1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 #define GET_SUBTARGETINFO_HEADER 25 #include "AMDGPUGenSubtargetInfo.inc" 26 27 namespace llvm { 28 29 class GCNTargetMachine; 30 31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 32 public AMDGPUSubtarget { 33 34 using AMDGPUSubtarget::getMaxWavesPerEU; 35 36 public: 37 // Following 2 enums are documented at: 38 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 39 enum class TrapHandlerAbi { 40 NONE = 0x00, 41 AMDHSA = 0x01, 42 }; 43 44 enum class TrapID { 45 LLVMAMDHSATrap = 0x02, 46 LLVMAMDHSADebugTrap = 0x03, 47 }; 48 49 private: 50 /// GlobalISel related APIs. 51 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 52 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 53 std::unique_ptr<InstructionSelector> InstSelector; 54 std::unique_ptr<LegalizerInfo> Legalizer; 55 std::unique_ptr<RegisterBankInfo> RegBankInfo; 56 57 protected: 58 // Basic subtarget description. 59 Triple TargetTriple; 60 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 61 unsigned Gen = INVALID; 62 InstrItineraryData InstrItins; 63 int LDSBankCount = 0; 64 unsigned MaxPrivateElementSize = 0; 65 66 // Possibly statically set by tablegen, but may want to be overridden. 67 bool FastFMAF32 = false; 68 bool FastDenormalF32 = false; 69 bool HalfRate64Ops = false; 70 bool FullRate64Ops = false; 71 72 // Dynamically set bits that enable features. 73 bool FlatForGlobal = false; 74 bool AutoWaitcntBeforeBarrier = false; 75 bool UnalignedScratchAccess = false; 76 bool UnalignedAccessMode = false; 77 bool HasApertureRegs = false; 78 bool SupportsXNACK = false; 79 80 // This should not be used directly. 'TargetID' tracks the dynamic settings 81 // for XNACK. 82 bool EnableXNACK = false; 83 84 bool EnableTgSplit = false; 85 bool EnableCuMode = false; 86 bool TrapHandler = false; 87 88 // Used as options. 89 bool EnableLoadStoreOpt = false; 90 bool EnableUnsafeDSOffsetFolding = false; 91 bool EnableSIScheduler = false; 92 bool EnableDS128 = false; 93 bool EnablePRTStrictNull = false; 94 bool DumpCode = false; 95 96 // Subtarget statically properties set by tablegen 97 bool FP64 = false; 98 bool FMA = false; 99 bool MIMG_R128 = false; 100 bool CIInsts = false; 101 bool GFX8Insts = false; 102 bool GFX9Insts = false; 103 bool GFX90AInsts = false; 104 bool GFX940Insts = false; 105 bool GFX10Insts = false; 106 bool GFX11Insts = false; 107 bool GFX10_3Insts = false; 108 bool GFX7GFX8GFX9Insts = false; 109 bool SGPRInitBug = false; 110 bool UserSGPRInit16Bug = false; 111 bool NegativeScratchOffsetBug = false; 112 bool NegativeUnalignedScratchOffsetBug = false; 113 bool HasSMemRealTime = false; 114 bool HasIntClamp = false; 115 bool HasFmaMixInsts = false; 116 bool HasMovrel = false; 117 bool HasVGPRIndexMode = false; 118 bool HasScalarStores = false; 119 bool HasScalarAtomics = false; 120 bool HasSDWAOmod = false; 121 bool HasSDWAScalar = false; 122 bool HasSDWASdst = false; 123 bool HasSDWAMac = false; 124 bool HasSDWAOutModsVOPC = false; 125 bool HasDPP = false; 126 bool HasDPP8 = false; 127 bool Has64BitDPP = false; 128 bool HasPackedFP32Ops = false; 129 bool HasImageInsts = false; 130 bool HasExtendedImageInsts = false; 131 bool HasR128A16 = false; 132 bool HasGFX10A16 = false; 133 bool HasG16 = false; 134 bool HasNSAEncoding = false; 135 unsigned NSAMaxSize = 0; 136 bool GFX10_AEncoding = false; 137 bool GFX10_BEncoding = false; 138 bool HasDLInsts = false; 139 bool HasDot1Insts = false; 140 bool HasDot2Insts = false; 141 bool HasDot3Insts = false; 142 bool HasDot4Insts = false; 143 bool HasDot5Insts = false; 144 bool HasDot6Insts = false; 145 bool HasDot7Insts = false; 146 bool HasDot8Insts = false; 147 bool HasMAIInsts = false; 148 bool HasFP8Insts = false; 149 bool HasPkFmacF16Inst = false; 150 bool HasAtomicFaddRtnInsts = false; 151 bool HasAtomicFaddNoRtnInsts = false; 152 bool HasAtomicPkFaddNoRtnInsts = false; 153 bool SupportsSRAMECC = false; 154 155 // This should not be used directly. 'TargetID' tracks the dynamic settings 156 // for SRAMECC. 157 bool EnableSRAMECC = false; 158 159 bool HasNoSdstCMPX = false; 160 bool HasVscnt = false; 161 bool HasGetWaveIdInst = false; 162 bool HasSMemTimeInst = false; 163 bool HasShaderCyclesRegister = false; 164 bool HasVOP3Literal = false; 165 bool HasNoDataDepHazard = false; 166 bool FlatAddressSpace = false; 167 bool FlatInstOffsets = false; 168 bool FlatGlobalInsts = false; 169 bool FlatScratchInsts = false; 170 bool ScalarFlatScratchInsts = false; 171 bool HasArchitectedFlatScratch = false; 172 bool EnableFlatScratch = false; 173 bool AddNoCarryInsts = false; 174 bool HasUnpackedD16VMem = false; 175 bool LDSMisalignedBug = false; 176 bool HasMFMAInlineLiteralBug = false; 177 bool UnalignedBufferAccess = false; 178 bool UnalignedDSAccess = false; 179 bool HasPackedTID = false; 180 bool ScalarizeGlobal = false; 181 182 bool HasVcmpxPermlaneHazard = false; 183 bool HasVMEMtoScalarWriteHazard = false; 184 bool HasSMEMtoVectorWriteHazard = false; 185 bool HasInstFwdPrefetchBug = false; 186 bool HasVcmpxExecWARHazard = false; 187 bool HasLdsBranchVmemWARHazard = false; 188 bool HasNSAtoVMEMBug = false; 189 bool HasNSAClauseBug = false; 190 bool HasOffset3fBug = false; 191 bool HasFlatSegmentOffsetBug = false; 192 bool HasImageStoreD16Bug = false; 193 bool HasImageGather4D16Bug = false; 194 bool HasVOPDInsts = false; 195 196 // Dummy feature to use for assembler in tablegen. 197 bool FeatureDisable = false; 198 199 SelectionDAGTargetInfo TSInfo; 200 private: 201 SIInstrInfo InstrInfo; 202 SITargetLowering TLInfo; 203 SIFrameLowering FrameLowering; 204 205 public: 206 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 207 const GCNTargetMachine &TM); 208 ~GCNSubtarget() override; 209 210 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 211 StringRef GPU, StringRef FS); 212 213 const SIInstrInfo *getInstrInfo() const override { 214 return &InstrInfo; 215 } 216 217 const SIFrameLowering *getFrameLowering() const override { 218 return &FrameLowering; 219 } 220 221 const SITargetLowering *getTargetLowering() const override { 222 return &TLInfo; 223 } 224 225 const SIRegisterInfo *getRegisterInfo() const override { 226 return &InstrInfo.getRegisterInfo(); 227 } 228 229 const CallLowering *getCallLowering() const override { 230 return CallLoweringInfo.get(); 231 } 232 233 const InlineAsmLowering *getInlineAsmLowering() const override { 234 return InlineAsmLoweringInfo.get(); 235 } 236 237 InstructionSelector *getInstructionSelector() const override { 238 return InstSelector.get(); 239 } 240 241 const LegalizerInfo *getLegalizerInfo() const override { 242 return Legalizer.get(); 243 } 244 245 const RegisterBankInfo *getRegBankInfo() const override { 246 return RegBankInfo.get(); 247 } 248 249 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 250 return TargetID; 251 } 252 253 // Nothing implemented, just prevent crashes on use. 254 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 255 return &TSInfo; 256 } 257 258 const InstrItineraryData *getInstrItineraryData() const override { 259 return &InstrItins; 260 } 261 262 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 263 264 Generation getGeneration() const { 265 return (Generation)Gen; 266 } 267 268 unsigned getMaxWaveScratchSize() const { 269 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 270 if (getGeneration() < GFX11) { 271 // 13-bit field in units of 256-dword. 272 return (256 * 4) * ((1 << 13) - 1); 273 } 274 // 15-bit field in units of 64-dword. 275 return (64 * 4) * ((1 << 15) - 1); 276 } 277 278 /// Return the number of high bits known to be zero for a frame index. 279 unsigned getKnownHighZeroBitsForFrameIndex() const { 280 return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 281 } 282 283 int getLDSBankCount() const { 284 return LDSBankCount; 285 } 286 287 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 288 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 289 } 290 291 unsigned getConstantBusLimit(unsigned Opcode) const; 292 293 /// Returns if the result of this instruction with a 16-bit result returned in 294 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 295 /// the original value. 296 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 297 298 bool hasIntClamp() const { 299 return HasIntClamp; 300 } 301 302 bool hasFP64() const { 303 return FP64; 304 } 305 306 bool hasMIMG_R128() const { 307 return MIMG_R128; 308 } 309 310 bool hasHWFP64() const { 311 return FP64; 312 } 313 314 bool hasFastFMAF32() const { 315 return FastFMAF32; 316 } 317 318 bool hasHalfRate64Ops() const { 319 return HalfRate64Ops; 320 } 321 322 bool hasFullRate64Ops() const { 323 return FullRate64Ops; 324 } 325 326 bool hasAddr64() const { 327 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 328 } 329 330 bool hasFlat() const { 331 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 332 } 333 334 // Return true if the target only has the reverse operand versions of VALU 335 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 336 bool hasOnlyRevVALUShifts() const { 337 return getGeneration() >= VOLCANIC_ISLANDS; 338 } 339 340 bool hasFractBug() const { 341 return getGeneration() == SOUTHERN_ISLANDS; 342 } 343 344 bool hasBFE() const { 345 return true; 346 } 347 348 bool hasBFI() const { 349 return true; 350 } 351 352 bool hasBFM() const { 353 return hasBFE(); 354 } 355 356 bool hasBCNT(unsigned Size) const { 357 return true; 358 } 359 360 bool hasFFBL() const { 361 return true; 362 } 363 364 bool hasFFBH() const { 365 return true; 366 } 367 368 bool hasMed3_16() const { 369 return getGeneration() >= AMDGPUSubtarget::GFX9; 370 } 371 372 bool hasMin3Max3_16() const { 373 return getGeneration() >= AMDGPUSubtarget::GFX9; 374 } 375 376 bool hasFmaMixInsts() const { 377 return HasFmaMixInsts; 378 } 379 380 bool hasCARRY() const { 381 return true; 382 } 383 384 bool hasFMA() const { 385 return FMA; 386 } 387 388 bool hasSwap() const { 389 return GFX9Insts; 390 } 391 392 bool hasScalarPackInsts() const { 393 return GFX9Insts; 394 } 395 396 bool hasScalarMulHiInsts() const { 397 return GFX9Insts; 398 } 399 400 TrapHandlerAbi getTrapHandlerAbi() const { 401 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 402 } 403 404 bool supportsGetDoorbellID() const { 405 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 406 return getGeneration() >= GFX9; 407 } 408 409 /// True if the offset field of DS instructions works as expected. On SI, the 410 /// offset uses a 16-bit adder and does not always wrap properly. 411 bool hasUsableDSOffset() const { 412 return getGeneration() >= SEA_ISLANDS; 413 } 414 415 bool unsafeDSOffsetFoldingEnabled() const { 416 return EnableUnsafeDSOffsetFolding; 417 } 418 419 /// Condition output from div_scale is usable. 420 bool hasUsableDivScaleConditionOutput() const { 421 return getGeneration() != SOUTHERN_ISLANDS; 422 } 423 424 /// Extra wait hazard is needed in some cases before 425 /// s_cbranch_vccnz/s_cbranch_vccz. 426 bool hasReadVCCZBug() const { 427 return getGeneration() <= SEA_ISLANDS; 428 } 429 430 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 431 bool partialVCCWritesUpdateVCCZ() const { 432 return getGeneration() >= GFX10; 433 } 434 435 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 436 /// was written by a VALU instruction. 437 bool hasSMRDReadVALUDefHazard() const { 438 return getGeneration() == SOUTHERN_ISLANDS; 439 } 440 441 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 442 /// SGPR was written by a VALU Instruction. 443 bool hasVMEMReadSGPRVALUDefHazard() const { 444 return getGeneration() >= VOLCANIC_ISLANDS; 445 } 446 447 bool hasRFEHazards() const { 448 return getGeneration() >= VOLCANIC_ISLANDS; 449 } 450 451 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 452 unsigned getSetRegWaitStates() const { 453 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 454 } 455 456 bool dumpCode() const { 457 return DumpCode; 458 } 459 460 /// Return the amount of LDS that can be used that will not restrict the 461 /// occupancy lower than WaveCount. 462 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 463 const Function &) const; 464 465 bool supportsMinMaxDenormModes() const { 466 return getGeneration() >= AMDGPUSubtarget::GFX9; 467 } 468 469 /// \returns If target supports S_DENORM_MODE. 470 bool hasDenormModeInst() const { 471 return getGeneration() >= AMDGPUSubtarget::GFX10; 472 } 473 474 bool useFlatForGlobal() const { 475 return FlatForGlobal; 476 } 477 478 /// \returns If target supports ds_read/write_b128 and user enables generation 479 /// of ds_read/write_b128. 480 bool useDS128() const { 481 return CIInsts && EnableDS128; 482 } 483 484 /// \return If target supports ds_read/write_b96/128. 485 bool hasDS96AndDS128() const { 486 return CIInsts; 487 } 488 489 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 490 bool haveRoundOpsF64() const { 491 return CIInsts; 492 } 493 494 /// \returns If MUBUF instructions always perform range checking, even for 495 /// buffer resources used for private memory access. 496 bool privateMemoryResourceIsRangeChecked() const { 497 return getGeneration() < AMDGPUSubtarget::GFX9; 498 } 499 500 /// \returns If target requires PRT Struct NULL support (zero result registers 501 /// for sparse texture support). 502 bool usePRTStrictNull() const { 503 return EnablePRTStrictNull; 504 } 505 506 bool hasAutoWaitcntBeforeBarrier() const { 507 return AutoWaitcntBeforeBarrier; 508 } 509 510 bool hasUnalignedBufferAccess() const { 511 return UnalignedBufferAccess; 512 } 513 514 bool hasUnalignedBufferAccessEnabled() const { 515 return UnalignedBufferAccess && UnalignedAccessMode; 516 } 517 518 bool hasUnalignedDSAccess() const { 519 return UnalignedDSAccess; 520 } 521 522 bool hasUnalignedDSAccessEnabled() const { 523 return UnalignedDSAccess && UnalignedAccessMode; 524 } 525 526 bool hasUnalignedScratchAccess() const { 527 return UnalignedScratchAccess; 528 } 529 530 bool hasUnalignedAccessMode() const { 531 return UnalignedAccessMode; 532 } 533 534 bool hasApertureRegs() const { 535 return HasApertureRegs; 536 } 537 538 bool isTrapHandlerEnabled() const { 539 return TrapHandler; 540 } 541 542 bool isXNACKEnabled() const { 543 return TargetID.isXnackOnOrAny(); 544 } 545 546 bool isTgSplitEnabled() const { 547 return EnableTgSplit; 548 } 549 550 bool isCuModeEnabled() const { 551 return EnableCuMode; 552 } 553 554 bool hasFlatAddressSpace() const { 555 return FlatAddressSpace; 556 } 557 558 bool hasFlatScrRegister() const { 559 return hasFlatAddressSpace(); 560 } 561 562 bool hasFlatInstOffsets() const { 563 return FlatInstOffsets; 564 } 565 566 bool hasFlatGlobalInsts() const { 567 return FlatGlobalInsts; 568 } 569 570 bool hasFlatScratchInsts() const { 571 return FlatScratchInsts; 572 } 573 574 // Check if target supports ST addressing mode with FLAT scratch instructions. 575 // The ST addressing mode means no registers are used, either VGPR or SGPR, 576 // but only immediate offset is swizzled and added to the FLAT scratch base. 577 bool hasFlatScratchSTMode() const { 578 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 579 } 580 581 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 582 583 bool hasScalarFlatScratchInsts() const { 584 return ScalarFlatScratchInsts; 585 } 586 587 bool enableFlatScratch() const { 588 return flatScratchIsArchitected() || 589 (EnableFlatScratch && hasFlatScratchInsts()); 590 } 591 592 bool hasGlobalAddTidInsts() const { 593 return GFX10_BEncoding; 594 } 595 596 bool hasAtomicCSub() const { 597 return GFX10_BEncoding; 598 } 599 600 bool hasMultiDwordFlatScratchAddressing() const { 601 return getGeneration() >= GFX9; 602 } 603 604 bool hasFlatSegmentOffsetBug() const { 605 return HasFlatSegmentOffsetBug; 606 } 607 608 bool hasFlatLgkmVMemCountInOrder() const { 609 return getGeneration() > GFX9; 610 } 611 612 bool hasD16LoadStore() const { 613 return getGeneration() >= GFX9; 614 } 615 616 bool d16PreservesUnusedBits() const { 617 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 618 } 619 620 bool hasD16Images() const { 621 return getGeneration() >= VOLCANIC_ISLANDS; 622 } 623 624 /// Return if most LDS instructions have an m0 use that require m0 to be 625 /// initialized. 626 bool ldsRequiresM0Init() const { 627 return getGeneration() < GFX9; 628 } 629 630 // True if the hardware rewinds and replays GWS operations if a wave is 631 // preempted. 632 // 633 // If this is false, a GWS operation requires testing if a nack set the 634 // MEM_VIOL bit, and repeating if so. 635 bool hasGWSAutoReplay() const { 636 return getGeneration() >= GFX9; 637 } 638 639 /// \returns if target has ds_gws_sema_release_all instruction. 640 bool hasGWSSemaReleaseAll() const { 641 return CIInsts; 642 } 643 644 /// \returns true if the target has integer add/sub instructions that do not 645 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 646 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 647 /// for saturation. 648 bool hasAddNoCarry() const { 649 return AddNoCarryInsts; 650 } 651 652 bool hasUnpackedD16VMem() const { 653 return HasUnpackedD16VMem; 654 } 655 656 // Covers VS/PS/CS graphics shaders 657 bool isMesaGfxShader(const Function &F) const { 658 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 659 } 660 661 bool hasMad64_32() const { 662 return getGeneration() >= SEA_ISLANDS; 663 } 664 665 bool hasSDWAOmod() const { 666 return HasSDWAOmod; 667 } 668 669 bool hasSDWAScalar() const { 670 return HasSDWAScalar; 671 } 672 673 bool hasSDWASdst() const { 674 return HasSDWASdst; 675 } 676 677 bool hasSDWAMac() const { 678 return HasSDWAMac; 679 } 680 681 bool hasSDWAOutModsVOPC() const { 682 return HasSDWAOutModsVOPC; 683 } 684 685 bool hasDLInsts() const { 686 return HasDLInsts; 687 } 688 689 bool hasDot1Insts() const { 690 return HasDot1Insts; 691 } 692 693 bool hasDot2Insts() const { 694 return HasDot2Insts; 695 } 696 697 bool hasDot3Insts() const { 698 return HasDot3Insts; 699 } 700 701 bool hasDot4Insts() const { 702 return HasDot4Insts; 703 } 704 705 bool hasDot5Insts() const { 706 return HasDot5Insts; 707 } 708 709 bool hasDot6Insts() const { 710 return HasDot6Insts; 711 } 712 713 bool hasDot7Insts() const { 714 return HasDot7Insts; 715 } 716 717 bool hasDot8Insts() const { 718 return HasDot8Insts; 719 } 720 721 bool hasMAIInsts() const { 722 return HasMAIInsts; 723 } 724 725 bool hasFP8Insts() const { 726 return HasFP8Insts; 727 } 728 729 bool hasPkFmacF16Inst() const { 730 return HasPkFmacF16Inst; 731 } 732 733 bool hasAtomicFaddInsts() const { 734 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 735 } 736 737 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 738 739 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 740 741 bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } 742 743 bool hasNoSdstCMPX() const { 744 return HasNoSdstCMPX; 745 } 746 747 bool hasVscnt() const { 748 return HasVscnt; 749 } 750 751 bool hasGetWaveIdInst() const { 752 return HasGetWaveIdInst; 753 } 754 755 bool hasSMemTimeInst() const { 756 return HasSMemTimeInst; 757 } 758 759 bool hasShaderCyclesRegister() const { 760 return HasShaderCyclesRegister; 761 } 762 763 bool hasVOP3Literal() const { 764 return HasVOP3Literal; 765 } 766 767 bool hasNoDataDepHazard() const { 768 return HasNoDataDepHazard; 769 } 770 771 bool vmemWriteNeedsExpWaitcnt() const { 772 return getGeneration() < SEA_ISLANDS; 773 } 774 775 // Scratch is allocated in 256 dword per wave blocks for the entire 776 // wavefront. When viewed from the perspective of an arbitrary workitem, this 777 // is 4-byte aligned. 778 // 779 // Only 4-byte alignment is really needed to access anything. Transformations 780 // on the pointer value itself may rely on the alignment / known low bits of 781 // the pointer. Set this to something above the minimum to avoid needing 782 // dynamic realignment in common cases. 783 Align getStackAlignment() const { return Align(16); } 784 785 bool enableMachineScheduler() const override { 786 return true; 787 } 788 789 bool useAA() const override; 790 791 bool enableSubRegLiveness() const override { 792 return true; 793 } 794 795 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 796 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 797 798 // static wrappers 799 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 800 801 // XXX - Why is this here if it isn't in the default pass set? 802 bool enableEarlyIfConversion() const override { 803 return true; 804 } 805 806 void overrideSchedPolicy(MachineSchedPolicy &Policy, 807 unsigned NumRegionInstrs) const override; 808 809 unsigned getMaxNumUserSGPRs() const { 810 return 16; 811 } 812 813 bool hasSMemRealTime() const { 814 return HasSMemRealTime; 815 } 816 817 bool hasMovrel() const { 818 return HasMovrel; 819 } 820 821 bool hasVGPRIndexMode() const { 822 return HasVGPRIndexMode; 823 } 824 825 bool useVGPRIndexMode() const; 826 827 bool hasScalarCompareEq64() const { 828 return getGeneration() >= VOLCANIC_ISLANDS; 829 } 830 831 bool hasScalarStores() const { 832 return HasScalarStores; 833 } 834 835 bool hasScalarAtomics() const { 836 return HasScalarAtomics; 837 } 838 839 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 840 841 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 842 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 843 844 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 845 bool hasPermLane64() const { return getGeneration() >= GFX11; } 846 847 bool hasDPP() const { 848 return HasDPP; 849 } 850 851 bool hasDPPBroadcasts() const { 852 return HasDPP && getGeneration() < GFX10; 853 } 854 855 bool hasDPPWavefrontShifts() const { 856 return HasDPP && getGeneration() < GFX10; 857 } 858 859 bool hasDPP8() const { 860 return HasDPP8; 861 } 862 863 bool has64BitDPP() const { 864 return Has64BitDPP; 865 } 866 867 bool hasPackedFP32Ops() const { 868 return HasPackedFP32Ops; 869 } 870 871 bool hasFmaakFmamkF32Insts() const { 872 return getGeneration() >= GFX10 || hasGFX940Insts(); 873 } 874 875 bool hasImageInsts() const { 876 return HasImageInsts; 877 } 878 879 bool hasExtendedImageInsts() const { 880 return HasExtendedImageInsts; 881 } 882 883 bool hasR128A16() const { 884 return HasR128A16; 885 } 886 887 bool hasGFX10A16() const { 888 return HasGFX10A16; 889 } 890 891 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 892 893 bool hasG16() const { return HasG16; } 894 895 bool hasOffset3fBug() const { 896 return HasOffset3fBug; 897 } 898 899 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 900 901 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 902 903 bool hasNSAEncoding() const { return HasNSAEncoding; } 904 905 unsigned getNSAMaxSize() const { return NSAMaxSize; } 906 907 bool hasGFX10_AEncoding() const { 908 return GFX10_AEncoding; 909 } 910 911 bool hasGFX10_BEncoding() const { 912 return GFX10_BEncoding; 913 } 914 915 bool hasGFX10_3Insts() const { 916 return GFX10_3Insts; 917 } 918 919 bool hasMadF16() const; 920 921 bool hasMovB64() const { return GFX940Insts; } 922 923 bool hasLshlAddB64() const { return GFX940Insts; } 924 925 bool enableSIScheduler() const { 926 return EnableSIScheduler; 927 } 928 929 bool loadStoreOptEnabled() const { 930 return EnableLoadStoreOpt; 931 } 932 933 bool hasSGPRInitBug() const { 934 return SGPRInitBug; 935 } 936 937 bool hasUserSGPRInit16Bug() const { 938 return UserSGPRInit16Bug && isWave32(); 939 } 940 941 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 942 943 bool hasNegativeUnalignedScratchOffsetBug() const { 944 return NegativeUnalignedScratchOffsetBug; 945 } 946 947 bool hasMFMAInlineLiteralBug() const { 948 return HasMFMAInlineLiteralBug; 949 } 950 951 bool has12DWordStoreHazard() const { 952 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 953 } 954 955 // \returns true if the subtarget supports DWORDX3 load/store instructions. 956 bool hasDwordx3LoadStores() const { 957 return CIInsts; 958 } 959 960 bool hasReadM0MovRelInterpHazard() const { 961 return getGeneration() == AMDGPUSubtarget::GFX9; 962 } 963 964 bool hasReadM0SendMsgHazard() const { 965 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 966 getGeneration() <= AMDGPUSubtarget::GFX9; 967 } 968 969 bool hasReadM0LdsDmaHazard() const { 970 return getGeneration() == AMDGPUSubtarget::GFX9; 971 } 972 973 bool hasReadM0LdsDirectHazard() const { 974 return getGeneration() == AMDGPUSubtarget::GFX9; 975 } 976 977 bool hasVcmpxPermlaneHazard() const { 978 return HasVcmpxPermlaneHazard; 979 } 980 981 bool hasVMEMtoScalarWriteHazard() const { 982 return HasVMEMtoScalarWriteHazard; 983 } 984 985 bool hasSMEMtoVectorWriteHazard() const { 986 return HasSMEMtoVectorWriteHazard; 987 } 988 989 bool hasLDSMisalignedBug() const { 990 return LDSMisalignedBug && !EnableCuMode; 991 } 992 993 bool hasInstFwdPrefetchBug() const { 994 return HasInstFwdPrefetchBug; 995 } 996 997 bool hasVcmpxExecWARHazard() const { 998 return HasVcmpxExecWARHazard; 999 } 1000 1001 bool hasLdsBranchVmemWARHazard() const { 1002 return HasLdsBranchVmemWARHazard; 1003 } 1004 1005 // Has one cycle hazard on transcendental instruction feeding a 1006 // non transcendental VALU. 1007 bool hasTransForwardingHazard() const { return GFX940Insts; } 1008 1009 // Has one cycle hazard on a VALU instruction partially writing dst with 1010 // a shift of result bits feeding another VALU instruction. 1011 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1012 1013 // Cannot use op_sel with v_dot instructions. 1014 bool hasDOTOpSelHazard() const { return GFX940Insts; } 1015 1016 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1017 bool hasVDecCoExecHazard() const { 1018 return GFX940Insts; 1019 } 1020 1021 bool hasNSAtoVMEMBug() const { 1022 return HasNSAtoVMEMBug; 1023 } 1024 1025 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1026 1027 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1028 1029 bool hasGFX90AInsts() const { return GFX90AInsts; } 1030 1031 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1032 1033 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1034 1035 bool hasVALUPartialForwardingHazard() const { 1036 return getGeneration() >= GFX11; 1037 } 1038 1039 bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; } 1040 1041 /// Return if operations acting on VGPR tuples require even alignment. 1042 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1043 1044 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1045 bool hasSPackHL() const { return GFX11Insts; } 1046 1047 /// Return true if the target's EXP instruction has the COMPR flag, which 1048 /// affects the meaning of the EN (enable) bits. 1049 bool hasCompressedExport() const { return !GFX11Insts; } 1050 1051 /// Return true if the target's EXP instruction supports the NULL export 1052 /// target. 1053 bool hasNullExportTarget() const { return !GFX11Insts; } 1054 1055 bool hasVOPDInsts() const { return HasVOPDInsts; } 1056 1057 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1058 1059 /// Return true if the target has the S_DELAY_ALU instruction. 1060 bool hasDelayAlu() const { return GFX11Insts; } 1061 1062 bool hasPackedTID() const { return HasPackedTID; } 1063 1064 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1065 // hasGFX90AInsts is also true. 1066 bool hasGFX940Insts() const { return GFX940Insts; } 1067 1068 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1069 /// SGPRs 1070 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1071 1072 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1073 /// VGPRs 1074 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1075 1076 /// Return occupancy for the given function. Used LDS and a number of 1077 /// registers if provided. 1078 /// Note, occupancy can be affected by the scratch allocation as well, but 1079 /// we do not have enough information to compute it. 1080 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1081 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1082 1083 /// \returns true if the flat_scratch register should be initialized with the 1084 /// pointer to the wave's scratch memory rather than a size and offset. 1085 bool flatScratchIsPointer() const { 1086 return getGeneration() >= AMDGPUSubtarget::GFX9; 1087 } 1088 1089 /// \returns true if the flat_scratch register is initialized by the HW. 1090 /// In this case it is readonly. 1091 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1092 1093 /// \returns true if the machine has merged shaders in which s0-s7 are 1094 /// reserved by the hardware and user SGPRs start at s8 1095 bool hasMergedShaders() const { 1096 return getGeneration() >= GFX9; 1097 } 1098 1099 // \returns true if the target supports the pre-NGG legacy geometry path. 1100 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1101 1102 /// \returns SGPR allocation granularity supported by the subtarget. 1103 unsigned getSGPRAllocGranule() const { 1104 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1105 } 1106 1107 /// \returns SGPR encoding granularity supported by the subtarget. 1108 unsigned getSGPREncodingGranule() const { 1109 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1110 } 1111 1112 /// \returns Total number of SGPRs supported by the subtarget. 1113 unsigned getTotalNumSGPRs() const { 1114 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1115 } 1116 1117 /// \returns Addressable number of SGPRs supported by the subtarget. 1118 unsigned getAddressableNumSGPRs() const { 1119 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1120 } 1121 1122 /// \returns Minimum number of SGPRs that meets the given number of waves per 1123 /// execution unit requirement supported by the subtarget. 1124 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1125 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1126 } 1127 1128 /// \returns Maximum number of SGPRs that meets the given number of waves per 1129 /// execution unit requirement supported by the subtarget. 1130 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1131 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1132 } 1133 1134 /// \returns Reserved number of SGPRs. This is common 1135 /// utility function called by MachineFunction and 1136 /// Function variants of getReservedNumSGPRs. 1137 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1138 /// \returns Reserved number of SGPRs for given machine function \p MF. 1139 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1140 1141 /// \returns Reserved number of SGPRs for given function \p F. 1142 unsigned getReservedNumSGPRs(const Function &F) const; 1143 1144 /// \returns max num SGPRs. This is the common utility 1145 /// function called by MachineFunction and Function 1146 /// variants of getMaxNumSGPRs. 1147 unsigned getBaseMaxNumSGPRs(const Function &F, 1148 std::pair<unsigned, unsigned> WavesPerEU, 1149 unsigned PreloadedSGPRs, 1150 unsigned ReservedNumSGPRs) const; 1151 1152 /// \returns Maximum number of SGPRs that meets number of waves per execution 1153 /// unit requirement for function \p MF, or number of SGPRs explicitly 1154 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1155 /// 1156 /// \returns Value that meets number of waves per execution unit requirement 1157 /// if explicitly requested value cannot be converted to integer, violates 1158 /// subtarget's specifications, or does not meet number of waves per execution 1159 /// unit requirement. 1160 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1161 1162 /// \returns Maximum number of SGPRs that meets number of waves per execution 1163 /// unit requirement for function \p F, or number of SGPRs explicitly 1164 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1165 /// 1166 /// \returns Value that meets number of waves per execution unit requirement 1167 /// if explicitly requested value cannot be converted to integer, violates 1168 /// subtarget's specifications, or does not meet number of waves per execution 1169 /// unit requirement. 1170 unsigned getMaxNumSGPRs(const Function &F) const; 1171 1172 /// \returns VGPR allocation granularity supported by the subtarget. 1173 unsigned getVGPRAllocGranule() const { 1174 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1175 } 1176 1177 /// \returns VGPR encoding granularity supported by the subtarget. 1178 unsigned getVGPREncodingGranule() const { 1179 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1180 } 1181 1182 /// \returns Total number of VGPRs supported by the subtarget. 1183 unsigned getTotalNumVGPRs() const { 1184 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1185 } 1186 1187 /// \returns Addressable number of VGPRs supported by the subtarget. 1188 unsigned getAddressableNumVGPRs() const { 1189 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1190 } 1191 1192 /// \returns Minimum number of VGPRs that meets given number of waves per 1193 /// execution unit requirement supported by the subtarget. 1194 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1195 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1196 } 1197 1198 /// \returns Maximum number of VGPRs that meets given number of waves per 1199 /// execution unit requirement supported by the subtarget. 1200 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1201 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1202 } 1203 1204 /// \returns max num VGPRs. This is the common utility function 1205 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1206 unsigned getBaseMaxNumVGPRs(const Function &F, 1207 std::pair<unsigned, unsigned> WavesPerEU) const; 1208 /// \returns Maximum number of VGPRs that meets number of waves per execution 1209 /// unit requirement for function \p F, or number of VGPRs explicitly 1210 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1211 /// 1212 /// \returns Value that meets number of waves per execution unit requirement 1213 /// if explicitly requested value cannot be converted to integer, violates 1214 /// subtarget's specifications, or does not meet number of waves per execution 1215 /// unit requirement. 1216 unsigned getMaxNumVGPRs(const Function &F) const; 1217 1218 unsigned getMaxNumAGPRs(const Function &F) const { 1219 return getMaxNumVGPRs(F); 1220 } 1221 1222 /// \returns Maximum number of VGPRs that meets number of waves per execution 1223 /// unit requirement for function \p MF, or number of VGPRs explicitly 1224 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1225 /// 1226 /// \returns Value that meets number of waves per execution unit requirement 1227 /// if explicitly requested value cannot be converted to integer, violates 1228 /// subtarget's specifications, or does not meet number of waves per execution 1229 /// unit requirement. 1230 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1231 1232 void getPostRAMutations( 1233 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1234 const override; 1235 1236 std::unique_ptr<ScheduleDAGMutation> 1237 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1238 1239 bool isWave32() const { 1240 return getWavefrontSize() == 32; 1241 } 1242 1243 bool isWave64() const { 1244 return getWavefrontSize() == 64; 1245 } 1246 1247 const TargetRegisterClass *getBoolRC() const { 1248 return getRegisterInfo()->getBoolRC(); 1249 } 1250 1251 /// \returns Maximum number of work groups per compute unit supported by the 1252 /// subtarget and limited by given \p FlatWorkGroupSize. 1253 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1254 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1255 } 1256 1257 /// \returns Minimum flat work group size supported by the subtarget. 1258 unsigned getMinFlatWorkGroupSize() const override { 1259 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1260 } 1261 1262 /// \returns Maximum flat work group size supported by the subtarget. 1263 unsigned getMaxFlatWorkGroupSize() const override { 1264 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1265 } 1266 1267 /// \returns Number of waves per execution unit required to support the given 1268 /// \p FlatWorkGroupSize. 1269 unsigned 1270 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1271 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1272 } 1273 1274 /// \returns Minimum number of waves per execution unit supported by the 1275 /// subtarget. 1276 unsigned getMinWavesPerEU() const override { 1277 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1278 } 1279 1280 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1281 SDep &Dep) const override; 1282 1283 // \returns true if it's beneficial on this subtarget for the scheduler to 1284 // cluster stores as well as loads. 1285 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1286 }; 1287 1288 } // end namespace llvm 1289 1290 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1291