1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 #define GET_SUBTARGETINFO_HEADER 25 #include "AMDGPUGenSubtargetInfo.inc" 26 27 namespace llvm { 28 29 class GCNTargetMachine; 30 31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 32 public AMDGPUSubtarget { 33 34 using AMDGPUSubtarget::getMaxWavesPerEU; 35 36 public: 37 // Following 2 enums are documented at: 38 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 39 enum class TrapHandlerAbi { 40 NONE = 0x00, 41 AMDHSA = 0x01, 42 }; 43 44 enum class TrapID { 45 LLVMAMDHSATrap = 0x02, 46 LLVMAMDHSADebugTrap = 0x03, 47 }; 48 49 private: 50 /// GlobalISel related APIs. 51 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 52 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 53 std::unique_ptr<InstructionSelector> InstSelector; 54 std::unique_ptr<LegalizerInfo> Legalizer; 55 std::unique_ptr<RegisterBankInfo> RegBankInfo; 56 57 protected: 58 // Basic subtarget description. 59 Triple TargetTriple; 60 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 61 unsigned Gen; 62 InstrItineraryData InstrItins; 63 int LDSBankCount; 64 unsigned MaxPrivateElementSize; 65 66 // Possibly statically set by tablegen, but may want to be overridden. 67 bool FastFMAF32; 68 bool FastDenormalF32; 69 bool HalfRate64Ops; 70 bool FullRate64Ops; 71 72 // Dynamically set bits that enable features. 73 bool FlatForGlobal; 74 bool AutoWaitcntBeforeBarrier; 75 bool UnalignedScratchAccess; 76 bool UnalignedAccessMode; 77 bool HasApertureRegs; 78 bool SupportsXNACK; 79 80 // This should not be used directly. 'TargetID' tracks the dynamic settings 81 // for XNACK. 82 bool EnableXNACK; 83 84 bool EnableTgSplit; 85 bool EnableCuMode; 86 bool TrapHandler; 87 88 // Used as options. 89 bool EnableLoadStoreOpt; 90 bool EnableUnsafeDSOffsetFolding; 91 bool EnableSIScheduler; 92 bool EnableDS128; 93 bool EnablePRTStrictNull; 94 bool DumpCode; 95 96 // Subtarget statically properties set by tablegen 97 bool FP64; 98 bool FMA; 99 bool MIMG_R128; 100 bool CIInsts; 101 bool GFX8Insts; 102 bool GFX9Insts; 103 bool GFX90AInsts; 104 bool GFX10Insts; 105 bool GFX10_3Insts; 106 bool GFX7GFX8GFX9Insts; 107 bool SGPRInitBug; 108 bool NegativeScratchOffsetBug; 109 bool NegativeUnalignedScratchOffsetBug; 110 bool HasSMemRealTime; 111 bool HasIntClamp; 112 bool HasFmaMixInsts; 113 bool HasMovrel; 114 bool HasVGPRIndexMode; 115 bool HasScalarStores; 116 bool HasScalarAtomics; 117 bool HasSDWAOmod; 118 bool HasSDWAScalar; 119 bool HasSDWASdst; 120 bool HasSDWAMac; 121 bool HasSDWAOutModsVOPC; 122 bool HasDPP; 123 bool HasDPP8; 124 bool Has64BitDPP; 125 bool HasPackedFP32Ops; 126 bool HasExtendedImageInsts; 127 bool HasR128A16; 128 bool HasGFX10A16; 129 bool HasG16; 130 bool HasNSAEncoding; 131 unsigned NSAMaxSize; 132 bool GFX10_AEncoding; 133 bool GFX10_BEncoding; 134 bool HasDLInsts; 135 bool HasDot1Insts; 136 bool HasDot2Insts; 137 bool HasDot3Insts; 138 bool HasDot4Insts; 139 bool HasDot5Insts; 140 bool HasDot6Insts; 141 bool HasDot7Insts; 142 bool HasMAIInsts; 143 bool HasPkFmacF16Inst; 144 bool HasAtomicFaddInsts; 145 bool SupportsSRAMECC; 146 147 // This should not be used directly. 'TargetID' tracks the dynamic settings 148 // for SRAMECC. 149 bool EnableSRAMECC; 150 151 bool HasNoSdstCMPX; 152 bool HasVscnt; 153 bool HasGetWaveIdInst; 154 bool HasSMemTimeInst; 155 bool HasShaderCyclesRegister; 156 bool HasRegisterBanking; 157 bool HasVOP3Literal; 158 bool HasNoDataDepHazard; 159 bool FlatAddressSpace; 160 bool FlatInstOffsets; 161 bool FlatGlobalInsts; 162 bool FlatScratchInsts; 163 bool ScalarFlatScratchInsts; 164 bool HasArchitectedFlatScratch; 165 bool AddNoCarryInsts; 166 bool HasUnpackedD16VMem; 167 bool LDSMisalignedBug; 168 bool HasMFMAInlineLiteralBug; 169 bool UnalignedBufferAccess; 170 bool UnalignedDSAccess; 171 bool HasPackedTID; 172 bool ScalarizeGlobal; 173 174 bool HasVcmpxPermlaneHazard; 175 bool HasVMEMtoScalarWriteHazard; 176 bool HasSMEMtoVectorWriteHazard; 177 bool HasInstFwdPrefetchBug; 178 bool HasVcmpxExecWARHazard; 179 bool HasLdsBranchVmemWARHazard; 180 bool HasNSAtoVMEMBug; 181 bool HasNSAClauseBug; 182 bool HasOffset3fBug; 183 bool HasFlatSegmentOffsetBug; 184 bool HasImageStoreD16Bug; 185 bool HasImageGather4D16Bug; 186 187 // Dummy feature to use for assembler in tablegen. 188 bool FeatureDisable; 189 190 SelectionDAGTargetInfo TSInfo; 191 private: 192 SIInstrInfo InstrInfo; 193 SITargetLowering TLInfo; 194 SIFrameLowering FrameLowering; 195 196 public: 197 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 198 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 199 200 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 201 const GCNTargetMachine &TM); 202 ~GCNSubtarget() override; 203 204 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 205 StringRef GPU, StringRef FS); 206 207 const SIInstrInfo *getInstrInfo() const override { 208 return &InstrInfo; 209 } 210 211 const SIFrameLowering *getFrameLowering() const override { 212 return &FrameLowering; 213 } 214 215 const SITargetLowering *getTargetLowering() const override { 216 return &TLInfo; 217 } 218 219 const SIRegisterInfo *getRegisterInfo() const override { 220 return &InstrInfo.getRegisterInfo(); 221 } 222 223 const CallLowering *getCallLowering() const override { 224 return CallLoweringInfo.get(); 225 } 226 227 const InlineAsmLowering *getInlineAsmLowering() const override { 228 return InlineAsmLoweringInfo.get(); 229 } 230 231 InstructionSelector *getInstructionSelector() const override { 232 return InstSelector.get(); 233 } 234 235 const LegalizerInfo *getLegalizerInfo() const override { 236 return Legalizer.get(); 237 } 238 239 const RegisterBankInfo *getRegBankInfo() const override { 240 return RegBankInfo.get(); 241 } 242 243 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 244 return TargetID; 245 } 246 247 // Nothing implemented, just prevent crashes on use. 248 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 249 return &TSInfo; 250 } 251 252 const InstrItineraryData *getInstrItineraryData() const override { 253 return &InstrItins; 254 } 255 256 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 257 258 Generation getGeneration() const { 259 return (Generation)Gen; 260 } 261 262 /// Return the number of high bits known to be zero for a frame index. 263 unsigned getKnownHighZeroBitsForFrameIndex() const { 264 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 265 } 266 267 int getLDSBankCount() const { 268 return LDSBankCount; 269 } 270 271 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 272 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 273 } 274 275 unsigned getConstantBusLimit(unsigned Opcode) const; 276 277 /// Returns if the result of this instruction with a 16-bit result returned in 278 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 279 /// the original value. 280 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 281 282 bool hasIntClamp() const { 283 return HasIntClamp; 284 } 285 286 bool hasFP64() const { 287 return FP64; 288 } 289 290 bool hasMIMG_R128() const { 291 return MIMG_R128; 292 } 293 294 bool hasHWFP64() const { 295 return FP64; 296 } 297 298 bool hasFastFMAF32() const { 299 return FastFMAF32; 300 } 301 302 bool hasHalfRate64Ops() const { 303 return HalfRate64Ops; 304 } 305 306 bool hasFullRate64Ops() const { 307 return FullRate64Ops; 308 } 309 310 bool hasAddr64() const { 311 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 312 } 313 314 bool hasFlat() const { 315 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 316 } 317 318 // Return true if the target only has the reverse operand versions of VALU 319 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 320 bool hasOnlyRevVALUShifts() const { 321 return getGeneration() >= VOLCANIC_ISLANDS; 322 } 323 324 bool hasFractBug() const { 325 return getGeneration() == SOUTHERN_ISLANDS; 326 } 327 328 bool hasBFE() const { 329 return true; 330 } 331 332 bool hasBFI() const { 333 return true; 334 } 335 336 bool hasBFM() const { 337 return hasBFE(); 338 } 339 340 bool hasBCNT(unsigned Size) const { 341 return true; 342 } 343 344 bool hasFFBL() const { 345 return true; 346 } 347 348 bool hasFFBH() const { 349 return true; 350 } 351 352 bool hasMed3_16() const { 353 return getGeneration() >= AMDGPUSubtarget::GFX9; 354 } 355 356 bool hasMin3Max3_16() const { 357 return getGeneration() >= AMDGPUSubtarget::GFX9; 358 } 359 360 bool hasFmaMixInsts() const { 361 return HasFmaMixInsts; 362 } 363 364 bool hasCARRY() const { 365 return true; 366 } 367 368 bool hasFMA() const { 369 return FMA; 370 } 371 372 bool hasSwap() const { 373 return GFX9Insts; 374 } 375 376 bool hasScalarPackInsts() const { 377 return GFX9Insts; 378 } 379 380 bool hasScalarMulHiInsts() const { 381 return GFX9Insts; 382 } 383 384 TrapHandlerAbi getTrapHandlerAbi() const { 385 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 386 } 387 388 bool supportsGetDoorbellID() const { 389 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 390 return getGeneration() >= GFX9; 391 } 392 393 /// True if the offset field of DS instructions works as expected. On SI, the 394 /// offset uses a 16-bit adder and does not always wrap properly. 395 bool hasUsableDSOffset() const { 396 return getGeneration() >= SEA_ISLANDS; 397 } 398 399 bool unsafeDSOffsetFoldingEnabled() const { 400 return EnableUnsafeDSOffsetFolding; 401 } 402 403 /// Condition output from div_scale is usable. 404 bool hasUsableDivScaleConditionOutput() const { 405 return getGeneration() != SOUTHERN_ISLANDS; 406 } 407 408 /// Extra wait hazard is needed in some cases before 409 /// s_cbranch_vccnz/s_cbranch_vccz. 410 bool hasReadVCCZBug() const { 411 return getGeneration() <= SEA_ISLANDS; 412 } 413 414 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 415 bool partialVCCWritesUpdateVCCZ() const { 416 return getGeneration() >= GFX10; 417 } 418 419 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 420 /// was written by a VALU instruction. 421 bool hasSMRDReadVALUDefHazard() const { 422 return getGeneration() == SOUTHERN_ISLANDS; 423 } 424 425 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 426 /// SGPR was written by a VALU Instruction. 427 bool hasVMEMReadSGPRVALUDefHazard() const { 428 return getGeneration() >= VOLCANIC_ISLANDS; 429 } 430 431 bool hasRFEHazards() const { 432 return getGeneration() >= VOLCANIC_ISLANDS; 433 } 434 435 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 436 unsigned getSetRegWaitStates() const { 437 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 438 } 439 440 bool dumpCode() const { 441 return DumpCode; 442 } 443 444 /// Return the amount of LDS that can be used that will not restrict the 445 /// occupancy lower than WaveCount. 446 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 447 const Function &) const; 448 449 bool supportsMinMaxDenormModes() const { 450 return getGeneration() >= AMDGPUSubtarget::GFX9; 451 } 452 453 /// \returns If target supports S_DENORM_MODE. 454 bool hasDenormModeInst() const { 455 return getGeneration() >= AMDGPUSubtarget::GFX10; 456 } 457 458 bool useFlatForGlobal() const { 459 return FlatForGlobal; 460 } 461 462 /// \returns If target supports ds_read/write_b128 and user enables generation 463 /// of ds_read/write_b128. 464 bool useDS128() const { 465 return CIInsts && EnableDS128; 466 } 467 468 /// \return If target supports ds_read/write_b96/128. 469 bool hasDS96AndDS128() const { 470 return CIInsts; 471 } 472 473 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 474 bool haveRoundOpsF64() const { 475 return CIInsts; 476 } 477 478 /// \returns If MUBUF instructions always perform range checking, even for 479 /// buffer resources used for private memory access. 480 bool privateMemoryResourceIsRangeChecked() const { 481 return getGeneration() < AMDGPUSubtarget::GFX9; 482 } 483 484 /// \returns If target requires PRT Struct NULL support (zero result registers 485 /// for sparse texture support). 486 bool usePRTStrictNull() const { 487 return EnablePRTStrictNull; 488 } 489 490 bool hasAutoWaitcntBeforeBarrier() const { 491 return AutoWaitcntBeforeBarrier; 492 } 493 494 bool hasUnalignedBufferAccess() const { 495 return UnalignedBufferAccess; 496 } 497 498 bool hasUnalignedBufferAccessEnabled() const { 499 return UnalignedBufferAccess && UnalignedAccessMode; 500 } 501 502 bool hasUnalignedDSAccess() const { 503 return UnalignedDSAccess; 504 } 505 506 bool hasUnalignedDSAccessEnabled() const { 507 return UnalignedDSAccess && UnalignedAccessMode; 508 } 509 510 bool hasUnalignedScratchAccess() const { 511 return UnalignedScratchAccess; 512 } 513 514 bool hasUnalignedAccessMode() const { 515 return UnalignedAccessMode; 516 } 517 518 bool hasApertureRegs() const { 519 return HasApertureRegs; 520 } 521 522 bool isTrapHandlerEnabled() const { 523 return TrapHandler; 524 } 525 526 bool isXNACKEnabled() const { 527 return TargetID.isXnackOnOrAny(); 528 } 529 530 bool isTgSplitEnabled() const { 531 return EnableTgSplit; 532 } 533 534 bool isCuModeEnabled() const { 535 return EnableCuMode; 536 } 537 538 bool hasFlatAddressSpace() const { 539 return FlatAddressSpace; 540 } 541 542 bool hasFlatScrRegister() const { 543 return hasFlatAddressSpace(); 544 } 545 546 bool hasFlatInstOffsets() const { 547 return FlatInstOffsets; 548 } 549 550 bool hasFlatGlobalInsts() const { 551 return FlatGlobalInsts; 552 } 553 554 bool hasFlatScratchInsts() const { 555 return FlatScratchInsts; 556 } 557 558 // Check if target supports ST addressing mode with FLAT scratch instructions. 559 // The ST addressing mode means no registers are used, either VGPR or SGPR, 560 // but only immediate offset is swizzled and added to the FLAT scratch base. 561 bool hasFlatScratchSTMode() const { 562 return hasFlatScratchInsts() && hasGFX10_3Insts(); 563 } 564 565 bool hasScalarFlatScratchInsts() const { 566 return ScalarFlatScratchInsts; 567 } 568 569 bool hasGlobalAddTidInsts() const { 570 return GFX10_BEncoding; 571 } 572 573 bool hasAtomicCSub() const { 574 return GFX10_BEncoding; 575 } 576 577 bool hasMultiDwordFlatScratchAddressing() const { 578 return getGeneration() >= GFX9; 579 } 580 581 bool hasFlatSegmentOffsetBug() const { 582 return HasFlatSegmentOffsetBug; 583 } 584 585 bool hasFlatLgkmVMemCountInOrder() const { 586 return getGeneration() > GFX9; 587 } 588 589 bool hasD16LoadStore() const { 590 return getGeneration() >= GFX9; 591 } 592 593 bool d16PreservesUnusedBits() const { 594 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 595 } 596 597 bool hasD16Images() const { 598 return getGeneration() >= VOLCANIC_ISLANDS; 599 } 600 601 /// Return if most LDS instructions have an m0 use that require m0 to be 602 /// initialized. 603 bool ldsRequiresM0Init() const { 604 return getGeneration() < GFX9; 605 } 606 607 // True if the hardware rewinds and replays GWS operations if a wave is 608 // preempted. 609 // 610 // If this is false, a GWS operation requires testing if a nack set the 611 // MEM_VIOL bit, and repeating if so. 612 bool hasGWSAutoReplay() const { 613 return getGeneration() >= GFX9; 614 } 615 616 /// \returns if target has ds_gws_sema_release_all instruction. 617 bool hasGWSSemaReleaseAll() const { 618 return CIInsts; 619 } 620 621 /// \returns true if the target has integer add/sub instructions that do not 622 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 623 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 624 /// for saturation. 625 bool hasAddNoCarry() const { 626 return AddNoCarryInsts; 627 } 628 629 bool hasUnpackedD16VMem() const { 630 return HasUnpackedD16VMem; 631 } 632 633 // Covers VS/PS/CS graphics shaders 634 bool isMesaGfxShader(const Function &F) const { 635 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 636 } 637 638 bool hasMad64_32() const { 639 return getGeneration() >= SEA_ISLANDS; 640 } 641 642 bool hasSDWAOmod() const { 643 return HasSDWAOmod; 644 } 645 646 bool hasSDWAScalar() const { 647 return HasSDWAScalar; 648 } 649 650 bool hasSDWASdst() const { 651 return HasSDWASdst; 652 } 653 654 bool hasSDWAMac() const { 655 return HasSDWAMac; 656 } 657 658 bool hasSDWAOutModsVOPC() const { 659 return HasSDWAOutModsVOPC; 660 } 661 662 bool hasDLInsts() const { 663 return HasDLInsts; 664 } 665 666 bool hasDot1Insts() const { 667 return HasDot1Insts; 668 } 669 670 bool hasDot2Insts() const { 671 return HasDot2Insts; 672 } 673 674 bool hasDot3Insts() const { 675 return HasDot3Insts; 676 } 677 678 bool hasDot4Insts() const { 679 return HasDot4Insts; 680 } 681 682 bool hasDot5Insts() const { 683 return HasDot5Insts; 684 } 685 686 bool hasDot6Insts() const { 687 return HasDot6Insts; 688 } 689 690 bool hasDot7Insts() const { 691 return HasDot7Insts; 692 } 693 694 bool hasMAIInsts() const { 695 return HasMAIInsts; 696 } 697 698 bool hasPkFmacF16Inst() const { 699 return HasPkFmacF16Inst; 700 } 701 702 bool hasAtomicFaddInsts() const { 703 return HasAtomicFaddInsts; 704 } 705 706 bool hasNoSdstCMPX() const { 707 return HasNoSdstCMPX; 708 } 709 710 bool hasVscnt() const { 711 return HasVscnt; 712 } 713 714 bool hasGetWaveIdInst() const { 715 return HasGetWaveIdInst; 716 } 717 718 bool hasSMemTimeInst() const { 719 return HasSMemTimeInst; 720 } 721 722 bool hasShaderCyclesRegister() const { 723 return HasShaderCyclesRegister; 724 } 725 726 bool hasRegisterBanking() const { 727 return HasRegisterBanking; 728 } 729 730 bool hasVOP3Literal() const { 731 return HasVOP3Literal; 732 } 733 734 bool hasNoDataDepHazard() const { 735 return HasNoDataDepHazard; 736 } 737 738 bool vmemWriteNeedsExpWaitcnt() const { 739 return getGeneration() < SEA_ISLANDS; 740 } 741 742 // Scratch is allocated in 256 dword per wave blocks for the entire 743 // wavefront. When viewed from the perspective of an arbitrary workitem, this 744 // is 4-byte aligned. 745 // 746 // Only 4-byte alignment is really needed to access anything. Transformations 747 // on the pointer value itself may rely on the alignment / known low bits of 748 // the pointer. Set this to something above the minimum to avoid needing 749 // dynamic realignment in common cases. 750 Align getStackAlignment() const { return Align(16); } 751 752 bool enableMachineScheduler() const override { 753 return true; 754 } 755 756 bool useAA() const override; 757 758 bool enableSubRegLiveness() const override { 759 return true; 760 } 761 762 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 763 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 764 765 // static wrappers 766 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 767 768 // XXX - Why is this here if it isn't in the default pass set? 769 bool enableEarlyIfConversion() const override { 770 return true; 771 } 772 773 bool enableFlatScratch() const; 774 775 void overrideSchedPolicy(MachineSchedPolicy &Policy, 776 unsigned NumRegionInstrs) const override; 777 778 unsigned getMaxNumUserSGPRs() const { 779 return 16; 780 } 781 782 bool hasSMemRealTime() const { 783 return HasSMemRealTime; 784 } 785 786 bool hasMovrel() const { 787 return HasMovrel; 788 } 789 790 bool hasVGPRIndexMode() const { 791 return HasVGPRIndexMode; 792 } 793 794 bool useVGPRIndexMode() const; 795 796 bool hasScalarCompareEq64() const { 797 return getGeneration() >= VOLCANIC_ISLANDS; 798 } 799 800 bool hasScalarStores() const { 801 return HasScalarStores; 802 } 803 804 bool hasScalarAtomics() const { 805 return HasScalarAtomics; 806 } 807 808 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 809 810 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 811 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 812 813 bool hasDPP() const { 814 return HasDPP; 815 } 816 817 bool hasDPPBroadcasts() const { 818 return HasDPP && getGeneration() < GFX10; 819 } 820 821 bool hasDPPWavefrontShifts() const { 822 return HasDPP && getGeneration() < GFX10; 823 } 824 825 bool hasDPP8() const { 826 return HasDPP8; 827 } 828 829 bool has64BitDPP() const { 830 return Has64BitDPP; 831 } 832 833 bool hasPackedFP32Ops() const { 834 return HasPackedFP32Ops; 835 } 836 837 bool hasFmaakFmamkF32Insts() const { 838 return getGeneration() >= GFX10; 839 } 840 841 bool hasExtendedImageInsts() const { 842 return HasExtendedImageInsts; 843 } 844 845 bool hasR128A16() const { 846 return HasR128A16; 847 } 848 849 bool hasGFX10A16() const { 850 return HasGFX10A16; 851 } 852 853 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 854 855 bool hasG16() const { return HasG16; } 856 857 bool hasOffset3fBug() const { 858 return HasOffset3fBug; 859 } 860 861 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 862 863 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 864 865 bool hasNSAEncoding() const { return HasNSAEncoding; } 866 867 unsigned getNSAMaxSize() const { return NSAMaxSize; } 868 869 bool hasGFX10_AEncoding() const { 870 return GFX10_AEncoding; 871 } 872 873 bool hasGFX10_BEncoding() const { 874 return GFX10_BEncoding; 875 } 876 877 bool hasGFX10_3Insts() const { 878 return GFX10_3Insts; 879 } 880 881 bool hasMadF16() const; 882 883 bool enableSIScheduler() const { 884 return EnableSIScheduler; 885 } 886 887 bool loadStoreOptEnabled() const { 888 return EnableLoadStoreOpt; 889 } 890 891 bool hasSGPRInitBug() const { 892 return SGPRInitBug; 893 } 894 895 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 896 897 bool hasNegativeUnalignedScratchOffsetBug() const { 898 return NegativeUnalignedScratchOffsetBug; 899 } 900 901 bool hasMFMAInlineLiteralBug() const { 902 return HasMFMAInlineLiteralBug; 903 } 904 905 bool has12DWordStoreHazard() const { 906 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 907 } 908 909 // \returns true if the subtarget supports DWORDX3 load/store instructions. 910 bool hasDwordx3LoadStores() const { 911 return CIInsts; 912 } 913 914 bool hasReadM0MovRelInterpHazard() const { 915 return getGeneration() == AMDGPUSubtarget::GFX9; 916 } 917 918 bool hasReadM0SendMsgHazard() const { 919 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 920 getGeneration() <= AMDGPUSubtarget::GFX9; 921 } 922 923 bool hasVcmpxPermlaneHazard() const { 924 return HasVcmpxPermlaneHazard; 925 } 926 927 bool hasVMEMtoScalarWriteHazard() const { 928 return HasVMEMtoScalarWriteHazard; 929 } 930 931 bool hasSMEMtoVectorWriteHazard() const { 932 return HasSMEMtoVectorWriteHazard; 933 } 934 935 bool hasLDSMisalignedBug() const { 936 return LDSMisalignedBug && !EnableCuMode; 937 } 938 939 bool hasInstFwdPrefetchBug() const { 940 return HasInstFwdPrefetchBug; 941 } 942 943 bool hasVcmpxExecWARHazard() const { 944 return HasVcmpxExecWARHazard; 945 } 946 947 bool hasLdsBranchVmemWARHazard() const { 948 return HasLdsBranchVmemWARHazard; 949 } 950 951 bool hasNSAtoVMEMBug() const { 952 return HasNSAtoVMEMBug; 953 } 954 955 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 956 957 bool hasHardClauses() const { return getGeneration() >= GFX10; } 958 959 bool hasGFX90AInsts() const { return GFX90AInsts; } 960 961 /// Return if operations acting on VGPR tuples require even alignment. 962 bool needsAlignedVGPRs() const { return GFX90AInsts; } 963 964 bool hasPackedTID() const { return HasPackedTID; } 965 966 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 967 /// SGPRs 968 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 969 970 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 971 /// VGPRs 972 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 973 974 /// Return occupancy for the given function. Used LDS and a number of 975 /// registers if provided. 976 /// Note, occupancy can be affected by the scratch allocation as well, but 977 /// we do not have enough information to compute it. 978 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 979 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 980 981 /// \returns true if the flat_scratch register should be initialized with the 982 /// pointer to the wave's scratch memory rather than a size and offset. 983 bool flatScratchIsPointer() const { 984 return getGeneration() >= AMDGPUSubtarget::GFX9; 985 } 986 987 /// \returns true if the flat_scratch register is initialized by the HW. 988 /// In this case it is readonly. 989 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 990 991 /// \returns true if the machine has merged shaders in which s0-s7 are 992 /// reserved by the hardware and user SGPRs start at s8 993 bool hasMergedShaders() const { 994 return getGeneration() >= GFX9; 995 } 996 997 /// \returns SGPR allocation granularity supported by the subtarget. 998 unsigned getSGPRAllocGranule() const { 999 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1000 } 1001 1002 /// \returns SGPR encoding granularity supported by the subtarget. 1003 unsigned getSGPREncodingGranule() const { 1004 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1005 } 1006 1007 /// \returns Total number of SGPRs supported by the subtarget. 1008 unsigned getTotalNumSGPRs() const { 1009 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1010 } 1011 1012 /// \returns Addressable number of SGPRs supported by the subtarget. 1013 unsigned getAddressableNumSGPRs() const { 1014 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1015 } 1016 1017 /// \returns Minimum number of SGPRs that meets the given number of waves per 1018 /// execution unit requirement supported by the subtarget. 1019 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1020 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1021 } 1022 1023 /// \returns Maximum number of SGPRs that meets the given number of waves per 1024 /// execution unit requirement supported by the subtarget. 1025 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1026 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1027 } 1028 1029 /// \returns Reserved number of SGPRs. This is common 1030 /// utility function called by MachineFunction and 1031 /// Function variants of getReservedNumSGPRs. 1032 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; 1033 /// \returns Reserved number of SGPRs for given machine function \p MF. 1034 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1035 1036 /// \returns Reserved number of SGPRs for given function \p F. 1037 unsigned getReservedNumSGPRs(const Function &F) const; 1038 1039 /// \returns max num SGPRs. This is the common utility 1040 /// function called by MachineFunction and Function 1041 /// variants of getMaxNumSGPRs. 1042 unsigned getBaseMaxNumSGPRs(const Function &F, 1043 std::pair<unsigned, unsigned> WavesPerEU, 1044 unsigned PreloadedSGPRs, 1045 unsigned ReservedNumSGPRs) const; 1046 1047 /// \returns Maximum number of SGPRs that meets number of waves per execution 1048 /// unit requirement for function \p MF, or number of SGPRs explicitly 1049 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1050 /// 1051 /// \returns Value that meets number of waves per execution unit requirement 1052 /// if explicitly requested value cannot be converted to integer, violates 1053 /// subtarget's specifications, or does not meet number of waves per execution 1054 /// unit requirement. 1055 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1056 1057 /// \returns Maximum number of SGPRs that meets number of waves per execution 1058 /// unit requirement for function \p F, or number of SGPRs explicitly 1059 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1060 /// 1061 /// \returns Value that meets number of waves per execution unit requirement 1062 /// if explicitly requested value cannot be converted to integer, violates 1063 /// subtarget's specifications, or does not meet number of waves per execution 1064 /// unit requirement. 1065 unsigned getMaxNumSGPRs(const Function &F) const; 1066 1067 /// \returns VGPR allocation granularity supported by the subtarget. 1068 unsigned getVGPRAllocGranule() const { 1069 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1070 } 1071 1072 /// \returns VGPR encoding granularity supported by the subtarget. 1073 unsigned getVGPREncodingGranule() const { 1074 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1075 } 1076 1077 /// \returns Total number of VGPRs supported by the subtarget. 1078 unsigned getTotalNumVGPRs() const { 1079 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1080 } 1081 1082 /// \returns Addressable number of VGPRs supported by the subtarget. 1083 unsigned getAddressableNumVGPRs() const { 1084 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1085 } 1086 1087 /// \returns Minimum number of VGPRs that meets given number of waves per 1088 /// execution unit requirement supported by the subtarget. 1089 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1090 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1091 } 1092 1093 /// \returns Maximum number of VGPRs that meets given number of waves per 1094 /// execution unit requirement supported by the subtarget. 1095 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1096 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1097 } 1098 1099 /// \returns max num VGPRs. This is the common utility function 1100 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1101 unsigned getBaseMaxNumVGPRs(const Function &F, 1102 std::pair<unsigned, unsigned> WavesPerEU) const; 1103 /// \returns Maximum number of VGPRs that meets number of waves per execution 1104 /// unit requirement for function \p F, or number of VGPRs explicitly 1105 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1106 /// 1107 /// \returns Value that meets number of waves per execution unit requirement 1108 /// if explicitly requested value cannot be converted to integer, violates 1109 /// subtarget's specifications, or does not meet number of waves per execution 1110 /// unit requirement. 1111 unsigned getMaxNumVGPRs(const Function &F) const; 1112 1113 /// \returns Maximum number of VGPRs that meets number of waves per execution 1114 /// unit requirement for function \p MF, or number of VGPRs explicitly 1115 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1116 /// 1117 /// \returns Value that meets number of waves per execution unit requirement 1118 /// if explicitly requested value cannot be converted to integer, violates 1119 /// subtarget's specifications, or does not meet number of waves per execution 1120 /// unit requirement. 1121 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1122 1123 void getPostRAMutations( 1124 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1125 const override; 1126 1127 std::unique_ptr<ScheduleDAGMutation> 1128 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1129 1130 bool isWave32() const { 1131 return getWavefrontSize() == 32; 1132 } 1133 1134 bool isWave64() const { 1135 return getWavefrontSize() == 64; 1136 } 1137 1138 const TargetRegisterClass *getBoolRC() const { 1139 return getRegisterInfo()->getBoolRC(); 1140 } 1141 1142 /// \returns Maximum number of work groups per compute unit supported by the 1143 /// subtarget and limited by given \p FlatWorkGroupSize. 1144 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1145 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1146 } 1147 1148 /// \returns Minimum flat work group size supported by the subtarget. 1149 unsigned getMinFlatWorkGroupSize() const override { 1150 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1151 } 1152 1153 /// \returns Maximum flat work group size supported by the subtarget. 1154 unsigned getMaxFlatWorkGroupSize() const override { 1155 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1156 } 1157 1158 /// \returns Number of waves per execution unit required to support the given 1159 /// \p FlatWorkGroupSize. 1160 unsigned 1161 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1162 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1163 } 1164 1165 /// \returns Minimum number of waves per execution unit supported by the 1166 /// subtarget. 1167 unsigned getMinWavesPerEU() const override { 1168 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1169 } 1170 1171 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1172 SDep &Dep) const override; 1173 }; 1174 1175 } // end namespace llvm 1176 1177 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1178