1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 namespace llvm { 25 26 class MCInst; 27 class MCInstrInfo; 28 29 } // namespace llvm 30 31 #define GET_SUBTARGETINFO_HEADER 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 namespace llvm { 35 36 class GCNTargetMachine; 37 38 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 39 public AMDGPUSubtarget { 40 41 using AMDGPUSubtarget::getMaxWavesPerEU; 42 43 public: 44 // Following 2 enums are documented at: 45 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 46 enum class TrapHandlerAbi { 47 NONE = 0x00, 48 AMDHSA = 0x01, 49 }; 50 51 enum class TrapID { 52 LLVMAMDHSATrap = 0x02, 53 LLVMAMDHSADebugTrap = 0x03, 54 }; 55 56 private: 57 /// GlobalISel related APIs. 58 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 59 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 60 std::unique_ptr<InstructionSelector> InstSelector; 61 std::unique_ptr<LegalizerInfo> Legalizer; 62 std::unique_ptr<RegisterBankInfo> RegBankInfo; 63 64 protected: 65 // Basic subtarget description. 66 Triple TargetTriple; 67 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 68 unsigned Gen; 69 InstrItineraryData InstrItins; 70 int LDSBankCount; 71 unsigned MaxPrivateElementSize; 72 73 // Possibly statically set by tablegen, but may want to be overridden. 74 bool FastFMAF32; 75 bool FastDenormalF32; 76 bool HalfRate64Ops; 77 bool FullRate64Ops; 78 79 // Dynamically set bits that enable features. 80 bool FlatForGlobal; 81 bool AutoWaitcntBeforeBarrier; 82 bool UnalignedScratchAccess; 83 bool UnalignedAccessMode; 84 bool HasApertureRegs; 85 bool SupportsXNACK; 86 87 // This should not be used directly. 'TargetID' tracks the dynamic settings 88 // for XNACK. 89 bool EnableXNACK; 90 91 bool EnableTgSplit; 92 bool EnableCuMode; 93 bool TrapHandler; 94 95 // Used as options. 96 bool EnableLoadStoreOpt; 97 bool EnableUnsafeDSOffsetFolding; 98 bool EnableSIScheduler; 99 bool EnableDS128; 100 bool EnablePRTStrictNull; 101 bool DumpCode; 102 103 // Subtarget statically properties set by tablegen 104 bool FP64; 105 bool FMA; 106 bool MIMG_R128; 107 bool IsGCN; 108 bool CIInsts; 109 bool GFX8Insts; 110 bool GFX9Insts; 111 bool GFX90AInsts; 112 bool GFX10Insts; 113 bool GFX10_3Insts; 114 bool GFX7GFX8GFX9Insts; 115 bool SGPRInitBug; 116 bool NegativeScratchOffsetBug; 117 bool NegativeUnalignedScratchOffsetBug; 118 bool HasSMemRealTime; 119 bool HasIntClamp; 120 bool HasFmaMixInsts; 121 bool HasMovrel; 122 bool HasVGPRIndexMode; 123 bool HasScalarStores; 124 bool HasScalarAtomics; 125 bool HasSDWAOmod; 126 bool HasSDWAScalar; 127 bool HasSDWASdst; 128 bool HasSDWAMac; 129 bool HasSDWAOutModsVOPC; 130 bool HasDPP; 131 bool HasDPP8; 132 bool Has64BitDPP; 133 bool HasPackedFP32Ops; 134 bool HasExtendedImageInsts; 135 bool HasR128A16; 136 bool HasGFX10A16; 137 bool HasG16; 138 bool HasNSAEncoding; 139 unsigned NSAMaxSize; 140 bool GFX10_AEncoding; 141 bool GFX10_BEncoding; 142 bool HasDLInsts; 143 bool HasDot1Insts; 144 bool HasDot2Insts; 145 bool HasDot3Insts; 146 bool HasDot4Insts; 147 bool HasDot5Insts; 148 bool HasDot6Insts; 149 bool HasDot7Insts; 150 bool HasMAIInsts; 151 bool HasPkFmacF16Inst; 152 bool HasAtomicFaddInsts; 153 bool SupportsSRAMECC; 154 155 // This should not be used directly. 'TargetID' tracks the dynamic settings 156 // for SRAMECC. 157 bool EnableSRAMECC; 158 159 bool HasNoSdstCMPX; 160 bool HasVscnt; 161 bool HasGetWaveIdInst; 162 bool HasSMemTimeInst; 163 bool HasShaderCyclesRegister; 164 bool HasRegisterBanking; 165 bool HasVOP3Literal; 166 bool HasNoDataDepHazard; 167 bool FlatAddressSpace; 168 bool FlatInstOffsets; 169 bool FlatGlobalInsts; 170 bool FlatScratchInsts; 171 bool ScalarFlatScratchInsts; 172 bool HasArchitectedFlatScratch; 173 bool AddNoCarryInsts; 174 bool HasUnpackedD16VMem; 175 bool R600ALUInst; 176 bool CaymanISA; 177 bool CFALUBug; 178 bool LDSMisalignedBug; 179 bool HasMFMAInlineLiteralBug; 180 bool HasVertexCache; 181 short TexVTXClauseSize; 182 bool UnalignedBufferAccess; 183 bool UnalignedDSAccess; 184 bool HasPackedTID; 185 bool ScalarizeGlobal; 186 187 bool HasVcmpxPermlaneHazard; 188 bool HasVMEMtoScalarWriteHazard; 189 bool HasSMEMtoVectorWriteHazard; 190 bool HasInstFwdPrefetchBug; 191 bool HasVcmpxExecWARHazard; 192 bool HasLdsBranchVmemWARHazard; 193 bool HasNSAtoVMEMBug; 194 bool HasNSAClauseBug; 195 bool HasOffset3fBug; 196 bool HasFlatSegmentOffsetBug; 197 bool HasImageStoreD16Bug; 198 bool HasImageGather4D16Bug; 199 200 // Dummy feature to use for assembler in tablegen. 201 bool FeatureDisable; 202 203 SelectionDAGTargetInfo TSInfo; 204 private: 205 SIInstrInfo InstrInfo; 206 SITargetLowering TLInfo; 207 SIFrameLowering FrameLowering; 208 209 public: 210 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 211 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 212 213 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 214 const GCNTargetMachine &TM); 215 ~GCNSubtarget() override; 216 217 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 218 StringRef GPU, StringRef FS); 219 220 const SIInstrInfo *getInstrInfo() const override { 221 return &InstrInfo; 222 } 223 224 const SIFrameLowering *getFrameLowering() const override { 225 return &FrameLowering; 226 } 227 228 const SITargetLowering *getTargetLowering() const override { 229 return &TLInfo; 230 } 231 232 const SIRegisterInfo *getRegisterInfo() const override { 233 return &InstrInfo.getRegisterInfo(); 234 } 235 236 const CallLowering *getCallLowering() const override { 237 return CallLoweringInfo.get(); 238 } 239 240 const InlineAsmLowering *getInlineAsmLowering() const override { 241 return InlineAsmLoweringInfo.get(); 242 } 243 244 InstructionSelector *getInstructionSelector() const override { 245 return InstSelector.get(); 246 } 247 248 const LegalizerInfo *getLegalizerInfo() const override { 249 return Legalizer.get(); 250 } 251 252 const RegisterBankInfo *getRegBankInfo() const override { 253 return RegBankInfo.get(); 254 } 255 256 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 257 return TargetID; 258 } 259 260 // Nothing implemented, just prevent crashes on use. 261 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 262 return &TSInfo; 263 } 264 265 const InstrItineraryData *getInstrItineraryData() const override { 266 return &InstrItins; 267 } 268 269 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 270 271 Generation getGeneration() const { 272 return (Generation)Gen; 273 } 274 275 /// Return the number of high bits known to be zero fror a frame index. 276 unsigned getKnownHighZeroBitsForFrameIndex() const { 277 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 278 } 279 280 int getLDSBankCount() const { 281 return LDSBankCount; 282 } 283 284 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 285 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 286 } 287 288 unsigned getConstantBusLimit(unsigned Opcode) const; 289 290 /// Returns if the result of this instruction with a 16-bit result returned in 291 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 292 /// the original value. 293 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 294 295 bool hasIntClamp() const { 296 return HasIntClamp; 297 } 298 299 bool hasFP64() const { 300 return FP64; 301 } 302 303 bool hasMIMG_R128() const { 304 return MIMG_R128; 305 } 306 307 bool hasHWFP64() const { 308 return FP64; 309 } 310 311 bool hasFastFMAF32() const { 312 return FastFMAF32; 313 } 314 315 bool hasHalfRate64Ops() const { 316 return HalfRate64Ops; 317 } 318 319 bool hasFullRate64Ops() const { 320 return FullRate64Ops; 321 } 322 323 bool hasAddr64() const { 324 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 325 } 326 327 bool hasFlat() const { 328 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 329 } 330 331 // Return true if the target only has the reverse operand versions of VALU 332 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 333 bool hasOnlyRevVALUShifts() const { 334 return getGeneration() >= VOLCANIC_ISLANDS; 335 } 336 337 bool hasFractBug() const { 338 return getGeneration() == SOUTHERN_ISLANDS; 339 } 340 341 bool hasBFE() const { 342 return true; 343 } 344 345 bool hasBFI() const { 346 return true; 347 } 348 349 bool hasBFM() const { 350 return hasBFE(); 351 } 352 353 bool hasBCNT(unsigned Size) const { 354 return true; 355 } 356 357 bool hasFFBL() const { 358 return true; 359 } 360 361 bool hasFFBH() const { 362 return true; 363 } 364 365 bool hasMed3_16() const { 366 return getGeneration() >= AMDGPUSubtarget::GFX9; 367 } 368 369 bool hasMin3Max3_16() const { 370 return getGeneration() >= AMDGPUSubtarget::GFX9; 371 } 372 373 bool hasFmaMixInsts() const { 374 return HasFmaMixInsts; 375 } 376 377 bool hasCARRY() const { 378 return true; 379 } 380 381 bool hasFMA() const { 382 return FMA; 383 } 384 385 bool hasSwap() const { 386 return GFX9Insts; 387 } 388 389 bool hasScalarPackInsts() const { 390 return GFX9Insts; 391 } 392 393 bool hasScalarMulHiInsts() const { 394 return GFX9Insts; 395 } 396 397 TrapHandlerAbi getTrapHandlerAbi() const { 398 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 399 } 400 401 bool supportsGetDoorbellID() const { 402 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 403 return getGeneration() >= GFX9; 404 } 405 406 /// True if the offset field of DS instructions works as expected. On SI, the 407 /// offset uses a 16-bit adder and does not always wrap properly. 408 bool hasUsableDSOffset() const { 409 return getGeneration() >= SEA_ISLANDS; 410 } 411 412 bool unsafeDSOffsetFoldingEnabled() const { 413 return EnableUnsafeDSOffsetFolding; 414 } 415 416 /// Condition output from div_scale is usable. 417 bool hasUsableDivScaleConditionOutput() const { 418 return getGeneration() != SOUTHERN_ISLANDS; 419 } 420 421 /// Extra wait hazard is needed in some cases before 422 /// s_cbranch_vccnz/s_cbranch_vccz. 423 bool hasReadVCCZBug() const { 424 return getGeneration() <= SEA_ISLANDS; 425 } 426 427 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 428 bool partialVCCWritesUpdateVCCZ() const { 429 return getGeneration() >= GFX10; 430 } 431 432 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 433 /// was written by a VALU instruction. 434 bool hasSMRDReadVALUDefHazard() const { 435 return getGeneration() == SOUTHERN_ISLANDS; 436 } 437 438 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 439 /// SGPR was written by a VALU Instruction. 440 bool hasVMEMReadSGPRVALUDefHazard() const { 441 return getGeneration() >= VOLCANIC_ISLANDS; 442 } 443 444 bool hasRFEHazards() const { 445 return getGeneration() >= VOLCANIC_ISLANDS; 446 } 447 448 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 449 unsigned getSetRegWaitStates() const { 450 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 451 } 452 453 bool dumpCode() const { 454 return DumpCode; 455 } 456 457 /// Return the amount of LDS that can be used that will not restrict the 458 /// occupancy lower than WaveCount. 459 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 460 const Function &) const; 461 462 bool supportsMinMaxDenormModes() const { 463 return getGeneration() >= AMDGPUSubtarget::GFX9; 464 } 465 466 /// \returns If target supports S_DENORM_MODE. 467 bool hasDenormModeInst() const { 468 return getGeneration() >= AMDGPUSubtarget::GFX10; 469 } 470 471 bool useFlatForGlobal() const { 472 return FlatForGlobal; 473 } 474 475 /// \returns If target supports ds_read/write_b128 and user enables generation 476 /// of ds_read/write_b128. 477 bool useDS128() const { 478 return CIInsts && EnableDS128; 479 } 480 481 /// \return If target supports ds_read/write_b96/128. 482 bool hasDS96AndDS128() const { 483 return CIInsts; 484 } 485 486 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 487 bool haveRoundOpsF64() const { 488 return CIInsts; 489 } 490 491 /// \returns If MUBUF instructions always perform range checking, even for 492 /// buffer resources used for private memory access. 493 bool privateMemoryResourceIsRangeChecked() const { 494 return getGeneration() < AMDGPUSubtarget::GFX9; 495 } 496 497 /// \returns If target requires PRT Struct NULL support (zero result registers 498 /// for sparse texture support). 499 bool usePRTStrictNull() const { 500 return EnablePRTStrictNull; 501 } 502 503 bool hasAutoWaitcntBeforeBarrier() const { 504 return AutoWaitcntBeforeBarrier; 505 } 506 507 bool hasUnalignedBufferAccess() const { 508 return UnalignedBufferAccess; 509 } 510 511 bool hasUnalignedBufferAccessEnabled() const { 512 return UnalignedBufferAccess && UnalignedAccessMode; 513 } 514 515 bool hasUnalignedDSAccess() const { 516 return UnalignedDSAccess; 517 } 518 519 bool hasUnalignedDSAccessEnabled() const { 520 return UnalignedDSAccess && UnalignedAccessMode; 521 } 522 523 bool hasUnalignedScratchAccess() const { 524 return UnalignedScratchAccess; 525 } 526 527 bool hasUnalignedAccessMode() const { 528 return UnalignedAccessMode; 529 } 530 531 bool hasApertureRegs() const { 532 return HasApertureRegs; 533 } 534 535 bool isTrapHandlerEnabled() const { 536 return TrapHandler; 537 } 538 539 bool isXNACKEnabled() const { 540 return TargetID.isXnackOnOrAny(); 541 } 542 543 bool isTgSplitEnabled() const { 544 return EnableTgSplit; 545 } 546 547 bool isCuModeEnabled() const { 548 return EnableCuMode; 549 } 550 551 bool hasFlatAddressSpace() const { 552 return FlatAddressSpace; 553 } 554 555 bool hasFlatScrRegister() const { 556 return hasFlatAddressSpace(); 557 } 558 559 bool hasFlatInstOffsets() const { 560 return FlatInstOffsets; 561 } 562 563 bool hasFlatGlobalInsts() const { 564 return FlatGlobalInsts; 565 } 566 567 bool hasFlatScratchInsts() const { 568 return FlatScratchInsts; 569 } 570 571 // Check if target supports ST addressing mode with FLAT scratch instructions. 572 // The ST addressing mode means no registers are used, either VGPR or SGPR, 573 // but only immediate offset is swizzled and added to the FLAT scratch base. 574 bool hasFlatScratchSTMode() const { 575 return hasFlatScratchInsts() && hasGFX10_3Insts(); 576 } 577 578 bool hasScalarFlatScratchInsts() const { 579 return ScalarFlatScratchInsts; 580 } 581 582 bool hasGlobalAddTidInsts() const { 583 return GFX10_BEncoding; 584 } 585 586 bool hasAtomicCSub() const { 587 return GFX10_BEncoding; 588 } 589 590 bool hasMultiDwordFlatScratchAddressing() const { 591 return getGeneration() >= GFX9; 592 } 593 594 bool hasFlatSegmentOffsetBug() const { 595 return HasFlatSegmentOffsetBug; 596 } 597 598 bool hasFlatLgkmVMemCountInOrder() const { 599 return getGeneration() > GFX9; 600 } 601 602 bool hasD16LoadStore() const { 603 return getGeneration() >= GFX9; 604 } 605 606 bool d16PreservesUnusedBits() const { 607 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 608 } 609 610 bool hasD16Images() const { 611 return getGeneration() >= VOLCANIC_ISLANDS; 612 } 613 614 /// Return if most LDS instructions have an m0 use that require m0 to be 615 /// iniitalized. 616 bool ldsRequiresM0Init() const { 617 return getGeneration() < GFX9; 618 } 619 620 // True if the hardware rewinds and replays GWS operations if a wave is 621 // preempted. 622 // 623 // If this is false, a GWS operation requires testing if a nack set the 624 // MEM_VIOL bit, and repeating if so. 625 bool hasGWSAutoReplay() const { 626 return getGeneration() >= GFX9; 627 } 628 629 /// \returns if target has ds_gws_sema_release_all instruction. 630 bool hasGWSSemaReleaseAll() const { 631 return CIInsts; 632 } 633 634 /// \returns true if the target has integer add/sub instructions that do not 635 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 636 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 637 /// for saturation. 638 bool hasAddNoCarry() const { 639 return AddNoCarryInsts; 640 } 641 642 bool hasUnpackedD16VMem() const { 643 return HasUnpackedD16VMem; 644 } 645 646 // Covers VS/PS/CS graphics shaders 647 bool isMesaGfxShader(const Function &F) const { 648 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 649 } 650 651 bool hasMad64_32() const { 652 return getGeneration() >= SEA_ISLANDS; 653 } 654 655 bool hasSDWAOmod() const { 656 return HasSDWAOmod; 657 } 658 659 bool hasSDWAScalar() const { 660 return HasSDWAScalar; 661 } 662 663 bool hasSDWASdst() const { 664 return HasSDWASdst; 665 } 666 667 bool hasSDWAMac() const { 668 return HasSDWAMac; 669 } 670 671 bool hasSDWAOutModsVOPC() const { 672 return HasSDWAOutModsVOPC; 673 } 674 675 bool hasDLInsts() const { 676 return HasDLInsts; 677 } 678 679 bool hasDot1Insts() const { 680 return HasDot1Insts; 681 } 682 683 bool hasDot2Insts() const { 684 return HasDot2Insts; 685 } 686 687 bool hasDot3Insts() const { 688 return HasDot3Insts; 689 } 690 691 bool hasDot4Insts() const { 692 return HasDot4Insts; 693 } 694 695 bool hasDot5Insts() const { 696 return HasDot5Insts; 697 } 698 699 bool hasDot6Insts() const { 700 return HasDot6Insts; 701 } 702 703 bool hasDot7Insts() const { 704 return HasDot7Insts; 705 } 706 707 bool hasMAIInsts() const { 708 return HasMAIInsts; 709 } 710 711 bool hasPkFmacF16Inst() const { 712 return HasPkFmacF16Inst; 713 } 714 715 bool hasAtomicFaddInsts() const { 716 return HasAtomicFaddInsts; 717 } 718 719 bool hasNoSdstCMPX() const { 720 return HasNoSdstCMPX; 721 } 722 723 bool hasVscnt() const { 724 return HasVscnt; 725 } 726 727 bool hasGetWaveIdInst() const { 728 return HasGetWaveIdInst; 729 } 730 731 bool hasSMemTimeInst() const { 732 return HasSMemTimeInst; 733 } 734 735 bool hasShaderCyclesRegister() const { 736 return HasShaderCyclesRegister; 737 } 738 739 bool hasRegisterBanking() const { 740 return HasRegisterBanking; 741 } 742 743 bool hasVOP3Literal() const { 744 return HasVOP3Literal; 745 } 746 747 bool hasNoDataDepHazard() const { 748 return HasNoDataDepHazard; 749 } 750 751 bool vmemWriteNeedsExpWaitcnt() const { 752 return getGeneration() < SEA_ISLANDS; 753 } 754 755 // Scratch is allocated in 256 dword per wave blocks for the entire 756 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 757 // is 4-byte aligned. 758 // 759 // Only 4-byte alignment is really needed to access anything. Transformations 760 // on the pointer value itself may rely on the alignment / known low bits of 761 // the pointer. Set this to something above the minimum to avoid needing 762 // dynamic realignment in common cases. 763 Align getStackAlignment() const { return Align(16); } 764 765 bool enableMachineScheduler() const override { 766 return true; 767 } 768 769 bool useAA() const override; 770 771 bool enableSubRegLiveness() const override { 772 return true; 773 } 774 775 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 776 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 777 778 // static wrappers 779 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 780 781 // XXX - Why is this here if it isn't in the default pass set? 782 bool enableEarlyIfConversion() const override { 783 return true; 784 } 785 786 bool enableFlatScratch() const; 787 788 void overrideSchedPolicy(MachineSchedPolicy &Policy, 789 unsigned NumRegionInstrs) const override; 790 791 unsigned getMaxNumUserSGPRs() const { 792 return 16; 793 } 794 795 bool hasSMemRealTime() const { 796 return HasSMemRealTime; 797 } 798 799 bool hasMovrel() const { 800 return HasMovrel; 801 } 802 803 bool hasVGPRIndexMode() const { 804 return HasVGPRIndexMode; 805 } 806 807 bool useVGPRIndexMode() const; 808 809 bool hasScalarCompareEq64() const { 810 return getGeneration() >= VOLCANIC_ISLANDS; 811 } 812 813 bool hasScalarStores() const { 814 return HasScalarStores; 815 } 816 817 bool hasScalarAtomics() const { 818 return HasScalarAtomics; 819 } 820 821 bool hasLDSFPAtomics() const { 822 return GFX8Insts; 823 } 824 825 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 826 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 827 828 bool hasDPP() const { 829 return HasDPP; 830 } 831 832 bool hasDPPBroadcasts() const { 833 return HasDPP && getGeneration() < GFX10; 834 } 835 836 bool hasDPPWavefrontShifts() const { 837 return HasDPP && getGeneration() < GFX10; 838 } 839 840 bool hasDPP8() const { 841 return HasDPP8; 842 } 843 844 bool has64BitDPP() const { 845 return Has64BitDPP; 846 } 847 848 bool hasPackedFP32Ops() const { 849 return HasPackedFP32Ops; 850 } 851 852 bool hasFmaakFmamkF32Insts() const { 853 return getGeneration() >= GFX10; 854 } 855 856 bool hasExtendedImageInsts() const { 857 return HasExtendedImageInsts; 858 } 859 860 bool hasR128A16() const { 861 return HasR128A16; 862 } 863 864 bool hasGFX10A16() const { 865 return HasGFX10A16; 866 } 867 868 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 869 870 bool hasG16() const { return HasG16; } 871 872 bool hasOffset3fBug() const { 873 return HasOffset3fBug; 874 } 875 876 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 877 878 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 879 880 bool hasNSAEncoding() const { return HasNSAEncoding; } 881 882 unsigned getNSAMaxSize() const { return NSAMaxSize; } 883 884 bool hasGFX10_AEncoding() const { 885 return GFX10_AEncoding; 886 } 887 888 bool hasGFX10_BEncoding() const { 889 return GFX10_BEncoding; 890 } 891 892 bool hasGFX10_3Insts() const { 893 return GFX10_3Insts; 894 } 895 896 bool hasMadF16() const; 897 898 bool enableSIScheduler() const { 899 return EnableSIScheduler; 900 } 901 902 bool loadStoreOptEnabled() const { 903 return EnableLoadStoreOpt; 904 } 905 906 bool hasSGPRInitBug() const { 907 return SGPRInitBug; 908 } 909 910 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 911 912 bool hasNegativeUnalignedScratchOffsetBug() const { 913 return NegativeUnalignedScratchOffsetBug; 914 } 915 916 bool hasMFMAInlineLiteralBug() const { 917 return HasMFMAInlineLiteralBug; 918 } 919 920 bool has12DWordStoreHazard() const { 921 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 922 } 923 924 // \returns true if the subtarget supports DWORDX3 load/store instructions. 925 bool hasDwordx3LoadStores() const { 926 return CIInsts; 927 } 928 929 bool hasReadM0MovRelInterpHazard() const { 930 return getGeneration() == AMDGPUSubtarget::GFX9; 931 } 932 933 bool hasReadM0SendMsgHazard() const { 934 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 935 getGeneration() <= AMDGPUSubtarget::GFX9; 936 } 937 938 bool hasVcmpxPermlaneHazard() const { 939 return HasVcmpxPermlaneHazard; 940 } 941 942 bool hasVMEMtoScalarWriteHazard() const { 943 return HasVMEMtoScalarWriteHazard; 944 } 945 946 bool hasSMEMtoVectorWriteHazard() const { 947 return HasSMEMtoVectorWriteHazard; 948 } 949 950 bool hasLDSMisalignedBug() const { 951 return LDSMisalignedBug && !EnableCuMode; 952 } 953 954 bool hasInstFwdPrefetchBug() const { 955 return HasInstFwdPrefetchBug; 956 } 957 958 bool hasVcmpxExecWARHazard() const { 959 return HasVcmpxExecWARHazard; 960 } 961 962 bool hasLdsBranchVmemWARHazard() const { 963 return HasLdsBranchVmemWARHazard; 964 } 965 966 bool hasNSAtoVMEMBug() const { 967 return HasNSAtoVMEMBug; 968 } 969 970 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 971 972 bool hasHardClauses() const { return getGeneration() >= GFX10; } 973 974 bool hasGFX90AInsts() const { return GFX90AInsts; } 975 976 /// Return if operations acting on VGPR tuples require even alignment. 977 bool needsAlignedVGPRs() const { return GFX90AInsts; } 978 979 bool hasPackedTID() const { return HasPackedTID; } 980 981 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 982 /// SGPRs 983 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 984 985 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 986 /// VGPRs 987 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 988 989 /// Return occupancy for the given function. Used LDS and a number of 990 /// registers if provided. 991 /// Note, occupancy can be affected by the scratch allocation as well, but 992 /// we do not have enough information to compute it. 993 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 994 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 995 996 /// \returns true if the flat_scratch register should be initialized with the 997 /// pointer to the wave's scratch memory rather than a size and offset. 998 bool flatScratchIsPointer() const { 999 return getGeneration() >= AMDGPUSubtarget::GFX9; 1000 } 1001 1002 /// \returns true if the flat_scratch register is initialized by the HW. 1003 /// In this case it is readonly. 1004 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1005 1006 /// \returns true if the machine has merged shaders in which s0-s7 are 1007 /// reserved by the hardware and user SGPRs start at s8 1008 bool hasMergedShaders() const { 1009 return getGeneration() >= GFX9; 1010 } 1011 1012 /// \returns SGPR allocation granularity supported by the subtarget. 1013 unsigned getSGPRAllocGranule() const { 1014 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1015 } 1016 1017 /// \returns SGPR encoding granularity supported by the subtarget. 1018 unsigned getSGPREncodingGranule() const { 1019 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1020 } 1021 1022 /// \returns Total number of SGPRs supported by the subtarget. 1023 unsigned getTotalNumSGPRs() const { 1024 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1025 } 1026 1027 /// \returns Addressable number of SGPRs supported by the subtarget. 1028 unsigned getAddressableNumSGPRs() const { 1029 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1030 } 1031 1032 /// \returns Minimum number of SGPRs that meets the given number of waves per 1033 /// execution unit requirement supported by the subtarget. 1034 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1035 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1036 } 1037 1038 /// \returns Maximum number of SGPRs that meets the given number of waves per 1039 /// execution unit requirement supported by the subtarget. 1040 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1041 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1042 } 1043 1044 /// \returns Reserved number of SGPRs. This is common 1045 /// utility function called by MachineFunction and 1046 /// Function variants of getReservedNumSGPRs. 1047 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; 1048 /// \returns Reserved number of SGPRs for given machine function \p MF. 1049 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1050 1051 /// \returns Reserved number of SGPRs for given function \p F. 1052 unsigned getReservedNumSGPRs(const Function &F) const; 1053 1054 /// \returns max num SGPRs. This is the common utility 1055 /// function called by MachineFunction and Function 1056 /// variants of getMaxNumSGPRs. 1057 unsigned getBaseMaxNumSGPRs(const Function &F, 1058 std::pair<unsigned, unsigned> WavesPerEU, 1059 unsigned PreloadedSGPRs, 1060 unsigned ReservedNumSGPRs) const; 1061 1062 /// \returns Maximum number of SGPRs that meets number of waves per execution 1063 /// unit requirement for function \p MF, or number of SGPRs explicitly 1064 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1065 /// 1066 /// \returns Value that meets number of waves per execution unit requirement 1067 /// if explicitly requested value cannot be converted to integer, violates 1068 /// subtarget's specifications, or does not meet number of waves per execution 1069 /// unit requirement. 1070 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1071 1072 /// \returns Maximum number of SGPRs that meets number of waves per execution 1073 /// unit requirement for function \p F, or number of SGPRs explicitly 1074 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1075 /// 1076 /// \returns Value that meets number of waves per execution unit requirement 1077 /// if explicitly requested value cannot be converted to integer, violates 1078 /// subtarget's specifications, or does not meet number of waves per execution 1079 /// unit requirement. 1080 unsigned getMaxNumSGPRs(const Function &F) const; 1081 1082 /// \returns VGPR allocation granularity supported by the subtarget. 1083 unsigned getVGPRAllocGranule() const { 1084 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1085 } 1086 1087 /// \returns VGPR encoding granularity supported by the subtarget. 1088 unsigned getVGPREncodingGranule() const { 1089 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1090 } 1091 1092 /// \returns Total number of VGPRs supported by the subtarget. 1093 unsigned getTotalNumVGPRs() const { 1094 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1095 } 1096 1097 /// \returns Addressable number of VGPRs supported by the subtarget. 1098 unsigned getAddressableNumVGPRs() const { 1099 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1100 } 1101 1102 /// \returns Minimum number of VGPRs that meets given number of waves per 1103 /// execution unit requirement supported by the subtarget. 1104 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1105 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1106 } 1107 1108 /// \returns Maximum number of VGPRs that meets given number of waves per 1109 /// execution unit requirement supported by the subtarget. 1110 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1111 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1112 } 1113 1114 /// \returns max num VGPRs. This is the common utility function 1115 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1116 unsigned getBaseMaxNumVGPRs(const Function &F, 1117 std::pair<unsigned, unsigned> WavesPerEU) const; 1118 /// \returns Maximum number of VGPRs that meets number of waves per execution 1119 /// unit requirement for function \p F, or number of VGPRs explicitly 1120 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1121 /// 1122 /// \returns Value that meets number of waves per execution unit requirement 1123 /// if explicitly requested value cannot be converted to integer, violates 1124 /// subtarget's specifications, or does not meet number of waves per execution 1125 /// unit requirement. 1126 unsigned getMaxNumVGPRs(const Function &F) const; 1127 1128 /// \returns Maximum number of VGPRs that meets number of waves per execution 1129 /// unit requirement for function \p MF, or number of VGPRs explicitly 1130 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1131 /// 1132 /// \returns Value that meets number of waves per execution unit requirement 1133 /// if explicitly requested value cannot be converted to integer, violates 1134 /// subtarget's specifications, or does not meet number of waves per execution 1135 /// unit requirement. 1136 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1137 1138 void getPostRAMutations( 1139 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1140 const override; 1141 1142 bool isWave32() const { 1143 return getWavefrontSize() == 32; 1144 } 1145 1146 bool isWave64() const { 1147 return getWavefrontSize() == 64; 1148 } 1149 1150 const TargetRegisterClass *getBoolRC() const { 1151 return getRegisterInfo()->getBoolRC(); 1152 } 1153 1154 /// \returns Maximum number of work groups per compute unit supported by the 1155 /// subtarget and limited by given \p FlatWorkGroupSize. 1156 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1157 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1158 } 1159 1160 /// \returns Minimum flat work group size supported by the subtarget. 1161 unsigned getMinFlatWorkGroupSize() const override { 1162 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1163 } 1164 1165 /// \returns Maximum flat work group size supported by the subtarget. 1166 unsigned getMaxFlatWorkGroupSize() const override { 1167 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1168 } 1169 1170 /// \returns Number of waves per execution unit required to support the given 1171 /// \p FlatWorkGroupSize. 1172 unsigned 1173 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1174 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1175 } 1176 1177 /// \returns Minimum number of waves per execution unit supported by the 1178 /// subtarget. 1179 unsigned getMinWavesPerEU() const override { 1180 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1181 } 1182 1183 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1184 SDep &Dep) const override; 1185 }; 1186 1187 } // end namespace llvm 1188 1189 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1190