1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 #define GET_SUBTARGETINFO_HEADER 25 #include "AMDGPUGenSubtargetInfo.inc" 26 27 namespace llvm { 28 29 class GCNTargetMachine; 30 31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 32 public AMDGPUSubtarget { 33 34 using AMDGPUSubtarget::getMaxWavesPerEU; 35 36 public: 37 // Following 2 enums are documented at: 38 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 39 enum class TrapHandlerAbi { 40 NONE = 0x00, 41 AMDHSA = 0x01, 42 }; 43 44 enum class TrapID { 45 LLVMAMDHSATrap = 0x02, 46 LLVMAMDHSADebugTrap = 0x03, 47 }; 48 49 private: 50 /// GlobalISel related APIs. 51 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 52 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 53 std::unique_ptr<InstructionSelector> InstSelector; 54 std::unique_ptr<LegalizerInfo> Legalizer; 55 std::unique_ptr<RegisterBankInfo> RegBankInfo; 56 57 protected: 58 // Basic subtarget description. 59 Triple TargetTriple; 60 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 61 unsigned Gen; 62 InstrItineraryData InstrItins; 63 int LDSBankCount; 64 unsigned MaxPrivateElementSize; 65 66 // Possibly statically set by tablegen, but may want to be overridden. 67 bool FastFMAF32; 68 bool FastDenormalF32; 69 bool HalfRate64Ops; 70 bool FullRate64Ops; 71 72 // Dynamically set bits that enable features. 73 bool FlatForGlobal; 74 bool AutoWaitcntBeforeBarrier; 75 bool UnalignedScratchAccess; 76 bool UnalignedAccessMode; 77 bool HasApertureRegs; 78 bool SupportsXNACK; 79 80 // This should not be used directly. 'TargetID' tracks the dynamic settings 81 // for XNACK. 82 bool EnableXNACK; 83 84 bool EnableTgSplit; 85 bool EnableCuMode; 86 bool TrapHandler; 87 88 // Used as options. 89 bool EnableLoadStoreOpt; 90 bool EnableUnsafeDSOffsetFolding; 91 bool EnableSIScheduler; 92 bool EnableDS128; 93 bool EnablePRTStrictNull; 94 bool DumpCode; 95 96 // Subtarget statically properties set by tablegen 97 bool FP64; 98 bool FMA; 99 bool MIMG_R128; 100 bool CIInsts; 101 bool GFX8Insts; 102 bool GFX9Insts; 103 bool GFX90AInsts; 104 bool GFX10Insts; 105 bool GFX10_3Insts; 106 bool GFX7GFX8GFX9Insts; 107 bool SGPRInitBug; 108 bool NegativeScratchOffsetBug; 109 bool NegativeUnalignedScratchOffsetBug; 110 bool HasSMemRealTime; 111 bool HasIntClamp; 112 bool HasFmaMixInsts; 113 bool HasMovrel; 114 bool HasVGPRIndexMode; 115 bool HasScalarStores; 116 bool HasScalarAtomics; 117 bool HasSDWAOmod; 118 bool HasSDWAScalar; 119 bool HasSDWASdst; 120 bool HasSDWAMac; 121 bool HasSDWAOutModsVOPC; 122 bool HasDPP; 123 bool HasDPP8; 124 bool Has64BitDPP; 125 bool HasPackedFP32Ops; 126 bool HasExtendedImageInsts; 127 bool HasR128A16; 128 bool HasGFX10A16; 129 bool HasG16; 130 bool HasNSAEncoding; 131 unsigned NSAMaxSize; 132 bool GFX10_AEncoding; 133 bool GFX10_BEncoding; 134 bool HasDLInsts; 135 bool HasDot1Insts; 136 bool HasDot2Insts; 137 bool HasDot3Insts; 138 bool HasDot4Insts; 139 bool HasDot5Insts; 140 bool HasDot6Insts; 141 bool HasDot7Insts; 142 bool HasMAIInsts; 143 bool HasPkFmacF16Inst; 144 bool HasAtomicFaddInsts; 145 bool SupportsSRAMECC; 146 147 // This should not be used directly. 'TargetID' tracks the dynamic settings 148 // for SRAMECC. 149 bool EnableSRAMECC; 150 151 bool HasNoSdstCMPX; 152 bool HasVscnt; 153 bool HasGetWaveIdInst; 154 bool HasSMemTimeInst; 155 bool HasShaderCyclesRegister; 156 bool HasVOP3Literal; 157 bool HasNoDataDepHazard; 158 bool FlatAddressSpace; 159 bool FlatInstOffsets; 160 bool FlatGlobalInsts; 161 bool FlatScratchInsts; 162 bool ScalarFlatScratchInsts; 163 bool HasArchitectedFlatScratch; 164 bool AddNoCarryInsts; 165 bool HasUnpackedD16VMem; 166 bool LDSMisalignedBug; 167 bool HasMFMAInlineLiteralBug; 168 bool UnalignedBufferAccess; 169 bool UnalignedDSAccess; 170 bool HasPackedTID; 171 bool ScalarizeGlobal; 172 173 bool HasVcmpxPermlaneHazard; 174 bool HasVMEMtoScalarWriteHazard; 175 bool HasSMEMtoVectorWriteHazard; 176 bool HasInstFwdPrefetchBug; 177 bool HasVcmpxExecWARHazard; 178 bool HasLdsBranchVmemWARHazard; 179 bool HasNSAtoVMEMBug; 180 bool HasNSAClauseBug; 181 bool HasOffset3fBug; 182 bool HasFlatSegmentOffsetBug; 183 bool HasImageStoreD16Bug; 184 bool HasImageGather4D16Bug; 185 186 // Dummy feature to use for assembler in tablegen. 187 bool FeatureDisable; 188 189 SelectionDAGTargetInfo TSInfo; 190 private: 191 SIInstrInfo InstrInfo; 192 SITargetLowering TLInfo; 193 SIFrameLowering FrameLowering; 194 195 public: 196 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 197 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 198 199 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 200 const GCNTargetMachine &TM); 201 ~GCNSubtarget() override; 202 203 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 204 StringRef GPU, StringRef FS); 205 206 const SIInstrInfo *getInstrInfo() const override { 207 return &InstrInfo; 208 } 209 210 const SIFrameLowering *getFrameLowering() const override { 211 return &FrameLowering; 212 } 213 214 const SITargetLowering *getTargetLowering() const override { 215 return &TLInfo; 216 } 217 218 const SIRegisterInfo *getRegisterInfo() const override { 219 return &InstrInfo.getRegisterInfo(); 220 } 221 222 const CallLowering *getCallLowering() const override { 223 return CallLoweringInfo.get(); 224 } 225 226 const InlineAsmLowering *getInlineAsmLowering() const override { 227 return InlineAsmLoweringInfo.get(); 228 } 229 230 InstructionSelector *getInstructionSelector() const override { 231 return InstSelector.get(); 232 } 233 234 const LegalizerInfo *getLegalizerInfo() const override { 235 return Legalizer.get(); 236 } 237 238 const RegisterBankInfo *getRegBankInfo() const override { 239 return RegBankInfo.get(); 240 } 241 242 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 243 return TargetID; 244 } 245 246 // Nothing implemented, just prevent crashes on use. 247 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 248 return &TSInfo; 249 } 250 251 const InstrItineraryData *getInstrItineraryData() const override { 252 return &InstrItins; 253 } 254 255 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 256 257 Generation getGeneration() const { 258 return (Generation)Gen; 259 } 260 261 /// Return the number of high bits known to be zero for a frame index. 262 unsigned getKnownHighZeroBitsForFrameIndex() const { 263 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 264 } 265 266 int getLDSBankCount() const { 267 return LDSBankCount; 268 } 269 270 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 271 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 272 } 273 274 unsigned getConstantBusLimit(unsigned Opcode) const; 275 276 /// Returns if the result of this instruction with a 16-bit result returned in 277 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 278 /// the original value. 279 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 280 281 bool hasIntClamp() const { 282 return HasIntClamp; 283 } 284 285 bool hasFP64() const { 286 return FP64; 287 } 288 289 bool hasMIMG_R128() const { 290 return MIMG_R128; 291 } 292 293 bool hasHWFP64() const { 294 return FP64; 295 } 296 297 bool hasFastFMAF32() const { 298 return FastFMAF32; 299 } 300 301 bool hasHalfRate64Ops() const { 302 return HalfRate64Ops; 303 } 304 305 bool hasFullRate64Ops() const { 306 return FullRate64Ops; 307 } 308 309 bool hasAddr64() const { 310 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 311 } 312 313 bool hasFlat() const { 314 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 315 } 316 317 // Return true if the target only has the reverse operand versions of VALU 318 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 319 bool hasOnlyRevVALUShifts() const { 320 return getGeneration() >= VOLCANIC_ISLANDS; 321 } 322 323 bool hasFractBug() const { 324 return getGeneration() == SOUTHERN_ISLANDS; 325 } 326 327 bool hasBFE() const { 328 return true; 329 } 330 331 bool hasBFI() const { 332 return true; 333 } 334 335 bool hasBFM() const { 336 return hasBFE(); 337 } 338 339 bool hasBCNT(unsigned Size) const { 340 return true; 341 } 342 343 bool hasFFBL() const { 344 return true; 345 } 346 347 bool hasFFBH() const { 348 return true; 349 } 350 351 bool hasMed3_16() const { 352 return getGeneration() >= AMDGPUSubtarget::GFX9; 353 } 354 355 bool hasMin3Max3_16() const { 356 return getGeneration() >= AMDGPUSubtarget::GFX9; 357 } 358 359 bool hasFmaMixInsts() const { 360 return HasFmaMixInsts; 361 } 362 363 bool hasCARRY() const { 364 return true; 365 } 366 367 bool hasFMA() const { 368 return FMA; 369 } 370 371 bool hasSwap() const { 372 return GFX9Insts; 373 } 374 375 bool hasScalarPackInsts() const { 376 return GFX9Insts; 377 } 378 379 bool hasScalarMulHiInsts() const { 380 return GFX9Insts; 381 } 382 383 TrapHandlerAbi getTrapHandlerAbi() const { 384 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 385 } 386 387 bool supportsGetDoorbellID() const { 388 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 389 return getGeneration() >= GFX9; 390 } 391 392 /// True if the offset field of DS instructions works as expected. On SI, the 393 /// offset uses a 16-bit adder and does not always wrap properly. 394 bool hasUsableDSOffset() const { 395 return getGeneration() >= SEA_ISLANDS; 396 } 397 398 bool unsafeDSOffsetFoldingEnabled() const { 399 return EnableUnsafeDSOffsetFolding; 400 } 401 402 /// Condition output from div_scale is usable. 403 bool hasUsableDivScaleConditionOutput() const { 404 return getGeneration() != SOUTHERN_ISLANDS; 405 } 406 407 /// Extra wait hazard is needed in some cases before 408 /// s_cbranch_vccnz/s_cbranch_vccz. 409 bool hasReadVCCZBug() const { 410 return getGeneration() <= SEA_ISLANDS; 411 } 412 413 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 414 bool partialVCCWritesUpdateVCCZ() const { 415 return getGeneration() >= GFX10; 416 } 417 418 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 419 /// was written by a VALU instruction. 420 bool hasSMRDReadVALUDefHazard() const { 421 return getGeneration() == SOUTHERN_ISLANDS; 422 } 423 424 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 425 /// SGPR was written by a VALU Instruction. 426 bool hasVMEMReadSGPRVALUDefHazard() const { 427 return getGeneration() >= VOLCANIC_ISLANDS; 428 } 429 430 bool hasRFEHazards() const { 431 return getGeneration() >= VOLCANIC_ISLANDS; 432 } 433 434 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 435 unsigned getSetRegWaitStates() const { 436 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 437 } 438 439 bool dumpCode() const { 440 return DumpCode; 441 } 442 443 /// Return the amount of LDS that can be used that will not restrict the 444 /// occupancy lower than WaveCount. 445 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 446 const Function &) const; 447 448 bool supportsMinMaxDenormModes() const { 449 return getGeneration() >= AMDGPUSubtarget::GFX9; 450 } 451 452 /// \returns If target supports S_DENORM_MODE. 453 bool hasDenormModeInst() const { 454 return getGeneration() >= AMDGPUSubtarget::GFX10; 455 } 456 457 bool useFlatForGlobal() const { 458 return FlatForGlobal; 459 } 460 461 /// \returns If target supports ds_read/write_b128 and user enables generation 462 /// of ds_read/write_b128. 463 bool useDS128() const { 464 return CIInsts && EnableDS128; 465 } 466 467 /// \return If target supports ds_read/write_b96/128. 468 bool hasDS96AndDS128() const { 469 return CIInsts; 470 } 471 472 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 473 bool haveRoundOpsF64() const { 474 return CIInsts; 475 } 476 477 /// \returns If MUBUF instructions always perform range checking, even for 478 /// buffer resources used for private memory access. 479 bool privateMemoryResourceIsRangeChecked() const { 480 return getGeneration() < AMDGPUSubtarget::GFX9; 481 } 482 483 /// \returns If target requires PRT Struct NULL support (zero result registers 484 /// for sparse texture support). 485 bool usePRTStrictNull() const { 486 return EnablePRTStrictNull; 487 } 488 489 bool hasAutoWaitcntBeforeBarrier() const { 490 return AutoWaitcntBeforeBarrier; 491 } 492 493 bool hasUnalignedBufferAccess() const { 494 return UnalignedBufferAccess; 495 } 496 497 bool hasUnalignedBufferAccessEnabled() const { 498 return UnalignedBufferAccess && UnalignedAccessMode; 499 } 500 501 bool hasUnalignedDSAccess() const { 502 return UnalignedDSAccess; 503 } 504 505 bool hasUnalignedDSAccessEnabled() const { 506 return UnalignedDSAccess && UnalignedAccessMode; 507 } 508 509 bool hasUnalignedScratchAccess() const { 510 return UnalignedScratchAccess; 511 } 512 513 bool hasUnalignedAccessMode() const { 514 return UnalignedAccessMode; 515 } 516 517 bool hasApertureRegs() const { 518 return HasApertureRegs; 519 } 520 521 bool isTrapHandlerEnabled() const { 522 return TrapHandler; 523 } 524 525 bool isXNACKEnabled() const { 526 return TargetID.isXnackOnOrAny(); 527 } 528 529 bool isTgSplitEnabled() const { 530 return EnableTgSplit; 531 } 532 533 bool isCuModeEnabled() const { 534 return EnableCuMode; 535 } 536 537 bool hasFlatAddressSpace() const { 538 return FlatAddressSpace; 539 } 540 541 bool hasFlatScrRegister() const { 542 return hasFlatAddressSpace(); 543 } 544 545 bool hasFlatInstOffsets() const { 546 return FlatInstOffsets; 547 } 548 549 bool hasFlatGlobalInsts() const { 550 return FlatGlobalInsts; 551 } 552 553 bool hasFlatScratchInsts() const { 554 return FlatScratchInsts; 555 } 556 557 // Check if target supports ST addressing mode with FLAT scratch instructions. 558 // The ST addressing mode means no registers are used, either VGPR or SGPR, 559 // but only immediate offset is swizzled and added to the FLAT scratch base. 560 bool hasFlatScratchSTMode() const { 561 return hasFlatScratchInsts() && hasGFX10_3Insts(); 562 } 563 564 bool hasScalarFlatScratchInsts() const { 565 return ScalarFlatScratchInsts; 566 } 567 568 bool hasGlobalAddTidInsts() const { 569 return GFX10_BEncoding; 570 } 571 572 bool hasAtomicCSub() const { 573 return GFX10_BEncoding; 574 } 575 576 bool hasMultiDwordFlatScratchAddressing() const { 577 return getGeneration() >= GFX9; 578 } 579 580 bool hasFlatSegmentOffsetBug() const { 581 return HasFlatSegmentOffsetBug; 582 } 583 584 bool hasFlatLgkmVMemCountInOrder() const { 585 return getGeneration() > GFX9; 586 } 587 588 bool hasD16LoadStore() const { 589 return getGeneration() >= GFX9; 590 } 591 592 bool d16PreservesUnusedBits() const { 593 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 594 } 595 596 bool hasD16Images() const { 597 return getGeneration() >= VOLCANIC_ISLANDS; 598 } 599 600 /// Return if most LDS instructions have an m0 use that require m0 to be 601 /// initialized. 602 bool ldsRequiresM0Init() const { 603 return getGeneration() < GFX9; 604 } 605 606 // True if the hardware rewinds and replays GWS operations if a wave is 607 // preempted. 608 // 609 // If this is false, a GWS operation requires testing if a nack set the 610 // MEM_VIOL bit, and repeating if so. 611 bool hasGWSAutoReplay() const { 612 return getGeneration() >= GFX9; 613 } 614 615 /// \returns if target has ds_gws_sema_release_all instruction. 616 bool hasGWSSemaReleaseAll() const { 617 return CIInsts; 618 } 619 620 /// \returns true if the target has integer add/sub instructions that do not 621 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 622 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 623 /// for saturation. 624 bool hasAddNoCarry() const { 625 return AddNoCarryInsts; 626 } 627 628 bool hasUnpackedD16VMem() const { 629 return HasUnpackedD16VMem; 630 } 631 632 // Covers VS/PS/CS graphics shaders 633 bool isMesaGfxShader(const Function &F) const { 634 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 635 } 636 637 bool hasMad64_32() const { 638 return getGeneration() >= SEA_ISLANDS; 639 } 640 641 bool hasSDWAOmod() const { 642 return HasSDWAOmod; 643 } 644 645 bool hasSDWAScalar() const { 646 return HasSDWAScalar; 647 } 648 649 bool hasSDWASdst() const { 650 return HasSDWASdst; 651 } 652 653 bool hasSDWAMac() const { 654 return HasSDWAMac; 655 } 656 657 bool hasSDWAOutModsVOPC() const { 658 return HasSDWAOutModsVOPC; 659 } 660 661 bool hasDLInsts() const { 662 return HasDLInsts; 663 } 664 665 bool hasDot1Insts() const { 666 return HasDot1Insts; 667 } 668 669 bool hasDot2Insts() const { 670 return HasDot2Insts; 671 } 672 673 bool hasDot3Insts() const { 674 return HasDot3Insts; 675 } 676 677 bool hasDot4Insts() const { 678 return HasDot4Insts; 679 } 680 681 bool hasDot5Insts() const { 682 return HasDot5Insts; 683 } 684 685 bool hasDot6Insts() const { 686 return HasDot6Insts; 687 } 688 689 bool hasDot7Insts() const { 690 return HasDot7Insts; 691 } 692 693 bool hasMAIInsts() const { 694 return HasMAIInsts; 695 } 696 697 bool hasPkFmacF16Inst() const { 698 return HasPkFmacF16Inst; 699 } 700 701 bool hasAtomicFaddInsts() const { 702 return HasAtomicFaddInsts; 703 } 704 705 bool hasNoSdstCMPX() const { 706 return HasNoSdstCMPX; 707 } 708 709 bool hasVscnt() const { 710 return HasVscnt; 711 } 712 713 bool hasGetWaveIdInst() const { 714 return HasGetWaveIdInst; 715 } 716 717 bool hasSMemTimeInst() const { 718 return HasSMemTimeInst; 719 } 720 721 bool hasShaderCyclesRegister() const { 722 return HasShaderCyclesRegister; 723 } 724 725 bool hasVOP3Literal() const { 726 return HasVOP3Literal; 727 } 728 729 bool hasNoDataDepHazard() const { 730 return HasNoDataDepHazard; 731 } 732 733 bool vmemWriteNeedsExpWaitcnt() const { 734 return getGeneration() < SEA_ISLANDS; 735 } 736 737 // Scratch is allocated in 256 dword per wave blocks for the entire 738 // wavefront. When viewed from the perspective of an arbitrary workitem, this 739 // is 4-byte aligned. 740 // 741 // Only 4-byte alignment is really needed to access anything. Transformations 742 // on the pointer value itself may rely on the alignment / known low bits of 743 // the pointer. Set this to something above the minimum to avoid needing 744 // dynamic realignment in common cases. 745 Align getStackAlignment() const { return Align(16); } 746 747 bool enableMachineScheduler() const override { 748 return true; 749 } 750 751 bool useAA() const override; 752 753 bool enableSubRegLiveness() const override { 754 return true; 755 } 756 757 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 758 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 759 760 // static wrappers 761 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 762 763 // XXX - Why is this here if it isn't in the default pass set? 764 bool enableEarlyIfConversion() const override { 765 return true; 766 } 767 768 bool enableFlatScratch() const; 769 770 void overrideSchedPolicy(MachineSchedPolicy &Policy, 771 unsigned NumRegionInstrs) const override; 772 773 unsigned getMaxNumUserSGPRs() const { 774 return 16; 775 } 776 777 bool hasSMemRealTime() const { 778 return HasSMemRealTime; 779 } 780 781 bool hasMovrel() const { 782 return HasMovrel; 783 } 784 785 bool hasVGPRIndexMode() const { 786 return HasVGPRIndexMode; 787 } 788 789 bool useVGPRIndexMode() const; 790 791 bool hasScalarCompareEq64() const { 792 return getGeneration() >= VOLCANIC_ISLANDS; 793 } 794 795 bool hasScalarStores() const { 796 return HasScalarStores; 797 } 798 799 bool hasScalarAtomics() const { 800 return HasScalarAtomics; 801 } 802 803 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 804 805 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 806 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 807 808 bool hasDPP() const { 809 return HasDPP; 810 } 811 812 bool hasDPPBroadcasts() const { 813 return HasDPP && getGeneration() < GFX10; 814 } 815 816 bool hasDPPWavefrontShifts() const { 817 return HasDPP && getGeneration() < GFX10; 818 } 819 820 bool hasDPP8() const { 821 return HasDPP8; 822 } 823 824 bool has64BitDPP() const { 825 return Has64BitDPP; 826 } 827 828 bool hasPackedFP32Ops() const { 829 return HasPackedFP32Ops; 830 } 831 832 bool hasFmaakFmamkF32Insts() const { 833 return getGeneration() >= GFX10; 834 } 835 836 bool hasExtendedImageInsts() const { 837 return HasExtendedImageInsts; 838 } 839 840 bool hasR128A16() const { 841 return HasR128A16; 842 } 843 844 bool hasGFX10A16() const { 845 return HasGFX10A16; 846 } 847 848 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 849 850 bool hasG16() const { return HasG16; } 851 852 bool hasOffset3fBug() const { 853 return HasOffset3fBug; 854 } 855 856 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 857 858 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 859 860 bool hasNSAEncoding() const { return HasNSAEncoding; } 861 862 unsigned getNSAMaxSize() const { return NSAMaxSize; } 863 864 bool hasGFX10_AEncoding() const { 865 return GFX10_AEncoding; 866 } 867 868 bool hasGFX10_BEncoding() const { 869 return GFX10_BEncoding; 870 } 871 872 bool hasGFX10_3Insts() const { 873 return GFX10_3Insts; 874 } 875 876 bool hasMadF16() const; 877 878 bool enableSIScheduler() const { 879 return EnableSIScheduler; 880 } 881 882 bool loadStoreOptEnabled() const { 883 return EnableLoadStoreOpt; 884 } 885 886 bool hasSGPRInitBug() const { 887 return SGPRInitBug; 888 } 889 890 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 891 892 bool hasNegativeUnalignedScratchOffsetBug() const { 893 return NegativeUnalignedScratchOffsetBug; 894 } 895 896 bool hasMFMAInlineLiteralBug() const { 897 return HasMFMAInlineLiteralBug; 898 } 899 900 bool has12DWordStoreHazard() const { 901 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 902 } 903 904 // \returns true if the subtarget supports DWORDX3 load/store instructions. 905 bool hasDwordx3LoadStores() const { 906 return CIInsts; 907 } 908 909 bool hasReadM0MovRelInterpHazard() const { 910 return getGeneration() == AMDGPUSubtarget::GFX9; 911 } 912 913 bool hasReadM0SendMsgHazard() const { 914 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 915 getGeneration() <= AMDGPUSubtarget::GFX9; 916 } 917 918 bool hasVcmpxPermlaneHazard() const { 919 return HasVcmpxPermlaneHazard; 920 } 921 922 bool hasVMEMtoScalarWriteHazard() const { 923 return HasVMEMtoScalarWriteHazard; 924 } 925 926 bool hasSMEMtoVectorWriteHazard() const { 927 return HasSMEMtoVectorWriteHazard; 928 } 929 930 bool hasLDSMisalignedBug() const { 931 return LDSMisalignedBug && !EnableCuMode; 932 } 933 934 bool hasInstFwdPrefetchBug() const { 935 return HasInstFwdPrefetchBug; 936 } 937 938 bool hasVcmpxExecWARHazard() const { 939 return HasVcmpxExecWARHazard; 940 } 941 942 bool hasLdsBranchVmemWARHazard() const { 943 return HasLdsBranchVmemWARHazard; 944 } 945 946 bool hasNSAtoVMEMBug() const { 947 return HasNSAtoVMEMBug; 948 } 949 950 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 951 952 bool hasHardClauses() const { return getGeneration() >= GFX10; } 953 954 bool hasGFX90AInsts() const { return GFX90AInsts; } 955 956 /// Return if operations acting on VGPR tuples require even alignment. 957 bool needsAlignedVGPRs() const { return GFX90AInsts; } 958 959 bool hasPackedTID() const { return HasPackedTID; } 960 961 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 962 /// SGPRs 963 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 964 965 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 966 /// VGPRs 967 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 968 969 /// Return occupancy for the given function. Used LDS and a number of 970 /// registers if provided. 971 /// Note, occupancy can be affected by the scratch allocation as well, but 972 /// we do not have enough information to compute it. 973 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 974 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 975 976 /// \returns true if the flat_scratch register should be initialized with the 977 /// pointer to the wave's scratch memory rather than a size and offset. 978 bool flatScratchIsPointer() const { 979 return getGeneration() >= AMDGPUSubtarget::GFX9; 980 } 981 982 /// \returns true if the flat_scratch register is initialized by the HW. 983 /// In this case it is readonly. 984 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 985 986 /// \returns true if the machine has merged shaders in which s0-s7 are 987 /// reserved by the hardware and user SGPRs start at s8 988 bool hasMergedShaders() const { 989 return getGeneration() >= GFX9; 990 } 991 992 /// \returns SGPR allocation granularity supported by the subtarget. 993 unsigned getSGPRAllocGranule() const { 994 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 995 } 996 997 /// \returns SGPR encoding granularity supported by the subtarget. 998 unsigned getSGPREncodingGranule() const { 999 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1000 } 1001 1002 /// \returns Total number of SGPRs supported by the subtarget. 1003 unsigned getTotalNumSGPRs() const { 1004 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1005 } 1006 1007 /// \returns Addressable number of SGPRs supported by the subtarget. 1008 unsigned getAddressableNumSGPRs() const { 1009 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1010 } 1011 1012 /// \returns Minimum number of SGPRs that meets the given number of waves per 1013 /// execution unit requirement supported by the subtarget. 1014 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1015 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1016 } 1017 1018 /// \returns Maximum number of SGPRs that meets the given number of waves per 1019 /// execution unit requirement supported by the subtarget. 1020 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1021 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1022 } 1023 1024 /// \returns Reserved number of SGPRs. This is common 1025 /// utility function called by MachineFunction and 1026 /// Function variants of getReservedNumSGPRs. 1027 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1028 /// \returns Reserved number of SGPRs for given machine function \p MF. 1029 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1030 1031 /// \returns Reserved number of SGPRs for given function \p F. 1032 unsigned getReservedNumSGPRs(const Function &F) const; 1033 1034 /// \returns max num SGPRs. This is the common utility 1035 /// function called by MachineFunction and Function 1036 /// variants of getMaxNumSGPRs. 1037 unsigned getBaseMaxNumSGPRs(const Function &F, 1038 std::pair<unsigned, unsigned> WavesPerEU, 1039 unsigned PreloadedSGPRs, 1040 unsigned ReservedNumSGPRs) const; 1041 1042 /// \returns Maximum number of SGPRs that meets number of waves per execution 1043 /// unit requirement for function \p MF, or number of SGPRs explicitly 1044 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1045 /// 1046 /// \returns Value that meets number of waves per execution unit requirement 1047 /// if explicitly requested value cannot be converted to integer, violates 1048 /// subtarget's specifications, or does not meet number of waves per execution 1049 /// unit requirement. 1050 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1051 1052 /// \returns Maximum number of SGPRs that meets number of waves per execution 1053 /// unit requirement for function \p F, or number of SGPRs explicitly 1054 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1055 /// 1056 /// \returns Value that meets number of waves per execution unit requirement 1057 /// if explicitly requested value cannot be converted to integer, violates 1058 /// subtarget's specifications, or does not meet number of waves per execution 1059 /// unit requirement. 1060 unsigned getMaxNumSGPRs(const Function &F) const; 1061 1062 /// \returns VGPR allocation granularity supported by the subtarget. 1063 unsigned getVGPRAllocGranule() const { 1064 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1065 } 1066 1067 /// \returns VGPR encoding granularity supported by the subtarget. 1068 unsigned getVGPREncodingGranule() const { 1069 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1070 } 1071 1072 /// \returns Total number of VGPRs supported by the subtarget. 1073 unsigned getTotalNumVGPRs() const { 1074 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1075 } 1076 1077 /// \returns Addressable number of VGPRs supported by the subtarget. 1078 unsigned getAddressableNumVGPRs() const { 1079 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1080 } 1081 1082 /// \returns Minimum number of VGPRs that meets given number of waves per 1083 /// execution unit requirement supported by the subtarget. 1084 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1085 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1086 } 1087 1088 /// \returns Maximum number of VGPRs that meets given number of waves per 1089 /// execution unit requirement supported by the subtarget. 1090 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1091 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1092 } 1093 1094 /// \returns max num VGPRs. This is the common utility function 1095 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1096 unsigned getBaseMaxNumVGPRs(const Function &F, 1097 std::pair<unsigned, unsigned> WavesPerEU) const; 1098 /// \returns Maximum number of VGPRs that meets number of waves per execution 1099 /// unit requirement for function \p F, or number of VGPRs explicitly 1100 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1101 /// 1102 /// \returns Value that meets number of waves per execution unit requirement 1103 /// if explicitly requested value cannot be converted to integer, violates 1104 /// subtarget's specifications, or does not meet number of waves per execution 1105 /// unit requirement. 1106 unsigned getMaxNumVGPRs(const Function &F) const; 1107 1108 /// \returns Maximum number of VGPRs that meets number of waves per execution 1109 /// unit requirement for function \p MF, or number of VGPRs explicitly 1110 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1111 /// 1112 /// \returns Value that meets number of waves per execution unit requirement 1113 /// if explicitly requested value cannot be converted to integer, violates 1114 /// subtarget's specifications, or does not meet number of waves per execution 1115 /// unit requirement. 1116 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1117 1118 void getPostRAMutations( 1119 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1120 const override; 1121 1122 std::unique_ptr<ScheduleDAGMutation> 1123 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1124 1125 bool isWave32() const { 1126 return getWavefrontSize() == 32; 1127 } 1128 1129 bool isWave64() const { 1130 return getWavefrontSize() == 64; 1131 } 1132 1133 const TargetRegisterClass *getBoolRC() const { 1134 return getRegisterInfo()->getBoolRC(); 1135 } 1136 1137 /// \returns Maximum number of work groups per compute unit supported by the 1138 /// subtarget and limited by given \p FlatWorkGroupSize. 1139 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1140 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1141 } 1142 1143 /// \returns Minimum flat work group size supported by the subtarget. 1144 unsigned getMinFlatWorkGroupSize() const override { 1145 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1146 } 1147 1148 /// \returns Maximum flat work group size supported by the subtarget. 1149 unsigned getMaxFlatWorkGroupSize() const override { 1150 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1151 } 1152 1153 /// \returns Number of waves per execution unit required to support the given 1154 /// \p FlatWorkGroupSize. 1155 unsigned 1156 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1157 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1158 } 1159 1160 /// \returns Minimum number of waves per execution unit supported by the 1161 /// subtarget. 1162 unsigned getMinWavesPerEU() const override { 1163 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1164 } 1165 1166 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1167 SDep &Dep) const override; 1168 }; 1169 1170 } // end namespace llvm 1171 1172 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1173