1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "R600FrameLowering.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "SIFrameLowering.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 #include "Utils/AMDGPUBaseInfo.h" 27 #include "llvm/ADT/Triple.h" 28 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 29 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 30 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 31 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 32 #include "llvm/CodeGen/MachineFunction.h" 33 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 34 #include "llvm/MC/MCInstrItineraries.h" 35 #include "llvm/Support/MathExtras.h" 36 #include <cassert> 37 #include <cstdint> 38 #include <memory> 39 #include <utility> 40 41 #define GET_SUBTARGETINFO_HEADER 42 #include "AMDGPUGenSubtargetInfo.inc" 43 #define GET_SUBTARGETINFO_HEADER 44 #include "R600GenSubtargetInfo.inc" 45 46 namespace llvm { 47 48 class StringRef; 49 50 class AMDGPUSubtarget { 51 public: 52 enum Generation { 53 R600 = 0, 54 R700 = 1, 55 EVERGREEN = 2, 56 NORTHERN_ISLANDS = 3, 57 SOUTHERN_ISLANDS = 4, 58 SEA_ISLANDS = 5, 59 VOLCANIC_ISLANDS = 6, 60 GFX9 = 7, 61 GFX10 = 8 62 }; 63 64 private: 65 Triple TargetTriple; 66 67 protected: 68 bool Has16BitInsts; 69 bool HasMadMixInsts; 70 bool HasMadMacF32Insts; 71 bool HasDsSrc2Insts; 72 bool HasSDWA; 73 bool HasVOP3PInsts; 74 bool HasMulI24; 75 bool HasMulU24; 76 bool HasInv2PiInlineImm; 77 bool HasFminFmaxLegacy; 78 bool EnablePromoteAlloca; 79 bool HasTrigReducedRange; 80 unsigned MaxWavesPerEU; 81 int LocalMemorySize; 82 char WavefrontSizeLog2; 83 84 public: 85 AMDGPUSubtarget(const Triple &TT); 86 87 static const AMDGPUSubtarget &get(const MachineFunction &MF); 88 static const AMDGPUSubtarget &get(const TargetMachine &TM, 89 const Function &F); 90 91 /// \returns Default range flat work group size for a calling convention. 92 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 93 94 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 95 /// for function \p F, or minimum/maximum flat work group sizes explicitly 96 /// requested using "amdgpu-flat-work-group-size" attribute attached to 97 /// function \p F. 98 /// 99 /// \returns Subtarget's default values if explicitly requested values cannot 100 /// be converted to integer, or violate subtarget's specifications. 101 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 102 103 /// \returns Subtarget's default pair of minimum/maximum number of waves per 104 /// execution unit for function \p F, or minimum/maximum number of waves per 105 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 106 /// attached to function \p F. 107 /// 108 /// \returns Subtarget's default values if explicitly requested values cannot 109 /// be converted to integer, violate subtarget's specifications, or are not 110 /// compatible with minimum/maximum number of waves limited by flat work group 111 /// size, register usage, and/or lds usage. 112 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 113 114 /// Return the amount of LDS that can be used that will not restrict the 115 /// occupancy lower than WaveCount. 116 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 117 const Function &) const; 118 119 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 120 /// the given LDS memory size is the only constraint. 121 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 122 123 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 124 125 bool isAmdHsaOS() const { 126 return TargetTriple.getOS() == Triple::AMDHSA; 127 } 128 129 bool isAmdPalOS() const { 130 return TargetTriple.getOS() == Triple::AMDPAL; 131 } 132 133 bool isMesa3DOS() const { 134 return TargetTriple.getOS() == Triple::Mesa3D; 135 } 136 137 bool isMesaKernel(const Function &F) const { 138 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 139 } 140 141 bool isAmdHsaOrMesa(const Function &F) const { 142 return isAmdHsaOS() || isMesaKernel(F); 143 } 144 145 bool isGCN() const { 146 return TargetTriple.getArch() == Triple::amdgcn; 147 } 148 149 bool has16BitInsts() const { 150 return Has16BitInsts; 151 } 152 153 bool hasMadMixInsts() const { 154 return HasMadMixInsts; 155 } 156 157 bool hasMadMacF32Insts() const { 158 return HasMadMacF32Insts || !isGCN(); 159 } 160 161 bool hasDsSrc2Insts() const { 162 return HasDsSrc2Insts; 163 } 164 165 bool hasSDWA() const { 166 return HasSDWA; 167 } 168 169 bool hasVOP3PInsts() const { 170 return HasVOP3PInsts; 171 } 172 173 bool hasMulI24() const { 174 return HasMulI24; 175 } 176 177 bool hasMulU24() const { 178 return HasMulU24; 179 } 180 181 bool hasInv2PiInlineImm() const { 182 return HasInv2PiInlineImm; 183 } 184 185 bool hasFminFmaxLegacy() const { 186 return HasFminFmaxLegacy; 187 } 188 189 bool hasTrigReducedRange() const { 190 return HasTrigReducedRange; 191 } 192 193 bool isPromoteAllocaEnabled() const { 194 return EnablePromoteAlloca; 195 } 196 197 unsigned getWavefrontSize() const { 198 return 1 << WavefrontSizeLog2; 199 } 200 201 unsigned getWavefrontSizeLog2() const { 202 return WavefrontSizeLog2; 203 } 204 205 int getLocalMemorySize() const { 206 return LocalMemorySize; 207 } 208 209 Align getAlignmentForImplicitArgPtr() const { 210 return isAmdHsaOS() ? Align(8) : Align(4); 211 } 212 213 /// Returns the offset in bytes from the start of the input buffer 214 /// of the first explicit kernel argument. 215 unsigned getExplicitKernelArgOffset(const Function &F) const { 216 return isAmdHsaOrMesa(F) ? 0 : 36; 217 } 218 219 /// \returns Maximum number of work groups per compute unit supported by the 220 /// subtarget and limited by given \p FlatWorkGroupSize. 221 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 222 223 /// \returns Minimum flat work group size supported by the subtarget. 224 virtual unsigned getMinFlatWorkGroupSize() const = 0; 225 226 /// \returns Maximum flat work group size supported by the subtarget. 227 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 228 229 /// \returns Number of waves per execution unit required to support the given 230 /// \p FlatWorkGroupSize. 231 virtual unsigned 232 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; 233 234 /// \returns Minimum number of waves per execution unit supported by the 235 /// subtarget. 236 virtual unsigned getMinWavesPerEU() const = 0; 237 238 /// \returns Maximum number of waves per execution unit supported by the 239 /// subtarget without any kind of limitation. 240 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 241 242 /// Creates value range metadata on an workitemid.* inrinsic call or load. 243 bool makeLIDRangeMetadata(Instruction *I) const; 244 245 /// \returns Number of bytes of arguments that are passed to a shader or 246 /// kernel in addition to the explicit ones declared for the function. 247 unsigned getImplicitArgNumBytes(const Function &F) const { 248 if (isMesaKernel(F)) 249 return 16; 250 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 251 } 252 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 253 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 254 255 /// \returns Corresponsing DWARF register number mapping flavour for the 256 /// \p WavefrontSize. 257 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const { 258 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 259 : AMDGPUDwarfFlavour::Wave64; 260 } 261 262 virtual ~AMDGPUSubtarget() {} 263 }; 264 265 class GCNSubtarget : public AMDGPUGenSubtargetInfo, 266 public AMDGPUSubtarget { 267 268 using AMDGPUSubtarget::getMaxWavesPerEU; 269 270 public: 271 enum TrapHandlerAbi { 272 TrapHandlerAbiNone = 0, 273 TrapHandlerAbiHsa = 1 274 }; 275 276 enum TrapID { 277 TrapIDHardwareReserved = 0, 278 TrapIDHSADebugTrap = 1, 279 TrapIDLLVMTrap = 2, 280 TrapIDLLVMDebugTrap = 3, 281 TrapIDDebugBreakpoint = 7, 282 TrapIDDebugReserved8 = 8, 283 TrapIDDebugReservedFE = 0xfe, 284 TrapIDDebugReservedFF = 0xff 285 }; 286 287 enum TrapRegValues { 288 LLVMTrapHandlerRegValue = 1 289 }; 290 291 private: 292 /// GlobalISel related APIs. 293 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 294 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 295 std::unique_ptr<InstructionSelector> InstSelector; 296 std::unique_ptr<LegalizerInfo> Legalizer; 297 std::unique_ptr<RegisterBankInfo> RegBankInfo; 298 299 protected: 300 // Basic subtarget description. 301 Triple TargetTriple; 302 unsigned Gen; 303 InstrItineraryData InstrItins; 304 int LDSBankCount; 305 unsigned MaxPrivateElementSize; 306 307 // Possibly statically set by tablegen, but may want to be overridden. 308 bool FastFMAF32; 309 bool FastDenormalF32; 310 bool HalfRate64Ops; 311 312 // Dynamially set bits that enable features. 313 bool FlatForGlobal; 314 bool AutoWaitcntBeforeBarrier; 315 bool CodeObjectV3; 316 bool UnalignedScratchAccess; 317 bool UnalignedBufferAccess; 318 bool HasApertureRegs; 319 bool EnableXNACK; 320 bool DoesNotSupportXNACK; 321 bool EnableCuMode; 322 bool TrapHandler; 323 324 // Used as options. 325 bool EnableLoadStoreOpt; 326 bool EnableUnsafeDSOffsetFolding; 327 bool EnableSIScheduler; 328 bool EnableDS128; 329 bool EnablePRTStrictNull; 330 bool DumpCode; 331 332 // Subtarget statically properties set by tablegen 333 bool FP64; 334 bool FMA; 335 bool MIMG_R128; 336 bool IsGCN; 337 bool GCN3Encoding; 338 bool CIInsts; 339 bool GFX8Insts; 340 bool GFX9Insts; 341 bool GFX10Insts; 342 bool GFX10_3Insts; 343 bool GFX7GFX8GFX9Insts; 344 bool SGPRInitBug; 345 bool HasSMemRealTime; 346 bool HasIntClamp; 347 bool HasFmaMixInsts; 348 bool HasMovrel; 349 bool HasVGPRIndexMode; 350 bool HasScalarStores; 351 bool HasScalarAtomics; 352 bool HasSDWAOmod; 353 bool HasSDWAScalar; 354 bool HasSDWASdst; 355 bool HasSDWAMac; 356 bool HasSDWAOutModsVOPC; 357 bool HasDPP; 358 bool HasDPP8; 359 bool HasR128A16; 360 bool HasGFX10A16; 361 bool HasG16; 362 bool HasNSAEncoding; 363 bool GFX10_BEncoding; 364 bool HasDLInsts; 365 bool HasDot1Insts; 366 bool HasDot2Insts; 367 bool HasDot3Insts; 368 bool HasDot4Insts; 369 bool HasDot5Insts; 370 bool HasDot6Insts; 371 bool HasMAIInsts; 372 bool HasPkFmacF16Inst; 373 bool HasAtomicFaddInsts; 374 bool EnableSRAMECC; 375 bool DoesNotSupportSRAMECC; 376 bool HasNoSdstCMPX; 377 bool HasVscnt; 378 bool HasGetWaveIdInst; 379 bool HasSMemTimeInst; 380 bool HasRegisterBanking; 381 bool HasVOP3Literal; 382 bool HasNoDataDepHazard; 383 bool FlatAddressSpace; 384 bool FlatInstOffsets; 385 bool FlatGlobalInsts; 386 bool FlatScratchInsts; 387 bool ScalarFlatScratchInsts; 388 bool AddNoCarryInsts; 389 bool HasUnpackedD16VMem; 390 bool R600ALUInst; 391 bool CaymanISA; 392 bool CFALUBug; 393 bool LDSMisalignedBug; 394 bool HasMFMAInlineLiteralBug; 395 bool HasVertexCache; 396 short TexVTXClauseSize; 397 bool ScalarizeGlobal; 398 399 bool HasVcmpxPermlaneHazard; 400 bool HasVMEMtoScalarWriteHazard; 401 bool HasSMEMtoVectorWriteHazard; 402 bool HasInstFwdPrefetchBug; 403 bool HasVcmpxExecWARHazard; 404 bool HasLdsBranchVmemWARHazard; 405 bool HasNSAtoVMEMBug; 406 bool HasOffset3fBug; 407 bool HasFlatSegmentOffsetBug; 408 409 // Dummy feature to use for assembler in tablegen. 410 bool FeatureDisable; 411 412 SelectionDAGTargetInfo TSInfo; 413 private: 414 SIInstrInfo InstrInfo; 415 SITargetLowering TLInfo; 416 SIFrameLowering FrameLowering; 417 418 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 419 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 420 421 public: 422 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 423 const GCNTargetMachine &TM); 424 ~GCNSubtarget() override; 425 426 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 427 StringRef GPU, StringRef FS); 428 429 const SIInstrInfo *getInstrInfo() const override { 430 return &InstrInfo; 431 } 432 433 const SIFrameLowering *getFrameLowering() const override { 434 return &FrameLowering; 435 } 436 437 const SITargetLowering *getTargetLowering() const override { 438 return &TLInfo; 439 } 440 441 const SIRegisterInfo *getRegisterInfo() const override { 442 return &InstrInfo.getRegisterInfo(); 443 } 444 445 const CallLowering *getCallLowering() const override { 446 return CallLoweringInfo.get(); 447 } 448 449 const InlineAsmLowering *getInlineAsmLowering() const override { 450 return InlineAsmLoweringInfo.get(); 451 } 452 453 InstructionSelector *getInstructionSelector() const override { 454 return InstSelector.get(); 455 } 456 457 const LegalizerInfo *getLegalizerInfo() const override { 458 return Legalizer.get(); 459 } 460 461 const RegisterBankInfo *getRegBankInfo() const override { 462 return RegBankInfo.get(); 463 } 464 465 // Nothing implemented, just prevent crashes on use. 466 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 467 return &TSInfo; 468 } 469 470 const InstrItineraryData *getInstrItineraryData() const override { 471 return &InstrItins; 472 } 473 474 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 475 476 Generation getGeneration() const { 477 return (Generation)Gen; 478 } 479 480 /// Return the number of high bits known to be zero fror a frame index. 481 unsigned getKnownHighZeroBitsForFrameIndex() const { 482 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 483 } 484 485 int getLDSBankCount() const { 486 return LDSBankCount; 487 } 488 489 unsigned getMaxPrivateElementSize() const { 490 return MaxPrivateElementSize; 491 } 492 493 unsigned getConstantBusLimit(unsigned Opcode) const; 494 495 bool hasIntClamp() const { 496 return HasIntClamp; 497 } 498 499 bool hasFP64() const { 500 return FP64; 501 } 502 503 bool hasMIMG_R128() const { 504 return MIMG_R128; 505 } 506 507 bool hasHWFP64() const { 508 return FP64; 509 } 510 511 bool hasFastFMAF32() const { 512 return FastFMAF32; 513 } 514 515 bool hasHalfRate64Ops() const { 516 return HalfRate64Ops; 517 } 518 519 bool hasAddr64() const { 520 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 521 } 522 523 // Return true if the target only has the reverse operand versions of VALU 524 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 525 bool hasOnlyRevVALUShifts() const { 526 return getGeneration() >= VOLCANIC_ISLANDS; 527 } 528 529 bool hasFractBug() const { 530 return getGeneration() == SOUTHERN_ISLANDS; 531 } 532 533 bool hasBFE() const { 534 return true; 535 } 536 537 bool hasBFI() const { 538 return true; 539 } 540 541 bool hasBFM() const { 542 return hasBFE(); 543 } 544 545 bool hasBCNT(unsigned Size) const { 546 return true; 547 } 548 549 bool hasFFBL() const { 550 return true; 551 } 552 553 bool hasFFBH() const { 554 return true; 555 } 556 557 bool hasMed3_16() const { 558 return getGeneration() >= AMDGPUSubtarget::GFX9; 559 } 560 561 bool hasMin3Max3_16() const { 562 return getGeneration() >= AMDGPUSubtarget::GFX9; 563 } 564 565 bool hasFmaMixInsts() const { 566 return HasFmaMixInsts; 567 } 568 569 bool hasCARRY() const { 570 return true; 571 } 572 573 bool hasFMA() const { 574 return FMA; 575 } 576 577 bool hasSwap() const { 578 return GFX9Insts; 579 } 580 581 bool hasScalarPackInsts() const { 582 return GFX9Insts; 583 } 584 585 bool hasScalarMulHiInsts() const { 586 return GFX9Insts; 587 } 588 589 TrapHandlerAbi getTrapHandlerAbi() const { 590 return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; 591 } 592 593 /// True if the offset field of DS instructions works as expected. On SI, the 594 /// offset uses a 16-bit adder and does not always wrap properly. 595 bool hasUsableDSOffset() const { 596 return getGeneration() >= SEA_ISLANDS; 597 } 598 599 bool unsafeDSOffsetFoldingEnabled() const { 600 return EnableUnsafeDSOffsetFolding; 601 } 602 603 /// Condition output from div_scale is usable. 604 bool hasUsableDivScaleConditionOutput() const { 605 return getGeneration() != SOUTHERN_ISLANDS; 606 } 607 608 /// Extra wait hazard is needed in some cases before 609 /// s_cbranch_vccnz/s_cbranch_vccz. 610 bool hasReadVCCZBug() const { 611 return getGeneration() <= SEA_ISLANDS; 612 } 613 614 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 615 bool partialVCCWritesUpdateVCCZ() const { 616 return getGeneration() >= GFX10; 617 } 618 619 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 620 /// was written by a VALU instruction. 621 bool hasSMRDReadVALUDefHazard() const { 622 return getGeneration() == SOUTHERN_ISLANDS; 623 } 624 625 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 626 /// SGPR was written by a VALU Instruction. 627 bool hasVMEMReadSGPRVALUDefHazard() const { 628 return getGeneration() >= VOLCANIC_ISLANDS; 629 } 630 631 bool hasRFEHazards() const { 632 return getGeneration() >= VOLCANIC_ISLANDS; 633 } 634 635 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 636 unsigned getSetRegWaitStates() const { 637 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 638 } 639 640 bool dumpCode() const { 641 return DumpCode; 642 } 643 644 /// Return the amount of LDS that can be used that will not restrict the 645 /// occupancy lower than WaveCount. 646 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 647 const Function &) const; 648 649 bool supportsMinMaxDenormModes() const { 650 return getGeneration() >= AMDGPUSubtarget::GFX9; 651 } 652 653 /// \returns If target supports S_DENORM_MODE. 654 bool hasDenormModeInst() const { 655 return getGeneration() >= AMDGPUSubtarget::GFX10; 656 } 657 658 bool useFlatForGlobal() const { 659 return FlatForGlobal; 660 } 661 662 /// \returns If target supports ds_read/write_b128 and user enables generation 663 /// of ds_read/write_b128. 664 bool useDS128() const { 665 return CIInsts && EnableDS128; 666 } 667 668 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 669 bool haveRoundOpsF64() const { 670 return CIInsts; 671 } 672 673 /// \returns If MUBUF instructions always perform range checking, even for 674 /// buffer resources used for private memory access. 675 bool privateMemoryResourceIsRangeChecked() const { 676 return getGeneration() < AMDGPUSubtarget::GFX9; 677 } 678 679 /// \returns If target requires PRT Struct NULL support (zero result registers 680 /// for sparse texture support). 681 bool usePRTStrictNull() const { 682 return EnablePRTStrictNull; 683 } 684 685 bool hasAutoWaitcntBeforeBarrier() const { 686 return AutoWaitcntBeforeBarrier; 687 } 688 689 bool hasCodeObjectV3() const { 690 // FIXME: Need to add code object v3 support for mesa and pal. 691 return isAmdHsaOS() ? CodeObjectV3 : false; 692 } 693 694 bool hasUnalignedBufferAccess() const { 695 return UnalignedBufferAccess; 696 } 697 698 bool hasUnalignedScratchAccess() const { 699 return UnalignedScratchAccess; 700 } 701 702 bool hasApertureRegs() const { 703 return HasApertureRegs; 704 } 705 706 bool isTrapHandlerEnabled() const { 707 return TrapHandler; 708 } 709 710 bool isXNACKEnabled() const { 711 return EnableXNACK; 712 } 713 714 bool isCuModeEnabled() const { 715 return EnableCuMode; 716 } 717 718 bool hasFlatAddressSpace() const { 719 return FlatAddressSpace; 720 } 721 722 bool hasFlatScrRegister() const { 723 return hasFlatAddressSpace(); 724 } 725 726 bool hasFlatInstOffsets() const { 727 return FlatInstOffsets; 728 } 729 730 bool hasFlatGlobalInsts() const { 731 return FlatGlobalInsts; 732 } 733 734 bool hasFlatScratchInsts() const { 735 return FlatScratchInsts; 736 } 737 738 bool hasScalarFlatScratchInsts() const { 739 return ScalarFlatScratchInsts; 740 } 741 742 bool hasGlobalAddTidInsts() const { 743 return GFX10_BEncoding; 744 } 745 746 bool hasAtomicCSub() const { 747 return GFX10_BEncoding; 748 } 749 750 bool hasMultiDwordFlatScratchAddressing() const { 751 return getGeneration() >= GFX9; 752 } 753 754 bool hasFlatSegmentOffsetBug() const { 755 return HasFlatSegmentOffsetBug; 756 } 757 758 bool hasFlatLgkmVMemCountInOrder() const { 759 return getGeneration() > GFX9; 760 } 761 762 bool hasD16LoadStore() const { 763 return getGeneration() >= GFX9; 764 } 765 766 bool d16PreservesUnusedBits() const { 767 return hasD16LoadStore() && !isSRAMECCEnabled(); 768 } 769 770 bool hasD16Images() const { 771 return getGeneration() >= VOLCANIC_ISLANDS; 772 } 773 774 /// Return if most LDS instructions have an m0 use that require m0 to be 775 /// iniitalized. 776 bool ldsRequiresM0Init() const { 777 return getGeneration() < GFX9; 778 } 779 780 // True if the hardware rewinds and replays GWS operations if a wave is 781 // preempted. 782 // 783 // If this is false, a GWS operation requires testing if a nack set the 784 // MEM_VIOL bit, and repeating if so. 785 bool hasGWSAutoReplay() const { 786 return getGeneration() >= GFX9; 787 } 788 789 /// \returns if target has ds_gws_sema_release_all instruction. 790 bool hasGWSSemaReleaseAll() const { 791 return CIInsts; 792 } 793 794 bool hasAddNoCarry() const { 795 return AddNoCarryInsts; 796 } 797 798 bool hasUnpackedD16VMem() const { 799 return HasUnpackedD16VMem; 800 } 801 802 // Covers VS/PS/CS graphics shaders 803 bool isMesaGfxShader(const Function &F) const { 804 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 805 } 806 807 bool hasMad64_32() const { 808 return getGeneration() >= SEA_ISLANDS; 809 } 810 811 bool hasSDWAOmod() const { 812 return HasSDWAOmod; 813 } 814 815 bool hasSDWAScalar() const { 816 return HasSDWAScalar; 817 } 818 819 bool hasSDWASdst() const { 820 return HasSDWASdst; 821 } 822 823 bool hasSDWAMac() const { 824 return HasSDWAMac; 825 } 826 827 bool hasSDWAOutModsVOPC() const { 828 return HasSDWAOutModsVOPC; 829 } 830 831 bool hasDLInsts() const { 832 return HasDLInsts; 833 } 834 835 bool hasDot1Insts() const { 836 return HasDot1Insts; 837 } 838 839 bool hasDot2Insts() const { 840 return HasDot2Insts; 841 } 842 843 bool hasDot3Insts() const { 844 return HasDot3Insts; 845 } 846 847 bool hasDot4Insts() const { 848 return HasDot4Insts; 849 } 850 851 bool hasDot5Insts() const { 852 return HasDot5Insts; 853 } 854 855 bool hasDot6Insts() const { 856 return HasDot6Insts; 857 } 858 859 bool hasMAIInsts() const { 860 return HasMAIInsts; 861 } 862 863 bool hasPkFmacF16Inst() const { 864 return HasPkFmacF16Inst; 865 } 866 867 bool hasAtomicFaddInsts() const { 868 return HasAtomicFaddInsts; 869 } 870 871 bool isSRAMECCEnabled() const { 872 return EnableSRAMECC; 873 } 874 875 bool hasNoSdstCMPX() const { 876 return HasNoSdstCMPX; 877 } 878 879 bool hasVscnt() const { 880 return HasVscnt; 881 } 882 883 bool hasGetWaveIdInst() const { 884 return HasGetWaveIdInst; 885 } 886 887 bool hasSMemTimeInst() const { 888 return HasSMemTimeInst; 889 } 890 891 bool hasRegisterBanking() const { 892 return HasRegisterBanking; 893 } 894 895 bool hasVOP3Literal() const { 896 return HasVOP3Literal; 897 } 898 899 bool hasNoDataDepHazard() const { 900 return HasNoDataDepHazard; 901 } 902 903 bool vmemWriteNeedsExpWaitcnt() const { 904 return getGeneration() < SEA_ISLANDS; 905 } 906 907 // Scratch is allocated in 256 dword per wave blocks for the entire 908 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 909 // is 4-byte aligned. 910 // 911 // Only 4-byte alignment is really needed to access anything. Transformations 912 // on the pointer value itself may rely on the alignment / known low bits of 913 // the pointer. Set this to something above the minimum to avoid needing 914 // dynamic realignment in common cases. 915 Align getStackAlignment() const { return Align(16); } 916 917 bool enableMachineScheduler() const override { 918 return true; 919 } 920 921 bool enableSubRegLiveness() const override { 922 return true; 923 } 924 925 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 926 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 927 928 // static wrappers 929 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 930 931 // XXX - Why is this here if it isn't in the default pass set? 932 bool enableEarlyIfConversion() const override { 933 return true; 934 } 935 936 void overrideSchedPolicy(MachineSchedPolicy &Policy, 937 unsigned NumRegionInstrs) const override; 938 939 unsigned getMaxNumUserSGPRs() const { 940 return 16; 941 } 942 943 bool hasSMemRealTime() const { 944 return HasSMemRealTime; 945 } 946 947 bool hasMovrel() const { 948 return HasMovrel; 949 } 950 951 bool hasVGPRIndexMode() const { 952 return HasVGPRIndexMode; 953 } 954 955 bool useVGPRIndexMode() const; 956 957 bool hasScalarCompareEq64() const { 958 return getGeneration() >= VOLCANIC_ISLANDS; 959 } 960 961 bool hasScalarStores() const { 962 return HasScalarStores; 963 } 964 965 bool hasScalarAtomics() const { 966 return HasScalarAtomics; 967 } 968 969 bool hasLDSFPAtomics() const { 970 return GFX8Insts; 971 } 972 973 bool hasDPP() const { 974 return HasDPP; 975 } 976 977 bool hasDPPBroadcasts() const { 978 return HasDPP && getGeneration() < GFX10; 979 } 980 981 bool hasDPPWavefrontShifts() const { 982 return HasDPP && getGeneration() < GFX10; 983 } 984 985 bool hasDPP8() const { 986 return HasDPP8; 987 } 988 989 bool hasR128A16() const { 990 return HasR128A16; 991 } 992 993 bool hasGFX10A16() const { 994 return HasGFX10A16; 995 } 996 997 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 998 999 bool hasG16() const { return HasG16; } 1000 1001 bool hasOffset3fBug() const { 1002 return HasOffset3fBug; 1003 } 1004 1005 bool hasNSAEncoding() const { 1006 return HasNSAEncoding; 1007 } 1008 1009 bool hasGFX10_BEncoding() const { 1010 return GFX10_BEncoding; 1011 } 1012 1013 bool hasGFX10_3Insts() const { 1014 return GFX10_3Insts; 1015 } 1016 1017 bool hasMadF16() const; 1018 1019 bool enableSIScheduler() const { 1020 return EnableSIScheduler; 1021 } 1022 1023 bool loadStoreOptEnabled() const { 1024 return EnableLoadStoreOpt; 1025 } 1026 1027 bool hasSGPRInitBug() const { 1028 return SGPRInitBug; 1029 } 1030 1031 bool hasMFMAInlineLiteralBug() const { 1032 return HasMFMAInlineLiteralBug; 1033 } 1034 1035 bool has12DWordStoreHazard() const { 1036 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1037 } 1038 1039 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1040 bool hasDwordx3LoadStores() const { 1041 return CIInsts; 1042 } 1043 1044 bool hasSMovFedHazard() const { 1045 return getGeneration() == AMDGPUSubtarget::GFX9; 1046 } 1047 1048 bool hasReadM0MovRelInterpHazard() const { 1049 return getGeneration() == AMDGPUSubtarget::GFX9; 1050 } 1051 1052 bool hasReadM0SendMsgHazard() const { 1053 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1054 getGeneration() <= AMDGPUSubtarget::GFX9; 1055 } 1056 1057 bool hasVcmpxPermlaneHazard() const { 1058 return HasVcmpxPermlaneHazard; 1059 } 1060 1061 bool hasVMEMtoScalarWriteHazard() const { 1062 return HasVMEMtoScalarWriteHazard; 1063 } 1064 1065 bool hasSMEMtoVectorWriteHazard() const { 1066 return HasSMEMtoVectorWriteHazard; 1067 } 1068 1069 bool hasLDSMisalignedBug() const { 1070 return LDSMisalignedBug && !EnableCuMode; 1071 } 1072 1073 bool hasInstFwdPrefetchBug() const { 1074 return HasInstFwdPrefetchBug; 1075 } 1076 1077 bool hasVcmpxExecWARHazard() const { 1078 return HasVcmpxExecWARHazard; 1079 } 1080 1081 bool hasLdsBranchVmemWARHazard() const { 1082 return HasLdsBranchVmemWARHazard; 1083 } 1084 1085 bool hasNSAtoVMEMBug() const { 1086 return HasNSAtoVMEMBug; 1087 } 1088 1089 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1090 1091 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1092 /// SGPRs 1093 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1094 1095 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1096 /// VGPRs 1097 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1098 1099 /// Return occupancy for the given function. Used LDS and a number of 1100 /// registers if provided. 1101 /// Note, occupancy can be affected by the scratch allocation as well, but 1102 /// we do not have enough information to compute it. 1103 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1104 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1105 1106 /// \returns true if the flat_scratch register should be initialized with the 1107 /// pointer to the wave's scratch memory rather than a size and offset. 1108 bool flatScratchIsPointer() const { 1109 return getGeneration() >= AMDGPUSubtarget::GFX9; 1110 } 1111 1112 /// \returns true if the machine has merged shaders in which s0-s7 are 1113 /// reserved by the hardware and user SGPRs start at s8 1114 bool hasMergedShaders() const { 1115 return getGeneration() >= GFX9; 1116 } 1117 1118 /// \returns SGPR allocation granularity supported by the subtarget. 1119 unsigned getSGPRAllocGranule() const { 1120 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1121 } 1122 1123 /// \returns SGPR encoding granularity supported by the subtarget. 1124 unsigned getSGPREncodingGranule() const { 1125 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1126 } 1127 1128 /// \returns Total number of SGPRs supported by the subtarget. 1129 unsigned getTotalNumSGPRs() const { 1130 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1131 } 1132 1133 /// \returns Addressable number of SGPRs supported by the subtarget. 1134 unsigned getAddressableNumSGPRs() const { 1135 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1136 } 1137 1138 /// \returns Minimum number of SGPRs that meets the given number of waves per 1139 /// execution unit requirement supported by the subtarget. 1140 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1141 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1142 } 1143 1144 /// \returns Maximum number of SGPRs that meets the given number of waves per 1145 /// execution unit requirement supported by the subtarget. 1146 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1147 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1148 } 1149 1150 /// \returns Reserved number of SGPRs for given function \p MF. 1151 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1152 1153 /// \returns Maximum number of SGPRs that meets number of waves per execution 1154 /// unit requirement for function \p MF, or number of SGPRs explicitly 1155 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1156 /// 1157 /// \returns Value that meets number of waves per execution unit requirement 1158 /// if explicitly requested value cannot be converted to integer, violates 1159 /// subtarget's specifications, or does not meet number of waves per execution 1160 /// unit requirement. 1161 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1162 1163 /// \returns VGPR allocation granularity supported by the subtarget. 1164 unsigned getVGPRAllocGranule() const { 1165 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1166 } 1167 1168 /// \returns VGPR encoding granularity supported by the subtarget. 1169 unsigned getVGPREncodingGranule() const { 1170 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1171 } 1172 1173 /// \returns Total number of VGPRs supported by the subtarget. 1174 unsigned getTotalNumVGPRs() const { 1175 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1176 } 1177 1178 /// \returns Addressable number of VGPRs supported by the subtarget. 1179 unsigned getAddressableNumVGPRs() const { 1180 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1181 } 1182 1183 /// \returns Minimum number of VGPRs that meets given number of waves per 1184 /// execution unit requirement supported by the subtarget. 1185 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1186 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1187 } 1188 1189 /// \returns Maximum number of VGPRs that meets given number of waves per 1190 /// execution unit requirement supported by the subtarget. 1191 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1192 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1193 } 1194 1195 /// \returns Maximum number of VGPRs that meets number of waves per execution 1196 /// unit requirement for function \p MF, or number of VGPRs explicitly 1197 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1198 /// 1199 /// \returns Value that meets number of waves per execution unit requirement 1200 /// if explicitly requested value cannot be converted to integer, violates 1201 /// subtarget's specifications, or does not meet number of waves per execution 1202 /// unit requirement. 1203 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1204 1205 void getPostRAMutations( 1206 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1207 const override; 1208 1209 bool isWave32() const { 1210 return getWavefrontSize() == 32; 1211 } 1212 1213 const TargetRegisterClass *getBoolRC() const { 1214 return getRegisterInfo()->getBoolRC(); 1215 } 1216 1217 /// \returns Maximum number of work groups per compute unit supported by the 1218 /// subtarget and limited by given \p FlatWorkGroupSize. 1219 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1220 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1221 } 1222 1223 /// \returns Minimum flat work group size supported by the subtarget. 1224 unsigned getMinFlatWorkGroupSize() const override { 1225 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1226 } 1227 1228 /// \returns Maximum flat work group size supported by the subtarget. 1229 unsigned getMaxFlatWorkGroupSize() const override { 1230 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1231 } 1232 1233 /// \returns Number of waves per execution unit required to support the given 1234 /// \p FlatWorkGroupSize. 1235 unsigned 1236 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1237 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1238 } 1239 1240 /// \returns Minimum number of waves per execution unit supported by the 1241 /// subtarget. 1242 unsigned getMinWavesPerEU() const override { 1243 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1244 } 1245 1246 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1247 SDep &Dep) const override; 1248 }; 1249 1250 class R600Subtarget final : public R600GenSubtargetInfo, 1251 public AMDGPUSubtarget { 1252 private: 1253 R600InstrInfo InstrInfo; 1254 R600FrameLowering FrameLowering; 1255 bool FMA; 1256 bool CaymanISA; 1257 bool CFALUBug; 1258 bool HasVertexCache; 1259 bool R600ALUInst; 1260 bool FP64; 1261 short TexVTXClauseSize; 1262 Generation Gen; 1263 R600TargetLowering TLInfo; 1264 InstrItineraryData InstrItins; 1265 SelectionDAGTargetInfo TSInfo; 1266 1267 public: 1268 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 1269 const TargetMachine &TM); 1270 1271 const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } 1272 1273 const R600FrameLowering *getFrameLowering() const override { 1274 return &FrameLowering; 1275 } 1276 1277 const R600TargetLowering *getTargetLowering() const override { 1278 return &TLInfo; 1279 } 1280 1281 const R600RegisterInfo *getRegisterInfo() const override { 1282 return &InstrInfo.getRegisterInfo(); 1283 } 1284 1285 const InstrItineraryData *getInstrItineraryData() const override { 1286 return &InstrItins; 1287 } 1288 1289 // Nothing implemented, just prevent crashes on use. 1290 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 1291 return &TSInfo; 1292 } 1293 1294 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 1295 1296 Generation getGeneration() const { 1297 return Gen; 1298 } 1299 1300 Align getStackAlignment() const { return Align(4); } 1301 1302 R600Subtarget &initializeSubtargetDependencies(const Triple &TT, 1303 StringRef GPU, StringRef FS); 1304 1305 bool hasBFE() const { 1306 return (getGeneration() >= EVERGREEN); 1307 } 1308 1309 bool hasBFI() const { 1310 return (getGeneration() >= EVERGREEN); 1311 } 1312 1313 bool hasBCNT(unsigned Size) const { 1314 if (Size == 32) 1315 return (getGeneration() >= EVERGREEN); 1316 1317 return false; 1318 } 1319 1320 bool hasBORROW() const { 1321 return (getGeneration() >= EVERGREEN); 1322 } 1323 1324 bool hasCARRY() const { 1325 return (getGeneration() >= EVERGREEN); 1326 } 1327 1328 bool hasCaymanISA() const { 1329 return CaymanISA; 1330 } 1331 1332 bool hasFFBL() const { 1333 return (getGeneration() >= EVERGREEN); 1334 } 1335 1336 bool hasFFBH() const { 1337 return (getGeneration() >= EVERGREEN); 1338 } 1339 1340 bool hasFMA() const { return FMA; } 1341 1342 bool hasCFAluBug() const { return CFALUBug; } 1343 1344 bool hasVertexCache() const { return HasVertexCache; } 1345 1346 short getTexVTXClauseSize() const { return TexVTXClauseSize; } 1347 1348 bool enableMachineScheduler() const override { 1349 return true; 1350 } 1351 1352 bool enableSubRegLiveness() const override { 1353 return true; 1354 } 1355 1356 /// \returns Maximum number of work groups per compute unit supported by the 1357 /// subtarget and limited by given \p FlatWorkGroupSize. 1358 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1359 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1360 } 1361 1362 /// \returns Minimum flat work group size supported by the subtarget. 1363 unsigned getMinFlatWorkGroupSize() const override { 1364 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1365 } 1366 1367 /// \returns Maximum flat work group size supported by the subtarget. 1368 unsigned getMaxFlatWorkGroupSize() const override { 1369 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1370 } 1371 1372 /// \returns Number of waves per execution unit required to support the given 1373 /// \p FlatWorkGroupSize. 1374 unsigned 1375 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1376 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1377 } 1378 1379 /// \returns Minimum number of waves per execution unit supported by the 1380 /// subtarget. 1381 unsigned getMinWavesPerEU() const override { 1382 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1383 } 1384 }; 1385 1386 } // end namespace llvm 1387 1388 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 1389