1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "R600FrameLowering.h" 20 #include "R600ISelLowering.h" 21 #include "R600InstrInfo.h" 22 #include "SIFrameLowering.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "Utils/AMDGPUBaseInfo.h" 26 #include "llvm/ADT/Triple.h" 27 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 29 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 32 #include "llvm/MC/MCInstrItineraries.h" 33 #include "llvm/Support/MathExtras.h" 34 #include <cassert> 35 #include <cstdint> 36 #include <memory> 37 #include <utility> 38 39 #define GET_SUBTARGETINFO_HEADER 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_HEADER 42 #include "R600GenSubtargetInfo.inc" 43 44 namespace llvm { 45 46 class StringRef; 47 48 class AMDGPUSubtarget { 49 public: 50 enum Generation { 51 R600 = 0, 52 R700 = 1, 53 EVERGREEN = 2, 54 NORTHERN_ISLANDS = 3, 55 SOUTHERN_ISLANDS = 4, 56 SEA_ISLANDS = 5, 57 VOLCANIC_ISLANDS = 6, 58 GFX9 = 7, 59 GFX10 = 8 60 }; 61 62 private: 63 Triple TargetTriple; 64 65 protected: 66 bool Has16BitInsts; 67 bool HasMadMixInsts; 68 bool FP32Denormals; 69 bool FPExceptions; 70 bool HasSDWA; 71 bool HasVOP3PInsts; 72 bool HasMulI24; 73 bool HasMulU24; 74 bool HasInv2PiInlineImm; 75 bool HasFminFmaxLegacy; 76 bool EnablePromoteAlloca; 77 bool HasTrigReducedRange; 78 unsigned MaxWavesPerEU; 79 int LocalMemorySize; 80 unsigned WavefrontSize; 81 82 public: 83 AMDGPUSubtarget(const Triple &TT); 84 85 static const AMDGPUSubtarget &get(const MachineFunction &MF); 86 static const AMDGPUSubtarget &get(const TargetMachine &TM, 87 const Function &F); 88 89 /// \returns Default range flat work group size for a calling convention. 90 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 91 92 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 93 /// for function \p F, or minimum/maximum flat work group sizes explicitly 94 /// requested using "amdgpu-flat-work-group-size" attribute attached to 95 /// function \p F. 96 /// 97 /// \returns Subtarget's default values if explicitly requested values cannot 98 /// be converted to integer, or violate subtarget's specifications. 99 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 100 101 /// \returns Subtarget's default pair of minimum/maximum number of waves per 102 /// execution unit for function \p F, or minimum/maximum number of waves per 103 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 104 /// attached to function \p F. 105 /// 106 /// \returns Subtarget's default values if explicitly requested values cannot 107 /// be converted to integer, violate subtarget's specifications, or are not 108 /// compatible with minimum/maximum number of waves limited by flat work group 109 /// size, register usage, and/or lds usage. 110 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 111 112 /// Return the amount of LDS that can be used that will not restrict the 113 /// occupancy lower than WaveCount. 114 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 115 const Function &) const; 116 117 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 118 /// the given LDS memory size is the only constraint. 119 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 120 121 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 122 123 bool isAmdHsaOS() const { 124 return TargetTriple.getOS() == Triple::AMDHSA; 125 } 126 127 bool isAmdPalOS() const { 128 return TargetTriple.getOS() == Triple::AMDPAL; 129 } 130 131 bool isMesa3DOS() const { 132 return TargetTriple.getOS() == Triple::Mesa3D; 133 } 134 135 bool isMesaKernel(const Function &F) const { 136 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 137 } 138 139 bool isAmdHsaOrMesa(const Function &F) const { 140 return isAmdHsaOS() || isMesaKernel(F); 141 } 142 143 bool has16BitInsts() const { 144 return Has16BitInsts; 145 } 146 147 bool hasMadMixInsts() const { 148 return HasMadMixInsts; 149 } 150 151 bool hasFP32Denormals(const Function &F) const { 152 // FIXME: This should not be a property of the subtarget. This should be a 153 // property with a default set by the calling convention which can be 154 // overridden by attributes. For now, use the subtarget feature as a 155 // placeholder attribute. The function arguments only purpose is to 156 // discourage use without a function context until this is removed. 157 return FP32Denormals; 158 } 159 160 bool hasFPExceptions() const { 161 return FPExceptions; 162 } 163 164 bool hasSDWA() const { 165 return HasSDWA; 166 } 167 168 bool hasVOP3PInsts() const { 169 return HasVOP3PInsts; 170 } 171 172 bool hasMulI24() const { 173 return HasMulI24; 174 } 175 176 bool hasMulU24() const { 177 return HasMulU24; 178 } 179 180 bool hasInv2PiInlineImm() const { 181 return HasInv2PiInlineImm; 182 } 183 184 bool hasFminFmaxLegacy() const { 185 return HasFminFmaxLegacy; 186 } 187 188 bool hasTrigReducedRange() const { 189 return HasTrigReducedRange; 190 } 191 192 bool isPromoteAllocaEnabled() const { 193 return EnablePromoteAlloca; 194 } 195 196 unsigned getWavefrontSize() const { 197 return WavefrontSize; 198 } 199 200 int getLocalMemorySize() const { 201 return LocalMemorySize; 202 } 203 204 Align getAlignmentForImplicitArgPtr() const { 205 return isAmdHsaOS() ? Align(8) : Align(4); 206 } 207 208 /// Returns the offset in bytes from the start of the input buffer 209 /// of the first explicit kernel argument. 210 unsigned getExplicitKernelArgOffset(const Function &F) const { 211 return isAmdHsaOrMesa(F) ? 0 : 36; 212 } 213 214 /// \returns Maximum number of work groups per compute unit supported by the 215 /// subtarget and limited by given \p FlatWorkGroupSize. 216 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 217 218 /// \returns Minimum flat work group size supported by the subtarget. 219 virtual unsigned getMinFlatWorkGroupSize() const = 0; 220 221 /// \returns Maximum flat work group size supported by the subtarget. 222 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 223 224 /// \returns Maximum number of waves per execution unit supported by the 225 /// subtarget and limited by given \p FlatWorkGroupSize. 226 virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0; 227 228 /// \returns Minimum number of waves per execution unit supported by the 229 /// subtarget. 230 virtual unsigned getMinWavesPerEU() const = 0; 231 232 /// \returns Maximum number of waves per execution unit supported by the 233 /// subtarget without any kind of limitation. 234 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 235 236 /// Creates value range metadata on an workitemid.* inrinsic call or load. 237 bool makeLIDRangeMetadata(Instruction *I) const; 238 239 /// \returns Number of bytes of arguments that are passed to a shader or 240 /// kernel in addition to the explicit ones declared for the function. 241 unsigned getImplicitArgNumBytes(const Function &F) const { 242 if (isMesaKernel(F)) 243 return 16; 244 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 245 } 246 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 247 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 248 249 virtual ~AMDGPUSubtarget() {} 250 }; 251 252 class GCNSubtarget : public AMDGPUGenSubtargetInfo, 253 public AMDGPUSubtarget { 254 255 using AMDGPUSubtarget::getMaxWavesPerEU; 256 257 public: 258 enum TrapHandlerAbi { 259 TrapHandlerAbiNone = 0, 260 TrapHandlerAbiHsa = 1 261 }; 262 263 enum TrapID { 264 TrapIDHardwareReserved = 0, 265 TrapIDHSADebugTrap = 1, 266 TrapIDLLVMTrap = 2, 267 TrapIDLLVMDebugTrap = 3, 268 TrapIDDebugBreakpoint = 7, 269 TrapIDDebugReserved8 = 8, 270 TrapIDDebugReservedFE = 0xfe, 271 TrapIDDebugReservedFF = 0xff 272 }; 273 274 enum TrapRegValues { 275 LLVMTrapHandlerRegValue = 1 276 }; 277 278 private: 279 /// GlobalISel related APIs. 280 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 281 std::unique_ptr<InstructionSelector> InstSelector; 282 std::unique_ptr<LegalizerInfo> Legalizer; 283 std::unique_ptr<RegisterBankInfo> RegBankInfo; 284 285 protected: 286 // Basic subtarget description. 287 Triple TargetTriple; 288 unsigned Gen; 289 InstrItineraryData InstrItins; 290 int LDSBankCount; 291 unsigned MaxPrivateElementSize; 292 293 // Possibly statically set by tablegen, but may want to be overridden. 294 bool FastFMAF32; 295 bool HalfRate64Ops; 296 297 // Dynamially set bits that enable features. 298 bool FP64FP16Denormals; 299 bool FlatForGlobal; 300 bool AutoWaitcntBeforeBarrier; 301 bool CodeObjectV3; 302 bool UnalignedScratchAccess; 303 bool UnalignedBufferAccess; 304 bool HasApertureRegs; 305 bool EnableXNACK; 306 bool DoesNotSupportXNACK; 307 bool EnableCuMode; 308 bool TrapHandler; 309 310 // Used as options. 311 bool EnableLoadStoreOpt; 312 bool EnableUnsafeDSOffsetFolding; 313 bool EnableSIScheduler; 314 bool EnableDS128; 315 bool EnablePRTStrictNull; 316 bool DumpCode; 317 318 // Subtarget statically properties set by tablegen 319 bool FP64; 320 bool FMA; 321 bool MIMG_R128; 322 bool IsGCN; 323 bool GCN3Encoding; 324 bool CIInsts; 325 bool GFX8Insts; 326 bool GFX9Insts; 327 bool GFX10Insts; 328 bool GFX7GFX8GFX9Insts; 329 bool SGPRInitBug; 330 bool HasSMemRealTime; 331 bool HasIntClamp; 332 bool HasFmaMixInsts; 333 bool HasMovrel; 334 bool HasVGPRIndexMode; 335 bool HasScalarStores; 336 bool HasScalarAtomics; 337 bool HasSDWAOmod; 338 bool HasSDWAScalar; 339 bool HasSDWASdst; 340 bool HasSDWAMac; 341 bool HasSDWAOutModsVOPC; 342 bool HasDPP; 343 bool HasDPP8; 344 bool HasR128A16; 345 bool HasNSAEncoding; 346 bool HasDLInsts; 347 bool HasDot1Insts; 348 bool HasDot2Insts; 349 bool HasDot3Insts; 350 bool HasDot4Insts; 351 bool HasDot5Insts; 352 bool HasDot6Insts; 353 bool HasMAIInsts; 354 bool HasPkFmacF16Inst; 355 bool HasAtomicFaddInsts; 356 bool EnableSRAMECC; 357 bool DoesNotSupportSRAMECC; 358 bool HasNoSdstCMPX; 359 bool HasVscnt; 360 bool HasRegisterBanking; 361 bool HasVOP3Literal; 362 bool HasNoDataDepHazard; 363 bool FlatAddressSpace; 364 bool FlatInstOffsets; 365 bool FlatGlobalInsts; 366 bool FlatScratchInsts; 367 bool ScalarFlatScratchInsts; 368 bool AddNoCarryInsts; 369 bool HasUnpackedD16VMem; 370 bool R600ALUInst; 371 bool CaymanISA; 372 bool CFALUBug; 373 bool LDSMisalignedBug; 374 bool HasMFMAInlineLiteralBug; 375 bool HasVertexCache; 376 short TexVTXClauseSize; 377 bool ScalarizeGlobal; 378 379 bool HasVcmpxPermlaneHazard; 380 bool HasVMEMtoScalarWriteHazard; 381 bool HasSMEMtoVectorWriteHazard; 382 bool HasInstFwdPrefetchBug; 383 bool HasVcmpxExecWARHazard; 384 bool HasLdsBranchVmemWARHazard; 385 bool HasNSAtoVMEMBug; 386 bool HasOffset3fBug; 387 bool HasFlatSegmentOffsetBug; 388 389 // Dummy feature to use for assembler in tablegen. 390 bool FeatureDisable; 391 392 SelectionDAGTargetInfo TSInfo; 393 private: 394 SIInstrInfo InstrInfo; 395 SITargetLowering TLInfo; 396 SIFrameLowering FrameLowering; 397 398 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 399 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 400 401 public: 402 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 403 const GCNTargetMachine &TM); 404 ~GCNSubtarget() override; 405 406 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 407 StringRef GPU, StringRef FS); 408 409 const SIInstrInfo *getInstrInfo() const override { 410 return &InstrInfo; 411 } 412 413 const SIFrameLowering *getFrameLowering() const override { 414 return &FrameLowering; 415 } 416 417 const SITargetLowering *getTargetLowering() const override { 418 return &TLInfo; 419 } 420 421 const SIRegisterInfo *getRegisterInfo() const override { 422 return &InstrInfo.getRegisterInfo(); 423 } 424 425 const CallLowering *getCallLowering() const override { 426 return CallLoweringInfo.get(); 427 } 428 429 InstructionSelector *getInstructionSelector() const override { 430 return InstSelector.get(); 431 } 432 433 const LegalizerInfo *getLegalizerInfo() const override { 434 return Legalizer.get(); 435 } 436 437 const RegisterBankInfo *getRegBankInfo() const override { 438 return RegBankInfo.get(); 439 } 440 441 // Nothing implemented, just prevent crashes on use. 442 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 443 return &TSInfo; 444 } 445 446 const InstrItineraryData *getInstrItineraryData() const override { 447 return &InstrItins; 448 } 449 450 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 451 452 Generation getGeneration() const { 453 return (Generation)Gen; 454 } 455 456 unsigned getWavefrontSizeLog2() const { 457 return Log2_32(WavefrontSize); 458 } 459 460 /// Return the number of high bits known to be zero fror a frame index. 461 unsigned getKnownHighZeroBitsForFrameIndex() const { 462 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 463 } 464 465 int getLDSBankCount() const { 466 return LDSBankCount; 467 } 468 469 unsigned getMaxPrivateElementSize() const { 470 return MaxPrivateElementSize; 471 } 472 473 unsigned getConstantBusLimit(unsigned Opcode) const; 474 475 bool hasIntClamp() const { 476 return HasIntClamp; 477 } 478 479 bool hasFP64() const { 480 return FP64; 481 } 482 483 bool hasMIMG_R128() const { 484 return MIMG_R128; 485 } 486 487 bool hasHWFP64() const { 488 return FP64; 489 } 490 491 bool hasFastFMAF32() const { 492 return FastFMAF32; 493 } 494 495 bool hasHalfRate64Ops() const { 496 return HalfRate64Ops; 497 } 498 499 bool hasAddr64() const { 500 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 501 } 502 503 // Return true if the target only has the reverse operand versions of VALU 504 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 505 bool hasOnlyRevVALUShifts() const { 506 return getGeneration() >= VOLCANIC_ISLANDS; 507 } 508 509 bool hasBFE() const { 510 return true; 511 } 512 513 bool hasBFI() const { 514 return true; 515 } 516 517 bool hasBFM() const { 518 return hasBFE(); 519 } 520 521 bool hasBCNT(unsigned Size) const { 522 return true; 523 } 524 525 bool hasFFBL() const { 526 return true; 527 } 528 529 bool hasFFBH() const { 530 return true; 531 } 532 533 bool hasMed3_16() const { 534 return getGeneration() >= AMDGPUSubtarget::GFX9; 535 } 536 537 bool hasMin3Max3_16() const { 538 return getGeneration() >= AMDGPUSubtarget::GFX9; 539 } 540 541 bool hasFmaMixInsts() const { 542 return HasFmaMixInsts; 543 } 544 545 bool hasCARRY() const { 546 return true; 547 } 548 549 bool hasFMA() const { 550 return FMA; 551 } 552 553 bool hasSwap() const { 554 return GFX9Insts; 555 } 556 557 bool hasScalarPackInsts() const { 558 return GFX9Insts; 559 } 560 561 bool hasScalarMulHiInsts() const { 562 return GFX9Insts; 563 } 564 565 TrapHandlerAbi getTrapHandlerAbi() const { 566 return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; 567 } 568 569 /// True if the offset field of DS instructions works as expected. On SI, the 570 /// offset uses a 16-bit adder and does not always wrap properly. 571 bool hasUsableDSOffset() const { 572 return getGeneration() >= SEA_ISLANDS; 573 } 574 575 bool unsafeDSOffsetFoldingEnabled() const { 576 return EnableUnsafeDSOffsetFolding; 577 } 578 579 /// Condition output from div_scale is usable. 580 bool hasUsableDivScaleConditionOutput() const { 581 return getGeneration() != SOUTHERN_ISLANDS; 582 } 583 584 /// Extra wait hazard is needed in some cases before 585 /// s_cbranch_vccnz/s_cbranch_vccz. 586 bool hasReadVCCZBug() const { 587 return getGeneration() <= SEA_ISLANDS; 588 } 589 590 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 591 /// was written by a VALU instruction. 592 bool hasSMRDReadVALUDefHazard() const { 593 return getGeneration() == SOUTHERN_ISLANDS; 594 } 595 596 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 597 /// SGPR was written by a VALU Instruction. 598 bool hasVMEMReadSGPRVALUDefHazard() const { 599 return getGeneration() >= VOLCANIC_ISLANDS; 600 } 601 602 bool hasRFEHazards() const { 603 return getGeneration() >= VOLCANIC_ISLANDS; 604 } 605 606 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 607 unsigned getSetRegWaitStates() const { 608 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 609 } 610 611 bool dumpCode() const { 612 return DumpCode; 613 } 614 615 /// Return the amount of LDS that can be used that will not restrict the 616 /// occupancy lower than WaveCount. 617 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 618 const Function &) const; 619 620 /// Alias for hasFP64FP16Denormals 621 bool hasFP16Denormals(const Function &F) const { 622 return FP64FP16Denormals; 623 } 624 625 /// Alias for hasFP64FP16Denormals 626 bool hasFP64Denormals(const Function &F) const { 627 return FP64FP16Denormals; 628 } 629 630 bool hasFP64FP16Denormals(const Function &F) const { 631 return FP64FP16Denormals; 632 } 633 634 bool supportsMinMaxDenormModes() const { 635 return getGeneration() >= AMDGPUSubtarget::GFX9; 636 } 637 638 /// \returns If target supports S_DENORM_MODE. 639 bool hasDenormModeInst() const { 640 return getGeneration() >= AMDGPUSubtarget::GFX10; 641 } 642 643 bool useFlatForGlobal() const { 644 return FlatForGlobal; 645 } 646 647 /// \returns If target supports ds_read/write_b128 and user enables generation 648 /// of ds_read/write_b128. 649 bool useDS128() const { 650 return CIInsts && EnableDS128; 651 } 652 653 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 654 bool haveRoundOpsF64() const { 655 return CIInsts; 656 } 657 658 /// \returns If MUBUF instructions always perform range checking, even for 659 /// buffer resources used for private memory access. 660 bool privateMemoryResourceIsRangeChecked() const { 661 return getGeneration() < AMDGPUSubtarget::GFX9; 662 } 663 664 /// \returns If target requires PRT Struct NULL support (zero result registers 665 /// for sparse texture support). 666 bool usePRTStrictNull() const { 667 return EnablePRTStrictNull; 668 } 669 670 bool hasAutoWaitcntBeforeBarrier() const { 671 return AutoWaitcntBeforeBarrier; 672 } 673 674 bool hasCodeObjectV3() const { 675 // FIXME: Need to add code object v3 support for mesa and pal. 676 return isAmdHsaOS() ? CodeObjectV3 : false; 677 } 678 679 bool hasUnalignedBufferAccess() const { 680 return UnalignedBufferAccess; 681 } 682 683 bool hasUnalignedScratchAccess() const { 684 return UnalignedScratchAccess; 685 } 686 687 bool hasApertureRegs() const { 688 return HasApertureRegs; 689 } 690 691 bool isTrapHandlerEnabled() const { 692 return TrapHandler; 693 } 694 695 bool isXNACKEnabled() const { 696 return EnableXNACK; 697 } 698 699 bool isCuModeEnabled() const { 700 return EnableCuMode; 701 } 702 703 bool hasFlatAddressSpace() const { 704 return FlatAddressSpace; 705 } 706 707 bool hasFlatScrRegister() const { 708 return hasFlatAddressSpace(); 709 } 710 711 bool hasFlatInstOffsets() const { 712 return FlatInstOffsets; 713 } 714 715 bool hasFlatGlobalInsts() const { 716 return FlatGlobalInsts; 717 } 718 719 bool hasFlatScratchInsts() const { 720 return FlatScratchInsts; 721 } 722 723 bool hasScalarFlatScratchInsts() const { 724 return ScalarFlatScratchInsts; 725 } 726 727 bool hasFlatSegmentOffsetBug() const { 728 return HasFlatSegmentOffsetBug; 729 } 730 731 bool hasFlatLgkmVMemCountInOrder() const { 732 return getGeneration() > GFX9; 733 } 734 735 bool hasD16LoadStore() const { 736 return getGeneration() >= GFX9; 737 } 738 739 bool d16PreservesUnusedBits() const { 740 return hasD16LoadStore() && !isSRAMECCEnabled(); 741 } 742 743 bool hasD16Images() const { 744 return getGeneration() >= VOLCANIC_ISLANDS; 745 } 746 747 /// Return if most LDS instructions have an m0 use that require m0 to be 748 /// iniitalized. 749 bool ldsRequiresM0Init() const { 750 return getGeneration() < GFX9; 751 } 752 753 // True if the hardware rewinds and replays GWS operations if a wave is 754 // preempted. 755 // 756 // If this is false, a GWS operation requires testing if a nack set the 757 // MEM_VIOL bit, and repeating if so. 758 bool hasGWSAutoReplay() const { 759 return getGeneration() >= GFX9; 760 } 761 762 /// \returns if target has ds_gws_sema_release_all instruction. 763 bool hasGWSSemaReleaseAll() const { 764 return CIInsts; 765 } 766 767 bool hasAddNoCarry() const { 768 return AddNoCarryInsts; 769 } 770 771 bool hasUnpackedD16VMem() const { 772 return HasUnpackedD16VMem; 773 } 774 775 // Covers VS/PS/CS graphics shaders 776 bool isMesaGfxShader(const Function &F) const { 777 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 778 } 779 780 bool hasMad64_32() const { 781 return getGeneration() >= SEA_ISLANDS; 782 } 783 784 bool hasSDWAOmod() const { 785 return HasSDWAOmod; 786 } 787 788 bool hasSDWAScalar() const { 789 return HasSDWAScalar; 790 } 791 792 bool hasSDWASdst() const { 793 return HasSDWASdst; 794 } 795 796 bool hasSDWAMac() const { 797 return HasSDWAMac; 798 } 799 800 bool hasSDWAOutModsVOPC() const { 801 return HasSDWAOutModsVOPC; 802 } 803 804 bool hasDLInsts() const { 805 return HasDLInsts; 806 } 807 808 bool hasDot1Insts() const { 809 return HasDot1Insts; 810 } 811 812 bool hasDot2Insts() const { 813 return HasDot2Insts; 814 } 815 816 bool hasDot3Insts() const { 817 return HasDot3Insts; 818 } 819 820 bool hasDot4Insts() const { 821 return HasDot4Insts; 822 } 823 824 bool hasDot5Insts() const { 825 return HasDot5Insts; 826 } 827 828 bool hasDot6Insts() const { 829 return HasDot6Insts; 830 } 831 832 bool hasMAIInsts() const { 833 return HasMAIInsts; 834 } 835 836 bool hasPkFmacF16Inst() const { 837 return HasPkFmacF16Inst; 838 } 839 840 bool hasAtomicFaddInsts() const { 841 return HasAtomicFaddInsts; 842 } 843 844 bool isSRAMECCEnabled() const { 845 return EnableSRAMECC; 846 } 847 848 bool hasNoSdstCMPX() const { 849 return HasNoSdstCMPX; 850 } 851 852 bool hasVscnt() const { 853 return HasVscnt; 854 } 855 856 bool hasRegisterBanking() const { 857 return HasRegisterBanking; 858 } 859 860 bool hasVOP3Literal() const { 861 return HasVOP3Literal; 862 } 863 864 bool hasNoDataDepHazard() const { 865 return HasNoDataDepHazard; 866 } 867 868 bool vmemWriteNeedsExpWaitcnt() const { 869 return getGeneration() < SEA_ISLANDS; 870 } 871 872 // Scratch is allocated in 256 dword per wave blocks for the entire 873 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 874 // is 4-byte aligned. 875 // 876 // Only 4-byte alignment is really needed to access anything. Transformations 877 // on the pointer value itself may rely on the alignment / known low bits of 878 // the pointer. Set this to something above the minimum to avoid needing 879 // dynamic realignment in common cases. 880 Align getStackAlignment() const { return Align(16); } 881 882 bool enableMachineScheduler() const override { 883 return true; 884 } 885 886 bool enableSubRegLiveness() const override { 887 return true; 888 } 889 890 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 891 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 892 893 /// \returns Number of execution units per compute unit supported by the 894 /// subtarget. 895 unsigned getEUsPerCU() const { 896 return AMDGPU::IsaInfo::getEUsPerCU(this); 897 } 898 899 /// \returns Maximum number of waves per compute unit supported by the 900 /// subtarget without any kind of limitation. 901 unsigned getMaxWavesPerCU() const { 902 return AMDGPU::IsaInfo::getMaxWavesPerCU(this); 903 } 904 905 /// \returns Maximum number of waves per compute unit supported by the 906 /// subtarget and limited by given \p FlatWorkGroupSize. 907 unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { 908 return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); 909 } 910 911 /// \returns Number of waves per work group supported by the subtarget and 912 /// limited by given \p FlatWorkGroupSize. 913 unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { 914 return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize); 915 } 916 917 // static wrappers 918 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 919 920 // XXX - Why is this here if it isn't in the default pass set? 921 bool enableEarlyIfConversion() const override { 922 return true; 923 } 924 925 void overrideSchedPolicy(MachineSchedPolicy &Policy, 926 unsigned NumRegionInstrs) const override; 927 928 unsigned getMaxNumUserSGPRs() const { 929 return 16; 930 } 931 932 bool hasSMemRealTime() const { 933 return HasSMemRealTime; 934 } 935 936 bool hasMovrel() const { 937 return HasMovrel; 938 } 939 940 bool hasVGPRIndexMode() const { 941 return HasVGPRIndexMode; 942 } 943 944 bool useVGPRIndexMode() const; 945 946 bool hasScalarCompareEq64() const { 947 return getGeneration() >= VOLCANIC_ISLANDS; 948 } 949 950 bool hasScalarStores() const { 951 return HasScalarStores; 952 } 953 954 bool hasScalarAtomics() const { 955 return HasScalarAtomics; 956 } 957 958 bool hasLDSFPAtomics() const { 959 return GFX8Insts; 960 } 961 962 bool hasDPP() const { 963 return HasDPP; 964 } 965 966 bool hasDPPBroadcasts() const { 967 return HasDPP && getGeneration() < GFX10; 968 } 969 970 bool hasDPPWavefrontShifts() const { 971 return HasDPP && getGeneration() < GFX10; 972 } 973 974 bool hasDPP8() const { 975 return HasDPP8; 976 } 977 978 bool hasR128A16() const { 979 return HasR128A16; 980 } 981 982 bool hasOffset3fBug() const { 983 return HasOffset3fBug; 984 } 985 986 bool hasNSAEncoding() const { 987 return HasNSAEncoding; 988 } 989 990 bool hasMadF16() const; 991 992 bool enableSIScheduler() const { 993 return EnableSIScheduler; 994 } 995 996 bool loadStoreOptEnabled() const { 997 return EnableLoadStoreOpt; 998 } 999 1000 bool hasSGPRInitBug() const { 1001 return SGPRInitBug; 1002 } 1003 1004 bool hasMFMAInlineLiteralBug() const { 1005 return HasMFMAInlineLiteralBug; 1006 } 1007 1008 bool has12DWordStoreHazard() const { 1009 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1010 } 1011 1012 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1013 bool hasDwordx3LoadStores() const { 1014 return CIInsts; 1015 } 1016 1017 bool hasSMovFedHazard() const { 1018 return getGeneration() == AMDGPUSubtarget::GFX9; 1019 } 1020 1021 bool hasReadM0MovRelInterpHazard() const { 1022 return getGeneration() == AMDGPUSubtarget::GFX9; 1023 } 1024 1025 bool hasReadM0SendMsgHazard() const { 1026 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1027 getGeneration() <= AMDGPUSubtarget::GFX9; 1028 } 1029 1030 bool hasVcmpxPermlaneHazard() const { 1031 return HasVcmpxPermlaneHazard; 1032 } 1033 1034 bool hasVMEMtoScalarWriteHazard() const { 1035 return HasVMEMtoScalarWriteHazard; 1036 } 1037 1038 bool hasSMEMtoVectorWriteHazard() const { 1039 return HasSMEMtoVectorWriteHazard; 1040 } 1041 1042 bool hasLDSMisalignedBug() const { 1043 return LDSMisalignedBug && !EnableCuMode; 1044 } 1045 1046 bool hasInstFwdPrefetchBug() const { 1047 return HasInstFwdPrefetchBug; 1048 } 1049 1050 bool hasVcmpxExecWARHazard() const { 1051 return HasVcmpxExecWARHazard; 1052 } 1053 1054 bool hasLdsBranchVmemWARHazard() const { 1055 return HasLdsBranchVmemWARHazard; 1056 } 1057 1058 bool hasNSAtoVMEMBug() const { 1059 return HasNSAtoVMEMBug; 1060 } 1061 1062 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1063 /// SGPRs 1064 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1065 1066 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1067 /// VGPRs 1068 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1069 1070 /// Return occupancy for the given function. Used LDS and a number of 1071 /// registers if provided. 1072 /// Note, occupancy can be affected by the scratch allocation as well, but 1073 /// we do not have enough information to compute it. 1074 unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, 1075 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1076 1077 /// \returns true if the flat_scratch register should be initialized with the 1078 /// pointer to the wave's scratch memory rather than a size and offset. 1079 bool flatScratchIsPointer() const { 1080 return getGeneration() >= AMDGPUSubtarget::GFX9; 1081 } 1082 1083 /// \returns true if the machine has merged shaders in which s0-s7 are 1084 /// reserved by the hardware and user SGPRs start at s8 1085 bool hasMergedShaders() const { 1086 return getGeneration() >= GFX9; 1087 } 1088 1089 /// \returns SGPR allocation granularity supported by the subtarget. 1090 unsigned getSGPRAllocGranule() const { 1091 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1092 } 1093 1094 /// \returns SGPR encoding granularity supported by the subtarget. 1095 unsigned getSGPREncodingGranule() const { 1096 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1097 } 1098 1099 /// \returns Total number of SGPRs supported by the subtarget. 1100 unsigned getTotalNumSGPRs() const { 1101 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1102 } 1103 1104 /// \returns Addressable number of SGPRs supported by the subtarget. 1105 unsigned getAddressableNumSGPRs() const { 1106 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1107 } 1108 1109 /// \returns Minimum number of SGPRs that meets the given number of waves per 1110 /// execution unit requirement supported by the subtarget. 1111 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1112 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1113 } 1114 1115 /// \returns Maximum number of SGPRs that meets the given number of waves per 1116 /// execution unit requirement supported by the subtarget. 1117 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1118 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1119 } 1120 1121 /// \returns Reserved number of SGPRs for given function \p MF. 1122 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1123 1124 /// \returns Maximum number of SGPRs that meets number of waves per execution 1125 /// unit requirement for function \p MF, or number of SGPRs explicitly 1126 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1127 /// 1128 /// \returns Value that meets number of waves per execution unit requirement 1129 /// if explicitly requested value cannot be converted to integer, violates 1130 /// subtarget's specifications, or does not meet number of waves per execution 1131 /// unit requirement. 1132 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1133 1134 /// \returns VGPR allocation granularity supported by the subtarget. 1135 unsigned getVGPRAllocGranule() const { 1136 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1137 } 1138 1139 /// \returns VGPR encoding granularity supported by the subtarget. 1140 unsigned getVGPREncodingGranule() const { 1141 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1142 } 1143 1144 /// \returns Total number of VGPRs supported by the subtarget. 1145 unsigned getTotalNumVGPRs() const { 1146 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1147 } 1148 1149 /// \returns Addressable number of VGPRs supported by the subtarget. 1150 unsigned getAddressableNumVGPRs() const { 1151 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1152 } 1153 1154 /// \returns Minimum number of VGPRs that meets given number of waves per 1155 /// execution unit requirement supported by the subtarget. 1156 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1157 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1158 } 1159 1160 /// \returns Maximum number of VGPRs that meets given number of waves per 1161 /// execution unit requirement supported by the subtarget. 1162 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1163 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1164 } 1165 1166 /// \returns Maximum number of VGPRs that meets number of waves per execution 1167 /// unit requirement for function \p MF, or number of VGPRs explicitly 1168 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1169 /// 1170 /// \returns Value that meets number of waves per execution unit requirement 1171 /// if explicitly requested value cannot be converted to integer, violates 1172 /// subtarget's specifications, or does not meet number of waves per execution 1173 /// unit requirement. 1174 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1175 1176 void getPostRAMutations( 1177 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1178 const override; 1179 1180 bool isWave32() const { 1181 return WavefrontSize == 32; 1182 } 1183 1184 const TargetRegisterClass *getBoolRC() const { 1185 return getRegisterInfo()->getBoolRC(); 1186 } 1187 1188 /// \returns Maximum number of work groups per compute unit supported by the 1189 /// subtarget and limited by given \p FlatWorkGroupSize. 1190 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1191 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1192 } 1193 1194 /// \returns Minimum flat work group size supported by the subtarget. 1195 unsigned getMinFlatWorkGroupSize() const override { 1196 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1197 } 1198 1199 /// \returns Maximum flat work group size supported by the subtarget. 1200 unsigned getMaxFlatWorkGroupSize() const override { 1201 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1202 } 1203 1204 /// \returns Maximum number of waves per execution unit supported by the 1205 /// subtarget and limited by given \p FlatWorkGroupSize. 1206 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { 1207 return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); 1208 } 1209 1210 /// \returns Minimum number of waves per execution unit supported by the 1211 /// subtarget. 1212 unsigned getMinWavesPerEU() const override { 1213 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1214 } 1215 1216 void adjustSchedDependency(SUnit *Src, SUnit *Dst, SDep &Dep) const override; 1217 }; 1218 1219 class R600Subtarget final : public R600GenSubtargetInfo, 1220 public AMDGPUSubtarget { 1221 private: 1222 R600InstrInfo InstrInfo; 1223 R600FrameLowering FrameLowering; 1224 bool FMA; 1225 bool CaymanISA; 1226 bool CFALUBug; 1227 bool HasVertexCache; 1228 bool R600ALUInst; 1229 bool FP64; 1230 short TexVTXClauseSize; 1231 Generation Gen; 1232 R600TargetLowering TLInfo; 1233 InstrItineraryData InstrItins; 1234 SelectionDAGTargetInfo TSInfo; 1235 1236 public: 1237 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 1238 const TargetMachine &TM); 1239 1240 const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } 1241 1242 const R600FrameLowering *getFrameLowering() const override { 1243 return &FrameLowering; 1244 } 1245 1246 const R600TargetLowering *getTargetLowering() const override { 1247 return &TLInfo; 1248 } 1249 1250 const R600RegisterInfo *getRegisterInfo() const override { 1251 return &InstrInfo.getRegisterInfo(); 1252 } 1253 1254 const InstrItineraryData *getInstrItineraryData() const override { 1255 return &InstrItins; 1256 } 1257 1258 // Nothing implemented, just prevent crashes on use. 1259 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 1260 return &TSInfo; 1261 } 1262 1263 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 1264 1265 Generation getGeneration() const { 1266 return Gen; 1267 } 1268 1269 Align getStackAlignment() const { return Align(4); } 1270 1271 R600Subtarget &initializeSubtargetDependencies(const Triple &TT, 1272 StringRef GPU, StringRef FS); 1273 1274 bool hasBFE() const { 1275 return (getGeneration() >= EVERGREEN); 1276 } 1277 1278 bool hasBFI() const { 1279 return (getGeneration() >= EVERGREEN); 1280 } 1281 1282 bool hasBCNT(unsigned Size) const { 1283 if (Size == 32) 1284 return (getGeneration() >= EVERGREEN); 1285 1286 return false; 1287 } 1288 1289 bool hasBORROW() const { 1290 return (getGeneration() >= EVERGREEN); 1291 } 1292 1293 bool hasCARRY() const { 1294 return (getGeneration() >= EVERGREEN); 1295 } 1296 1297 bool hasCaymanISA() const { 1298 return CaymanISA; 1299 } 1300 1301 bool hasFFBL() const { 1302 return (getGeneration() >= EVERGREEN); 1303 } 1304 1305 bool hasFFBH() const { 1306 return (getGeneration() >= EVERGREEN); 1307 } 1308 1309 bool hasFMA() const { return FMA; } 1310 1311 bool hasCFAluBug() const { return CFALUBug; } 1312 1313 bool hasVertexCache() const { return HasVertexCache; } 1314 1315 short getTexVTXClauseSize() const { return TexVTXClauseSize; } 1316 1317 bool enableMachineScheduler() const override { 1318 return true; 1319 } 1320 1321 bool enableSubRegLiveness() const override { 1322 return true; 1323 } 1324 1325 /// \returns Maximum number of work groups per compute unit supported by the 1326 /// subtarget and limited by given \p FlatWorkGroupSize. 1327 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1328 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1329 } 1330 1331 /// \returns Minimum flat work group size supported by the subtarget. 1332 unsigned getMinFlatWorkGroupSize() const override { 1333 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1334 } 1335 1336 /// \returns Maximum flat work group size supported by the subtarget. 1337 unsigned getMaxFlatWorkGroupSize() const override { 1338 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1339 } 1340 1341 /// \returns Maximum number of waves per execution unit supported by the 1342 /// subtarget and limited by given \p FlatWorkGroupSize. 1343 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { 1344 return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); 1345 } 1346 1347 /// \returns Minimum number of waves per execution unit supported by the 1348 /// subtarget. 1349 unsigned getMinWavesPerEU() const override { 1350 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1351 } 1352 }; 1353 1354 } // end namespace llvm 1355 1356 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 1357