1 //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file declares the NVPTX specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H 14 #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H 15 16 #include "NVPTX.h" 17 #include "NVPTXFrameLowering.h" 18 #include "NVPTXISelLowering.h" 19 #include "NVPTXInstrInfo.h" 20 #include "NVPTXRegisterInfo.h" 21 #include "llvm/CodeGen/TargetSubtargetInfo.h" 22 #include "llvm/IR/DataLayout.h" 23 #include "llvm/Support/NVPTXAddrSpace.h" 24 #include <string> 25 26 #define GET_SUBTARGETINFO_HEADER 27 #include "NVPTXGenSubtargetInfo.inc" 28 29 namespace llvm { 30 31 class NVPTXSubtarget : public NVPTXGenSubtargetInfo { 32 virtual void anchor(); 33 std::string TargetName; 34 35 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 36 unsigned PTXVersion; 37 38 // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310 39 // sm_90a == 901 40 unsigned int FullSmVersion; 41 42 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from 43 // FullSmVersion. 44 unsigned int SmVersion; 45 46 NVPTXInstrInfo InstrInfo; 47 NVPTXTargetLowering TLInfo; 48 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; 49 50 // NVPTX does not have any call stack frame, but need a NVPTX specific 51 // FrameLowering class because TargetFrameLowering is abstract. 52 NVPTXFrameLowering FrameLowering; 53 54 public: 55 /// This constructor initializes the data members to match that 56 /// of the specified module. 57 /// 58 NVPTXSubtarget(const Triple &TT, const std::string &CPU, 59 const std::string &FS, const NVPTXTargetMachine &TM); 60 61 ~NVPTXSubtarget() override; 62 getFrameLowering()63 const TargetFrameLowering *getFrameLowering() const override { 64 return &FrameLowering; 65 } getInstrInfo()66 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } getRegisterInfo()67 const NVPTXRegisterInfo *getRegisterInfo() const override { 68 return &InstrInfo.getRegisterInfo(); 69 } getTargetLowering()70 const NVPTXTargetLowering *getTargetLowering() const override { 71 return &TLInfo; 72 } 73 74 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; 75 has256BitVectorLoadStore(unsigned AS)76 bool has256BitVectorLoadStore(unsigned AS) const { 77 return SmVersion >= 100 && PTXVersion >= 88 && 78 AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; 79 } hasAtomAddF64()80 bool hasAtomAddF64() const { return SmVersion >= 60; } hasAtomScope()81 bool hasAtomScope() const { return SmVersion >= 60; } hasAtomBitwise64()82 bool hasAtomBitwise64() const { return SmVersion >= 32; } hasAtomMinMax64()83 bool hasAtomMinMax64() const { return SmVersion >= 32; } hasAtomCas16()84 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } hasClusters()85 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } hasLDG()86 bool hasLDG() const { return SmVersion >= 32; } hasHWROT32()87 bool hasHWROT32() const { return SmVersion >= 32; } hasFP16Math()88 bool hasFP16Math() const { return SmVersion >= 53; } hasBF16Math()89 bool hasBF16Math() const { return SmVersion >= 80; } 90 bool allowFP16Math() const; hasMaskOperator()91 bool hasMaskOperator() const { return PTXVersion >= 71; } hasNoReturn()92 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } 93 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, 94 // release, acq_rel, sc) ? hasMemoryOrdering()95 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } 96 // Does SM & PTX support .acquire and .release qualifiers for fence? hasSplitAcquireAndReleaseFences()97 bool hasSplitAcquireAndReleaseFences() const { 98 return SmVersion >= 90 && PTXVersion >= 86; 99 } 100 // Does SM & PTX support atomic relaxed MMIO operations ? hasRelaxedMMIO()101 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } hasDotInstructions()102 bool hasDotInstructions() const { 103 return SmVersion >= 61 && PTXVersion >= 50; 104 } 105 // Tcgen05 instructions in Blackwell family hasTcgen05Instructions()106 bool hasTcgen05Instructions() const { 107 bool HasTcgen05 = false; 108 switch (FullSmVersion) { 109 default: 110 break; 111 case 1003: // sm_100a 112 case 1013: // sm_101a 113 HasTcgen05 = true; 114 break; 115 } 116 117 return HasTcgen05 && PTXVersion >= 86; 118 } 119 // f32x2 instructions in Blackwell family hasF32x2Instructions()120 bool hasF32x2Instructions() const { 121 return SmVersion >= 100 && PTXVersion >= 86; 122 } 123 124 // TMA G2S copy with cta_group::1/2 support hasCpAsyncBulkTensorCTAGroupSupport()125 bool hasCpAsyncBulkTensorCTAGroupSupport() const { 126 // TODO: Update/tidy-up after the family-conditional support arrives 127 switch (FullSmVersion) { 128 case 1003: 129 case 1013: 130 return PTXVersion >= 86; 131 case 1033: 132 return PTXVersion >= 88; 133 default: 134 return false; 135 } 136 } 137 138 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction 139 // terminates a basic block. Instead, it would assume that control flow 140 // continued to the next instruction. The next instruction could be in the 141 // block that's lexically below it. This would lead to a phantom CFG edges 142 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when 143 // PTX ISA versions 8.3+ we can confidently say that the bug will not be 144 // present. hasPTXASUnreachableBug()145 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } hasCvtaParam()146 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } getFullSmVersion()147 unsigned int getFullSmVersion() const { return FullSmVersion; } getSmVersion()148 unsigned int getSmVersion() const { return getFullSmVersion() / 10; } 149 // GPUs with "a" suffix have architecture-accelerated features that are 150 // supported on the specified architecture only, hence such targets do not 151 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing 152 // such GPU variants from the base GPU architecture. 153 // - false represents non-accelerated architecture. 154 // - true represents architecture-accelerated variant. hasArchAccelFeatures()155 bool hasArchAccelFeatures() const { 156 return (getFullSmVersion() & 1) && PTXVersion >= 80; 157 } 158 // GPUs with 'f' suffix have architecture-accelerated features which are 159 // portable across all future architectures under same SM major. For example, 160 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures. 161 // - false represents non-family-specific architecture. 162 // - true represents family-specific variant. hasFamilySpecificFeatures()163 bool hasFamilySpecificFeatures() const { 164 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88 165 : hasArchAccelFeatures(); 166 } 167 // If the user did not provide a target we default to the `sm_30` target. getTargetName()168 std::string getTargetName() const { 169 return TargetName.empty() ? "sm_30" : TargetName; 170 } hasTargetName()171 bool hasTargetName() const { return !TargetName.empty(); } 172 173 bool hasNativeBF16Support(int Opcode) const; 174 175 // Get maximum value of required alignments among the supported data types. 176 // From the PTX ISA doc, section 8.2.3: 177 // The memory consistency model relates operations executed on memory 178 // locations with scalar data-types, which have a maximum size and alignment 179 // of 64 bits. Memory operations with a vector data-type are modelled as a 180 // set of equivalent memory operations with a scalar data-type, executed in 181 // an unspecified order on the elements in the vector. getMaxRequiredAlignment()182 unsigned getMaxRequiredAlignment() const { return 8; } 183 // Get the smallest cmpxchg word size that the hardware supports. getMinCmpXchgSizeInBits()184 unsigned getMinCmpXchgSizeInBits() const { return 32; } 185 getPTXVersion()186 unsigned getPTXVersion() const { return PTXVersion; } 187 188 NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); 189 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 190 191 void failIfClustersUnsupported(std::string const &FailureMessage) const; 192 }; 193 194 } // End llvm namespace 195 196 #endif 197