xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file declares the NVPTX specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14 #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15 
16 #include "NVPTX.h"
17 #include "NVPTXFrameLowering.h"
18 #include "NVPTXISelLowering.h"
19 #include "NVPTXInstrInfo.h"
20 #include "NVPTXRegisterInfo.h"
21 #include "llvm/CodeGen/TargetSubtargetInfo.h"
22 #include "llvm/IR/DataLayout.h"
23 #include "llvm/Support/NVPTXAddrSpace.h"
24 #include <string>
25 
26 #define GET_SUBTARGETINFO_HEADER
27 #include "NVPTXGenSubtargetInfo.inc"
28 
29 namespace llvm {
30 
31 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
32   virtual void anchor();
33   std::string TargetName;
34 
35   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
36   unsigned PTXVersion;
37 
38   // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
39   // sm_90a == 901
40   unsigned int FullSmVersion;
41 
42   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
43   // FullSmVersion.
44   unsigned int SmVersion;
45 
46   NVPTXInstrInfo InstrInfo;
47   NVPTXTargetLowering TLInfo;
48   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
49 
50   // NVPTX does not have any call stack frame, but need a NVPTX specific
51   // FrameLowering class because TargetFrameLowering is abstract.
52   NVPTXFrameLowering FrameLowering;
53 
54 public:
55   /// This constructor initializes the data members to match that
56   /// of the specified module.
57   ///
58   NVPTXSubtarget(const Triple &TT, const std::string &CPU,
59                  const std::string &FS, const NVPTXTargetMachine &TM);
60 
61   ~NVPTXSubtarget() override;
62 
getFrameLowering()63   const TargetFrameLowering *getFrameLowering() const override {
64     return &FrameLowering;
65   }
getInstrInfo()66   const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
getRegisterInfo()67   const NVPTXRegisterInfo *getRegisterInfo() const override {
68     return &InstrInfo.getRegisterInfo();
69   }
getTargetLowering()70   const NVPTXTargetLowering *getTargetLowering() const override {
71     return &TLInfo;
72   }
73 
74   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
75 
has256BitVectorLoadStore(unsigned AS)76   bool has256BitVectorLoadStore(unsigned AS) const {
77     return SmVersion >= 100 && PTXVersion >= 88 &&
78            AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
79   }
hasAtomAddF64()80   bool hasAtomAddF64() const { return SmVersion >= 60; }
hasAtomScope()81   bool hasAtomScope() const { return SmVersion >= 60; }
hasAtomBitwise64()82   bool hasAtomBitwise64() const { return SmVersion >= 32; }
hasAtomMinMax64()83   bool hasAtomMinMax64() const { return SmVersion >= 32; }
hasAtomCas16()84   bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
hasClusters()85   bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
hasLDG()86   bool hasLDG() const { return SmVersion >= 32; }
hasHWROT32()87   bool hasHWROT32() const { return SmVersion >= 32; }
hasFP16Math()88   bool hasFP16Math() const { return SmVersion >= 53; }
hasBF16Math()89   bool hasBF16Math() const { return SmVersion >= 80; }
90   bool allowFP16Math() const;
hasMaskOperator()91   bool hasMaskOperator() const { return PTXVersion >= 71; }
hasNoReturn()92   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
93   // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
94   // release, acq_rel, sc) ?
hasMemoryOrdering()95   bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
96   // Does SM & PTX support .acquire and .release qualifiers for fence?
hasSplitAcquireAndReleaseFences()97   bool hasSplitAcquireAndReleaseFences() const {
98     return SmVersion >= 90 && PTXVersion >= 86;
99   }
100   // Does SM & PTX support atomic relaxed MMIO operations ?
hasRelaxedMMIO()101   bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
hasDotInstructions()102   bool hasDotInstructions() const {
103     return SmVersion >= 61 && PTXVersion >= 50;
104   }
105   // Tcgen05 instructions in Blackwell family
hasTcgen05Instructions()106   bool hasTcgen05Instructions() const {
107     bool HasTcgen05 = false;
108     switch (FullSmVersion) {
109     default:
110       break;
111     case 1003: // sm_100a
112     case 1013: // sm_101a
113       HasTcgen05 = true;
114       break;
115     }
116 
117     return HasTcgen05 && PTXVersion >= 86;
118   }
119   // f32x2 instructions in Blackwell family
hasF32x2Instructions()120   bool hasF32x2Instructions() const {
121     return SmVersion >= 100 && PTXVersion >= 86;
122   }
123 
124   // TMA G2S copy with cta_group::1/2 support
hasCpAsyncBulkTensorCTAGroupSupport()125   bool hasCpAsyncBulkTensorCTAGroupSupport() const {
126     // TODO: Update/tidy-up after the family-conditional support arrives
127     switch (FullSmVersion) {
128     case 1003:
129     case 1013:
130       return PTXVersion >= 86;
131     case 1033:
132       return PTXVersion >= 88;
133     default:
134       return false;
135     }
136   }
137 
138   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
139   // terminates a basic block. Instead, it would assume that control flow
140   // continued to the next instruction. The next instruction could be in the
141   // block that's lexically below it. This would lead to a phantom CFG edges
142   // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
143   // PTX ISA versions 8.3+ we can confidently say that the bug will not be
144   // present.
hasPTXASUnreachableBug()145   bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
hasCvtaParam()146   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
getFullSmVersion()147   unsigned int getFullSmVersion() const { return FullSmVersion; }
getSmVersion()148   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
149   // GPUs with "a" suffix have architecture-accelerated features that are
150   // supported on the specified architecture only, hence such targets do not
151   // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
152   // such GPU variants from the base GPU architecture.
153   // - false represents non-accelerated architecture.
154   // - true represents architecture-accelerated variant.
hasArchAccelFeatures()155   bool hasArchAccelFeatures() const {
156     return (getFullSmVersion() & 1) && PTXVersion >= 80;
157   }
158   // GPUs with 'f' suffix have architecture-accelerated features which are
159   // portable across all future architectures under same SM major. For example,
160   // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
161   // - false represents non-family-specific architecture.
162   // - true represents family-specific variant.
hasFamilySpecificFeatures()163   bool hasFamilySpecificFeatures() const {
164     return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
165                                         : hasArchAccelFeatures();
166   }
167   // If the user did not provide a target we default to the `sm_30` target.
getTargetName()168   std::string getTargetName() const {
169     return TargetName.empty() ? "sm_30" : TargetName;
170   }
hasTargetName()171   bool hasTargetName() const { return !TargetName.empty(); }
172 
173   bool hasNativeBF16Support(int Opcode) const;
174 
175   // Get maximum value of required alignments among the supported data types.
176   // From the PTX ISA doc, section 8.2.3:
177   //  The memory consistency model relates operations executed on memory
178   //  locations with scalar data-types, which have a maximum size and alignment
179   //  of 64 bits. Memory operations with a vector data-type are modelled as a
180   //  set of equivalent memory operations with a scalar data-type, executed in
181   //  an unspecified order on the elements in the vector.
getMaxRequiredAlignment()182   unsigned getMaxRequiredAlignment() const { return 8; }
183   // Get the smallest cmpxchg word size that the hardware supports.
getMinCmpXchgSizeInBits()184   unsigned getMinCmpXchgSizeInBits() const { return 32; }
185 
getPTXVersion()186   unsigned getPTXVersion() const { return PTXVersion; }
187 
188   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
189   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
190 
191   void failIfClustersUnsupported(std::string const &FailureMessage) const;
192 };
193 
194 } // End llvm namespace
195 
196 #endif
197