xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "llvm/CodeGen/BasicTTIImpl.h"
22 #include <optional>
23 
24 namespace llvm {
25 
26 class AMDGPUTargetMachine;
27 class GCNSubtarget;
28 class InstCombiner;
29 class Loop;
30 class ScalarEvolution;
31 class SITargetLowering;
32 class Type;
33 class Value;
34 
35 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
36   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
37   using TTI = TargetTransformInfo;
38 
39   friend BaseT;
40 
41   Triple TargetTriple;
42 
43   const TargetSubtargetInfo *ST;
44   const TargetLoweringBase *TLI;
45 
46   const TargetSubtargetInfo *getST() const { return ST; }
47   const TargetLoweringBase *getTLI() const { return TLI; }
48 
49 public:
50   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
51 
52   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
53                                TTI::UnrollingPreferences &UP,
54                                OptimizationRemarkEmitter *ORE);
55 
56   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
57                              TTI::PeelingPreferences &PP);
58 
59   int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
60 };
61 
62 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
63   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
64   using TTI = TargetTransformInfo;
65 
66   friend BaseT;
67 
68   const GCNSubtarget *ST;
69   const SITargetLowering *TLI;
70   AMDGPUTTIImpl CommonTTI;
71   bool IsGraphics;
72   bool HasFP32Denormals;
73   bool HasFP64FP16Denormals;
74   static constexpr bool InlinerVectorBonusPercent = 0;
75 
76   static const FeatureBitset InlineFeatureIgnoreList;
77 
78   const GCNSubtarget *getST() const { return ST; }
79   const SITargetLowering *getTLI() const { return TLI; }
80 
81   static inline int getFullRateInstrCost() {
82     return TargetTransformInfo::TCC_Basic;
83   }
84 
85   static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
86     return CostKind == TTI::TCK_CodeSize ? 2
87                                          : 2 * TargetTransformInfo::TCC_Basic;
88   }
89 
90   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
91   // should be 2 or 4.
92   static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
93     return CostKind == TTI::TCK_CodeSize ? 2
94                                          : 4 * TargetTransformInfo::TCC_Basic;
95   }
96 
97   // On some parts, normal fp64 operations are half rate, and others
98   // quarter. This also applies to some integer operations.
99   int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
100 
101   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
102 
103 public:
104   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
105 
106   bool hasBranchDivergence(const Function *F = nullptr) const;
107 
108   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
109                                TTI::UnrollingPreferences &UP,
110                                OptimizationRemarkEmitter *ORE);
111 
112   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
113                              TTI::PeelingPreferences &PP);
114 
115   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
116     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
117     return TTI::PSK_FastHardware;
118   }
119 
120   unsigned getNumberOfRegisters(unsigned RCID) const;
121   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
122   unsigned getMinVectorRegisterBitWidth() const;
123   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
124   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
125                                unsigned ChainSizeInBytes,
126                                VectorType *VecTy) const;
127   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
128                                 unsigned ChainSizeInBytes,
129                                 VectorType *VecTy) const;
130   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
131 
132   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
133                                   unsigned AddrSpace) const;
134   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
135                                    unsigned AddrSpace) const;
136   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
137                                     unsigned AddrSpace) const;
138 
139   int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
140   Type *getMemcpyLoopLoweringType(
141       LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
142       unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
143       std::optional<uint32_t> AtomicElementSize) const;
144 
145   void getMemcpyLoopResidualLoweringType(
146       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
147       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
148       unsigned SrcAlign, unsigned DestAlign,
149       std::optional<uint32_t> AtomicCpySize) const;
150   unsigned getMaxInterleaveFactor(ElementCount VF);
151 
152   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
153 
154   InstructionCost getArithmeticInstrCost(
155       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
156       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
157       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
158       ArrayRef<const Value *> Args = std::nullopt,
159       const Instruction *CxtI = nullptr);
160 
161   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
162                                  const Instruction *I = nullptr);
163 
164   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
165                                      ArrayRef<unsigned> Indices = {}) const;
166 
167   using BaseT::getVectorInstrCost;
168   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
169                                      TTI::TargetCostKind CostKind,
170                                      unsigned Index, Value *Op0, Value *Op1);
171 
172   bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
173   bool isSourceOfDivergence(const Value *V) const;
174   bool isAlwaysUniform(const Value *V) const;
175 
176   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
177     if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
178       switch (FromAS) {
179       case AMDGPUAS::GLOBAL_ADDRESS:
180       case AMDGPUAS::CONSTANT_ADDRESS:
181       case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
182       case AMDGPUAS::LOCAL_ADDRESS:
183       case AMDGPUAS::PRIVATE_ADDRESS:
184         return true;
185       default:
186         break;
187       }
188       return false;
189     }
190     if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
191          ToAS == AMDGPUAS::CONSTANT_ADDRESS) ||
192         (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
193          ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
194       return true;
195     return false;
196   }
197 
198   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
199     return AMDGPU::addrspacesMayAlias(AS0, AS1);
200   }
201 
202   unsigned getFlatAddressSpace() const {
203     // Don't bother running InferAddressSpaces pass on graphics shaders which
204     // don't use flat addressing.
205     if (IsGraphics)
206       return -1;
207     return AMDGPUAS::FLAT_ADDRESS;
208   }
209 
210   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
211                                   Intrinsic::ID IID) const;
212 
213   bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
214     return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
215            AS != AMDGPUAS::PRIVATE_ADDRESS;
216   }
217 
218   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
219                                           Value *NewV) const;
220 
221   bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222                                  const Value *Op1, InstCombiner &IC) const;
223   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
224                                                     IntrinsicInst &II) const;
225   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
226       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
227       APInt &UndefElts2, APInt &UndefElts3,
228       std::function<void(Instruction *, unsigned, APInt, APInt &)>
229           SimplifyAndSetOp) const;
230 
231   InstructionCost getVectorSplitCost() { return 0; }
232 
233   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
234                                  ArrayRef<int> Mask,
235                                  TTI::TargetCostKind CostKind, int Index,
236                                  VectorType *SubTp,
237                                  ArrayRef<const Value *> Args = std::nullopt,
238                                  const Instruction *CxtI = nullptr);
239 
240   bool areInlineCompatible(const Function *Caller,
241                            const Function *Callee) const;
242 
243   unsigned getInliningThresholdMultiplier() const { return 11; }
244   unsigned adjustInliningThreshold(const CallBase *CB) const;
245   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
246 
247   int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
248 
249   InstructionCost getArithmeticReductionCost(
250       unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
251       TTI::TargetCostKind CostKind);
252 
253   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
254                                         TTI::TargetCostKind CostKind);
255   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
256                                          FastMathFlags FMF,
257                                          TTI::TargetCostKind CostKind);
258 
259   /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
260   unsigned getCacheLineSize() const override { return 128; }
261 
262   /// How much before a load we should place the prefetch instruction.
263   /// This is currently measured in number of IR instructions.
264   unsigned getPrefetchDistance() const override;
265 
266   /// \return if target want to issue a prefetch in address space \p AS.
267   bool shouldPrefetchAddressSpace(unsigned AS) const override;
268 };
269 
270 } // end namespace llvm
271 
272 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
273