xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfoImplBase conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "llvm/CodeGen/BasicTTIImpl.h"
22 #include "llvm/Support/AMDGPUAddrSpace.h"
23 #include <optional>
24 
25 namespace llvm {
26 
27 class AMDGPUTargetMachine;
28 class GCNSubtarget;
29 class InstCombiner;
30 class Loop;
31 class ScalarEvolution;
32 class SITargetLowering;
33 class Type;
34 class Value;
35 
36 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
37   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
38   using TTI = TargetTransformInfo;
39 
40   friend BaseT;
41 
42   Triple TargetTriple;
43 
44   const TargetSubtargetInfo *ST;
45   const TargetLoweringBase *TLI;
46 
getST()47   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()48   const TargetLoweringBase *getTLI() const { return TLI; }
49 
50 public:
51   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52 
53   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
54                                TTI::UnrollingPreferences &UP,
55                                OptimizationRemarkEmitter *ORE) const override;
56 
57   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
58                              TTI::PeelingPreferences &PP) const override;
59 
60   uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
61 };
62 
63 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
64   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
65   using TTI = TargetTransformInfo;
66 
67   friend BaseT;
68 
69   const GCNSubtarget *ST;
70   const SITargetLowering *TLI;
71   AMDGPUTTIImpl CommonTTI;
72   bool IsGraphics;
73   bool HasFP32Denormals;
74   bool HasFP64FP16Denormals;
75   static constexpr bool InlinerVectorBonusPercent = 0;
76 
77   static const FeatureBitset InlineFeatureIgnoreList;
78 
getST()79   const GCNSubtarget *getST() const { return ST; }
getTLI()80   const SITargetLowering *getTLI() const { return TLI; }
81 
getFullRateInstrCost()82   static inline int getFullRateInstrCost() {
83     return TargetTransformInfo::TCC_Basic;
84   }
85 
getHalfRateInstrCost(TTI::TargetCostKind CostKind)86   static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87     return CostKind == TTI::TCK_CodeSize ? 2
88                                          : 2 * TargetTransformInfo::TCC_Basic;
89   }
90 
91   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92   // should be 2 or 4.
getQuarterRateInstrCost(TTI::TargetCostKind CostKind)93   static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94     return CostKind == TTI::TCK_CodeSize ? 2
95                                          : 4 * TargetTransformInfo::TCC_Basic;
96   }
97 
98   // On some parts, normal fp64 operations are half rate, and others
99   // quarter. This also applies to some integer operations.
100   int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
101 
102   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
103 
104 public:
105   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
106 
107   bool hasBranchDivergence(const Function *F = nullptr) const override;
108 
109   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
110                                TTI::UnrollingPreferences &UP,
111                                OptimizationRemarkEmitter *ORE) const override;
112 
113   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
114                              TTI::PeelingPreferences &PP) const override;
115 
getPopcntSupport(unsigned TyWidth)116   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
117     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
118     return TTI::PSK_FastHardware;
119   }
120 
121   unsigned getNumberOfRegisters(unsigned RCID) const override;
122   TypeSize
123   getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
124   unsigned getMinVectorRegisterBitWidth() const override;
125   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
126   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
127                                unsigned ChainSizeInBytes,
128                                VectorType *VecTy) const override;
129   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
130                                 unsigned ChainSizeInBytes,
131                                 VectorType *VecTy) const override;
132   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
133 
134   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
135                                   unsigned AddrSpace) const;
136   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
137                                    unsigned AddrSpace) const override;
138   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
139                                     unsigned AddrSpace) const override;
140 
141   uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
142   Type *getMemcpyLoopLoweringType(
143       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
144       unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
145       std::optional<uint32_t> AtomicElementSize) const override;
146 
147   void getMemcpyLoopResidualLoweringType(
148       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
149       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
150       Align SrcAlign, Align DestAlign,
151       std::optional<uint32_t> AtomicCpySize) const override;
152   unsigned getMaxInterleaveFactor(ElementCount VF) const override;
153 
154   bool getTgtMemIntrinsic(IntrinsicInst *Inst,
155                           MemIntrinsicInfo &Info) const override;
156 
157   InstructionCost getArithmeticInstrCost(
158       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
159       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
160       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
161       ArrayRef<const Value *> Args = {},
162       const Instruction *CxtI = nullptr) const override;
163 
164   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
165                                  const Instruction *I = nullptr) const override;
166 
167   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
168                                      ArrayRef<unsigned> Indices = {}) const;
169 
170   using BaseT::getVectorInstrCost;
171   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
172                                      TTI::TargetCostKind CostKind,
173                                      unsigned Index, const Value *Op0,
174                                      const Value *Op1) const override;
175 
176   bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
177   bool isSourceOfDivergence(const Value *V) const override;
178   bool isAlwaysUniform(const Value *V) const override;
179 
isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)180   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
181     // Address space casts must cast between different address spaces.
182     if (FromAS == ToAS)
183       return false;
184 
185     // Casts between any aliasing address spaces are valid.
186     return AMDGPU::addrspacesMayAlias(FromAS, ToAS);
187   }
188 
addrspacesMayAlias(unsigned AS0,unsigned AS1)189   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
190     return AMDGPU::addrspacesMayAlias(AS0, AS1);
191   }
192 
getFlatAddressSpace()193   unsigned getFlatAddressSpace() const override {
194     // Don't bother running InferAddressSpaces pass on graphics shaders which
195     // don't use flat addressing.
196     if (IsGraphics)
197       return -1;
198     return AMDGPUAS::FLAT_ADDRESS;
199   }
200 
201   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
202                                   Intrinsic::ID IID) const override;
203 
204   bool
canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS)205   canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
206     return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
207            AS != AMDGPUAS::PRIVATE_ADDRESS;
208   }
209 
210   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
211                                           Value *NewV) const override;
212 
213   bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
214                                  const Value *Op1, InstCombiner &IC) const;
215 
216   bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
217                                    unsigned LaneAgIdx) const;
218 
219   std::optional<Instruction *>
220   instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
221 
222   Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
223                                              IntrinsicInst &II,
224                                              const APInt &DemandedElts,
225                                              APInt &UndefElts) const;
226 
227   Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
228                                                 IntrinsicInst &II) const;
229 
230   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
231       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
232       APInt &UndefElts2, APInt &UndefElts3,
233       std::function<void(Instruction *, unsigned, APInt, APInt &)>
234           SimplifyAndSetOp) const override;
235 
getVectorSplitCost()236   InstructionCost getVectorSplitCost() const { return 0; }
237 
238   InstructionCost
239   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
240                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
241                  VectorType *SubTp, ArrayRef<const Value *> Args = {},
242                  const Instruction *CxtI = nullptr) const override;
243 
244   bool isProfitableToSinkOperands(Instruction *I,
245                                   SmallVectorImpl<Use *> &Ops) const override;
246 
247   bool areInlineCompatible(const Function *Caller,
248                            const Function *Callee) const override;
249 
250   int getInliningLastCallToStaticBonus() const override;
getInliningThresholdMultiplier()251   unsigned getInliningThresholdMultiplier() const override { return 11; }
252   unsigned adjustInliningThreshold(const CallBase *CB) const override;
253   unsigned getCallerAllocaCost(const CallBase *CB,
254                                const AllocaInst *AI) const override;
255 
getInlinerVectorBonusPercent()256   int getInlinerVectorBonusPercent() const override {
257     return InlinerVectorBonusPercent;
258   }
259 
260   InstructionCost
261   getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
262                              std::optional<FastMathFlags> FMF,
263                              TTI::TargetCostKind CostKind) const override;
264 
265   InstructionCost
266   getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
267                         TTI::TargetCostKind CostKind) const override;
268   InstructionCost
269   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
270                          TTI::TargetCostKind CostKind) const override;
271 
272   /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
getCacheLineSize()273   unsigned getCacheLineSize() const override { return 128; }
274 
275   /// How much before a load we should place the prefetch instruction.
276   /// This is currently measured in number of IR instructions.
277   unsigned getPrefetchDistance() const override;
278 
279   /// \return if target want to issue a prefetch in address space \p AS.
280   bool shouldPrefetchAddressSpace(unsigned AS) const override;
281   void collectKernelLaunchBounds(
282       const Function &F,
283       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
284 
285   enum class KnownIEEEMode { Unknown, On, Off };
286 
287   /// Return KnownIEEEMode::On if we know if the use context can assume
288   /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
289   /// "amdgpu-ieee"="false".
290   KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
291 
292   /// Account for loads of i8 vector types to have reduced cost. For
293   /// example the cost of load 4 i8s values is one is the cost of loading
294   /// a single i32 value.
295   InstructionCost getMemoryOpCost(
296       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
297       TTI::TargetCostKind CostKind,
298       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
299       const Instruction *I = nullptr) const override;
300 
301   /// When counting parts on AMD GPUs, account for i8s being grouped
302   /// together under a single i32 value. Otherwise fall back to base
303   /// implementation.
304   unsigned getNumberOfParts(Type *Tp) const override;
305 };
306 
307 } // end namespace llvm
308 
309 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
310