1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file a TargetTransformInfoImplBase conforming object specific to the 11 /// AMDGPU target machine. It uses the target's detailed information to 12 /// provide more precise answers to certain TTI queries, while letting the 13 /// target independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 19 20 #include "AMDGPU.h" 21 #include "llvm/CodeGen/BasicTTIImpl.h" 22 #include "llvm/Support/AMDGPUAddrSpace.h" 23 #include <optional> 24 25 namespace llvm { 26 27 class AMDGPUTargetMachine; 28 class GCNSubtarget; 29 class InstCombiner; 30 class Loop; 31 class ScalarEvolution; 32 class SITargetLowering; 33 class Type; 34 class Value; 35 36 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { 37 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; 38 using TTI = TargetTransformInfo; 39 40 friend BaseT; 41 42 Triple TargetTriple; 43 44 const TargetSubtargetInfo *ST; 45 const TargetLoweringBase *TLI; 46 getST()47 const TargetSubtargetInfo *getST() const { return ST; } getTLI()48 const TargetLoweringBase *getTLI() const { return TLI; } 49 50 public: 51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); 52 53 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 54 TTI::UnrollingPreferences &UP, 55 OptimizationRemarkEmitter *ORE) const override; 56 57 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 58 TTI::PeelingPreferences &PP) const override; 59 60 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override; 61 }; 62 63 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { 64 using BaseT = BasicTTIImplBase<GCNTTIImpl>; 65 using TTI = TargetTransformInfo; 66 67 friend BaseT; 68 69 const GCNSubtarget *ST; 70 const SITargetLowering *TLI; 71 AMDGPUTTIImpl CommonTTI; 72 bool IsGraphics; 73 bool HasFP32Denormals; 74 bool HasFP64FP16Denormals; 75 static constexpr bool InlinerVectorBonusPercent = 0; 76 77 static const FeatureBitset InlineFeatureIgnoreList; 78 getST()79 const GCNSubtarget *getST() const { return ST; } getTLI()80 const SITargetLowering *getTLI() const { return TLI; } 81 getFullRateInstrCost()82 static inline int getFullRateInstrCost() { 83 return TargetTransformInfo::TCC_Basic; 84 } 85 getHalfRateInstrCost(TTI::TargetCostKind CostKind)86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) { 87 return CostKind == TTI::TCK_CodeSize ? 2 88 : 2 * TargetTransformInfo::TCC_Basic; 89 } 90 91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe 92 // should be 2 or 4. getQuarterRateInstrCost(TTI::TargetCostKind CostKind)93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) { 94 return CostKind == TTI::TCK_CodeSize ? 2 95 : 4 * TargetTransformInfo::TCC_Basic; 96 } 97 98 // On some parts, normal fp64 operations are half rate, and others 99 // quarter. This also applies to some integer operations. 100 int get64BitInstrCost(TTI::TargetCostKind CostKind) const; 101 102 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; 103 104 public: 105 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); 106 107 bool hasBranchDivergence(const Function *F = nullptr) const override; 108 109 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 110 TTI::UnrollingPreferences &UP, 111 OptimizationRemarkEmitter *ORE) const override; 112 113 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 114 TTI::PeelingPreferences &PP) const override; 115 getPopcntSupport(unsigned TyWidth)116 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override { 117 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 118 return TTI::PSK_FastHardware; 119 } 120 121 unsigned getNumberOfRegisters(unsigned RCID) const override; 122 TypeSize 123 getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override; 124 unsigned getMinVectorRegisterBitWidth() const override; 125 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override; 126 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 127 unsigned ChainSizeInBytes, 128 VectorType *VecTy) const override; 129 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 130 unsigned ChainSizeInBytes, 131 VectorType *VecTy) const override; 132 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override; 133 134 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 135 unsigned AddrSpace) const; 136 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 137 unsigned AddrSpace) const override; 138 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 139 unsigned AddrSpace) const override; 140 141 uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override; 142 Type *getMemcpyLoopLoweringType( 143 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, 144 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, 145 std::optional<uint32_t> AtomicElementSize) const override; 146 147 void getMemcpyLoopResidualLoweringType( 148 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 149 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 150 Align SrcAlign, Align DestAlign, 151 std::optional<uint32_t> AtomicCpySize) const override; 152 unsigned getMaxInterleaveFactor(ElementCount VF) const override; 153 154 bool getTgtMemIntrinsic(IntrinsicInst *Inst, 155 MemIntrinsicInfo &Info) const override; 156 157 InstructionCost getArithmeticInstrCost( 158 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 159 TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, 160 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None}, 161 ArrayRef<const Value *> Args = {}, 162 const Instruction *CxtI = nullptr) const override; 163 164 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 165 const Instruction *I = nullptr) const override; 166 167 bool isInlineAsmSourceOfDivergence(const CallInst *CI, 168 ArrayRef<unsigned> Indices = {}) const; 169 170 using BaseT::getVectorInstrCost; 171 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, 172 TTI::TargetCostKind CostKind, 173 unsigned Index, const Value *Op0, 174 const Value *Op1) const override; 175 176 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; 177 bool isSourceOfDivergence(const Value *V) const override; 178 bool isAlwaysUniform(const Value *V) const override; 179 isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)180 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override { 181 // Address space casts must cast between different address spaces. 182 if (FromAS == ToAS) 183 return false; 184 185 // Casts between any aliasing address spaces are valid. 186 return AMDGPU::addrspacesMayAlias(FromAS, ToAS); 187 } 188 addrspacesMayAlias(unsigned AS0,unsigned AS1)189 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override { 190 return AMDGPU::addrspacesMayAlias(AS0, AS1); 191 } 192 getFlatAddressSpace()193 unsigned getFlatAddressSpace() const override { 194 // Don't bother running InferAddressSpaces pass on graphics shaders which 195 // don't use flat addressing. 196 if (IsGraphics) 197 return -1; 198 return AMDGPUAS::FLAT_ADDRESS; 199 } 200 201 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 202 Intrinsic::ID IID) const override; 203 204 bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS)205 canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override { 206 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 207 AS != AMDGPUAS::PRIVATE_ADDRESS; 208 } 209 210 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 211 Value *NewV) const override; 212 213 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, 214 const Value *Op1, InstCombiner &IC) const; 215 216 bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, 217 unsigned LaneAgIdx) const; 218 219 std::optional<Instruction *> 220 instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; 221 222 Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, 223 IntrinsicInst &II, 224 const APInt &DemandedElts, 225 APInt &UndefElts) const; 226 227 Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC, 228 IntrinsicInst &II) const; 229 230 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 231 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 232 APInt &UndefElts2, APInt &UndefElts3, 233 std::function<void(Instruction *, unsigned, APInt, APInt &)> 234 SimplifyAndSetOp) const override; 235 getVectorSplitCost()236 InstructionCost getVectorSplitCost() const { return 0; } 237 238 InstructionCost 239 getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, 240 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index, 241 VectorType *SubTp, ArrayRef<const Value *> Args = {}, 242 const Instruction *CxtI = nullptr) const override; 243 244 bool isProfitableToSinkOperands(Instruction *I, 245 SmallVectorImpl<Use *> &Ops) const override; 246 247 bool areInlineCompatible(const Function *Caller, 248 const Function *Callee) const override; 249 250 int getInliningLastCallToStaticBonus() const override; getInliningThresholdMultiplier()251 unsigned getInliningThresholdMultiplier() const override { return 11; } 252 unsigned adjustInliningThreshold(const CallBase *CB) const override; 253 unsigned getCallerAllocaCost(const CallBase *CB, 254 const AllocaInst *AI) const override; 255 getInlinerVectorBonusPercent()256 int getInlinerVectorBonusPercent() const override { 257 return InlinerVectorBonusPercent; 258 } 259 260 InstructionCost 261 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 262 std::optional<FastMathFlags> FMF, 263 TTI::TargetCostKind CostKind) const override; 264 265 InstructionCost 266 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 267 TTI::TargetCostKind CostKind) const override; 268 InstructionCost 269 getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, 270 TTI::TargetCostKind CostKind) const override; 271 272 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. getCacheLineSize()273 unsigned getCacheLineSize() const override { return 128; } 274 275 /// How much before a load we should place the prefetch instruction. 276 /// This is currently measured in number of IR instructions. 277 unsigned getPrefetchDistance() const override; 278 279 /// \return if target want to issue a prefetch in address space \p AS. 280 bool shouldPrefetchAddressSpace(unsigned AS) const override; 281 void collectKernelLaunchBounds( 282 const Function &F, 283 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override; 284 285 enum class KnownIEEEMode { Unknown, On, Off }; 286 287 /// Return KnownIEEEMode::On if we know if the use context can assume 288 /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume 289 /// "amdgpu-ieee"="false". 290 KnownIEEEMode fpenvIEEEMode(const Instruction &I) const; 291 292 /// Account for loads of i8 vector types to have reduced cost. For 293 /// example the cost of load 4 i8s values is one is the cost of loading 294 /// a single i32 value. 295 InstructionCost getMemoryOpCost( 296 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, 297 TTI::TargetCostKind CostKind, 298 TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, 299 const Instruction *I = nullptr) const override; 300 301 /// When counting parts on AMD GPUs, account for i8s being grouped 302 /// together under a single i32 value. Otherwise fall back to base 303 /// implementation. 304 unsigned getNumberOfParts(Type *Tp) const override; 305 }; 306 307 } // end namespace llvm 308 309 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 310