1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file a TargetTransformInfo::Concept conforming object specific to the 11 /// AMDGPU target machine. It uses the target's detailed information to 12 /// provide more precise answers to certain TTI queries, while letting the 13 /// target independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 19 20 #include "AMDGPU.h" 21 #include "llvm/CodeGen/BasicTTIImpl.h" 22 #include <optional> 23 24 namespace llvm { 25 26 class AMDGPUTargetMachine; 27 class GCNSubtarget; 28 class InstCombiner; 29 class Loop; 30 class ScalarEvolution; 31 class SITargetLowering; 32 class Type; 33 class Value; 34 35 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { 36 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; 37 using TTI = TargetTransformInfo; 38 39 friend BaseT; 40 41 Triple TargetTriple; 42 43 const TargetSubtargetInfo *ST; 44 const TargetLoweringBase *TLI; 45 46 const TargetSubtargetInfo *getST() const { return ST; } 47 const TargetLoweringBase *getTLI() const { return TLI; } 48 49 public: 50 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); 51 52 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 53 TTI::UnrollingPreferences &UP, 54 OptimizationRemarkEmitter *ORE); 55 56 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 57 TTI::PeelingPreferences &PP); 58 59 int64_t getMaxMemIntrinsicInlineSizeThreshold() const; 60 }; 61 62 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { 63 using BaseT = BasicTTIImplBase<GCNTTIImpl>; 64 using TTI = TargetTransformInfo; 65 66 friend BaseT; 67 68 const GCNSubtarget *ST; 69 const SITargetLowering *TLI; 70 AMDGPUTTIImpl CommonTTI; 71 bool IsGraphics; 72 bool HasFP32Denormals; 73 bool HasFP64FP16Denormals; 74 static constexpr bool InlinerVectorBonusPercent = 0; 75 76 static const FeatureBitset InlineFeatureIgnoreList; 77 78 const GCNSubtarget *getST() const { return ST; } 79 const SITargetLowering *getTLI() const { return TLI; } 80 81 static inline int getFullRateInstrCost() { 82 return TargetTransformInfo::TCC_Basic; 83 } 84 85 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) { 86 return CostKind == TTI::TCK_CodeSize ? 2 87 : 2 * TargetTransformInfo::TCC_Basic; 88 } 89 90 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe 91 // should be 2 or 4. 92 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) { 93 return CostKind == TTI::TCK_CodeSize ? 2 94 : 4 * TargetTransformInfo::TCC_Basic; 95 } 96 97 // On some parts, normal fp64 operations are half rate, and others 98 // quarter. This also applies to some integer operations. 99 int get64BitInstrCost(TTI::TargetCostKind CostKind) const; 100 101 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; 102 103 public: 104 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); 105 106 bool hasBranchDivergence(const Function *F = nullptr) const; 107 108 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 109 TTI::UnrollingPreferences &UP, 110 OptimizationRemarkEmitter *ORE); 111 112 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 113 TTI::PeelingPreferences &PP); 114 115 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { 116 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 117 return TTI::PSK_FastHardware; 118 } 119 120 unsigned getNumberOfRegisters(unsigned RCID) const; 121 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; 122 unsigned getMinVectorRegisterBitWidth() const; 123 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; 124 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 125 unsigned ChainSizeInBytes, 126 VectorType *VecTy) const; 127 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 128 unsigned ChainSizeInBytes, 129 VectorType *VecTy) const; 130 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 131 132 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 133 unsigned AddrSpace) const; 134 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 135 unsigned AddrSpace) const; 136 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 137 unsigned AddrSpace) const; 138 139 int64_t getMaxMemIntrinsicInlineSizeThreshold() const; 140 Type *getMemcpyLoopLoweringType( 141 LLVMContext & Context, Value * Length, unsigned SrcAddrSpace, 142 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, 143 std::optional<uint32_t> AtomicElementSize) const; 144 145 void getMemcpyLoopResidualLoweringType( 146 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 147 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 148 unsigned SrcAlign, unsigned DestAlign, 149 std::optional<uint32_t> AtomicCpySize) const; 150 unsigned getMaxInterleaveFactor(ElementCount VF); 151 152 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; 153 154 InstructionCost getArithmeticInstrCost( 155 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 156 TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, 157 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None}, 158 ArrayRef<const Value *> Args = std::nullopt, 159 const Instruction *CxtI = nullptr); 160 161 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 162 const Instruction *I = nullptr); 163 164 bool isInlineAsmSourceOfDivergence(const CallInst *CI, 165 ArrayRef<unsigned> Indices = {}) const; 166 167 using BaseT::getVectorInstrCost; 168 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, 169 TTI::TargetCostKind CostKind, 170 unsigned Index, Value *Op0, Value *Op1); 171 172 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; 173 bool isSourceOfDivergence(const Value *V) const; 174 bool isAlwaysUniform(const Value *V) const; 175 176 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 177 if (ToAS == AMDGPUAS::FLAT_ADDRESS) { 178 switch (FromAS) { 179 case AMDGPUAS::GLOBAL_ADDRESS: 180 case AMDGPUAS::CONSTANT_ADDRESS: 181 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 182 case AMDGPUAS::LOCAL_ADDRESS: 183 case AMDGPUAS::PRIVATE_ADDRESS: 184 return true; 185 default: 186 break; 187 } 188 return false; 189 } 190 if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 191 ToAS == AMDGPUAS::CONSTANT_ADDRESS) || 192 (FromAS == AMDGPUAS::CONSTANT_ADDRESS && 193 ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)) 194 return true; 195 return false; 196 } 197 198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 199 return AMDGPU::addrspacesMayAlias(AS0, AS1); 200 } 201 202 unsigned getFlatAddressSpace() const { 203 // Don't bother running InferAddressSpaces pass on graphics shaders which 204 // don't use flat addressing. 205 if (IsGraphics) 206 return -1; 207 return AMDGPUAS::FLAT_ADDRESS; 208 } 209 210 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 211 Intrinsic::ID IID) const; 212 213 bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const { 214 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 215 AS != AMDGPUAS::PRIVATE_ADDRESS; 216 } 217 218 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 219 Value *NewV) const; 220 221 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, 222 const Value *Op1, InstCombiner &IC) const; 223 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 224 IntrinsicInst &II) const; 225 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 226 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 227 APInt &UndefElts2, APInt &UndefElts3, 228 std::function<void(Instruction *, unsigned, APInt, APInt &)> 229 SimplifyAndSetOp) const; 230 231 InstructionCost getVectorSplitCost() { return 0; } 232 233 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 234 ArrayRef<int> Mask, 235 TTI::TargetCostKind CostKind, int Index, 236 VectorType *SubTp, 237 ArrayRef<const Value *> Args = std::nullopt, 238 const Instruction *CxtI = nullptr); 239 240 bool areInlineCompatible(const Function *Caller, 241 const Function *Callee) const; 242 243 unsigned getInliningThresholdMultiplier() const { return 11; } 244 unsigned adjustInliningThreshold(const CallBase *CB) const; 245 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; 246 247 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; } 248 249 InstructionCost getArithmeticReductionCost( 250 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, 251 TTI::TargetCostKind CostKind); 252 253 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 254 TTI::TargetCostKind CostKind); 255 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 256 FastMathFlags FMF, 257 TTI::TargetCostKind CostKind); 258 259 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. 260 unsigned getCacheLineSize() const override { return 128; } 261 262 /// How much before a load we should place the prefetch instruction. 263 /// This is currently measured in number of IR instructions. 264 unsigned getPrefetchDistance() const override; 265 266 /// \return if target want to issue a prefetch in address space \p AS. 267 bool shouldPrefetchAddressSpace(unsigned AS) const override; 268 }; 269 270 } // end namespace llvm 271 272 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 273