1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file a TargetTransformInfo::Concept conforming object specific to the 11 /// AMDGPU target machine. It uses the target's detailed information to 12 /// provide more precise answers to certain TTI queries, while letting the 13 /// target independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 19 20 #include "AMDGPU.h" 21 #include "AMDGPUSubtarget.h" 22 #include "AMDGPUTargetMachine.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/CodeGen/BasicTTIImpl.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/MC/SubtargetFeature.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <cassert> 32 33 namespace llvm { 34 35 class AMDGPUTargetLowering; 36 class Loop; 37 class ScalarEvolution; 38 class Type; 39 class Value; 40 41 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { 42 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; 43 using TTI = TargetTransformInfo; 44 45 friend BaseT; 46 47 Triple TargetTriple; 48 49 const GCNSubtarget *ST; 50 const TargetLoweringBase *TLI; 51 52 const TargetSubtargetInfo *getST() const { return ST; } 53 const TargetLoweringBase *getTLI() const { return TLI; } 54 55 public: 56 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 57 : BaseT(TM, F.getParent()->getDataLayout()), 58 TargetTriple(TM->getTargetTriple()), 59 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 60 TLI(ST->getTargetLowering()) {} 61 62 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 63 TTI::UnrollingPreferences &UP); 64 }; 65 66 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { 67 using BaseT = BasicTTIImplBase<GCNTTIImpl>; 68 using TTI = TargetTransformInfo; 69 70 friend BaseT; 71 72 const GCNSubtarget *ST; 73 const AMDGPUTargetLowering *TLI; 74 AMDGPUTTIImpl CommonTTI; 75 bool IsGraphicsShader; 76 bool HasFP32Denormals; 77 78 const FeatureBitset InlineFeatureIgnoreList = { 79 // Codegen control options which don't matter. 80 AMDGPU::FeatureEnableLoadStoreOpt, 81 AMDGPU::FeatureEnableSIScheduler, 82 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, 83 AMDGPU::FeatureFlatForGlobal, 84 AMDGPU::FeaturePromoteAlloca, 85 AMDGPU::FeatureUnalignedBufferAccess, 86 AMDGPU::FeatureUnalignedScratchAccess, 87 88 AMDGPU::FeatureAutoWaitcntBeforeBarrier, 89 90 // Property of the kernel/environment which can't actually differ. 91 AMDGPU::FeatureSGPRInitBug, 92 AMDGPU::FeatureXNACK, 93 AMDGPU::FeatureTrapHandler, 94 AMDGPU::FeatureCodeObjectV3, 95 96 // The default assumption needs to be ecc is enabled, but no directly 97 // exposed operations depend on it, so it can be safely inlined. 98 AMDGPU::FeatureSRAMECC, 99 100 // Perf-tuning features 101 AMDGPU::FeatureFastFMAF32, 102 AMDGPU::HalfRate64Ops 103 }; 104 105 const GCNSubtarget *getST() const { return ST; } 106 const AMDGPUTargetLowering *getTLI() const { return TLI; } 107 108 static inline int getFullRateInstrCost() { 109 return TargetTransformInfo::TCC_Basic; 110 } 111 112 static inline int getHalfRateInstrCost() { 113 return 2 * TargetTransformInfo::TCC_Basic; 114 } 115 116 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe 117 // should be 2 or 4. 118 static inline int getQuarterRateInstrCost() { 119 return 3 * TargetTransformInfo::TCC_Basic; 120 } 121 122 // On some parts, normal fp64 operations are half rate, and others 123 // quarter. This also applies to some integer operations. 124 inline int get64BitInstrCost() const { 125 return ST->hasHalfRate64Ops() ? 126 getHalfRateInstrCost() : getQuarterRateInstrCost(); 127 } 128 129 public: 130 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 131 : BaseT(TM, F.getParent()->getDataLayout()), 132 ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))), 133 TLI(ST->getTargetLowering()), 134 CommonTTI(TM, F), 135 IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), 136 HasFP32Denormals(ST->hasFP32Denormals(F)) { } 137 138 bool hasBranchDivergence() { return true; } 139 140 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 141 TTI::UnrollingPreferences &UP); 142 143 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { 144 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 145 return TTI::PSK_FastHardware; 146 } 147 148 unsigned getHardwareNumberOfRegisters(bool Vector) const; 149 unsigned getNumberOfRegisters(bool Vector) const; 150 unsigned getRegisterBitWidth(bool Vector) const; 151 unsigned getMinVectorRegisterBitWidth() const; 152 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 153 unsigned ChainSizeInBytes, 154 VectorType *VecTy) const; 155 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 156 unsigned ChainSizeInBytes, 157 VectorType *VecTy) const; 158 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 159 160 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 161 unsigned Alignment, 162 unsigned AddrSpace) const; 163 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 164 unsigned Alignment, 165 unsigned AddrSpace) const; 166 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 167 unsigned Alignment, 168 unsigned AddrSpace) const; 169 170 unsigned getMaxInterleaveFactor(unsigned VF); 171 172 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; 173 174 int getArithmeticInstrCost( 175 unsigned Opcode, Type *Ty, 176 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 177 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 178 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 179 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 180 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 181 const Instruction *CxtI = nullptr); 182 183 unsigned getCFInstrCost(unsigned Opcode); 184 185 int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); 186 bool isSourceOfDivergence(const Value *V) const; 187 bool isAlwaysUniform(const Value *V) const; 188 189 unsigned getFlatAddressSpace() const { 190 // Don't bother running InferAddressSpaces pass on graphics shaders which 191 // don't use flat addressing. 192 if (IsGraphicsShader) 193 return -1; 194 return AMDGPUAS::FLAT_ADDRESS; 195 } 196 197 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 198 Intrinsic::ID IID) const; 199 bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, 200 Value *OldV, Value *NewV) const; 201 202 unsigned getVectorSplitCost() { return 0; } 203 204 unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 205 Type *SubTp); 206 207 bool areInlineCompatible(const Function *Caller, 208 const Function *Callee) const; 209 210 unsigned getInliningThresholdMultiplier() { return 11; } 211 212 int getInlinerVectorBonusPercent() { return 0; } 213 214 int getArithmeticReductionCost(unsigned Opcode, 215 Type *Ty, 216 bool IsPairwise); 217 template <typename T> 218 int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 219 ArrayRef<T *> Args, FastMathFlags FMF, 220 unsigned VF); 221 int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 222 ArrayRef<Type *> Tys, FastMathFlags FMF, 223 unsigned ScalarizationCostPassed = UINT_MAX); 224 int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 225 ArrayRef<Value *> Args, FastMathFlags FMF, 226 unsigned VF = 1); 227 int getMinMaxReductionCost(Type *Ty, Type *CondTy, 228 bool IsPairwiseForm, 229 bool IsUnsigned); 230 unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); 231 }; 232 233 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { 234 using BaseT = BasicTTIImplBase<R600TTIImpl>; 235 using TTI = TargetTransformInfo; 236 237 friend BaseT; 238 239 const R600Subtarget *ST; 240 const AMDGPUTargetLowering *TLI; 241 AMDGPUTTIImpl CommonTTI; 242 243 public: 244 explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 245 : BaseT(TM, F.getParent()->getDataLayout()), 246 ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))), 247 TLI(ST->getTargetLowering()), 248 CommonTTI(TM, F) {} 249 250 const R600Subtarget *getST() const { return ST; } 251 const AMDGPUTargetLowering *getTLI() const { return TLI; } 252 253 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 254 TTI::UnrollingPreferences &UP); 255 unsigned getHardwareNumberOfRegisters(bool Vec) const; 256 unsigned getNumberOfRegisters(bool Vec) const; 257 unsigned getRegisterBitWidth(bool Vector) const; 258 unsigned getMinVectorRegisterBitWidth() const; 259 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 260 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, 261 unsigned AddrSpace) const; 262 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 263 unsigned Alignment, 264 unsigned AddrSpace) const; 265 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 266 unsigned Alignment, 267 unsigned AddrSpace) const; 268 unsigned getMaxInterleaveFactor(unsigned VF); 269 unsigned getCFInstrCost(unsigned Opcode); 270 int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); 271 }; 272 273 } // end namespace llvm 274 275 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 276