1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file a TargetTransformInfo::Concept conforming object specific to the 11 /// AMDGPU target machine. It uses the target's detailed information to 12 /// provide more precise answers to certain TTI queries, while letting the 13 /// target independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 19 20 #include "AMDGPU.h" 21 #include "AMDGPUSubtarget.h" 22 #include "AMDGPUTargetMachine.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/CodeGen/BasicTTIImpl.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/MC/SubtargetFeature.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <cassert> 32 33 namespace llvm { 34 35 class AMDGPUTargetLowering; 36 class Loop; 37 class ScalarEvolution; 38 class Type; 39 class Value; 40 41 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { 42 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; 43 using TTI = TargetTransformInfo; 44 45 friend BaseT; 46 47 Triple TargetTriple; 48 49 const GCNSubtarget *ST; 50 const TargetLoweringBase *TLI; 51 52 const TargetSubtargetInfo *getST() const { return ST; } 53 const TargetLoweringBase *getTLI() const { return TLI; } 54 55 public: 56 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 57 : BaseT(TM, F.getParent()->getDataLayout()), 58 TargetTriple(TM->getTargetTriple()), 59 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 60 TLI(ST->getTargetLowering()) {} 61 62 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 63 TTI::UnrollingPreferences &UP); 64 65 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 66 TTI::PeelingPreferences &PP); 67 }; 68 69 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { 70 using BaseT = BasicTTIImplBase<GCNTTIImpl>; 71 using TTI = TargetTransformInfo; 72 73 friend BaseT; 74 75 const GCNSubtarget *ST; 76 const SITargetLowering *TLI; 77 AMDGPUTTIImpl CommonTTI; 78 bool IsGraphicsShader; 79 bool HasFP32Denormals; 80 unsigned MaxVGPRs; 81 82 const FeatureBitset InlineFeatureIgnoreList = { 83 // Codegen control options which don't matter. 84 AMDGPU::FeatureEnableLoadStoreOpt, 85 AMDGPU::FeatureEnableSIScheduler, 86 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, 87 AMDGPU::FeatureFlatForGlobal, 88 AMDGPU::FeaturePromoteAlloca, 89 AMDGPU::FeatureUnalignedBufferAccess, 90 AMDGPU::FeatureUnalignedScratchAccess, 91 92 AMDGPU::FeatureAutoWaitcntBeforeBarrier, 93 94 // Property of the kernel/environment which can't actually differ. 95 AMDGPU::FeatureSGPRInitBug, 96 AMDGPU::FeatureXNACK, 97 AMDGPU::FeatureTrapHandler, 98 AMDGPU::FeatureCodeObjectV3, 99 100 // The default assumption needs to be ecc is enabled, but no directly 101 // exposed operations depend on it, so it can be safely inlined. 102 AMDGPU::FeatureSRAMECC, 103 104 // Perf-tuning features 105 AMDGPU::FeatureFastFMAF32, 106 AMDGPU::HalfRate64Ops 107 }; 108 109 const GCNSubtarget *getST() const { return ST; } 110 const AMDGPUTargetLowering *getTLI() const { return TLI; } 111 112 static inline int getFullRateInstrCost() { 113 return TargetTransformInfo::TCC_Basic; 114 } 115 116 static inline int getHalfRateInstrCost() { 117 return 2 * TargetTransformInfo::TCC_Basic; 118 } 119 120 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe 121 // should be 2 or 4. 122 static inline int getQuarterRateInstrCost() { 123 return 3 * TargetTransformInfo::TCC_Basic; 124 } 125 126 // On some parts, normal fp64 operations are half rate, and others 127 // quarter. This also applies to some integer operations. 128 inline int get64BitInstrCost() const { 129 return ST->hasHalfRate64Ops() ? 130 getHalfRateInstrCost() : getQuarterRateInstrCost(); 131 } 132 133 public: 134 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 135 : BaseT(TM, F.getParent()->getDataLayout()), 136 ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))), 137 TLI(ST->getTargetLowering()), 138 CommonTTI(TM, F), 139 IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), 140 HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()), 141 MaxVGPRs(ST->getMaxNumVGPRs( 142 std::max(ST->getWavesPerEU(F).first, 143 ST->getWavesPerEUForWorkGroup( 144 ST->getFlatWorkGroupSizes(F).second)))) {} 145 146 bool hasBranchDivergence() { return true; } 147 bool useGPUDivergenceAnalysis() const; 148 149 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 150 TTI::UnrollingPreferences &UP); 151 152 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 153 TTI::PeelingPreferences &PP); 154 155 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { 156 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 157 return TTI::PSK_FastHardware; 158 } 159 160 unsigned getHardwareNumberOfRegisters(bool Vector) const; 161 unsigned getNumberOfRegisters(bool Vector) const; 162 unsigned getNumberOfRegisters(unsigned RCID) const; 163 unsigned getRegisterBitWidth(bool Vector) const; 164 unsigned getMinVectorRegisterBitWidth() const; 165 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 166 unsigned ChainSizeInBytes, 167 VectorType *VecTy) const; 168 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 169 unsigned ChainSizeInBytes, 170 VectorType *VecTy) const; 171 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 172 173 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 174 unsigned AddrSpace) const; 175 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 176 unsigned AddrSpace) const; 177 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 178 unsigned AddrSpace) const; 179 Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, 180 unsigned SrcAddrSpace, unsigned DestAddrSpace, 181 unsigned SrcAlign, unsigned DestAlign) const; 182 183 void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut, 184 LLVMContext &Context, 185 unsigned RemainingBytes, 186 unsigned SrcAddrSpace, 187 unsigned DestAddrSpace, 188 unsigned SrcAlign, 189 unsigned DestAlign) const; 190 unsigned getMaxInterleaveFactor(unsigned VF); 191 192 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; 193 194 int getArithmeticInstrCost( 195 unsigned Opcode, Type *Ty, 196 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 197 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 198 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 199 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 200 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 201 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 202 const Instruction *CxtI = nullptr); 203 204 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); 205 206 bool isInlineAsmSourceOfDivergence(const CallInst *CI, 207 ArrayRef<unsigned> Indices = {}) const; 208 209 int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); 210 bool isSourceOfDivergence(const Value *V) const; 211 bool isAlwaysUniform(const Value *V) const; 212 213 unsigned getFlatAddressSpace() const { 214 // Don't bother running InferAddressSpaces pass on graphics shaders which 215 // don't use flat addressing. 216 if (IsGraphicsShader) 217 return -1; 218 return AMDGPUAS::FLAT_ADDRESS; 219 } 220 221 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 222 Intrinsic::ID IID) const; 223 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 224 Value *NewV) const; 225 226 unsigned getVectorSplitCost() { return 0; } 227 228 unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, 229 VectorType *SubTp); 230 231 bool areInlineCompatible(const Function *Caller, 232 const Function *Callee) const; 233 234 unsigned getInliningThresholdMultiplier() { return 11; } 235 236 int getInlinerVectorBonusPercent() { return 0; } 237 238 int getArithmeticReductionCost( 239 unsigned Opcode, 240 VectorType *Ty, 241 bool IsPairwise, 242 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); 243 244 int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 245 TTI::TargetCostKind CostKind); 246 int getMinMaxReductionCost( 247 VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, 248 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); 249 }; 250 251 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { 252 using BaseT = BasicTTIImplBase<R600TTIImpl>; 253 using TTI = TargetTransformInfo; 254 255 friend BaseT; 256 257 const R600Subtarget *ST; 258 const AMDGPUTargetLowering *TLI; 259 AMDGPUTTIImpl CommonTTI; 260 261 public: 262 explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 263 : BaseT(TM, F.getParent()->getDataLayout()), 264 ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))), 265 TLI(ST->getTargetLowering()), 266 CommonTTI(TM, F) {} 267 268 const R600Subtarget *getST() const { return ST; } 269 const AMDGPUTargetLowering *getTLI() const { return TLI; } 270 271 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 272 TTI::UnrollingPreferences &UP); 273 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 274 TTI::PeelingPreferences &PP); 275 unsigned getHardwareNumberOfRegisters(bool Vec) const; 276 unsigned getNumberOfRegisters(bool Vec) const; 277 unsigned getRegisterBitWidth(bool Vector) const; 278 unsigned getMinVectorRegisterBitWidth() const; 279 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 280 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 281 unsigned AddrSpace) const; 282 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 283 unsigned AddrSpace) const; 284 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 285 unsigned AddrSpace) const; 286 unsigned getMaxInterleaveFactor(unsigned VF); 287 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); 288 int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); 289 }; 290 291 } // end namespace llvm 292 293 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 294