1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/Analysis/LoopInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/IntrinsicsAMDGPU.h" 24 #include "llvm/IR/PatternMatch.h" 25 #include "llvm/Support/KnownBits.h" 26 #include <optional> 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "AMDGPUtti" 31 32 static cl::opt<unsigned> UnrollThresholdPrivate( 33 "amdgpu-unroll-threshold-private", 34 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 35 cl::init(2700), cl::Hidden); 36 37 static cl::opt<unsigned> UnrollThresholdLocal( 38 "amdgpu-unroll-threshold-local", 39 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), 40 cl::init(1000), cl::Hidden); 41 42 static cl::opt<unsigned> UnrollThresholdIf( 43 "amdgpu-unroll-threshold-if", 44 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), 45 cl::init(200), cl::Hidden); 46 47 static cl::opt<bool> UnrollRuntimeLocal( 48 "amdgpu-unroll-runtime-local", 49 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), 50 cl::init(true), cl::Hidden); 51 52 static cl::opt<bool> UseLegacyDA( 53 "amdgpu-use-legacy-divergence-analysis", 54 cl::desc("Enable legacy divergence analysis for AMDGPU"), 55 cl::init(false), cl::Hidden); 56 57 static cl::opt<unsigned> UnrollMaxBlockToAnalyze( 58 "amdgpu-unroll-max-block-to-analyze", 59 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), 60 cl::init(32), cl::Hidden); 61 62 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost", 63 cl::Hidden, cl::init(4000), 64 cl::desc("Cost of alloca argument")); 65 66 // If the amount of scratch memory to eliminate exceeds our ability to allocate 67 // it into registers we gain nothing by aggressively inlining functions for that 68 // heuristic. 69 static cl::opt<unsigned> 70 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, 71 cl::init(256), 72 cl::desc("Maximum alloca size to use for inline cost")); 73 74 // Inliner constraint to achieve reasonable compilation time. 75 static cl::opt<size_t> InlineMaxBB( 76 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), 77 cl::desc("Maximum number of BBs allowed in a function after inlining" 78 " (compile time constraint)")); 79 80 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, 81 unsigned Depth = 0) { 82 const Instruction *I = dyn_cast<Instruction>(Cond); 83 if (!I) 84 return false; 85 86 for (const Value *V : I->operand_values()) { 87 if (!L->contains(I)) 88 continue; 89 if (const PHINode *PHI = dyn_cast<PHINode>(V)) { 90 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { 91 return SubLoop->contains(PHI); })) 92 return true; 93 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) 94 return true; 95 } 96 return false; 97 } 98 99 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 100 : BaseT(TM, F.getParent()->getDataLayout()), 101 TargetTriple(TM->getTargetTriple()), 102 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 103 TLI(ST->getTargetLowering()) {} 104 105 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 106 TTI::UnrollingPreferences &UP, 107 OptimizationRemarkEmitter *ORE) { 108 const Function &F = *L->getHeader()->getParent(); 109 UP.Threshold = 110 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300); 111 UP.MaxCount = std::numeric_limits<unsigned>::max(); 112 UP.Partial = true; 113 114 // Conditional branch in a loop back edge needs 3 additional exec 115 // manipulations in average. 116 UP.BEInsns += 3; 117 118 // TODO: Do we want runtime unrolling? 119 120 // Maximum alloca size than can fit registers. Reserve 16 registers. 121 const unsigned MaxAlloca = (256 - 16) * 4; 122 unsigned ThresholdPrivate = UnrollThresholdPrivate; 123 unsigned ThresholdLocal = UnrollThresholdLocal; 124 125 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the 126 // provided threshold value as the default for Threshold 127 if (MDNode *LoopUnrollThreshold = 128 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) { 129 if (LoopUnrollThreshold->getNumOperands() == 2) { 130 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>( 131 LoopUnrollThreshold->getOperand(1)); 132 if (MetaThresholdValue) { 133 // We will also use the supplied value for PartialThreshold for now. 134 // We may introduce additional metadata if it becomes necessary in the 135 // future. 136 UP.Threshold = MetaThresholdValue->getSExtValue(); 137 UP.PartialThreshold = UP.Threshold; 138 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold); 139 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold); 140 } 141 } 142 } 143 144 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); 145 for (const BasicBlock *BB : L->getBlocks()) { 146 const DataLayout &DL = BB->getModule()->getDataLayout(); 147 unsigned LocalGEPsSeen = 0; 148 149 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { 150 return SubLoop->contains(BB); })) 151 continue; // Block belongs to an inner loop. 152 153 for (const Instruction &I : *BB) { 154 // Unroll a loop which contains an "if" statement whose condition 155 // defined by a PHI belonging to the loop. This may help to eliminate 156 // if region and potentially even PHI itself, saving on both divergence 157 // and registers used for the PHI. 158 // Add a small bonus for each of such "if" statements. 159 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { 160 if (UP.Threshold < MaxBoost && Br->isConditional()) { 161 BasicBlock *Succ0 = Br->getSuccessor(0); 162 BasicBlock *Succ1 = Br->getSuccessor(1); 163 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || 164 (L->contains(Succ1) && L->isLoopExiting(Succ1))) 165 continue; 166 if (dependsOnLocalPhi(L, Br->getCondition())) { 167 UP.Threshold += UnrollThresholdIf; 168 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold 169 << " for loop:\n" 170 << *L << " due to " << *Br << '\n'); 171 if (UP.Threshold >= MaxBoost) 172 return; 173 } 174 } 175 continue; 176 } 177 178 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 179 if (!GEP) 180 continue; 181 182 unsigned AS = GEP->getAddressSpace(); 183 unsigned Threshold = 0; 184 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 185 Threshold = ThresholdPrivate; 186 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 187 Threshold = ThresholdLocal; 188 else 189 continue; 190 191 if (UP.Threshold >= Threshold) 192 continue; 193 194 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 195 const Value *Ptr = GEP->getPointerOperand(); 196 const AllocaInst *Alloca = 197 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr)); 198 if (!Alloca || !Alloca->isStaticAlloca()) 199 continue; 200 Type *Ty = Alloca->getAllocatedType(); 201 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 202 if (AllocaSize > MaxAlloca) 203 continue; 204 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || 205 AS == AMDGPUAS::REGION_ADDRESS) { 206 LocalGEPsSeen++; 207 // Inhibit unroll for local memory if we have seen addressing not to 208 // a variable, most likely we will be unable to combine it. 209 // Do not unroll too deep inner loops for local memory to give a chance 210 // to unroll an outer loop for a more important reason. 211 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || 212 (!isa<GlobalVariable>(GEP->getPointerOperand()) && 213 !isa<Argument>(GEP->getPointerOperand()))) 214 continue; 215 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" 216 << *L << " due to LDS use.\n"); 217 UP.Runtime = UnrollRuntimeLocal; 218 } 219 220 // Check if GEP depends on a value defined by this loop itself. 221 bool HasLoopDef = false; 222 for (const Value *Op : GEP->operands()) { 223 const Instruction *Inst = dyn_cast<Instruction>(Op); 224 if (!Inst || L->isLoopInvariant(Op)) 225 continue; 226 227 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 228 return SubLoop->contains(Inst); })) 229 continue; 230 HasLoopDef = true; 231 break; 232 } 233 if (!HasLoopDef) 234 continue; 235 236 // We want to do whatever we can to limit the number of alloca 237 // instructions that make it through to the code generator. allocas 238 // require us to use indirect addressing, which is slow and prone to 239 // compiler bugs. If this loop does an address calculation on an 240 // alloca ptr, then we want to use a higher than normal loop unroll 241 // threshold. This will give SROA a better chance to eliminate these 242 // allocas. 243 // 244 // We also want to have more unrolling for local memory to let ds 245 // instructions with different offsets combine. 246 // 247 // Don't use the maximum allowed value here as it will make some 248 // programs way too big. 249 UP.Threshold = Threshold; 250 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold 251 << " for loop:\n" 252 << *L << " due to " << *GEP << '\n'); 253 if (UP.Threshold >= MaxBoost) 254 return; 255 } 256 257 // If we got a GEP in a small BB from inner loop then increase max trip 258 // count to analyze for better estimation cost in unroll 259 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze) 260 UP.MaxIterationsCountToAnalyze = 32; 261 } 262 } 263 264 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 265 TTI::PeelingPreferences &PP) { 266 BaseT::getPeelingPreferences(L, SE, PP); 267 } 268 269 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { 270 // Codegen control options which don't matter. 271 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, 272 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, 273 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, 274 AMDGPU::FeatureUnalignedAccessMode, 275 276 AMDGPU::FeatureAutoWaitcntBeforeBarrier, 277 278 // Property of the kernel/environment which can't actually differ. 279 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, 280 AMDGPU::FeatureTrapHandler, 281 282 // The default assumption needs to be ecc is enabled, but no directly 283 // exposed operations depend on it, so it can be safely inlined. 284 AMDGPU::FeatureSRAMECC, 285 286 // Perf-tuning features 287 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops}; 288 289 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 290 : BaseT(TM, F.getParent()->getDataLayout()), 291 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 292 TLI(ST->getTargetLowering()), CommonTTI(TM, F), 293 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) { 294 AMDGPU::SIModeRegisterDefaults Mode(F); 295 HasFP32Denormals = Mode.allFP32Denormals(); 296 HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); 297 } 298 299 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { 300 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector 301 // registers. See getRegisterClassForType for the implementation. 302 // In this case vector registers are not vector in terms of 303 // VGPRs, but those which can hold multiple values. 304 305 // This is really the number of registers to fill when vectorizing / 306 // interleaving loops, so we lie to avoid trying to use all registers. 307 return 4; 308 } 309 310 TypeSize 311 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 312 switch (K) { 313 case TargetTransformInfo::RGK_Scalar: 314 return TypeSize::getFixed(32); 315 case TargetTransformInfo::RGK_FixedWidthVector: 316 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32); 317 case TargetTransformInfo::RGK_ScalableVector: 318 return TypeSize::getScalable(0); 319 } 320 llvm_unreachable("Unsupported register kind"); 321 } 322 323 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { 324 return 32; 325 } 326 327 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 328 if (Opcode == Instruction::Load || Opcode == Instruction::Store) 329 return 32 * 4 / ElemWidth; 330 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 331 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 332 : 1; 333 } 334 335 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, 336 unsigned ChainSizeInBytes, 337 VectorType *VecTy) const { 338 unsigned VecRegBitWidth = VF * LoadSize; 339 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) 340 // TODO: Support element-size less than 32bit? 341 return 128 / LoadSize; 342 343 return VF; 344 } 345 346 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, 347 unsigned ChainSizeInBytes, 348 VectorType *VecTy) const { 349 unsigned VecRegBitWidth = VF * StoreSize; 350 if (VecRegBitWidth > 128) 351 return 128 / StoreSize; 352 353 return VF; 354 } 355 356 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 357 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || 358 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || 359 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 360 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { 361 return 512; 362 } 363 364 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 365 return 8 * ST->getMaxPrivateElementSize(); 366 367 // Common to flat, global, local and region. Assume for unknown addrspace. 368 return 128; 369 } 370 371 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 372 Align Alignment, 373 unsigned AddrSpace) const { 374 // We allow vectorization of flat stores, even though we may need to decompose 375 // them later if they may access private memory. We don't have enough context 376 // here, and legalization can handle it. 377 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 378 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && 379 ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 380 } 381 return true; 382 } 383 384 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 385 Align Alignment, 386 unsigned AddrSpace) const { 387 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 388 } 389 390 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 391 Align Alignment, 392 unsigned AddrSpace) const { 393 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 394 } 395 396 // FIXME: Really we would like to issue multiple 128-bit loads and stores per 397 // iteration. Should we report a larger size and let it legalize? 398 // 399 // FIXME: Should we use narrower types for local/region, or account for when 400 // unaligned access is legal? 401 // 402 // FIXME: This could use fine tuning and microbenchmarks. 403 Type *GCNTTIImpl::getMemcpyLoopLoweringType( 404 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, 405 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, 406 std::optional<uint32_t> AtomicElementSize) const { 407 408 if (AtomicElementSize) 409 return Type::getIntNTy(Context, *AtomicElementSize * 8); 410 411 unsigned MinAlign = std::min(SrcAlign, DestAlign); 412 413 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the 414 // hardware into byte accesses. If you assume all alignments are equally 415 // probable, it's more efficient on average to use short accesses for this 416 // case. 417 if (MinAlign == 2) 418 return Type::getInt16Ty(Context); 419 420 // Not all subtargets have 128-bit DS instructions, and we currently don't 421 // form them by default. 422 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || 423 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || 424 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || 425 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { 426 return FixedVectorType::get(Type::getInt32Ty(Context), 2); 427 } 428 429 // Global memory works best with 16-byte accesses. Private memory will also 430 // hit this, although they'll be decomposed. 431 return FixedVectorType::get(Type::getInt32Ty(Context), 4); 432 } 433 434 void GCNTTIImpl::getMemcpyLoopResidualLoweringType( 435 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 436 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 437 unsigned SrcAlign, unsigned DestAlign, 438 std::optional<uint32_t> AtomicCpySize) const { 439 assert(RemainingBytes < 16); 440 441 if (AtomicCpySize) 442 BaseT::getMemcpyLoopResidualLoweringType( 443 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, 444 DestAlign, AtomicCpySize); 445 446 unsigned MinAlign = std::min(SrcAlign, DestAlign); 447 448 if (MinAlign != 2) { 449 Type *I64Ty = Type::getInt64Ty(Context); 450 while (RemainingBytes >= 8) { 451 OpsOut.push_back(I64Ty); 452 RemainingBytes -= 8; 453 } 454 455 Type *I32Ty = Type::getInt32Ty(Context); 456 while (RemainingBytes >= 4) { 457 OpsOut.push_back(I32Ty); 458 RemainingBytes -= 4; 459 } 460 } 461 462 Type *I16Ty = Type::getInt16Ty(Context); 463 while (RemainingBytes >= 2) { 464 OpsOut.push_back(I16Ty); 465 RemainingBytes -= 2; 466 } 467 468 Type *I8Ty = Type::getInt8Ty(Context); 469 while (RemainingBytes) { 470 OpsOut.push_back(I8Ty); 471 --RemainingBytes; 472 } 473 } 474 475 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { 476 // Disable unrolling if the loop is not vectorized. 477 // TODO: Enable this again. 478 if (VF == 1) 479 return 1; 480 481 return 8; 482 } 483 484 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 485 MemIntrinsicInfo &Info) const { 486 switch (Inst->getIntrinsicID()) { 487 case Intrinsic::amdgcn_atomic_inc: 488 case Intrinsic::amdgcn_atomic_dec: 489 case Intrinsic::amdgcn_ds_ordered_add: 490 case Intrinsic::amdgcn_ds_ordered_swap: 491 case Intrinsic::amdgcn_ds_fadd: 492 case Intrinsic::amdgcn_ds_fmin: 493 case Intrinsic::amdgcn_ds_fmax: { 494 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); 495 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); 496 if (!Ordering || !Volatile) 497 return false; // Invalid. 498 499 unsigned OrderingVal = Ordering->getZExtValue(); 500 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) 501 return false; 502 503 Info.PtrVal = Inst->getArgOperand(0); 504 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); 505 Info.ReadMem = true; 506 Info.WriteMem = true; 507 Info.IsVolatile = !Volatile->isZero(); 508 return true; 509 } 510 default: 511 return false; 512 } 513 } 514 515 InstructionCost GCNTTIImpl::getArithmeticInstrCost( 516 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 517 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 518 ArrayRef<const Value *> Args, 519 const Instruction *CxtI) { 520 521 // Legalize the type. 522 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 523 int ISD = TLI->InstructionOpcodeToISD(Opcode); 524 525 // Because we don't have any legal vector operations, but the legal types, we 526 // need to account for split vectors. 527 unsigned NElts = LT.second.isVector() ? 528 LT.second.getVectorNumElements() : 1; 529 530 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 531 532 switch (ISD) { 533 case ISD::SHL: 534 case ISD::SRL: 535 case ISD::SRA: 536 if (SLT == MVT::i64) 537 return get64BitInstrCost(CostKind) * LT.first * NElts; 538 539 if (ST->has16BitInsts() && SLT == MVT::i16) 540 NElts = (NElts + 1) / 2; 541 542 // i32 543 return getFullRateInstrCost() * LT.first * NElts; 544 case ISD::ADD: 545 case ISD::SUB: 546 case ISD::AND: 547 case ISD::OR: 548 case ISD::XOR: 549 if (SLT == MVT::i64) { 550 // and, or and xor are typically split into 2 VALU instructions. 551 return 2 * getFullRateInstrCost() * LT.first * NElts; 552 } 553 554 if (ST->has16BitInsts() && SLT == MVT::i16) 555 NElts = (NElts + 1) / 2; 556 557 return LT.first * NElts * getFullRateInstrCost(); 558 case ISD::MUL: { 559 const int QuarterRateCost = getQuarterRateInstrCost(CostKind); 560 if (SLT == MVT::i64) { 561 const int FullRateCost = getFullRateInstrCost(); 562 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 563 } 564 565 if (ST->has16BitInsts() && SLT == MVT::i16) 566 NElts = (NElts + 1) / 2; 567 568 // i32 569 return QuarterRateCost * NElts * LT.first; 570 } 571 case ISD::FMUL: 572 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for 573 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole 574 // fused operation. 575 if (CxtI && CxtI->hasOneUse()) 576 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) { 577 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode()); 578 if (OPC == ISD::FADD || OPC == ISD::FSUB) { 579 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals) 580 return TargetTransformInfo::TCC_Free; 581 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) 582 return TargetTransformInfo::TCC_Free; 583 584 // Estimate all types may be fused with contract/unsafe flags 585 const TargetOptions &Options = TLI->getTargetMachine().Options; 586 if (Options.AllowFPOpFusion == FPOpFusion::Fast || 587 Options.UnsafeFPMath || 588 (FAdd->hasAllowContract() && CxtI->hasAllowContract())) 589 return TargetTransformInfo::TCC_Free; 590 } 591 } 592 [[fallthrough]]; 593 case ISD::FADD: 594 case ISD::FSUB: 595 if (ST->hasPackedFP32Ops() && SLT == MVT::f32) 596 NElts = (NElts + 1) / 2; 597 if (SLT == MVT::f64) 598 return LT.first * NElts * get64BitInstrCost(CostKind); 599 600 if (ST->has16BitInsts() && SLT == MVT::f16) 601 NElts = (NElts + 1) / 2; 602 603 if (SLT == MVT::f32 || SLT == MVT::f16) 604 return LT.first * NElts * getFullRateInstrCost(); 605 break; 606 case ISD::FDIV: 607 case ISD::FREM: 608 // FIXME: frem should be handled separately. The fdiv in it is most of it, 609 // but the current lowering is also not entirely correct. 610 if (SLT == MVT::f64) { 611 int Cost = 7 * get64BitInstrCost(CostKind) + 612 getQuarterRateInstrCost(CostKind) + 613 3 * getHalfRateInstrCost(CostKind); 614 // Add cost of workaround. 615 if (!ST->hasUsableDivScaleConditionOutput()) 616 Cost += 3 * getFullRateInstrCost(); 617 618 return LT.first * Cost * NElts; 619 } 620 621 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { 622 // TODO: This is more complicated, unsafe flags etc. 623 if ((SLT == MVT::f32 && !HasFP32Denormals) || 624 (SLT == MVT::f16 && ST->has16BitInsts())) { 625 return LT.first * getQuarterRateInstrCost(CostKind) * NElts; 626 } 627 } 628 629 if (SLT == MVT::f16 && ST->has16BitInsts()) { 630 // 2 x v_cvt_f32_f16 631 // f32 rcp 632 // f32 fmul 633 // v_cvt_f16_f32 634 // f16 div_fixup 635 int Cost = 636 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); 637 return LT.first * Cost * NElts; 638 } 639 640 if (SLT == MVT::f32 || SLT == MVT::f16) { 641 // 4 more v_cvt_* insts without f16 insts support 642 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + 643 1 * getQuarterRateInstrCost(CostKind); 644 645 if (!HasFP32Denormals) { 646 // FP mode switches. 647 Cost += 2 * getFullRateInstrCost(); 648 } 649 650 return LT.first * NElts * Cost; 651 } 652 break; 653 case ISD::FNEG: 654 // Use the backend' estimation. If fneg is not free each element will cost 655 // one additional instruction. 656 return TLI->isFNegFree(SLT) ? 0 : NElts; 657 default: 658 break; 659 } 660 661 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 662 Args, CxtI); 663 } 664 665 // Return true if there's a potential benefit from using v2f16/v2i16 666 // instructions for an intrinsic, even if it requires nontrivial legalization. 667 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { 668 switch (ID) { 669 case Intrinsic::fma: // TODO: fmuladd 670 // There's a small benefit to using vector ops in the legalized code. 671 case Intrinsic::round: 672 case Intrinsic::uadd_sat: 673 case Intrinsic::usub_sat: 674 case Intrinsic::sadd_sat: 675 case Intrinsic::ssub_sat: 676 return true; 677 default: 678 return false; 679 } 680 } 681 682 InstructionCost 683 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 684 TTI::TargetCostKind CostKind) { 685 if (ICA.getID() == Intrinsic::fabs) 686 return 0; 687 688 if (!intrinsicHasPackedVectorBenefit(ICA.getID())) 689 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 690 691 Type *RetTy = ICA.getReturnType(); 692 693 // Legalize the type. 694 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); 695 696 unsigned NElts = LT.second.isVector() ? 697 LT.second.getVectorNumElements() : 1; 698 699 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 700 701 if (SLT == MVT::f64) 702 return LT.first * NElts * get64BitInstrCost(CostKind); 703 704 if ((ST->has16BitInsts() && SLT == MVT::f16) || 705 (ST->hasPackedFP32Ops() && SLT == MVT::f32)) 706 NElts = (NElts + 1) / 2; 707 708 // TODO: Get more refined intrinsic costs? 709 unsigned InstRate = getQuarterRateInstrCost(CostKind); 710 711 switch (ICA.getID()) { 712 case Intrinsic::fma: 713 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) 714 : getQuarterRateInstrCost(CostKind); 715 break; 716 case Intrinsic::uadd_sat: 717 case Intrinsic::usub_sat: 718 case Intrinsic::sadd_sat: 719 case Intrinsic::ssub_sat: 720 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; 721 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 722 NElts = 1; 723 break; 724 } 725 726 return LT.first * NElts * InstRate; 727 } 728 729 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode, 730 TTI::TargetCostKind CostKind, 731 const Instruction *I) { 732 assert((I == nullptr || I->getOpcode() == Opcode) && 733 "Opcode should reflect passed instruction."); 734 const bool SCost = 735 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency); 736 const int CBrCost = SCost ? 5 : 7; 737 switch (Opcode) { 738 case Instruction::Br: { 739 // Branch instruction takes about 4 slots on gfx900. 740 auto BI = dyn_cast_or_null<BranchInst>(I); 741 if (BI && BI->isUnconditional()) 742 return SCost ? 1 : 4; 743 // Suppose conditional branch takes additional 3 exec manipulations 744 // instructions in average. 745 return CBrCost; 746 } 747 case Instruction::Switch: { 748 auto SI = dyn_cast_or_null<SwitchInst>(I); 749 // Each case (including default) takes 1 cmp + 1 cbr instructions in 750 // average. 751 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1); 752 } 753 case Instruction::Ret: 754 return SCost ? 1 : 10; 755 } 756 return BaseT::getCFInstrCost(Opcode, CostKind, I); 757 } 758 759 InstructionCost 760 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 761 std::optional<FastMathFlags> FMF, 762 TTI::TargetCostKind CostKind) { 763 if (TTI::requiresOrderedReduction(FMF)) 764 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 765 766 EVT OrigTy = TLI->getValueType(DL, Ty); 767 768 // Computes cost on targets that have packed math instructions(which support 769 // 16-bit types only). 770 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) 771 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 772 773 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 774 return LT.first * getFullRateInstrCost(); 775 } 776 777 InstructionCost 778 GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 779 bool IsUnsigned, 780 TTI::TargetCostKind CostKind) { 781 EVT OrigTy = TLI->getValueType(DL, Ty); 782 783 // Computes cost on targets that have packed math instructions(which support 784 // 16-bit types only). 785 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) 786 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); 787 788 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 789 return LT.first * getHalfRateInstrCost(CostKind); 790 } 791 792 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 793 TTI::TargetCostKind CostKind, 794 unsigned Index, Value *Op0, 795 Value *Op1) { 796 switch (Opcode) { 797 case Instruction::ExtractElement: 798 case Instruction::InsertElement: { 799 unsigned EltSize 800 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 801 if (EltSize < 32) { 802 if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) 803 return 0; 804 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, 805 Op1); 806 } 807 808 // Extracts are just reads of a subregister, so are free. Inserts are 809 // considered free because we don't want to have any cost for scalarizing 810 // operations, and we don't have to copy into a different register class. 811 812 // Dynamic indexing isn't free and is best avoided. 813 return Index == ~0u ? 2 : 0; 814 } 815 default: 816 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); 817 } 818 } 819 820 /// Analyze if the results of inline asm are divergent. If \p Indices is empty, 821 /// this is analyzing the collective result of all output registers. Otherwise, 822 /// this is only querying a specific result index if this returns multiple 823 /// registers in a struct. 824 bool GCNTTIImpl::isInlineAsmSourceOfDivergence( 825 const CallInst *CI, ArrayRef<unsigned> Indices) const { 826 // TODO: Handle complex extract indices 827 if (Indices.size() > 1) 828 return true; 829 830 const DataLayout &DL = CI->getModule()->getDataLayout(); 831 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 832 TargetLowering::AsmOperandInfoVector TargetConstraints = 833 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI); 834 835 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; 836 837 int OutputIdx = 0; 838 for (auto &TC : TargetConstraints) { 839 if (TC.Type != InlineAsm::isOutput) 840 continue; 841 842 // Skip outputs we don't care about. 843 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) 844 continue; 845 846 TLI->ComputeConstraintToUse(TC, SDValue()); 847 848 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint( 849 TRI, TC.ConstraintCode, TC.ConstraintVT).second; 850 851 // For AGPR constraints null is returned on subtargets without AGPRs, so 852 // assume divergent for null. 853 if (!RC || !TRI->isSGPRClass(RC)) 854 return true; 855 } 856 857 return false; 858 } 859 860 /// \returns true if the new GPU divergence analysis is enabled. 861 bool GCNTTIImpl::useGPUDivergenceAnalysis() const { 862 return !UseLegacyDA; 863 } 864 865 bool GCNTTIImpl::isReadRegisterSourceOfDivergence( 866 const IntrinsicInst *ReadReg) const { 867 Metadata *MD = 868 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata(); 869 StringRef RegName = 870 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString(); 871 872 // Special case registers that look like VCC. 873 MVT VT = MVT::getVT(ReadReg->getType()); 874 if (VT == MVT::i1) 875 return true; 876 877 // Special case scalar registers that start with 'v'. 878 if (RegName.startswith("vcc") || RegName.empty()) 879 return false; 880 881 // VGPR or AGPR is divergent. There aren't any specially named vector 882 // registers. 883 return RegName[0] == 'v' || RegName[0] == 'a'; 884 } 885 886 /// \returns true if the result of the value could potentially be 887 /// different across workitems in a wavefront. 888 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { 889 if (const Argument *A = dyn_cast<Argument>(V)) 890 return !AMDGPU::isArgPassedInSGPR(A); 891 892 // Loads from the private and flat address spaces are divergent, because 893 // threads can execute the load instruction with the same inputs and get 894 // different results. 895 // 896 // All other loads are not divergent, because if threads issue loads with the 897 // same arguments, they will always get the same result. 898 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 899 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || 900 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; 901 902 // Atomics are divergent because they are executed sequentially: when an 903 // atomic operation refers to the same address in each thread, then each 904 // thread after the first sees the value written by the previous thread as 905 // original value. 906 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 907 return true; 908 909 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 910 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) 911 return isReadRegisterSourceOfDivergence(Intrinsic); 912 913 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); 914 } 915 916 // Assume all function calls are a source of divergence. 917 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 918 if (CI->isInlineAsm()) 919 return isInlineAsmSourceOfDivergence(CI); 920 return true; 921 } 922 923 // Assume all function calls are a source of divergence. 924 if (isa<InvokeInst>(V)) 925 return true; 926 927 return false; 928 } 929 930 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { 931 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 932 switch (Intrinsic->getIntrinsicID()) { 933 default: 934 return false; 935 case Intrinsic::amdgcn_readfirstlane: 936 case Intrinsic::amdgcn_readlane: 937 case Intrinsic::amdgcn_icmp: 938 case Intrinsic::amdgcn_fcmp: 939 case Intrinsic::amdgcn_ballot: 940 case Intrinsic::amdgcn_if_break: 941 return true; 942 } 943 } 944 945 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 946 if (CI->isInlineAsm()) 947 return !isInlineAsmSourceOfDivergence(CI); 948 return false; 949 } 950 951 // In most cases TID / wavefrontsize is uniform. 952 // 953 // However, if a kernel has uneven dimesions we can have a value of 954 // workitem-id-x divided by the wavefrontsize non-uniform. For example 955 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1) 956 // packed into a same wave which gives 1 and 0 after the division by 64 957 // respectively. 958 // 959 // FIXME: limit it to 1D kernels only, although that shall be possible 960 // to perform this optimization is the size of the X dimension is a power 961 // of 2, we just do not currently have infrastructure to query it. 962 using namespace llvm::PatternMatch; 963 uint64_t C; 964 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), 965 m_ConstantInt(C))) || 966 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), 967 m_ConstantInt(C)))) { 968 const Function *F = cast<Instruction>(V)->getFunction(); 969 return C >= ST->getWavefrontSizeLog2() && 970 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; 971 } 972 973 Value *Mask; 974 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), 975 m_Value(Mask)))) { 976 const Function *F = cast<Instruction>(V)->getFunction(); 977 const DataLayout &DL = F->getParent()->getDataLayout(); 978 return computeKnownBits(Mask, DL).countMinTrailingZeros() >= 979 ST->getWavefrontSizeLog2() && 980 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; 981 } 982 983 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V); 984 if (!ExtValue) 985 return false; 986 987 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0)); 988 if (!CI) 989 return false; 990 991 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) { 992 switch (Intrinsic->getIntrinsicID()) { 993 default: 994 return false; 995 case Intrinsic::amdgcn_if: 996 case Intrinsic::amdgcn_else: { 997 ArrayRef<unsigned> Indices = ExtValue->getIndices(); 998 return Indices.size() == 1 && Indices[0] == 1; 999 } 1000 } 1001 } 1002 1003 // If we have inline asm returning mixed SGPR and VGPR results, we inferred 1004 // divergent for the overall struct return. We need to override it in the 1005 // case we're extracting an SGPR component here. 1006 if (CI->isInlineAsm()) 1007 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices()); 1008 1009 return false; 1010 } 1011 1012 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 1013 Intrinsic::ID IID) const { 1014 switch (IID) { 1015 case Intrinsic::amdgcn_atomic_inc: 1016 case Intrinsic::amdgcn_atomic_dec: 1017 case Intrinsic::amdgcn_ds_fadd: 1018 case Intrinsic::amdgcn_ds_fmin: 1019 case Intrinsic::amdgcn_ds_fmax: 1020 case Intrinsic::amdgcn_is_shared: 1021 case Intrinsic::amdgcn_is_private: 1022 case Intrinsic::amdgcn_flat_atomic_fadd: 1023 case Intrinsic::amdgcn_flat_atomic_fmax: 1024 case Intrinsic::amdgcn_flat_atomic_fmin: 1025 OpIndexes.push_back(0); 1026 return true; 1027 default: 1028 return false; 1029 } 1030 } 1031 1032 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, 1033 Value *OldV, 1034 Value *NewV) const { 1035 auto IntrID = II->getIntrinsicID(); 1036 switch (IntrID) { 1037 case Intrinsic::amdgcn_atomic_inc: 1038 case Intrinsic::amdgcn_atomic_dec: 1039 case Intrinsic::amdgcn_ds_fadd: 1040 case Intrinsic::amdgcn_ds_fmin: 1041 case Intrinsic::amdgcn_ds_fmax: { 1042 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); 1043 if (!IsVolatile->isZero()) 1044 return nullptr; 1045 Module *M = II->getParent()->getParent()->getParent(); 1046 Type *DestTy = II->getType(); 1047 Type *SrcTy = NewV->getType(); 1048 Function *NewDecl = 1049 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); 1050 II->setArgOperand(0, NewV); 1051 II->setCalledFunction(NewDecl); 1052 return II; 1053 } 1054 case Intrinsic::amdgcn_is_shared: 1055 case Intrinsic::amdgcn_is_private: { 1056 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? 1057 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; 1058 unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 1059 LLVMContext &Ctx = NewV->getType()->getContext(); 1060 ConstantInt *NewVal = (TrueAS == NewAS) ? 1061 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); 1062 return NewVal; 1063 } 1064 case Intrinsic::ptrmask: { 1065 unsigned OldAS = OldV->getType()->getPointerAddressSpace(); 1066 unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 1067 Value *MaskOp = II->getArgOperand(1); 1068 Type *MaskTy = MaskOp->getType(); 1069 1070 bool DoTruncate = false; 1071 1072 const GCNTargetMachine &TM = 1073 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); 1074 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { 1075 // All valid 64-bit to 32-bit casts work by chopping off the high 1076 // bits. Any masking only clearing the low bits will also apply in the new 1077 // address space. 1078 if (DL.getPointerSizeInBits(OldAS) != 64 || 1079 DL.getPointerSizeInBits(NewAS) != 32) 1080 return nullptr; 1081 1082 // TODO: Do we need to thread more context in here? 1083 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II); 1084 if (Known.countMinLeadingOnes() < 32) 1085 return nullptr; 1086 1087 DoTruncate = true; 1088 } 1089 1090 IRBuilder<> B(II); 1091 if (DoTruncate) { 1092 MaskTy = B.getInt32Ty(); 1093 MaskOp = B.CreateTrunc(MaskOp, MaskTy); 1094 } 1095 1096 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, 1097 {NewV, MaskOp}); 1098 } 1099 case Intrinsic::amdgcn_flat_atomic_fadd: 1100 case Intrinsic::amdgcn_flat_atomic_fmax: 1101 case Intrinsic::amdgcn_flat_atomic_fmin: { 1102 Module *M = II->getParent()->getParent()->getParent(); 1103 Type *DestTy = II->getType(); 1104 Type *SrcTy = NewV->getType(); 1105 Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(), 1106 {DestTy, SrcTy, DestTy}); 1107 II->setArgOperand(0, NewV); 1108 II->setCalledFunction(NewDecl); 1109 return II; 1110 } 1111 default: 1112 return nullptr; 1113 } 1114 } 1115 1116 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1117 VectorType *VT, ArrayRef<int> Mask, 1118 TTI::TargetCostKind CostKind, 1119 int Index, VectorType *SubTp, 1120 ArrayRef<const Value *> Args) { 1121 Kind = improveShuffleKindFromMask(Kind, Mask); 1122 if (ST->hasVOP3PInsts()) { 1123 if (cast<FixedVectorType>(VT)->getNumElements() == 2 && 1124 DL.getTypeSizeInBits(VT->getElementType()) == 16) { 1125 // With op_sel VOP3P instructions freely can access the low half or high 1126 // half of a register, so any swizzle is free. 1127 1128 switch (Kind) { 1129 case TTI::SK_Broadcast: 1130 case TTI::SK_Reverse: 1131 case TTI::SK_PermuteSingleSrc: 1132 return 0; 1133 default: 1134 break; 1135 } 1136 } 1137 } 1138 1139 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); 1140 } 1141 1142 bool GCNTTIImpl::areInlineCompatible(const Function *Caller, 1143 const Function *Callee) const { 1144 const TargetMachine &TM = getTLI()->getTargetMachine(); 1145 const GCNSubtarget *CallerST 1146 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); 1147 const GCNSubtarget *CalleeST 1148 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); 1149 1150 const FeatureBitset &CallerBits = CallerST->getFeatureBits(); 1151 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); 1152 1153 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 1154 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 1155 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 1156 return false; 1157 1158 // FIXME: dx10_clamp can just take the caller setting, but there seems to be 1159 // no way to support merge for backend defined attributes. 1160 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); 1161 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); 1162 if (!CallerMode.isInlineCompatible(CalleeMode)) 1163 return false; 1164 1165 if (Callee->hasFnAttribute(Attribute::AlwaysInline) || 1166 Callee->hasFnAttribute(Attribute::InlineHint)) 1167 return true; 1168 1169 // Hack to make compile times reasonable. 1170 if (InlineMaxBB) { 1171 // Single BB does not increase total BB amount. 1172 if (Callee->size() == 1) 1173 return true; 1174 size_t BBSize = Caller->size() + Callee->size() - 1; 1175 return BBSize <= InlineMaxBB; 1176 } 1177 1178 return true; 1179 } 1180 1181 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 1182 // If we have a pointer to private array passed into a function 1183 // it will not be optimized out, leaving scratch usage. 1184 // Increase the inline threshold to allow inlining in this case. 1185 uint64_t AllocaSize = 0; 1186 SmallPtrSet<const AllocaInst *, 8> AIVisited; 1187 for (Value *PtrArg : CB->args()) { 1188 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); 1189 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && 1190 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) 1191 continue; 1192 1193 PtrArg = getUnderlyingObject(PtrArg); 1194 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { 1195 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) 1196 continue; 1197 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 1198 // If the amount of stack memory is excessive we will not be able 1199 // to get rid of the scratch anyway, bail out. 1200 if (AllocaSize > ArgAllocaCutoff) { 1201 AllocaSize = 0; 1202 break; 1203 } 1204 } 1205 } 1206 if (AllocaSize) 1207 return ArgAllocaCost; 1208 return 0; 1209 } 1210 1211 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1212 TTI::UnrollingPreferences &UP, 1213 OptimizationRemarkEmitter *ORE) { 1214 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE); 1215 } 1216 1217 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1218 TTI::PeelingPreferences &PP) { 1219 CommonTTI.getPeelingPreferences(L, SE, PP); 1220 } 1221 1222 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { 1223 return ST->hasFullRate64Ops() 1224 ? getFullRateInstrCost() 1225 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) 1226 : getQuarterRateInstrCost(CostKind); 1227 } 1228 1229 std::pair<InstructionCost, MVT> 1230 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { 1231 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty); 1232 auto Size = DL.getTypeSizeInBits(Ty); 1233 // Maximum load or store can handle 8 dwords for scalar and 4 for 1234 // vector ALU. Let's assume anything above 8 dwords is expensive 1235 // even if legal. 1236 if (Size <= 256) 1237 return Cost; 1238 1239 Cost.first += (Size + 255) / 256; 1240 return Cost; 1241 } 1242