1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIModeRegisterDefaults.h"
21 #include "llvm/Analysis/InlineCost.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/Support/KnownBits.h"
29 #include <optional>
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "AMDGPUtti"
34
35 static cl::opt<unsigned> UnrollThresholdPrivate(
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
40 static cl::opt<unsigned> UnrollThresholdLocal(
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
45 static cl::opt<unsigned> UnrollThresholdIf(
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
50 static cl::opt<bool> UnrollRuntimeLocal(
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
55 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64 // If the amount of scratch memory to eliminate exceeds our ability to allocate
65 // it into registers we gain nothing by aggressively inlining functions for that
66 // heuristic.
67 static cl::opt<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72 // Inliner constraint to achieve reasonable compilation time.
73 static cl::opt<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
dependsOnLocalPhi(const Loop * L,const Value * Cond,unsigned Depth=0)78 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79 unsigned Depth = 0) {
80 const Instruction *I = dyn_cast<Instruction>(Cond);
81 if (!I)
82 return false;
83
84 for (const Value *V : I->operand_values()) {
85 if (!L->contains(I))
86 continue;
87 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89 return SubLoop->contains(PHI); }))
90 return true;
91 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92 return true;
93 }
94 return false;
95 }
96
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)97 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
98 : BaseT(TM, F.getDataLayout()),
99 TargetTriple(TM->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101 TLI(ST->getTargetLowering()) {}
102
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)103 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104 TTI::UnrollingPreferences &UP,
105 OptimizationRemarkEmitter *ORE) {
106 const Function &F = *L->getHeader()->getParent();
107 UP.Threshold =
108 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
109 UP.MaxCount = std::numeric_limits<unsigned>::max();
110 UP.Partial = true;
111
112 // Conditional branch in a loop back edge needs 3 additional exec
113 // manipulations in average.
114 UP.BEInsns += 3;
115
116 // We want to run unroll even for the loops which have been vectorized.
117 UP.UnrollVectorizedLoop = true;
118
119 // TODO: Do we want runtime unrolling?
120
121 // Maximum alloca size than can fit registers. Reserve 16 registers.
122 const unsigned MaxAlloca = (256 - 16) * 4;
123 unsigned ThresholdPrivate = UnrollThresholdPrivate;
124 unsigned ThresholdLocal = UnrollThresholdLocal;
125
126 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127 // provided threshold value as the default for Threshold
128 if (MDNode *LoopUnrollThreshold =
129 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
130 if (LoopUnrollThreshold->getNumOperands() == 2) {
131 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132 LoopUnrollThreshold->getOperand(1));
133 if (MetaThresholdValue) {
134 // We will also use the supplied value for PartialThreshold for now.
135 // We may introduce additional metadata if it becomes necessary in the
136 // future.
137 UP.Threshold = MetaThresholdValue->getSExtValue();
138 UP.PartialThreshold = UP.Threshold;
139 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
140 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
141 }
142 }
143 }
144
145 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
146 for (const BasicBlock *BB : L->getBlocks()) {
147 const DataLayout &DL = BB->getDataLayout();
148 unsigned LocalGEPsSeen = 0;
149
150 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
151 return SubLoop->contains(BB); }))
152 continue; // Block belongs to an inner loop.
153
154 for (const Instruction &I : *BB) {
155 // Unroll a loop which contains an "if" statement whose condition
156 // defined by a PHI belonging to the loop. This may help to eliminate
157 // if region and potentially even PHI itself, saving on both divergence
158 // and registers used for the PHI.
159 // Add a small bonus for each of such "if" statements.
160 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
161 if (UP.Threshold < MaxBoost && Br->isConditional()) {
162 BasicBlock *Succ0 = Br->getSuccessor(0);
163 BasicBlock *Succ1 = Br->getSuccessor(1);
164 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
165 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
166 continue;
167 if (dependsOnLocalPhi(L, Br->getCondition())) {
168 UP.Threshold += UnrollThresholdIf;
169 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170 << " for loop:\n"
171 << *L << " due to " << *Br << '\n');
172 if (UP.Threshold >= MaxBoost)
173 return;
174 }
175 }
176 continue;
177 }
178
179 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
180 if (!GEP)
181 continue;
182
183 unsigned AS = GEP->getAddressSpace();
184 unsigned Threshold = 0;
185 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
186 Threshold = ThresholdPrivate;
187 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
188 Threshold = ThresholdLocal;
189 else
190 continue;
191
192 if (UP.Threshold >= Threshold)
193 continue;
194
195 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196 const Value *Ptr = GEP->getPointerOperand();
197 const AllocaInst *Alloca =
198 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
199 if (!Alloca || !Alloca->isStaticAlloca())
200 continue;
201 Type *Ty = Alloca->getAllocatedType();
202 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
203 if (AllocaSize > MaxAlloca)
204 continue;
205 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
206 AS == AMDGPUAS::REGION_ADDRESS) {
207 LocalGEPsSeen++;
208 // Inhibit unroll for local memory if we have seen addressing not to
209 // a variable, most likely we will be unable to combine it.
210 // Do not unroll too deep inner loops for local memory to give a chance
211 // to unroll an outer loop for a more important reason.
212 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
213 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
214 !isa<Argument>(GEP->getPointerOperand())))
215 continue;
216 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217 << *L << " due to LDS use.\n");
218 UP.Runtime = UnrollRuntimeLocal;
219 }
220
221 // Check if GEP depends on a value defined by this loop itself.
222 bool HasLoopDef = false;
223 for (const Value *Op : GEP->operands()) {
224 const Instruction *Inst = dyn_cast<Instruction>(Op);
225 if (!Inst || L->isLoopInvariant(Op))
226 continue;
227
228 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
229 return SubLoop->contains(Inst); }))
230 continue;
231 HasLoopDef = true;
232 break;
233 }
234 if (!HasLoopDef)
235 continue;
236
237 // We want to do whatever we can to limit the number of alloca
238 // instructions that make it through to the code generator. allocas
239 // require us to use indirect addressing, which is slow and prone to
240 // compiler bugs. If this loop does an address calculation on an
241 // alloca ptr, then we want to use a higher than normal loop unroll
242 // threshold. This will give SROA a better chance to eliminate these
243 // allocas.
244 //
245 // We also want to have more unrolling for local memory to let ds
246 // instructions with different offsets combine.
247 //
248 // Don't use the maximum allowed value here as it will make some
249 // programs way too big.
250 UP.Threshold = Threshold;
251 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252 << " for loop:\n"
253 << *L << " due to " << *GEP << '\n');
254 if (UP.Threshold >= MaxBoost)
255 return;
256 }
257
258 // If we got a GEP in a small BB from inner loop then increase max trip
259 // count to analyze for better estimation cost in unroll
260 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
261 UP.MaxIterationsCountToAnalyze = 32;
262 }
263 }
264
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)265 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
266 TTI::PeelingPreferences &PP) {
267 BaseT::getPeelingPreferences(L, SE, PP);
268 }
269
getMaxMemIntrinsicInlineSizeThreshold() const270 int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
271 return 1024;
272 }
273
274 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275 // Codegen control options which don't matter.
276 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279 AMDGPU::FeatureUnalignedAccessMode,
280
281 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283 // Property of the kernel/environment which can't actually differ.
284 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285 AMDGPU::FeatureTrapHandler,
286
287 // The default assumption needs to be ecc is enabled, but no directly
288 // exposed operations depend on it, so it can be safely inlined.
289 AMDGPU::FeatureSRAMECC,
290
291 // Perf-tuning features
292 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)294 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
295 : BaseT(TM, F.getDataLayout()),
296 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
299 SIModeRegisterDefaults Mode(F, *ST);
300 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301 HasFP64FP16Denormals =
302 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303 }
304
hasBranchDivergence(const Function * F) const305 bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306 return !F || !ST->isSingleLaneExecution(*F);
307 }
308
getNumberOfRegisters(unsigned RCID) const309 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311 // registers. See getRegisterClassForType for the implementation.
312 // In this case vector registers are not vector in terms of
313 // VGPRs, but those which can hold multiple values.
314
315 // This is really the number of registers to fill when vectorizing /
316 // interleaving loops, so we lie to avoid trying to use all registers.
317 return 4;
318 }
319
320 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const321 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
322 switch (K) {
323 case TargetTransformInfo::RGK_Scalar:
324 return TypeSize::getFixed(32);
325 case TargetTransformInfo::RGK_FixedWidthVector:
326 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
327 case TargetTransformInfo::RGK_ScalableVector:
328 return TypeSize::getScalable(0);
329 }
330 llvm_unreachable("Unsupported register kind");
331 }
332
getMinVectorRegisterBitWidth() const333 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
334 return 32;
335 }
336
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const337 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339 return 32 * 4 / ElemWidth;
340 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342 : 1;
343 }
344
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy) const345 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346 unsigned ChainSizeInBytes,
347 VectorType *VecTy) const {
348 unsigned VecRegBitWidth = VF * LoadSize;
349 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350 // TODO: Support element-size less than 32bit?
351 return 128 / LoadSize;
352
353 return VF;
354 }
355
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy) const356 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357 unsigned ChainSizeInBytes,
358 VectorType *VecTy) const {
359 unsigned VecRegBitWidth = VF * StoreSize;
360 if (VecRegBitWidth > 128)
361 return 128 / StoreSize;
362
363 return VF;
364 }
365
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const366 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
372 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
373 return 512;
374 }
375
376 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377 return 8 * ST->getMaxPrivateElementSize();
378
379 // Common to flat, global, local and region. Assume for unknown addrspace.
380 return 128;
381 }
382
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const383 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384 Align Alignment,
385 unsigned AddrSpace) const {
386 // We allow vectorization of flat stores, even though we may need to decompose
387 // them later if they may access private memory. We don't have enough context
388 // here, and legalization can handle it.
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392 }
393 return true;
394 }
395
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const396 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400 }
401
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const402 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403 Align Alignment,
404 unsigned AddrSpace) const {
405 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406 }
407
getMaxMemIntrinsicInlineSizeThreshold() const408 int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409 return 1024;
410 }
411
412 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
413 // iteration. Should we report a larger size and let it legalize?
414 //
415 // FIXME: Should we use narrower types for local/region, or account for when
416 // unaligned access is legal?
417 //
418 // FIXME: This could use fine tuning and microbenchmarks.
getMemcpyLoopLoweringType(LLVMContext & Context,Value * Length,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign,std::optional<uint32_t> AtomicElementSize) const419 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422 std::optional<uint32_t> AtomicElementSize) const {
423
424 if (AtomicElementSize)
425 return Type::getIntNTy(Context, *AtomicElementSize * 8);
426
427 unsigned MinAlign = std::min(SrcAlign, DestAlign);
428
429 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430 // hardware into byte accesses. If you assume all alignments are equally
431 // probable, it's more efficient on average to use short accesses for this
432 // case.
433 if (MinAlign == 2)
434 return Type::getInt16Ty(Context);
435
436 // Not all subtargets have 128-bit DS instructions, and we currently don't
437 // form them by default.
438 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
439 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
440 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442 return FixedVectorType::get(Type::getInt32Ty(Context), 2);
443 }
444
445 // Global memory works best with 16-byte accesses. Private memory will also
446 // hit this, although they'll be decomposed.
447 return FixedVectorType::get(Type::getInt32Ty(Context), 4);
448 }
449
getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type * > & OpsOut,LLVMContext & Context,unsigned RemainingBytes,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign,std::optional<uint32_t> AtomicCpySize) const450 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453 unsigned SrcAlign, unsigned DestAlign,
454 std::optional<uint32_t> AtomicCpySize) const {
455 assert(RemainingBytes < 16);
456
457 if (AtomicCpySize)
458 BaseT::getMemcpyLoopResidualLoweringType(
459 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460 DestAlign, AtomicCpySize);
461
462 unsigned MinAlign = std::min(SrcAlign, DestAlign);
463
464 if (MinAlign != 2) {
465 Type *I64Ty = Type::getInt64Ty(Context);
466 while (RemainingBytes >= 8) {
467 OpsOut.push_back(I64Ty);
468 RemainingBytes -= 8;
469 }
470
471 Type *I32Ty = Type::getInt32Ty(Context);
472 while (RemainingBytes >= 4) {
473 OpsOut.push_back(I32Ty);
474 RemainingBytes -= 4;
475 }
476 }
477
478 Type *I16Ty = Type::getInt16Ty(Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(I16Ty);
481 RemainingBytes -= 2;
482 }
483
484 Type *I8Ty = Type::getInt8Ty(Context);
485 while (RemainingBytes) {
486 OpsOut.push_back(I8Ty);
487 --RemainingBytes;
488 }
489 }
490
getMaxInterleaveFactor(ElementCount VF)491 unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
494 if (VF.isScalar())
495 return 1;
496
497 return 8;
498 }
499
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info) const500 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501 MemIntrinsicInfo &Info) const {
502 switch (Inst->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap: {
505 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
506 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
507 if (!Ordering || !Volatile)
508 return false; // Invalid.
509
510 unsigned OrderingVal = Ordering->getZExtValue();
511 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
512 return false;
513
514 Info.PtrVal = Inst->getArgOperand(0);
515 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
516 Info.ReadMem = true;
517 Info.WriteMem = true;
518 Info.IsVolatile = !Volatile->isZero();
519 return true;
520 }
521 default:
522 return false;
523 }
524 }
525
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)526 InstructionCost GCNTTIImpl::getArithmeticInstrCost(
527 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
528 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
529 ArrayRef<const Value *> Args,
530 const Instruction *CxtI) {
531
532 // Legalize the type.
533 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
534 int ISD = TLI->InstructionOpcodeToISD(Opcode);
535
536 // Because we don't have any legal vector operations, but the legal types, we
537 // need to account for split vectors.
538 unsigned NElts = LT.second.isVector() ?
539 LT.second.getVectorNumElements() : 1;
540
541 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
542
543 switch (ISD) {
544 case ISD::SHL:
545 case ISD::SRL:
546 case ISD::SRA:
547 if (SLT == MVT::i64)
548 return get64BitInstrCost(CostKind) * LT.first * NElts;
549
550 if (ST->has16BitInsts() && SLT == MVT::i16)
551 NElts = (NElts + 1) / 2;
552
553 // i32
554 return getFullRateInstrCost() * LT.first * NElts;
555 case ISD::ADD:
556 case ISD::SUB:
557 case ISD::AND:
558 case ISD::OR:
559 case ISD::XOR:
560 if (SLT == MVT::i64) {
561 // and, or and xor are typically split into 2 VALU instructions.
562 return 2 * getFullRateInstrCost() * LT.first * NElts;
563 }
564
565 if (ST->has16BitInsts() && SLT == MVT::i16)
566 NElts = (NElts + 1) / 2;
567
568 return LT.first * NElts * getFullRateInstrCost();
569 case ISD::MUL: {
570 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
571 if (SLT == MVT::i64) {
572 const int FullRateCost = getFullRateInstrCost();
573 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
574 }
575
576 if (ST->has16BitInsts() && SLT == MVT::i16)
577 NElts = (NElts + 1) / 2;
578
579 // i32
580 return QuarterRateCost * NElts * LT.first;
581 }
582 case ISD::FMUL:
583 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
584 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
585 // fused operation.
586 if (CxtI && CxtI->hasOneUse())
587 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
588 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
589 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
590 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
591 return TargetTransformInfo::TCC_Free;
592 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
593 return TargetTransformInfo::TCC_Free;
594
595 // Estimate all types may be fused with contract/unsafe flags
596 const TargetOptions &Options = TLI->getTargetMachine().Options;
597 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
598 Options.UnsafeFPMath ||
599 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
600 return TargetTransformInfo::TCC_Free;
601 }
602 }
603 [[fallthrough]];
604 case ISD::FADD:
605 case ISD::FSUB:
606 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
607 NElts = (NElts + 1) / 2;
608 if (SLT == MVT::f64)
609 return LT.first * NElts * get64BitInstrCost(CostKind);
610
611 if (ST->has16BitInsts() && SLT == MVT::f16)
612 NElts = (NElts + 1) / 2;
613
614 if (SLT == MVT::f32 || SLT == MVT::f16)
615 return LT.first * NElts * getFullRateInstrCost();
616 break;
617 case ISD::FDIV:
618 case ISD::FREM:
619 // FIXME: frem should be handled separately. The fdiv in it is most of it,
620 // but the current lowering is also not entirely correct.
621 if (SLT == MVT::f64) {
622 int Cost = 7 * get64BitInstrCost(CostKind) +
623 getQuarterRateInstrCost(CostKind) +
624 3 * getHalfRateInstrCost(CostKind);
625 // Add cost of workaround.
626 if (!ST->hasUsableDivScaleConditionOutput())
627 Cost += 3 * getFullRateInstrCost();
628
629 return LT.first * Cost * NElts;
630 }
631
632 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
633 // TODO: This is more complicated, unsafe flags etc.
634 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
635 (SLT == MVT::f16 && ST->has16BitInsts())) {
636 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
637 }
638 }
639
640 if (SLT == MVT::f16 && ST->has16BitInsts()) {
641 // 2 x v_cvt_f32_f16
642 // f32 rcp
643 // f32 fmul
644 // v_cvt_f16_f32
645 // f16 div_fixup
646 int Cost =
647 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
648 return LT.first * Cost * NElts;
649 }
650
651 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
652 TLI->getTargetMachine().Options.UnsafeFPMath)) {
653 // Fast unsafe fdiv lowering:
654 // f32 rcp
655 // f32 fmul
656 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
657 return LT.first * Cost * NElts;
658 }
659
660 if (SLT == MVT::f32 || SLT == MVT::f16) {
661 // 4 more v_cvt_* insts without f16 insts support
662 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
663 1 * getQuarterRateInstrCost(CostKind);
664
665 if (!HasFP32Denormals) {
666 // FP mode switches.
667 Cost += 2 * getFullRateInstrCost();
668 }
669
670 return LT.first * NElts * Cost;
671 }
672 break;
673 case ISD::FNEG:
674 // Use the backend' estimation. If fneg is not free each element will cost
675 // one additional instruction.
676 return TLI->isFNegFree(SLT) ? 0 : NElts;
677 default:
678 break;
679 }
680
681 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
682 Args, CxtI);
683 }
684
685 // Return true if there's a potential benefit from using v2f16/v2i16
686 // instructions for an intrinsic, even if it requires nontrivial legalization.
intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)687 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
688 switch (ID) {
689 case Intrinsic::fma: // TODO: fmuladd
690 // There's a small benefit to using vector ops in the legalized code.
691 case Intrinsic::round:
692 case Intrinsic::uadd_sat:
693 case Intrinsic::usub_sat:
694 case Intrinsic::sadd_sat:
695 case Intrinsic::ssub_sat:
696 return true;
697 default:
698 return false;
699 }
700 }
701
702 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)703 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
704 TTI::TargetCostKind CostKind) {
705 if (ICA.getID() == Intrinsic::fabs)
706 return 0;
707
708 if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
709 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
710
711 Type *RetTy = ICA.getReturnType();
712
713 // Legalize the type.
714 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
715
716 unsigned NElts = LT.second.isVector() ?
717 LT.second.getVectorNumElements() : 1;
718
719 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
720
721 if (SLT == MVT::f64)
722 return LT.first * NElts * get64BitInstrCost(CostKind);
723
724 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
725 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
726 NElts = (NElts + 1) / 2;
727
728 // TODO: Get more refined intrinsic costs?
729 unsigned InstRate = getQuarterRateInstrCost(CostKind);
730
731 switch (ICA.getID()) {
732 case Intrinsic::fma:
733 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
734 : getQuarterRateInstrCost(CostKind);
735 break;
736 case Intrinsic::uadd_sat:
737 case Intrinsic::usub_sat:
738 case Intrinsic::sadd_sat:
739 case Intrinsic::ssub_sat:
740 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
741 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
742 NElts = 1;
743 break;
744 }
745
746 return LT.first * NElts * InstRate;
747 }
748
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)749 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
750 TTI::TargetCostKind CostKind,
751 const Instruction *I) {
752 assert((I == nullptr || I->getOpcode() == Opcode) &&
753 "Opcode should reflect passed instruction.");
754 const bool SCost =
755 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
756 const int CBrCost = SCost ? 5 : 7;
757 switch (Opcode) {
758 case Instruction::Br: {
759 // Branch instruction takes about 4 slots on gfx900.
760 auto BI = dyn_cast_or_null<BranchInst>(I);
761 if (BI && BI->isUnconditional())
762 return SCost ? 1 : 4;
763 // Suppose conditional branch takes additional 3 exec manipulations
764 // instructions in average.
765 return CBrCost;
766 }
767 case Instruction::Switch: {
768 auto SI = dyn_cast_or_null<SwitchInst>(I);
769 // Each case (including default) takes 1 cmp + 1 cbr instructions in
770 // average.
771 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
772 }
773 case Instruction::Ret:
774 return SCost ? 1 : 10;
775 }
776 return BaseT::getCFInstrCost(Opcode, CostKind, I);
777 }
778
779 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)780 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
781 std::optional<FastMathFlags> FMF,
782 TTI::TargetCostKind CostKind) {
783 if (TTI::requiresOrderedReduction(FMF))
784 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
785
786 EVT OrigTy = TLI->getValueType(DL, Ty);
787
788 // Computes cost on targets that have packed math instructions(which support
789 // 16-bit types only).
790 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
791 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
792
793 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
794 return LT.first * getFullRateInstrCost();
795 }
796
797 InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)798 GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
799 FastMathFlags FMF,
800 TTI::TargetCostKind CostKind) {
801 EVT OrigTy = TLI->getValueType(DL, Ty);
802
803 // Computes cost on targets that have packed math instructions(which support
804 // 16-bit types only).
805 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
806 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
807
808 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
809 return LT.first * getHalfRateInstrCost(CostKind);
810 }
811
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)812 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
813 TTI::TargetCostKind CostKind,
814 unsigned Index, Value *Op0,
815 Value *Op1) {
816 switch (Opcode) {
817 case Instruction::ExtractElement:
818 case Instruction::InsertElement: {
819 unsigned EltSize
820 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
821 if (EltSize < 32) {
822 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
823 return 0;
824 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
825 Op1);
826 }
827
828 // Extracts are just reads of a subregister, so are free. Inserts are
829 // considered free because we don't want to have any cost for scalarizing
830 // operations, and we don't have to copy into a different register class.
831
832 // Dynamic indexing isn't free and is best avoided.
833 return Index == ~0u ? 2 : 0;
834 }
835 default:
836 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
837 }
838 }
839
840 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
841 /// this is analyzing the collective result of all output registers. Otherwise,
842 /// this is only querying a specific result index if this returns multiple
843 /// registers in a struct.
isInlineAsmSourceOfDivergence(const CallInst * CI,ArrayRef<unsigned> Indices) const844 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
845 const CallInst *CI, ArrayRef<unsigned> Indices) const {
846 // TODO: Handle complex extract indices
847 if (Indices.size() > 1)
848 return true;
849
850 const DataLayout &DL = CI->getDataLayout();
851 const SIRegisterInfo *TRI = ST->getRegisterInfo();
852 TargetLowering::AsmOperandInfoVector TargetConstraints =
853 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
854
855 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
856
857 int OutputIdx = 0;
858 for (auto &TC : TargetConstraints) {
859 if (TC.Type != InlineAsm::isOutput)
860 continue;
861
862 // Skip outputs we don't care about.
863 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
864 continue;
865
866 TLI->ComputeConstraintToUse(TC, SDValue());
867
868 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
869 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
870
871 // For AGPR constraints null is returned on subtargets without AGPRs, so
872 // assume divergent for null.
873 if (!RC || !TRI->isSGPRClass(RC))
874 return true;
875 }
876
877 return false;
878 }
879
isReadRegisterSourceOfDivergence(const IntrinsicInst * ReadReg) const880 bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
881 const IntrinsicInst *ReadReg) const {
882 Metadata *MD =
883 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
884 StringRef RegName =
885 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
886
887 // Special case registers that look like VCC.
888 MVT VT = MVT::getVT(ReadReg->getType());
889 if (VT == MVT::i1)
890 return true;
891
892 // Special case scalar registers that start with 'v'.
893 if (RegName.starts_with("vcc") || RegName.empty())
894 return false;
895
896 // VGPR or AGPR is divergent. There aren't any specially named vector
897 // registers.
898 return RegName[0] == 'v' || RegName[0] == 'a';
899 }
900
901 /// \returns true if the result of the value could potentially be
902 /// different across workitems in a wavefront.
isSourceOfDivergence(const Value * V) const903 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
904 if (const Argument *A = dyn_cast<Argument>(V))
905 return !AMDGPU::isArgPassedInSGPR(A);
906
907 // Loads from the private and flat address spaces are divergent, because
908 // threads can execute the load instruction with the same inputs and get
909 // different results.
910 //
911 // All other loads are not divergent, because if threads issue loads with the
912 // same arguments, they will always get the same result.
913 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
914 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
915 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
916
917 // Atomics are divergent because they are executed sequentially: when an
918 // atomic operation refers to the same address in each thread, then each
919 // thread after the first sees the value written by the previous thread as
920 // original value.
921 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
922 return true;
923
924 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
925 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
926 return isReadRegisterSourceOfDivergence(Intrinsic);
927
928 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
929 }
930
931 // Assume all function calls are a source of divergence.
932 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
933 if (CI->isInlineAsm())
934 return isInlineAsmSourceOfDivergence(CI);
935 return true;
936 }
937
938 // Assume all function calls are a source of divergence.
939 if (isa<InvokeInst>(V))
940 return true;
941
942 return false;
943 }
944
isAlwaysUniform(const Value * V) const945 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
946 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
947 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
948
949 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
950 if (CI->isInlineAsm())
951 return !isInlineAsmSourceOfDivergence(CI);
952 return false;
953 }
954
955 // In most cases TID / wavefrontsize is uniform.
956 //
957 // However, if a kernel has uneven dimesions we can have a value of
958 // workitem-id-x divided by the wavefrontsize non-uniform. For example
959 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
960 // packed into a same wave which gives 1 and 0 after the division by 64
961 // respectively.
962 //
963 // FIXME: limit it to 1D kernels only, although that shall be possible
964 // to perform this optimization is the size of the X dimension is a power
965 // of 2, we just do not currently have infrastructure to query it.
966 using namespace llvm::PatternMatch;
967 uint64_t C;
968 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
969 m_ConstantInt(C))) ||
970 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
971 m_ConstantInt(C)))) {
972 const Function *F = cast<Instruction>(V)->getFunction();
973 return C >= ST->getWavefrontSizeLog2() &&
974 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
975 }
976
977 Value *Mask;
978 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
979 m_Value(Mask)))) {
980 const Function *F = cast<Instruction>(V)->getFunction();
981 const DataLayout &DL = F->getDataLayout();
982 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
983 ST->getWavefrontSizeLog2() &&
984 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
985 }
986
987 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
988 if (!ExtValue)
989 return false;
990
991 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
992 if (!CI)
993 return false;
994
995 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
996 switch (Intrinsic->getIntrinsicID()) {
997 default:
998 return false;
999 case Intrinsic::amdgcn_if:
1000 case Intrinsic::amdgcn_else: {
1001 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1002 return Indices.size() == 1 && Indices[0] == 1;
1003 }
1004 }
1005 }
1006
1007 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1008 // divergent for the overall struct return. We need to override it in the
1009 // case we're extracting an SGPR component here.
1010 if (CI->isInlineAsm())
1011 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1012
1013 return false;
1014 }
1015
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID) const1016 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1017 Intrinsic::ID IID) const {
1018 switch (IID) {
1019 case Intrinsic::amdgcn_is_shared:
1020 case Intrinsic::amdgcn_is_private:
1021 case Intrinsic::amdgcn_flat_atomic_fadd:
1022 case Intrinsic::amdgcn_flat_atomic_fmax:
1023 case Intrinsic::amdgcn_flat_atomic_fmin:
1024 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1025 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1026 OpIndexes.push_back(0);
1027 return true;
1028 default:
1029 return false;
1030 }
1031 }
1032
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV) const1033 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1034 Value *OldV,
1035 Value *NewV) const {
1036 auto IntrID = II->getIntrinsicID();
1037 switch (IntrID) {
1038 case Intrinsic::amdgcn_is_shared:
1039 case Intrinsic::amdgcn_is_private: {
1040 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1041 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1042 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1043 LLVMContext &Ctx = NewV->getType()->getContext();
1044 ConstantInt *NewVal = (TrueAS == NewAS) ?
1045 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1046 return NewVal;
1047 }
1048 case Intrinsic::ptrmask: {
1049 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1050 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1051 Value *MaskOp = II->getArgOperand(1);
1052 Type *MaskTy = MaskOp->getType();
1053
1054 bool DoTruncate = false;
1055
1056 const GCNTargetMachine &TM =
1057 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1058 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1059 // All valid 64-bit to 32-bit casts work by chopping off the high
1060 // bits. Any masking only clearing the low bits will also apply in the new
1061 // address space.
1062 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1063 DL.getPointerSizeInBits(NewAS) != 32)
1064 return nullptr;
1065
1066 // TODO: Do we need to thread more context in here?
1067 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1068 if (Known.countMinLeadingOnes() < 32)
1069 return nullptr;
1070
1071 DoTruncate = true;
1072 }
1073
1074 IRBuilder<> B(II);
1075 if (DoTruncate) {
1076 MaskTy = B.getInt32Ty();
1077 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1078 }
1079
1080 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1081 {NewV, MaskOp});
1082 }
1083 case Intrinsic::amdgcn_flat_atomic_fadd:
1084 case Intrinsic::amdgcn_flat_atomic_fmax:
1085 case Intrinsic::amdgcn_flat_atomic_fmin:
1086 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1087 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1088 Type *DestTy = II->getType();
1089 Type *SrcTy = NewV->getType();
1090 unsigned NewAS = SrcTy->getPointerAddressSpace();
1091 if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))
1092 return nullptr;
1093 Module *M = II->getModule();
1094 Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
1095 {DestTy, SrcTy, DestTy});
1096 II->setArgOperand(0, NewV);
1097 II->setCalledFunction(NewDecl);
1098 return II;
1099 }
1100 default:
1101 return nullptr;
1102 }
1103 }
1104
getShuffleCost(TTI::ShuffleKind Kind,VectorType * VT,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args,const Instruction * CxtI)1105 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1106 VectorType *VT, ArrayRef<int> Mask,
1107 TTI::TargetCostKind CostKind,
1108 int Index, VectorType *SubTp,
1109 ArrayRef<const Value *> Args,
1110 const Instruction *CxtI) {
1111 if (!isa<FixedVectorType>(VT))
1112 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1113
1114 Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1115
1116 // Larger vector widths may require additional instructions, but are
1117 // typically cheaper than scalarized versions.
1118 unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1119 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1120 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1121 bool HasVOP3P = ST->hasVOP3PInsts();
1122 unsigned RequestedElts =
1123 count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1124 if (RequestedElts == 0)
1125 return 0;
1126 switch (Kind) {
1127 case TTI::SK_Broadcast:
1128 case TTI::SK_Reverse:
1129 case TTI::SK_PermuteSingleSrc: {
1130 // With op_sel VOP3P instructions freely can access the low half or high
1131 // half of a register, so any swizzle of two elements is free.
1132 if (HasVOP3P && NumVectorElts == 2)
1133 return 0;
1134 unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1135 // SK_Broadcast just reuses the same mask
1136 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1137 return NumPerms + NumPermMasks;
1138 }
1139 case TTI::SK_ExtractSubvector:
1140 case TTI::SK_InsertSubvector: {
1141 // Even aligned accesses are free
1142 if (!(Index % 2))
1143 return 0;
1144 // Insert/extract subvectors only require shifts / extract code to get the
1145 // relevant bits
1146 return alignTo(RequestedElts, 2) / 2;
1147 }
1148 case TTI::SK_PermuteTwoSrc:
1149 case TTI::SK_Splice:
1150 case TTI::SK_Select: {
1151 unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1152 // SK_Select just reuses the same mask
1153 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1154 return NumPerms + NumPermMasks;
1155 }
1156
1157 default:
1158 break;
1159 }
1160 }
1161
1162 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1163 }
1164
areInlineCompatible(const Function * Caller,const Function * Callee) const1165 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1166 const Function *Callee) const {
1167 const TargetMachine &TM = getTLI()->getTargetMachine();
1168 const GCNSubtarget *CallerST
1169 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1170 const GCNSubtarget *CalleeST
1171 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1172
1173 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1174 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1175
1176 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1177 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1178 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1179 return false;
1180
1181 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1182 // no way to support merge for backend defined attributes.
1183 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1184 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1185 if (!CallerMode.isInlineCompatible(CalleeMode))
1186 return false;
1187
1188 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1189 Callee->hasFnAttribute(Attribute::InlineHint))
1190 return true;
1191
1192 // Hack to make compile times reasonable.
1193 if (InlineMaxBB) {
1194 // Single BB does not increase total BB amount.
1195 if (Callee->size() == 1)
1196 return true;
1197 size_t BBSize = Caller->size() + Callee->size() - 1;
1198 return BBSize <= InlineMaxBB;
1199 }
1200
1201 return true;
1202 }
1203
adjustInliningThresholdUsingCallee(const CallBase * CB,const SITargetLowering * TLI,const GCNTTIImpl * TTIImpl)1204 static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1205 const SITargetLowering *TLI,
1206 const GCNTTIImpl *TTIImpl) {
1207 const int NrOfSGPRUntilSpill = 26;
1208 const int NrOfVGPRUntilSpill = 32;
1209
1210 const DataLayout &DL = TTIImpl->getDataLayout();
1211
1212 unsigned adjustThreshold = 0;
1213 int SGPRsInUse = 0;
1214 int VGPRsInUse = 0;
1215 for (const Use &A : CB->args()) {
1216 SmallVector<EVT, 4> ValueVTs;
1217 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1218 for (auto ArgVT : ValueVTs) {
1219 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1220 CB->getContext(), CB->getCallingConv(), ArgVT);
1221 if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
1222 SGPRsInUse += CCRegNum;
1223 else
1224 VGPRsInUse += CCRegNum;
1225 }
1226 }
1227
1228 // The cost of passing function arguments through the stack:
1229 // 1 instruction to put a function argument on the stack in the caller.
1230 // 1 instruction to take a function argument from the stack in callee.
1231 // 1 instruction is explicitly take care of data dependencies in callee
1232 // function.
1233 InstructionCost ArgStackCost(1);
1234 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1235 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1236 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1237 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1238 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1239 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1240
1241 // The penalty cost is computed relative to the cost of instructions and does
1242 // not model any storage costs.
1243 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1244 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1245 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1246 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1247 return adjustThreshold;
1248 }
1249
getCallArgsTotalAllocaSize(const CallBase * CB,const DataLayout & DL)1250 static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1251 const DataLayout &DL) {
1252 // If we have a pointer to a private array passed into a function
1253 // it will not be optimized out, leaving scratch usage.
1254 // This function calculates the total size in bytes of the memory that would
1255 // end in scratch if the call was not inlined.
1256 unsigned AllocaSize = 0;
1257 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1258 for (Value *PtrArg : CB->args()) {
1259 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1260 if (!Ty)
1261 continue;
1262
1263 unsigned AddrSpace = Ty->getAddressSpace();
1264 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1265 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1266 continue;
1267
1268 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1269 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1270 continue;
1271
1272 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1273 }
1274 return AllocaSize;
1275 }
1276
adjustInliningThreshold(const CallBase * CB) const1277 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1278 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1279
1280 // Private object passed as arguments may end up in scratch usage if the call
1281 // is not inlined. Increase the inline threshold to promote inlining.
1282 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1283 if (AllocaSize > 0)
1284 Threshold += ArgAllocaCost;
1285 return Threshold;
1286 }
1287
getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI) const1288 unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1289 const AllocaInst *AI) const {
1290
1291 // Below the cutoff, assume that the private memory objects would be
1292 // optimized
1293 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1294 if (AllocaSize <= ArgAllocaCutoff)
1295 return 0;
1296
1297 // Above the cutoff, we give a cost to each private memory object
1298 // depending its size. If the array can be optimized by SROA this cost is not
1299 // added to the total-cost in the inliner cost analysis.
1300 //
1301 // We choose the total cost of the alloca such that their sum cancels the
1302 // bonus given in the threshold (ArgAllocaCost).
1303 //
1304 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1305 //
1306 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1307 // the single-bb bonus and the vector-bonus.
1308 //
1309 // We compensate the first two multipliers, by repeating logic from the
1310 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1311 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1312 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1313
1314 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1315 return BB.getTerminator()->getNumSuccessors() > 1;
1316 });
1317 if (SingleBB) {
1318 Threshold += Threshold / 2;
1319 }
1320
1321 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1322
1323 // Attribute the bonus proportionally to the alloca size
1324 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1325
1326 return AllocaThresholdBonus;
1327 }
1328
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)1329 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1330 TTI::UnrollingPreferences &UP,
1331 OptimizationRemarkEmitter *ORE) {
1332 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1333 }
1334
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1335 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1336 TTI::PeelingPreferences &PP) {
1337 CommonTTI.getPeelingPreferences(L, SE, PP);
1338 }
1339
get64BitInstrCost(TTI::TargetCostKind CostKind) const1340 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1341 return ST->hasFullRate64Ops()
1342 ? getFullRateInstrCost()
1343 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1344 : getQuarterRateInstrCost(CostKind);
1345 }
1346
1347 std::pair<InstructionCost, MVT>
getTypeLegalizationCost(Type * Ty) const1348 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1349 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1350 auto Size = DL.getTypeSizeInBits(Ty);
1351 // Maximum load or store can handle 8 dwords for scalar and 4 for
1352 // vector ALU. Let's assume anything above 8 dwords is expensive
1353 // even if legal.
1354 if (Size <= 256)
1355 return Cost;
1356
1357 Cost.first += (Size + 255) / 256;
1358 return Cost;
1359 }
1360
getPrefetchDistance() const1361 unsigned GCNTTIImpl::getPrefetchDistance() const {
1362 return ST->hasPrefetch() ? 128 : 0;
1363 }
1364
shouldPrefetchAddressSpace(unsigned AS) const1365 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1366 return AMDGPU::isFlatGlobalAddrSpace(AS);
1367 }
1368