1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIModeRegisterDefaults.h"
21 #include "llvm/Analysis/InlineCost.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/Support/KnownBits.h"
29 #include <optional>
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "AMDGPUtti"
34
35 static cl::opt<unsigned> UnrollThresholdPrivate(
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
40 static cl::opt<unsigned> UnrollThresholdLocal(
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
45 static cl::opt<unsigned> UnrollThresholdIf(
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
50 static cl::opt<bool> UnrollRuntimeLocal(
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
55 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64 // If the amount of scratch memory to eliminate exceeds our ability to allocate
65 // it into registers we gain nothing by aggressively inlining functions for that
66 // heuristic.
67 static cl::opt<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72 // Inliner constraint to achieve reasonable compilation time.
73 static cl::opt<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78 // This default unroll factor is based on microbenchmarks on gfx1030.
79 static cl::opt<unsigned> MemcpyLoopUnroll(
80 "amdgpu-memcpy-loop-unroll",
81 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
82 "operations when lowering memcpy as a loop"),
83 cl::init(16), cl::Hidden);
84
dependsOnLocalPhi(const Loop * L,const Value * Cond,unsigned Depth=0)85 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
86 unsigned Depth = 0) {
87 const Instruction *I = dyn_cast<Instruction>(Cond);
88 if (!I)
89 return false;
90
91 for (const Value *V : I->operand_values()) {
92 if (!L->contains(I))
93 continue;
94 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
95 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
96 return SubLoop->contains(PHI); }))
97 return true;
98 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
99 return true;
100 }
101 return false;
102 }
103
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)104 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
105 : BaseT(TM, F.getDataLayout()),
106 TargetTriple(TM->getTargetTriple()),
107 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
108 TLI(ST->getTargetLowering()) {}
109
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE) const110 void AMDGPUTTIImpl::getUnrollingPreferences(
111 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
112 OptimizationRemarkEmitter *ORE) const {
113 const Function &F = *L->getHeader()->getParent();
114 UP.Threshold =
115 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
116 UP.MaxCount = std::numeric_limits<unsigned>::max();
117 UP.Partial = true;
118
119 // Conditional branch in a loop back edge needs 3 additional exec
120 // manipulations in average.
121 UP.BEInsns += 3;
122
123 // We want to run unroll even for the loops which have been vectorized.
124 UP.UnrollVectorizedLoop = true;
125
126 // TODO: Do we want runtime unrolling?
127
128 // Maximum alloca size than can fit registers. Reserve 16 registers.
129 const unsigned MaxAlloca = (256 - 16) * 4;
130 unsigned ThresholdPrivate = UnrollThresholdPrivate;
131 unsigned ThresholdLocal = UnrollThresholdLocal;
132
133 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
134 // provided threshold value as the default for Threshold
135 if (MDNode *LoopUnrollThreshold =
136 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
137 if (LoopUnrollThreshold->getNumOperands() == 2) {
138 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
139 LoopUnrollThreshold->getOperand(1));
140 if (MetaThresholdValue) {
141 // We will also use the supplied value for PartialThreshold for now.
142 // We may introduce additional metadata if it becomes necessary in the
143 // future.
144 UP.Threshold = MetaThresholdValue->getSExtValue();
145 UP.PartialThreshold = UP.Threshold;
146 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
147 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
148 }
149 }
150 }
151
152 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
153 for (const BasicBlock *BB : L->getBlocks()) {
154 const DataLayout &DL = BB->getDataLayout();
155 unsigned LocalGEPsSeen = 0;
156
157 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
158 return SubLoop->contains(BB); }))
159 continue; // Block belongs to an inner loop.
160
161 for (const Instruction &I : *BB) {
162 // Unroll a loop which contains an "if" statement whose condition
163 // defined by a PHI belonging to the loop. This may help to eliminate
164 // if region and potentially even PHI itself, saving on both divergence
165 // and registers used for the PHI.
166 // Add a small bonus for each of such "if" statements.
167 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
168 if (UP.Threshold < MaxBoost && Br->isConditional()) {
169 BasicBlock *Succ0 = Br->getSuccessor(0);
170 BasicBlock *Succ1 = Br->getSuccessor(1);
171 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
172 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
173 continue;
174 if (dependsOnLocalPhi(L, Br->getCondition())) {
175 UP.Threshold += UnrollThresholdIf;
176 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
177 << " for loop:\n"
178 << *L << " due to " << *Br << '\n');
179 if (UP.Threshold >= MaxBoost)
180 return;
181 }
182 }
183 continue;
184 }
185
186 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
187 if (!GEP)
188 continue;
189
190 unsigned AS = GEP->getAddressSpace();
191 unsigned Threshold = 0;
192 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
193 Threshold = ThresholdPrivate;
194 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
195 Threshold = ThresholdLocal;
196 else
197 continue;
198
199 if (UP.Threshold >= Threshold)
200 continue;
201
202 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
203 const Value *Ptr = GEP->getPointerOperand();
204 const AllocaInst *Alloca =
205 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
206 if (!Alloca || !Alloca->isStaticAlloca())
207 continue;
208 Type *Ty = Alloca->getAllocatedType();
209 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
210 if (AllocaSize > MaxAlloca)
211 continue;
212 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
213 AS == AMDGPUAS::REGION_ADDRESS) {
214 LocalGEPsSeen++;
215 // Inhibit unroll for local memory if we have seen addressing not to
216 // a variable, most likely we will be unable to combine it.
217 // Do not unroll too deep inner loops for local memory to give a chance
218 // to unroll an outer loop for a more important reason.
219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
220 continue;
221
222 const Value *V = getUnderlyingObject(GEP->getPointerOperand());
223 if (!isa<GlobalVariable>(V) && !isa<Argument>(V))
224 continue;
225
226 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
227 << *L << " due to LDS use.\n");
228 UP.Runtime = UnrollRuntimeLocal;
229 }
230
231 // Check if GEP depends on a value defined by this loop itself.
232 bool HasLoopDef = false;
233 for (const Value *Op : GEP->operands()) {
234 const Instruction *Inst = dyn_cast<Instruction>(Op);
235 if (!Inst || L->isLoopInvariant(Op))
236 continue;
237
238 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
239 return SubLoop->contains(Inst); }))
240 continue;
241 HasLoopDef = true;
242 break;
243 }
244 if (!HasLoopDef)
245 continue;
246
247 // We want to do whatever we can to limit the number of alloca
248 // instructions that make it through to the code generator. allocas
249 // require us to use indirect addressing, which is slow and prone to
250 // compiler bugs. If this loop does an address calculation on an
251 // alloca ptr, then we want to use a higher than normal loop unroll
252 // threshold. This will give SROA a better chance to eliminate these
253 // allocas.
254 //
255 // We also want to have more unrolling for local memory to let ds
256 // instructions with different offsets combine.
257 //
258 // Don't use the maximum allowed value here as it will make some
259 // programs way too big.
260 UP.Threshold = Threshold;
261 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
262 << " for loop:\n"
263 << *L << " due to " << *GEP << '\n');
264 if (UP.Threshold >= MaxBoost)
265 return;
266 }
267
268 // If we got a GEP in a small BB from inner loop then increase max trip
269 // count to analyze for better estimation cost in unroll
270 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
271 UP.MaxIterationsCountToAnalyze = 32;
272 }
273 }
274
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP) const275 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
276 TTI::PeelingPreferences &PP) const {
277 BaseT::getPeelingPreferences(L, SE, PP);
278 }
279
getMaxMemIntrinsicInlineSizeThreshold() const280 uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
281 return 1024;
282 }
283
284 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
285 // Codegen control options which don't matter.
286 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
287 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
288 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
289 AMDGPU::FeatureUnalignedAccessMode,
290
291 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
292
293 // Property of the kernel/environment which can't actually differ.
294 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
295 AMDGPU::FeatureTrapHandler,
296
297 // The default assumption needs to be ecc is enabled, but no directly
298 // exposed operations depend on it, so it can be safely inlined.
299 AMDGPU::FeatureSRAMECC,
300
301 // Perf-tuning features
302 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
303
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)304 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
305 : BaseT(TM, F.getDataLayout()),
306 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
307 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
308 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
309 SIModeRegisterDefaults Mode(F, *ST);
310 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
311 HasFP64FP16Denormals =
312 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
313 }
314
hasBranchDivergence(const Function * F) const315 bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
316 return !F || !ST->isSingleLaneExecution(*F);
317 }
318
getNumberOfRegisters(unsigned RCID) const319 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
320 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
321 // registers. See getRegisterClassForType for the implementation.
322 // In this case vector registers are not vector in terms of
323 // VGPRs, but those which can hold multiple values.
324
325 // This is really the number of registers to fill when vectorizing /
326 // interleaving loops, so we lie to avoid trying to use all registers.
327 return 4;
328 }
329
330 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const331 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
332 switch (K) {
333 case TargetTransformInfo::RGK_Scalar:
334 return TypeSize::getFixed(32);
335 case TargetTransformInfo::RGK_FixedWidthVector:
336 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
337 case TargetTransformInfo::RGK_ScalableVector:
338 return TypeSize::getScalable(0);
339 }
340 llvm_unreachable("Unsupported register kind");
341 }
342
getMinVectorRegisterBitWidth() const343 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344 return 32;
345 }
346
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const347 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
348 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349 return 32 * 4 / ElemWidth;
350 // For a given width return the max 0number of elements that can be combined
351 // into a wider bit value:
352 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
353 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
354 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
355 : 1;
356 }
357
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy) const358 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
359 unsigned ChainSizeInBytes,
360 VectorType *VecTy) const {
361 unsigned VecRegBitWidth = VF * LoadSize;
362 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
363 // TODO: Support element-size less than 32bit?
364 return 128 / LoadSize;
365
366 return VF;
367 }
368
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy) const369 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
370 unsigned ChainSizeInBytes,
371 VectorType *VecTy) const {
372 unsigned VecRegBitWidth = VF * StoreSize;
373 if (VecRegBitWidth > 128)
374 return 128 / StoreSize;
375
376 return VF;
377 }
378
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const379 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
380 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
381 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
382 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
383 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
384 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
385 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
386 return 512;
387 }
388
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
390 return 8 * ST->getMaxPrivateElementSize();
391
392 // Common to flat, global, local and region. Assume for unknown addrspace.
393 return 128;
394 }
395
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const396 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 // We allow vectorization of flat stores, even though we may need to decompose
400 // them later if they may access private memory. We don't have enough context
401 // here, and legalization can handle it.
402 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
403 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
404 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
405 }
406 return true;
407 }
408
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const409 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
410 Align Alignment,
411 unsigned AddrSpace) const {
412 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
413 }
414
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const415 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
416 Align Alignment,
417 unsigned AddrSpace) const {
418 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
419 }
420
getMaxMemIntrinsicInlineSizeThreshold() const421 uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
422 return 1024;
423 }
424
getMemcpyLoopLoweringType(LLVMContext & Context,Value * Length,unsigned SrcAddrSpace,unsigned DestAddrSpace,Align SrcAlign,Align DestAlign,std::optional<uint32_t> AtomicElementSize) const425 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
427 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
428 std::optional<uint32_t> AtomicElementSize) const {
429
430 if (AtomicElementSize)
431 return Type::getIntNTy(Context, *AtomicElementSize * 8);
432
433 // 16-byte accesses achieve the highest copy throughput.
434 // If the operation has a fixed known length that is large enough, it is
435 // worthwhile to return an even wider type and let legalization lower it into
436 // multiple accesses, effectively unrolling the memcpy loop.
437 // We also rely on legalization to decompose into smaller accesses for
438 // subtargets and address spaces where it is necessary.
439 //
440 // Don't unroll if Length is not a constant, since unrolling leads to worse
441 // performance for length values that are smaller or slightly larger than the
442 // total size of the type returned here. Mitigating that would require a more
443 // complex lowering for variable-length memcpy and memmove.
444 unsigned I32EltsInVector = 4;
445 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))
446 return FixedVectorType::get(Type::getInt32Ty(Context),
447 MemcpyLoopUnroll * I32EltsInVector);
448
449 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
450 }
451
getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type * > & OpsOut,LLVMContext & Context,unsigned RemainingBytes,unsigned SrcAddrSpace,unsigned DestAddrSpace,Align SrcAlign,Align DestAlign,std::optional<uint32_t> AtomicCpySize) const452 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
453 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
454 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
455 Align SrcAlign, Align DestAlign,
456 std::optional<uint32_t> AtomicCpySize) const {
457
458 if (AtomicCpySize)
459 BaseT::getMemcpyLoopResidualLoweringType(
460 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
461 DestAlign, AtomicCpySize);
462
463 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
464 while (RemainingBytes >= 16) {
465 OpsOut.push_back(I32x4Ty);
466 RemainingBytes -= 16;
467 }
468
469 Type *I64Ty = Type::getInt64Ty(Context);
470 while (RemainingBytes >= 8) {
471 OpsOut.push_back(I64Ty);
472 RemainingBytes -= 8;
473 }
474
475 Type *I32Ty = Type::getInt32Ty(Context);
476 while (RemainingBytes >= 4) {
477 OpsOut.push_back(I32Ty);
478 RemainingBytes -= 4;
479 }
480
481 Type *I16Ty = Type::getInt16Ty(Context);
482 while (RemainingBytes >= 2) {
483 OpsOut.push_back(I16Ty);
484 RemainingBytes -= 2;
485 }
486
487 Type *I8Ty = Type::getInt8Ty(Context);
488 while (RemainingBytes) {
489 OpsOut.push_back(I8Ty);
490 --RemainingBytes;
491 }
492 }
493
getMaxInterleaveFactor(ElementCount VF) const494 unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
495 // Disable unrolling if the loop is not vectorized.
496 // TODO: Enable this again.
497 if (VF.isScalar())
498 return 1;
499
500 return 8;
501 }
502
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info) const503 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
504 MemIntrinsicInfo &Info) const {
505 switch (Inst->getIntrinsicID()) {
506 case Intrinsic::amdgcn_ds_ordered_add:
507 case Intrinsic::amdgcn_ds_ordered_swap: {
508 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
509 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527 }
528
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI) const529 InstructionCost GCNTTIImpl::getArithmeticInstrCost(
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
532 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
533
534 // Legalize the type.
535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
536 int ISD = TLI->InstructionOpcodeToISD(Opcode);
537
538 // Because we don't have any legal vector operations, but the legal types, we
539 // need to account for split vectors.
540 unsigned NElts = LT.second.isVector() ?
541 LT.second.getVectorNumElements() : 1;
542
543 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
544
545 switch (ISD) {
546 case ISD::SHL:
547 case ISD::SRL:
548 case ISD::SRA:
549 if (SLT == MVT::i64)
550 return get64BitInstrCost(CostKind) * LT.first * NElts;
551
552 if (ST->has16BitInsts() && SLT == MVT::i16)
553 NElts = (NElts + 1) / 2;
554
555 // i32
556 return getFullRateInstrCost() * LT.first * NElts;
557 case ISD::ADD:
558 case ISD::SUB:
559 case ISD::AND:
560 case ISD::OR:
561 case ISD::XOR:
562 if (SLT == MVT::i64) {
563 // and, or and xor are typically split into 2 VALU instructions.
564 return 2 * getFullRateInstrCost() * LT.first * NElts;
565 }
566
567 if (ST->has16BitInsts() && SLT == MVT::i16)
568 NElts = (NElts + 1) / 2;
569
570 return LT.first * NElts * getFullRateInstrCost();
571 case ISD::MUL: {
572 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
573 if (SLT == MVT::i64) {
574 const int FullRateCost = getFullRateInstrCost();
575 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
576 }
577
578 if (ST->has16BitInsts() && SLT == MVT::i16)
579 NElts = (NElts + 1) / 2;
580
581 // i32
582 return QuarterRateCost * NElts * LT.first;
583 }
584 case ISD::FMUL:
585 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
586 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
587 // fused operation.
588 if (CxtI && CxtI->hasOneUse())
589 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
590 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
591 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
592 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
593 return TargetTransformInfo::TCC_Free;
594 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
595 return TargetTransformInfo::TCC_Free;
596
597 // Estimate all types may be fused with contract/unsafe flags
598 const TargetOptions &Options = TLI->getTargetMachine().Options;
599 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
600 Options.UnsafeFPMath ||
601 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
602 return TargetTransformInfo::TCC_Free;
603 }
604 }
605 [[fallthrough]];
606 case ISD::FADD:
607 case ISD::FSUB:
608 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
609 NElts = (NElts + 1) / 2;
610 if (SLT == MVT::f64)
611 return LT.first * NElts * get64BitInstrCost(CostKind);
612
613 if (ST->has16BitInsts() && SLT == MVT::f16)
614 NElts = (NElts + 1) / 2;
615
616 if (SLT == MVT::f32 || SLT == MVT::f16)
617 return LT.first * NElts * getFullRateInstrCost();
618 break;
619 case ISD::FDIV:
620 case ISD::FREM:
621 // FIXME: frem should be handled separately. The fdiv in it is most of it,
622 // but the current lowering is also not entirely correct.
623 if (SLT == MVT::f64) {
624 int Cost = 7 * get64BitInstrCost(CostKind) +
625 getQuarterRateInstrCost(CostKind) +
626 3 * getHalfRateInstrCost(CostKind);
627 // Add cost of workaround.
628 if (!ST->hasUsableDivScaleConditionOutput())
629 Cost += 3 * getFullRateInstrCost();
630
631 return LT.first * Cost * NElts;
632 }
633
634 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
635 // TODO: This is more complicated, unsafe flags etc.
636 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
637 (SLT == MVT::f16 && ST->has16BitInsts())) {
638 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
639 }
640 }
641
642 if (SLT == MVT::f16 && ST->has16BitInsts()) {
643 // 2 x v_cvt_f32_f16
644 // f32 rcp
645 // f32 fmul
646 // v_cvt_f16_f32
647 // f16 div_fixup
648 int Cost =
649 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
650 return LT.first * Cost * NElts;
651 }
652
653 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
654 TLI->getTargetMachine().Options.UnsafeFPMath)) {
655 // Fast unsafe fdiv lowering:
656 // f32 rcp
657 // f32 fmul
658 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
659 return LT.first * Cost * NElts;
660 }
661
662 if (SLT == MVT::f32 || SLT == MVT::f16) {
663 // 4 more v_cvt_* insts without f16 insts support
664 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
665 1 * getQuarterRateInstrCost(CostKind);
666
667 if (!HasFP32Denormals) {
668 // FP mode switches.
669 Cost += 2 * getFullRateInstrCost();
670 }
671
672 return LT.first * NElts * Cost;
673 }
674 break;
675 case ISD::FNEG:
676 // Use the backend' estimation. If fneg is not free each element will cost
677 // one additional instruction.
678 return TLI->isFNegFree(SLT) ? 0 : NElts;
679 default:
680 break;
681 }
682
683 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
684 Args, CxtI);
685 }
686
687 // Return true if there's a potential benefit from using v2f16/v2i16
688 // instructions for an intrinsic, even if it requires nontrivial legalization.
intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)689 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
690 switch (ID) {
691 case Intrinsic::fma:
692 case Intrinsic::fmuladd:
693 case Intrinsic::copysign:
694 case Intrinsic::minimumnum:
695 case Intrinsic::maximumnum:
696 case Intrinsic::canonicalize:
697 // There's a small benefit to using vector ops in the legalized code.
698 case Intrinsic::round:
699 case Intrinsic::uadd_sat:
700 case Intrinsic::usub_sat:
701 case Intrinsic::sadd_sat:
702 case Intrinsic::ssub_sat:
703 case Intrinsic::abs:
704 return true;
705 default:
706 return false;
707 }
708 }
709
710 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind) const711 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
712 TTI::TargetCostKind CostKind) const {
713 switch (ICA.getID()) {
714 case Intrinsic::fabs:
715 // Free source modifier in the common case.
716 return 0;
717 case Intrinsic::amdgcn_workitem_id_x:
718 case Intrinsic::amdgcn_workitem_id_y:
719 case Intrinsic::amdgcn_workitem_id_z:
720 // TODO: If hasPackedTID, or if the calling context is not an entry point
721 // there may be a bit instruction.
722 return 0;
723 case Intrinsic::amdgcn_workgroup_id_x:
724 case Intrinsic::amdgcn_workgroup_id_y:
725 case Intrinsic::amdgcn_workgroup_id_z:
726 case Intrinsic::amdgcn_lds_kernel_id:
727 case Intrinsic::amdgcn_dispatch_ptr:
728 case Intrinsic::amdgcn_dispatch_id:
729 case Intrinsic::amdgcn_implicitarg_ptr:
730 case Intrinsic::amdgcn_queue_ptr:
731 // Read from an argument register.
732 return 0;
733 default:
734 break;
735 }
736
737 if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
738 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
739
740 Type *RetTy = ICA.getReturnType();
741
742 // Legalize the type.
743 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
744
745 unsigned NElts = LT.second.isVector() ?
746 LT.second.getVectorNumElements() : 1;
747
748 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
749
750 if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
751 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
752 NElts = (NElts + 1) / 2;
753
754 // TODO: Get more refined intrinsic costs?
755 unsigned InstRate = getQuarterRateInstrCost(CostKind);
756
757 switch (ICA.getID()) {
758 case Intrinsic::fma:
759 case Intrinsic::fmuladd:
760 if (SLT == MVT::f64) {
761 InstRate = get64BitInstrCost(CostKind);
762 break;
763 }
764
765 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
766 InstRate = getFullRateInstrCost();
767 else {
768 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
769 : getQuarterRateInstrCost(CostKind);
770 }
771 break;
772 case Intrinsic::copysign:
773 return NElts * getFullRateInstrCost();
774 case Intrinsic::minimumnum:
775 case Intrinsic::maximumnum: {
776 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
777 // promotion takes the place of the canonicalize.
778 unsigned NumOps = 3;
779 if (const IntrinsicInst *II = ICA.getInst()) {
780 // Directly legal with ieee=0
781 // TODO: Not directly legal with strictfp
782 if (fpenvIEEEMode(*II) == KnownIEEEMode::Off)
783 NumOps = 1;
784 }
785
786 unsigned BaseRate =
787 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
788 InstRate = BaseRate * NumOps;
789 break;
790 }
791 case Intrinsic::canonicalize: {
792 InstRate =
793 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
794 break;
795 }
796 case Intrinsic::uadd_sat:
797 case Intrinsic::usub_sat:
798 case Intrinsic::sadd_sat:
799 case Intrinsic::ssub_sat: {
800 if (SLT == MVT::i16 || SLT == MVT::i32)
801 InstRate = getFullRateInstrCost();
802
803 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
804 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
805 NElts = 1;
806 break;
807 }
808 case Intrinsic::abs:
809 // Expansion takes 2 instructions for VALU
810 if (SLT == MVT::i16 || SLT == MVT::i32)
811 InstRate = 2 * getFullRateInstrCost();
812 break;
813 default:
814 break;
815 }
816
817 return LT.first * NElts * InstRate;
818 }
819
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I) const820 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
821 TTI::TargetCostKind CostKind,
822 const Instruction *I) const {
823 assert((I == nullptr || I->getOpcode() == Opcode) &&
824 "Opcode should reflect passed instruction.");
825 const bool SCost =
826 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
827 const int CBrCost = SCost ? 5 : 7;
828 switch (Opcode) {
829 case Instruction::Br: {
830 // Branch instruction takes about 4 slots on gfx900.
831 const auto *BI = dyn_cast_or_null<BranchInst>(I);
832 if (BI && BI->isUnconditional())
833 return SCost ? 1 : 4;
834 // Suppose conditional branch takes additional 3 exec manipulations
835 // instructions in average.
836 return CBrCost;
837 }
838 case Instruction::Switch: {
839 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
840 // Each case (including default) takes 1 cmp + 1 cbr instructions in
841 // average.
842 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
843 }
844 case Instruction::Ret:
845 return SCost ? 1 : 10;
846 }
847 return BaseT::getCFInstrCost(Opcode, CostKind, I);
848 }
849
850 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind) const851 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
852 std::optional<FastMathFlags> FMF,
853 TTI::TargetCostKind CostKind) const {
854 if (TTI::requiresOrderedReduction(FMF))
855 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
856
857 EVT OrigTy = TLI->getValueType(DL, Ty);
858
859 // Computes cost on targets that have packed math instructions(which support
860 // 16-bit types only).
861 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
862 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
863
864 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
865 return LT.first * getFullRateInstrCost();
866 }
867
868 InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind) const869 GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
870 FastMathFlags FMF,
871 TTI::TargetCostKind CostKind) const {
872 EVT OrigTy = TLI->getValueType(DL, Ty);
873
874 // Computes cost on targets that have packed math instructions(which support
875 // 16-bit types only).
876 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
877 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
878
879 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
880 return LT.first * getHalfRateInstrCost(CostKind);
881 }
882
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,const Value * Op0,const Value * Op1) const883 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
884 TTI::TargetCostKind CostKind,
885 unsigned Index, const Value *Op0,
886 const Value *Op1) const {
887 switch (Opcode) {
888 case Instruction::ExtractElement:
889 case Instruction::InsertElement: {
890 unsigned EltSize
891 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
892 if (EltSize < 32) {
893 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
894 return 0;
895 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
896 Op1);
897 }
898
899 // Extracts are just reads of a subregister, so are free. Inserts are
900 // considered free because we don't want to have any cost for scalarizing
901 // operations, and we don't have to copy into a different register class.
902
903 // Dynamic indexing isn't free and is best avoided.
904 return Index == ~0u ? 2 : 0;
905 }
906 default:
907 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
908 }
909 }
910
911 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
912 /// this is analyzing the collective result of all output registers. Otherwise,
913 /// this is only querying a specific result index if this returns multiple
914 /// registers in a struct.
isInlineAsmSourceOfDivergence(const CallInst * CI,ArrayRef<unsigned> Indices) const915 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
916 const CallInst *CI, ArrayRef<unsigned> Indices) const {
917 // TODO: Handle complex extract indices
918 if (Indices.size() > 1)
919 return true;
920
921 const DataLayout &DL = CI->getDataLayout();
922 const SIRegisterInfo *TRI = ST->getRegisterInfo();
923 TargetLowering::AsmOperandInfoVector TargetConstraints =
924 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
925
926 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
927
928 int OutputIdx = 0;
929 for (auto &TC : TargetConstraints) {
930 if (TC.Type != InlineAsm::isOutput)
931 continue;
932
933 // Skip outputs we don't care about.
934 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
935 continue;
936
937 TLI->ComputeConstraintToUse(TC, SDValue());
938
939 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
940 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
941
942 // For AGPR constraints null is returned on subtargets without AGPRs, so
943 // assume divergent for null.
944 if (!RC || !TRI->isSGPRClass(RC))
945 return true;
946 }
947
948 return false;
949 }
950
isReadRegisterSourceOfDivergence(const IntrinsicInst * ReadReg) const951 bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
952 const IntrinsicInst *ReadReg) const {
953 Metadata *MD =
954 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
955 StringRef RegName =
956 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
957
958 // Special case registers that look like VCC.
959 MVT VT = MVT::getVT(ReadReg->getType());
960 if (VT == MVT::i1)
961 return true;
962
963 // Special case scalar registers that start with 'v'.
964 if (RegName.starts_with("vcc") || RegName.empty())
965 return false;
966
967 // VGPR or AGPR is divergent. There aren't any specially named vector
968 // registers.
969 return RegName[0] == 'v' || RegName[0] == 'a';
970 }
971
972 /// \returns true if the result of the value could potentially be
973 /// different across workitems in a wavefront.
isSourceOfDivergence(const Value * V) const974 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
975 if (const Argument *A = dyn_cast<Argument>(V))
976 return !AMDGPU::isArgPassedInSGPR(A);
977
978 // Loads from the private and flat address spaces are divergent, because
979 // threads can execute the load instruction with the same inputs and get
980 // different results.
981 //
982 // All other loads are not divergent, because if threads issue loads with the
983 // same arguments, they will always get the same result.
984 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
985 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
986 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
987
988 // Atomics are divergent because they are executed sequentially: when an
989 // atomic operation refers to the same address in each thread, then each
990 // thread after the first sees the value written by the previous thread as
991 // original value.
992 if (isa<AtomicRMWInst, AtomicCmpXchgInst>(V))
993 return true;
994
995 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
996 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
997 return isReadRegisterSourceOfDivergence(Intrinsic);
998
999 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
1000 }
1001
1002 // Assume all function calls are a source of divergence.
1003 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1004 if (CI->isInlineAsm())
1005 return isInlineAsmSourceOfDivergence(CI);
1006 return true;
1007 }
1008
1009 // Assume all function calls are a source of divergence.
1010 if (isa<InvokeInst>(V))
1011 return true;
1012
1013 return false;
1014 }
1015
isAlwaysUniform(const Value * V) const1016 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1017 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1018 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1019
1020 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1021 if (CI->isInlineAsm())
1022 return !isInlineAsmSourceOfDivergence(CI);
1023 return false;
1024 }
1025
1026 // In most cases TID / wavefrontsize is uniform.
1027 //
1028 // However, if a kernel has uneven dimesions we can have a value of
1029 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1030 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1031 // packed into a same wave which gives 1 and 0 after the division by 64
1032 // respectively.
1033 //
1034 // FIXME: limit it to 1D kernels only, although that shall be possible
1035 // to perform this optimization is the size of the X dimension is a power
1036 // of 2, we just do not currently have infrastructure to query it.
1037 using namespace llvm::PatternMatch;
1038 uint64_t C;
1039 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1040 m_ConstantInt(C))) ||
1041 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1042 m_ConstantInt(C)))) {
1043 const Function *F = cast<Instruction>(V)->getFunction();
1044 return C >= ST->getWavefrontSizeLog2() &&
1045 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1046 }
1047
1048 Value *Mask;
1049 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1050 m_Value(Mask)))) {
1051 const Function *F = cast<Instruction>(V)->getFunction();
1052 const DataLayout &DL = F->getDataLayout();
1053 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1054 ST->getWavefrontSizeLog2() &&
1055 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1056 }
1057
1058 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1059 if (!ExtValue)
1060 return false;
1061
1062 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1063 if (!CI)
1064 return false;
1065
1066 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1067 switch (Intrinsic->getIntrinsicID()) {
1068 default:
1069 return false;
1070 case Intrinsic::amdgcn_if:
1071 case Intrinsic::amdgcn_else: {
1072 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1073 return Indices.size() == 1 && Indices[0] == 1;
1074 }
1075 }
1076 }
1077
1078 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1079 // divergent for the overall struct return. We need to override it in the
1080 // case we're extracting an SGPR component here.
1081 if (CI->isInlineAsm())
1082 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1083
1084 return false;
1085 }
1086
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID) const1087 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1088 Intrinsic::ID IID) const {
1089 switch (IID) {
1090 case Intrinsic::amdgcn_is_shared:
1091 case Intrinsic::amdgcn_is_private:
1092 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1093 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1094 case Intrinsic::amdgcn_load_to_lds:
1095 case Intrinsic::amdgcn_make_buffer_rsrc:
1096 OpIndexes.push_back(0);
1097 return true;
1098 default:
1099 return false;
1100 }
1101 }
1102
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV) const1103 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1104 Value *OldV,
1105 Value *NewV) const {
1106 auto IntrID = II->getIntrinsicID();
1107 switch (IntrID) {
1108 case Intrinsic::amdgcn_is_shared:
1109 case Intrinsic::amdgcn_is_private: {
1110 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1111 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1112 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1113 LLVMContext &Ctx = NewV->getType()->getContext();
1114 ConstantInt *NewVal = (TrueAS == NewAS) ?
1115 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1116 return NewVal;
1117 }
1118 case Intrinsic::ptrmask: {
1119 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1120 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1121 Value *MaskOp = II->getArgOperand(1);
1122 Type *MaskTy = MaskOp->getType();
1123
1124 bool DoTruncate = false;
1125
1126 const GCNTargetMachine &TM =
1127 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1128 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1129 // All valid 64-bit to 32-bit casts work by chopping off the high
1130 // bits. Any masking only clearing the low bits will also apply in the new
1131 // address space.
1132 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1133 DL.getPointerSizeInBits(NewAS) != 32)
1134 return nullptr;
1135
1136 // TODO: Do we need to thread more context in here?
1137 KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II);
1138 if (Known.countMinLeadingOnes() < 32)
1139 return nullptr;
1140
1141 DoTruncate = true;
1142 }
1143
1144 IRBuilder<> B(II);
1145 if (DoTruncate) {
1146 MaskTy = B.getInt32Ty();
1147 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1148 }
1149
1150 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1151 {NewV, MaskOp});
1152 }
1153 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1154 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1155 Type *DestTy = II->getType();
1156 Type *SrcTy = NewV->getType();
1157 unsigned NewAS = SrcTy->getPointerAddressSpace();
1158 if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))
1159 return nullptr;
1160 Module *M = II->getModule();
1161 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1162 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1163 II->setArgOperand(0, NewV);
1164 II->setCalledFunction(NewDecl);
1165 return II;
1166 }
1167 case Intrinsic::amdgcn_load_to_lds: {
1168 Type *SrcTy = NewV->getType();
1169 Module *M = II->getModule();
1170 Function *NewDecl =
1171 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1172 II->setArgOperand(0, NewV);
1173 II->setCalledFunction(NewDecl);
1174 return II;
1175 }
1176 case Intrinsic::amdgcn_make_buffer_rsrc: {
1177 Type *SrcTy = NewV->getType();
1178 Type *DstTy = II->getType();
1179 Module *M = II->getModule();
1180 Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1181 M, II->getIntrinsicID(), {DstTy, SrcTy});
1182 II->setArgOperand(0, NewV);
1183 II->setCalledFunction(NewDecl);
1184 return II;
1185 }
1186 default:
1187 return nullptr;
1188 }
1189 }
1190
getShuffleCost(TTI::ShuffleKind Kind,VectorType * DstTy,VectorType * SrcTy,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args,const Instruction * CxtI) const1191 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1192 VectorType *DstTy, VectorType *SrcTy,
1193 ArrayRef<int> Mask,
1194 TTI::TargetCostKind CostKind,
1195 int Index, VectorType *SubTp,
1196 ArrayRef<const Value *> Args,
1197 const Instruction *CxtI) const {
1198 if (!isa<FixedVectorType>(SrcTy))
1199 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1200 SubTp);
1201
1202 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1203
1204 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1205 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1206 (ScalarSize == 16 || ScalarSize == 8)) {
1207 // Larger vector widths may require additional instructions, but are
1208 // typically cheaper than scalarized versions.
1209 unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1210 unsigned RequestedElts =
1211 count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1212 unsigned EltsPerReg = 32 / ScalarSize;
1213 if (RequestedElts == 0)
1214 return 0;
1215 switch (Kind) {
1216 case TTI::SK_Broadcast:
1217 case TTI::SK_Reverse:
1218 case TTI::SK_PermuteSingleSrc: {
1219 // With op_sel VOP3P instructions freely can access the low half or high
1220 // half of a register, so any swizzle of two elements is free.
1221 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
1222 return 0;
1223 unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1224 // SK_Broadcast just reuses the same mask
1225 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1226 return NumPerms + NumPermMasks;
1227 }
1228 case TTI::SK_ExtractSubvector:
1229 case TTI::SK_InsertSubvector: {
1230 // Even aligned accesses are free
1231 if (!(Index % 2))
1232 return 0;
1233 // Insert/extract subvectors only require shifts / extract code to get the
1234 // relevant bits
1235 return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1236 }
1237 case TTI::SK_PermuteTwoSrc:
1238 case TTI::SK_Splice:
1239 case TTI::SK_Select: {
1240 unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
1241 // SK_Select just reuses the same mask
1242 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1243 return NumPerms + NumPermMasks;
1244 }
1245
1246 default:
1247 break;
1248 }
1249 }
1250
1251 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1252 SubTp);
1253 }
1254
1255 /// Whether it is profitable to sink the operands of an
1256 /// Instruction I to the basic block of I.
1257 /// This helps using several modifiers (like abs and neg) more often.
isProfitableToSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const1258 bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1259 SmallVectorImpl<Use *> &Ops) const {
1260 using namespace PatternMatch;
1261
1262 for (auto &Op : I->operands()) {
1263 // Ensure we are not already sinking this operand.
1264 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1265 continue;
1266
1267 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
1268 Ops.push_back(&Op);
1269 }
1270
1271 return !Ops.empty();
1272 }
1273
areInlineCompatible(const Function * Caller,const Function * Callee) const1274 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1275 const Function *Callee) const {
1276 const TargetMachine &TM = getTLI()->getTargetMachine();
1277 const GCNSubtarget *CallerST
1278 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1279 const GCNSubtarget *CalleeST
1280 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1281
1282 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1283 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1284
1285 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1286 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1287 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1288 return false;
1289
1290 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1291 // no way to support merge for backend defined attributes.
1292 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1293 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1294 if (!CallerMode.isInlineCompatible(CalleeMode))
1295 return false;
1296
1297 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1298 Callee->hasFnAttribute(Attribute::InlineHint))
1299 return true;
1300
1301 // Hack to make compile times reasonable.
1302 if (InlineMaxBB) {
1303 // Single BB does not increase total BB amount.
1304 if (Callee->size() == 1)
1305 return true;
1306 size_t BBSize = Caller->size() + Callee->size() - 1;
1307 return BBSize <= InlineMaxBB;
1308 }
1309
1310 return true;
1311 }
1312
adjustInliningThresholdUsingCallee(const CallBase * CB,const SITargetLowering * TLI,const GCNTTIImpl * TTIImpl)1313 static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1314 const SITargetLowering *TLI,
1315 const GCNTTIImpl *TTIImpl) {
1316 const int NrOfSGPRUntilSpill = 26;
1317 const int NrOfVGPRUntilSpill = 32;
1318
1319 const DataLayout &DL = TTIImpl->getDataLayout();
1320
1321 unsigned adjustThreshold = 0;
1322 int SGPRsInUse = 0;
1323 int VGPRsInUse = 0;
1324 for (const Use &A : CB->args()) {
1325 SmallVector<EVT, 4> ValueVTs;
1326 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1327 for (auto ArgVT : ValueVTs) {
1328 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1329 CB->getContext(), CB->getCallingConv(), ArgVT);
1330 if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
1331 SGPRsInUse += CCRegNum;
1332 else
1333 VGPRsInUse += CCRegNum;
1334 }
1335 }
1336
1337 // The cost of passing function arguments through the stack:
1338 // 1 instruction to put a function argument on the stack in the caller.
1339 // 1 instruction to take a function argument from the stack in callee.
1340 // 1 instruction is explicitly take care of data dependencies in callee
1341 // function.
1342 InstructionCost ArgStackCost(1);
1343 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1344 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1345 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1346 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1347 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1348 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1349
1350 // The penalty cost is computed relative to the cost of instructions and does
1351 // not model any storage costs.
1352 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1353 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1354 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1355 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1356 return adjustThreshold;
1357 }
1358
getCallArgsTotalAllocaSize(const CallBase * CB,const DataLayout & DL)1359 static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1360 const DataLayout &DL) {
1361 // If we have a pointer to a private array passed into a function
1362 // it will not be optimized out, leaving scratch usage.
1363 // This function calculates the total size in bytes of the memory that would
1364 // end in scratch if the call was not inlined.
1365 unsigned AllocaSize = 0;
1366 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1367 for (Value *PtrArg : CB->args()) {
1368 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1369 if (!Ty)
1370 continue;
1371
1372 unsigned AddrSpace = Ty->getAddressSpace();
1373 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1374 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1375 continue;
1376
1377 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1378 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1379 continue;
1380
1381 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1382 }
1383 return AllocaSize;
1384 }
1385
getInliningLastCallToStaticBonus() const1386 int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1387 return BaseT::getInliningLastCallToStaticBonus() *
1388 getInliningThresholdMultiplier();
1389 }
1390
adjustInliningThreshold(const CallBase * CB) const1391 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1392 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1393
1394 // Private object passed as arguments may end up in scratch usage if the call
1395 // is not inlined. Increase the inline threshold to promote inlining.
1396 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1397 if (AllocaSize > 0)
1398 Threshold += ArgAllocaCost;
1399 return Threshold;
1400 }
1401
getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI) const1402 unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1403 const AllocaInst *AI) const {
1404
1405 // Below the cutoff, assume that the private memory objects would be
1406 // optimized
1407 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1408 if (AllocaSize <= ArgAllocaCutoff)
1409 return 0;
1410
1411 // Above the cutoff, we give a cost to each private memory object
1412 // depending its size. If the array can be optimized by SROA this cost is not
1413 // added to the total-cost in the inliner cost analysis.
1414 //
1415 // We choose the total cost of the alloca such that their sum cancels the
1416 // bonus given in the threshold (ArgAllocaCost).
1417 //
1418 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1419 //
1420 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1421 // the single-bb bonus and the vector-bonus.
1422 //
1423 // We compensate the first two multipliers, by repeating logic from the
1424 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1425 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1426 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1427
1428 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1429 return BB.getTerminator()->getNumSuccessors() > 1;
1430 });
1431 if (SingleBB) {
1432 Threshold += Threshold / 2;
1433 }
1434
1435 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1436
1437 // Attribute the bonus proportionally to the alloca size
1438 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1439
1440 return AllocaThresholdBonus;
1441 }
1442
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE) const1443 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1444 TTI::UnrollingPreferences &UP,
1445 OptimizationRemarkEmitter *ORE) const {
1446 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1447 }
1448
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP) const1449 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1450 TTI::PeelingPreferences &PP) const {
1451 CommonTTI.getPeelingPreferences(L, SE, PP);
1452 }
1453
get64BitInstrCost(TTI::TargetCostKind CostKind) const1454 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1455 return ST->hasFullRate64Ops()
1456 ? getFullRateInstrCost()
1457 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1458 : getQuarterRateInstrCost(CostKind);
1459 }
1460
1461 std::pair<InstructionCost, MVT>
getTypeLegalizationCost(Type * Ty) const1462 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1463 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1464 auto Size = DL.getTypeSizeInBits(Ty);
1465 // Maximum load or store can handle 8 dwords for scalar and 4 for
1466 // vector ALU. Let's assume anything above 8 dwords is expensive
1467 // even if legal.
1468 if (Size <= 256)
1469 return Cost;
1470
1471 Cost.first += (Size + 255) / 256;
1472 return Cost;
1473 }
1474
getPrefetchDistance() const1475 unsigned GCNTTIImpl::getPrefetchDistance() const {
1476 return ST->hasPrefetch() ? 128 : 0;
1477 }
1478
shouldPrefetchAddressSpace(unsigned AS) const1479 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1480 return AMDGPU::isFlatGlobalAddrSpace(AS);
1481 }
1482
collectKernelLaunchBounds(const Function & F,SmallVectorImpl<std::pair<StringRef,int64_t>> & LB) const1483 void GCNTTIImpl::collectKernelLaunchBounds(
1484 const Function &F,
1485 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1486 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1487 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1488 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1489 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1490 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1491 ST->getFlatWorkGroupSizes(F);
1492 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1493 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1494 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1495 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1496 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1497 }
1498
1499 GCNTTIImpl::KnownIEEEMode
fpenvIEEEMode(const Instruction & I) const1500 GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1501 if (!ST->hasIEEEMode()) // Only mode on gfx12
1502 return KnownIEEEMode::On;
1503
1504 const Function *F = I.getFunction();
1505 if (!F)
1506 return KnownIEEEMode::Unknown;
1507
1508 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1509 if (IEEEAttr.isValid())
1510 return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1511
1512 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1513 : KnownIEEEMode::On;
1514 }
1515
getMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I) const1516 InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1517 Align Alignment,
1518 unsigned AddressSpace,
1519 TTI::TargetCostKind CostKind,
1520 TTI::OperandValueInfo OpInfo,
1521 const Instruction *I) const {
1522 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1523 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1524 VecTy->getElementType()->isIntegerTy(8)) {
1525 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1526 getLoadStoreVecRegBitWidth(AddressSpace));
1527 }
1528 }
1529 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1530 OpInfo, I);
1531 }
1532
getNumberOfParts(Type * Tp) const1533 unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1534 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1535 if (VecTy->getElementType()->isIntegerTy(8)) {
1536 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1537 return divideCeil(ElementCount - 1, 4);
1538 }
1539 }
1540 return BaseT::getNumberOfParts(Tp);
1541 }
1542