1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/CodeGen/TargetPassConfig.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IntrinsicsR600.h"
19 #include "llvm/Target/TargetMachine.h"
20 #include "llvm/Transforms/IPO/Attributor.h"
21
22 #define DEBUG_TYPE "amdgpu-attributor"
23
24 using namespace llvm;
25
26 static cl::opt<unsigned> IndirectCallSpecializationThreshold(
27 "amdgpu-indirect-call-specialization-threshold",
28 cl::desc(
29 "A threshold controls whether an indirect call will be specialized"),
30 cl::init(3));
31
32 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
33
34 enum ImplicitArgumentPositions {
35 #include "AMDGPUAttributes.def"
36 LAST_ARG_POS
37 };
38
39 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
40
41 enum ImplicitArgumentMask {
42 NOT_IMPLICIT_INPUT = 0,
43 #include "AMDGPUAttributes.def"
44 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
45 };
46
47 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
48 static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
49 ImplicitAttrs[] = {
50 #include "AMDGPUAttributes.def"
51 };
52
53 // We do not need to note the x workitem or workgroup id because they are always
54 // initialized.
55 //
56 // TODO: We should not add the attributes if the known compile time workgroup
57 // size is 1 for y/z.
58 static ImplicitArgumentMask
intrinsicToAttrMask(Intrinsic::ID ID,bool & NonKernelOnly,bool & NeedsImplicit,bool HasApertureRegs,bool SupportsGetDoorBellID,unsigned CodeObjectVersion)59 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
60 bool HasApertureRegs, bool SupportsGetDoorBellID,
61 unsigned CodeObjectVersion) {
62 switch (ID) {
63 case Intrinsic::amdgcn_workitem_id_x:
64 NonKernelOnly = true;
65 return WORKITEM_ID_X;
66 case Intrinsic::amdgcn_workgroup_id_x:
67 NonKernelOnly = true;
68 return WORKGROUP_ID_X;
69 case Intrinsic::amdgcn_workitem_id_y:
70 case Intrinsic::r600_read_tidig_y:
71 return WORKITEM_ID_Y;
72 case Intrinsic::amdgcn_workitem_id_z:
73 case Intrinsic::r600_read_tidig_z:
74 return WORKITEM_ID_Z;
75 case Intrinsic::amdgcn_workgroup_id_y:
76 case Intrinsic::r600_read_tgid_y:
77 return WORKGROUP_ID_Y;
78 case Intrinsic::amdgcn_workgroup_id_z:
79 case Intrinsic::r600_read_tgid_z:
80 return WORKGROUP_ID_Z;
81 case Intrinsic::amdgcn_lds_kernel_id:
82 return LDS_KERNEL_ID;
83 case Intrinsic::amdgcn_dispatch_ptr:
84 return DISPATCH_PTR;
85 case Intrinsic::amdgcn_dispatch_id:
86 return DISPATCH_ID;
87 case Intrinsic::amdgcn_implicitarg_ptr:
88 return IMPLICIT_ARG_PTR;
89 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
90 // queue_ptr.
91 case Intrinsic::amdgcn_queue_ptr:
92 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
93 return QUEUE_PTR;
94 case Intrinsic::amdgcn_is_shared:
95 case Intrinsic::amdgcn_is_private:
96 if (HasApertureRegs)
97 return NOT_IMPLICIT_INPUT;
98 // Under V5, we need implicitarg_ptr + offsets to access private_base or
99 // shared_base. For pre-V5, however, need to access them through queue_ptr +
100 // offsets.
101 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
102 : QUEUE_PTR;
103 case Intrinsic::trap:
104 case Intrinsic::debugtrap:
105 case Intrinsic::ubsantrap:
106 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
107 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
108 : QUEUE_PTR;
109 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
110 return QUEUE_PTR;
111 default:
112 return NOT_IMPLICIT_INPUT;
113 }
114 }
115
castRequiresQueuePtr(unsigned SrcAS)116 static bool castRequiresQueuePtr(unsigned SrcAS) {
117 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
118 }
119
isDSAddress(const Constant * C)120 static bool isDSAddress(const Constant *C) {
121 const GlobalValue *GV = dyn_cast<GlobalValue>(C);
122 if (!GV)
123 return false;
124 unsigned AS = GV->getAddressSpace();
125 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
126 }
127
128 /// Returns true if the function requires the implicit argument be passed
129 /// regardless of the function contents.
funcRequiresHostcallPtr(const Function & F)130 static bool funcRequiresHostcallPtr(const Function &F) {
131 // Sanitizers require the hostcall buffer passed in the implicit arguments.
132 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
133 F.hasFnAttribute(Attribute::SanitizeThread) ||
134 F.hasFnAttribute(Attribute::SanitizeMemory) ||
135 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
136 F.hasFnAttribute(Attribute::SanitizeMemTag);
137 }
138
139 namespace {
140 class AMDGPUInformationCache : public InformationCache {
141 public:
AMDGPUInformationCache(const Module & M,AnalysisGetter & AG,BumpPtrAllocator & Allocator,SetVector<Function * > * CGSCC,TargetMachine & TM)142 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
143 BumpPtrAllocator &Allocator,
144 SetVector<Function *> *CGSCC, TargetMachine &TM)
145 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
146 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
147
148 TargetMachine &TM;
149
150 enum ConstantStatus : uint8_t {
151 NONE = 0,
152 DS_GLOBAL = 1 << 0,
153 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
154 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
155 ADDR_SPACE_CAST_BOTH_TO_FLAT =
156 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
157 };
158
159 /// Check if the subtarget has aperture regs.
hasApertureRegs(Function & F)160 bool hasApertureRegs(Function &F) {
161 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
162 return ST.hasApertureRegs();
163 }
164
165 /// Check if the subtarget supports GetDoorbellID.
supportsGetDoorbellID(Function & F)166 bool supportsGetDoorbellID(Function &F) {
167 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
168 return ST.supportsGetDoorbellID();
169 }
170
171 std::optional<std::pair<unsigned, unsigned>>
getFlatWorkGroupSizeAttr(const Function & F) const172 getFlatWorkGroupSizeAttr(const Function &F) const {
173 auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
174 if (!R)
175 return std::nullopt;
176 return std::make_pair(R->first, *(R->second));
177 }
178
179 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(const Function & F) const180 getDefaultFlatWorkGroupSize(const Function &F) const {
181 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
182 return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
183 }
184
185 std::pair<unsigned, unsigned>
getMaximumFlatWorkGroupRange(const Function & F)186 getMaximumFlatWorkGroupRange(const Function &F) {
187 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
189 }
190
getMaxNumWorkGroups(const Function & F)191 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
192 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
193 return ST.getMaxNumWorkGroups(F);
194 }
195
196 /// Get code object version.
getCodeObjectVersion() const197 unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
198
199 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
200 /// accounting for the interaction with the passed value to use for
201 /// "amdgpu-flat-work-group-size".
202 std::pair<unsigned, unsigned>
getWavesPerEU(const Function & F,std::pair<unsigned,unsigned> FlatWorkGroupSize)203 getWavesPerEU(const Function &F,
204 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
205 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
206 return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
207 }
208
209 std::optional<std::pair<unsigned, unsigned>>
getWavesPerEUAttr(const Function & F)210 getWavesPerEUAttr(const Function &F) {
211 auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
212 /*OnlyFirstRequired=*/true);
213 if (!Val)
214 return std::nullopt;
215 if (!Val->second) {
216 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
217 Val->second = ST.getMaxWavesPerEU();
218 }
219 return std::make_pair(Val->first, *(Val->second));
220 }
221
222 std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,std::pair<unsigned,unsigned> FlatWorkGroupSize)223 getEffectiveWavesPerEU(const Function &F,
224 std::pair<unsigned, unsigned> WavesPerEU,
225 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
226 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
227 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
228 getLDSSize(F));
229 }
230
getMaxWavesPerEU(const Function & F)231 unsigned getMaxWavesPerEU(const Function &F) {
232 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
233 return ST.getMaxWavesPerEU();
234 }
235
getMaxAddrSpace() const236 unsigned getMaxAddrSpace() const override {
237 return AMDGPUAS::MAX_AMDGPU_ADDRESS;
238 }
239
240 private:
241 /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
242 /// local to flat. These casts may require the queue pointer.
visitConstExpr(const ConstantExpr * CE)243 static uint8_t visitConstExpr(const ConstantExpr *CE) {
244 uint8_t Status = NONE;
245
246 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
247 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
248 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
249 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
250 else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
251 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
252 }
253
254 return Status;
255 }
256
257 /// Returns the minimum amount of LDS space used by a workgroup running
258 /// function \p F.
getLDSSize(const Function & F)259 static unsigned getLDSSize(const Function &F) {
260 return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
261 {0, UINT32_MAX}, true)
262 .first;
263 }
264
265 /// Get the constant access bitmap for \p C.
getConstantAccess(const Constant * C,SmallPtrSetImpl<const Constant * > & Visited)266 uint8_t getConstantAccess(const Constant *C,
267 SmallPtrSetImpl<const Constant *> &Visited) {
268 auto It = ConstantStatus.find(C);
269 if (It != ConstantStatus.end())
270 return It->second;
271
272 uint8_t Result = 0;
273 if (isDSAddress(C))
274 Result = DS_GLOBAL;
275
276 if (const auto *CE = dyn_cast<ConstantExpr>(C))
277 Result |= visitConstExpr(CE);
278
279 for (const Use &U : C->operands()) {
280 const auto *OpC = dyn_cast<Constant>(U);
281 if (!OpC || !Visited.insert(OpC).second)
282 continue;
283
284 Result |= getConstantAccess(OpC, Visited);
285 }
286 return Result;
287 }
288
289 public:
290 /// Returns true if \p Fn needs the queue pointer because of \p C.
needsQueuePtr(const Constant * C,Function & Fn)291 bool needsQueuePtr(const Constant *C, Function &Fn) {
292 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
293 bool HasAperture = hasApertureRegs(Fn);
294
295 // No need to explore the constants.
296 if (!IsNonEntryFunc && HasAperture)
297 return false;
298
299 SmallPtrSet<const Constant *, 8> Visited;
300 uint8_t Access = getConstantAccess(C, Visited);
301
302 // We need to trap on DS globals in non-entry functions.
303 if (IsNonEntryFunc && (Access & DS_GLOBAL))
304 return true;
305
306 return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
307 }
308
checkConstForAddrSpaceCastFromPrivate(const Constant * C)309 bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
310 SmallPtrSet<const Constant *, 8> Visited;
311 uint8_t Access = getConstantAccess(C, Visited);
312 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
313 }
314
315 private:
316 /// Used to determine if the Constant needs the queue pointer.
317 DenseMap<const Constant *, uint8_t> ConstantStatus;
318 const unsigned CodeObjectVersion;
319 };
320
321 struct AAAMDAttributes
322 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
323 AbstractAttribute> {
324 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
325 AbstractAttribute>;
326
AAAMDAttributes__anon93435ae10111::AAAMDAttributes327 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
328
329 /// Create an abstract attribute view for the position \p IRP.
330 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
331 Attributor &A);
332
333 /// See AbstractAttribute::getName().
getName__anon93435ae10111::AAAMDAttributes334 StringRef getName() const override { return "AAAMDAttributes"; }
335
336 /// See AbstractAttribute::getIdAddr().
getIdAddr__anon93435ae10111::AAAMDAttributes337 const char *getIdAddr() const override { return &ID; }
338
339 /// This function should return true if the type of the \p AA is
340 /// AAAMDAttributes.
classof__anon93435ae10111::AAAMDAttributes341 static bool classof(const AbstractAttribute *AA) {
342 return (AA->getIdAddr() == &ID);
343 }
344
345 /// Unique ID (due to the unique address)
346 static const char ID;
347 };
348 const char AAAMDAttributes::ID = 0;
349
350 struct AAUniformWorkGroupSize
351 : public StateWrapper<BooleanState, AbstractAttribute> {
352 using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAUniformWorkGroupSize__anon93435ae10111::AAUniformWorkGroupSize353 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
354
355 /// Create an abstract attribute view for the position \p IRP.
356 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
357 Attributor &A);
358
359 /// See AbstractAttribute::getName().
getName__anon93435ae10111::AAUniformWorkGroupSize360 StringRef getName() const override { return "AAUniformWorkGroupSize"; }
361
362 /// See AbstractAttribute::getIdAddr().
getIdAddr__anon93435ae10111::AAUniformWorkGroupSize363 const char *getIdAddr() const override { return &ID; }
364
365 /// This function should return true if the type of the \p AA is
366 /// AAAMDAttributes.
classof__anon93435ae10111::AAUniformWorkGroupSize367 static bool classof(const AbstractAttribute *AA) {
368 return (AA->getIdAddr() == &ID);
369 }
370
371 /// Unique ID (due to the unique address)
372 static const char ID;
373 };
374 const char AAUniformWorkGroupSize::ID = 0;
375
376 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
AAUniformWorkGroupSizeFunction__anon93435ae10111::AAUniformWorkGroupSizeFunction377 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
378 : AAUniformWorkGroupSize(IRP, A) {}
379
initialize__anon93435ae10111::AAUniformWorkGroupSizeFunction380 void initialize(Attributor &A) override {
381 Function *F = getAssociatedFunction();
382 CallingConv::ID CC = F->getCallingConv();
383
384 if (CC != CallingConv::AMDGPU_KERNEL)
385 return;
386
387 bool InitialValue = false;
388 if (F->hasFnAttribute("uniform-work-group-size"))
389 InitialValue =
390 F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
391 "true";
392
393 if (InitialValue)
394 indicateOptimisticFixpoint();
395 else
396 indicatePessimisticFixpoint();
397 }
398
updateImpl__anon93435ae10111::AAUniformWorkGroupSizeFunction399 ChangeStatus updateImpl(Attributor &A) override {
400 ChangeStatus Change = ChangeStatus::UNCHANGED;
401
402 auto CheckCallSite = [&](AbstractCallSite CS) {
403 Function *Caller = CS.getInstruction()->getFunction();
404 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
405 << "->" << getAssociatedFunction()->getName() << "\n");
406
407 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
408 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
409 if (!CallerInfo || !CallerInfo->isValidState())
410 return false;
411
412 Change = Change | clampStateAndIndicateChange(this->getState(),
413 CallerInfo->getState());
414
415 return true;
416 };
417
418 bool AllCallSitesKnown = true;
419 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
420 return indicatePessimisticFixpoint();
421
422 return Change;
423 }
424
manifest__anon93435ae10111::AAUniformWorkGroupSizeFunction425 ChangeStatus manifest(Attributor &A) override {
426 SmallVector<Attribute, 8> AttrList;
427 LLVMContext &Ctx = getAssociatedFunction()->getContext();
428
429 AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
430 getAssumed() ? "true" : "false"));
431 return A.manifestAttrs(getIRPosition(), AttrList,
432 /* ForceReplace */ true);
433 }
434
isValidState__anon93435ae10111::AAUniformWorkGroupSizeFunction435 bool isValidState() const override {
436 // This state is always valid, even when the state is false.
437 return true;
438 }
439
getAsStr__anon93435ae10111::AAUniformWorkGroupSizeFunction440 const std::string getAsStr(Attributor *) const override {
441 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
442 }
443
444 /// See AbstractAttribute::trackStatistics()
trackStatistics__anon93435ae10111::AAUniformWorkGroupSizeFunction445 void trackStatistics() const override {}
446 };
447
448 AAUniformWorkGroupSize &
createForPosition(const IRPosition & IRP,Attributor & A)449 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
450 Attributor &A) {
451 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
452 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
453 llvm_unreachable(
454 "AAUniformWorkGroupSize is only valid for function position");
455 }
456
457 struct AAAMDAttributesFunction : public AAAMDAttributes {
AAAMDAttributesFunction__anon93435ae10111::AAAMDAttributesFunction458 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
459 : AAAMDAttributes(IRP, A) {}
460
initialize__anon93435ae10111::AAAMDAttributesFunction461 void initialize(Attributor &A) override {
462 Function *F = getAssociatedFunction();
463
464 // If the function requires the implicit arg pointer due to sanitizers,
465 // assume it's needed even if explicitly marked as not requiring it.
466 const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
467 if (NeedsHostcall) {
468 removeAssumedBits(IMPLICIT_ARG_PTR);
469 removeAssumedBits(HOSTCALL_PTR);
470 }
471
472 for (auto Attr : ImplicitAttrs) {
473 if (NeedsHostcall &&
474 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
475 continue;
476
477 if (F->hasFnAttribute(Attr.second))
478 addKnownBits(Attr.first);
479 }
480
481 if (F->isDeclaration())
482 return;
483
484 // Ignore functions with graphics calling conventions, these are currently
485 // not allowed to have kernel arguments.
486 if (AMDGPU::isGraphics(F->getCallingConv())) {
487 indicatePessimisticFixpoint();
488 return;
489 }
490 }
491
updateImpl__anon93435ae10111::AAAMDAttributesFunction492 ChangeStatus updateImpl(Attributor &A) override {
493 Function *F = getAssociatedFunction();
494 // The current assumed state used to determine a change.
495 auto OrigAssumed = getAssumed();
496
497 // Check for Intrinsics and propagate attributes.
498 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
499 *this, this->getIRPosition(), DepClassTy::REQUIRED);
500 if (!AAEdges || !AAEdges->isValidState() ||
501 AAEdges->hasNonAsmUnknownCallee())
502 return indicatePessimisticFixpoint();
503
504 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
505
506 bool NeedsImplicit = false;
507 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
508 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
509 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
510 unsigned COV = InfoCache.getCodeObjectVersion();
511
512 for (Function *Callee : AAEdges->getOptimisticEdges()) {
513 Intrinsic::ID IID = Callee->getIntrinsicID();
514 if (IID == Intrinsic::not_intrinsic) {
515 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
516 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
517 if (!AAAMD || !AAAMD->isValidState())
518 return indicatePessimisticFixpoint();
519 *this &= *AAAMD;
520 continue;
521 }
522
523 bool NonKernelOnly = false;
524 ImplicitArgumentMask AttrMask =
525 intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
526 HasApertureRegs, SupportsGetDoorbellID, COV);
527 if (AttrMask != NOT_IMPLICIT_INPUT) {
528 if ((IsNonEntryFunc || !NonKernelOnly))
529 removeAssumedBits(AttrMask);
530 }
531 }
532
533 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
534 if (NeedsImplicit)
535 removeAssumedBits(IMPLICIT_ARG_PTR);
536
537 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
538 // Under V5, we need implicitarg_ptr + offsets to access private_base or
539 // shared_base. We do not actually need queue_ptr.
540 if (COV >= 5)
541 removeAssumedBits(IMPLICIT_ARG_PTR);
542 else
543 removeAssumedBits(QUEUE_PTR);
544 }
545
546 if (funcRetrievesMultigridSyncArg(A, COV)) {
547 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
548 "multigrid_sync_arg needs implicitarg_ptr");
549 removeAssumedBits(MULTIGRID_SYNC_ARG);
550 }
551
552 if (funcRetrievesHostcallPtr(A, COV)) {
553 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
554 removeAssumedBits(HOSTCALL_PTR);
555 }
556
557 if (funcRetrievesHeapPtr(A, COV)) {
558 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
559 removeAssumedBits(HEAP_PTR);
560 }
561
562 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
563 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
564 removeAssumedBits(QUEUE_PTR);
565 }
566
567 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
568 removeAssumedBits(LDS_KERNEL_ID);
569 }
570
571 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
572 removeAssumedBits(DEFAULT_QUEUE);
573
574 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
575 removeAssumedBits(COMPLETION_ACTION);
576
577 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
578 removeAssumedBits(FLAT_SCRATCH_INIT);
579
580 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
581 : ChangeStatus::UNCHANGED;
582 }
583
manifest__anon93435ae10111::AAAMDAttributesFunction584 ChangeStatus manifest(Attributor &A) override {
585 SmallVector<Attribute, 8> AttrList;
586 LLVMContext &Ctx = getAssociatedFunction()->getContext();
587
588 for (auto Attr : ImplicitAttrs) {
589 if (isKnown(Attr.first))
590 AttrList.push_back(Attribute::get(Ctx, Attr.second));
591 }
592
593 return A.manifestAttrs(getIRPosition(), AttrList,
594 /* ForceReplace */ true);
595 }
596
getAsStr__anon93435ae10111::AAAMDAttributesFunction597 const std::string getAsStr(Attributor *) const override {
598 std::string Str;
599 raw_string_ostream OS(Str);
600 OS << "AMDInfo[";
601 for (auto Attr : ImplicitAttrs)
602 if (isAssumed(Attr.first))
603 OS << ' ' << Attr.second;
604 OS << " ]";
605 return OS.str();
606 }
607
608 /// See AbstractAttribute::trackStatistics()
trackStatistics__anon93435ae10111::AAAMDAttributesFunction609 void trackStatistics() const override {}
610
611 private:
checkForQueuePtr__anon93435ae10111::AAAMDAttributesFunction612 bool checkForQueuePtr(Attributor &A) {
613 Function *F = getAssociatedFunction();
614 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
615
616 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
617
618 bool NeedsQueuePtr = false;
619
620 auto CheckAddrSpaceCasts = [&](Instruction &I) {
621 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
622 if (castRequiresQueuePtr(SrcAS)) {
623 NeedsQueuePtr = true;
624 return false;
625 }
626 return true;
627 };
628
629 bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
630
631 // `checkForAllInstructions` is much more cheaper than going through all
632 // instructions, try it first.
633
634 // The queue pointer is not needed if aperture regs is present.
635 if (!HasApertureRegs) {
636 bool UsedAssumedInformation = false;
637 A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
638 {Instruction::AddrSpaceCast},
639 UsedAssumedInformation);
640 }
641
642 // If we found that we need the queue pointer, nothing else to do.
643 if (NeedsQueuePtr)
644 return true;
645
646 if (!IsNonEntryFunc && HasApertureRegs)
647 return false;
648
649 for (BasicBlock &BB : *F) {
650 for (Instruction &I : BB) {
651 for (const Use &U : I.operands()) {
652 if (const auto *C = dyn_cast<Constant>(U)) {
653 if (InfoCache.needsQueuePtr(C, *F))
654 return true;
655 }
656 }
657 }
658 }
659
660 return false;
661 }
662
funcRetrievesMultigridSyncArg__anon93435ae10111::AAAMDAttributesFunction663 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
664 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
665 AA::RangeTy Range(Pos, 8);
666 return funcRetrievesImplicitKernelArg(A, Range);
667 }
668
funcRetrievesHostcallPtr__anon93435ae10111::AAAMDAttributesFunction669 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
670 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
671 AA::RangeTy Range(Pos, 8);
672 return funcRetrievesImplicitKernelArg(A, Range);
673 }
674
funcRetrievesDefaultQueue__anon93435ae10111::AAAMDAttributesFunction675 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
676 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
677 AA::RangeTy Range(Pos, 8);
678 return funcRetrievesImplicitKernelArg(A, Range);
679 }
680
funcRetrievesCompletionAction__anon93435ae10111::AAAMDAttributesFunction681 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
682 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
683 AA::RangeTy Range(Pos, 8);
684 return funcRetrievesImplicitKernelArg(A, Range);
685 }
686
funcRetrievesHeapPtr__anon93435ae10111::AAAMDAttributesFunction687 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
688 if (COV < 5)
689 return false;
690 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
691 return funcRetrievesImplicitKernelArg(A, Range);
692 }
693
funcRetrievesQueuePtr__anon93435ae10111::AAAMDAttributesFunction694 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
695 if (COV < 5)
696 return false;
697 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
698 return funcRetrievesImplicitKernelArg(A, Range);
699 }
700
funcRetrievesImplicitKernelArg__anon93435ae10111::AAAMDAttributesFunction701 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
702 // Check if this is a call to the implicitarg_ptr builtin and it
703 // is used to retrieve the hostcall pointer. The implicit arg for
704 // hostcall is not used only if every use of the implicitarg_ptr
705 // is a load that clearly does not retrieve any byte of the
706 // hostcall pointer. We check this by tracing all the uses of the
707 // initial call to the implicitarg_ptr intrinsic.
708 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
709 auto &Call = cast<CallBase>(I);
710 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
711 return true;
712
713 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
714 *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
715 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
716 return false;
717
718 return PointerInfoAA->forallInterferingAccesses(
719 Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
720 return Acc.getRemoteInst()->isDroppable();
721 });
722 };
723
724 bool UsedAssumedInformation = false;
725 return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
726 UsedAssumedInformation);
727 }
728
funcRetrievesLDSKernelId__anon93435ae10111::AAAMDAttributesFunction729 bool funcRetrievesLDSKernelId(Attributor &A) {
730 auto DoesNotRetrieve = [&](Instruction &I) {
731 auto &Call = cast<CallBase>(I);
732 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
733 };
734 bool UsedAssumedInformation = false;
735 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
736 UsedAssumedInformation);
737 }
738
739 // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
740 // not to be set.
needFlatScratchInit__anon93435ae10111::AAAMDAttributesFunction741 bool needFlatScratchInit(Attributor &A) {
742 assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
743
744 // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
745 // there is a cast from PRIVATE_ADDRESS.
746 auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
747 return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
748 AMDGPUAS::PRIVATE_ADDRESS;
749 };
750
751 bool UsedAssumedInformation = false;
752 if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
753 {Instruction::AddrSpaceCast},
754 UsedAssumedInformation))
755 return true;
756
757 // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
758 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
759
760 Function *F = getAssociatedFunction();
761 for (Instruction &I : instructions(F)) {
762 for (const Use &U : I.operands()) {
763 if (const auto *C = dyn_cast<Constant>(U)) {
764 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
765 return true;
766 }
767 }
768 }
769
770 // Finally check callees.
771
772 // This is called on each callee; false means callee shouldn't have
773 // no-flat-scratch-init.
774 auto CheckForNoFlatScratchInit = [&](Instruction &I) {
775 const auto &CB = cast<CallBase>(I);
776 const Function *Callee = CB.getCalledFunction();
777
778 // Callee == 0 for inline asm or indirect call with known callees.
779 // In the latter case, updateImpl() already checked the callees and we
780 // know their FLAT_SCRATCH_INIT bit is set.
781 // If function has indirect call with unknown callees, the bit is
782 // already removed in updateImpl() and execution won't reach here.
783 if (!Callee)
784 return true;
785
786 return Callee->getIntrinsicID() !=
787 Intrinsic::amdgcn_addrspacecast_nonnull;
788 };
789
790 UsedAssumedInformation = false;
791 // If any callee is false (i.e. need FlatScratchInit),
792 // checkForAllCallLikeInstructions returns false, in which case this
793 // function returns true.
794 return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
795 UsedAssumedInformation);
796 }
797 };
798
createForPosition(const IRPosition & IRP,Attributor & A)799 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
800 Attributor &A) {
801 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
802 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
803 llvm_unreachable("AAAMDAttributes is only valid for function position");
804 }
805
806 /// Base class to derive different size ranges.
807 struct AAAMDSizeRangeAttribute
808 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
809 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
810
811 StringRef AttrName;
812
AAAMDSizeRangeAttribute__anon93435ae10111::AAAMDSizeRangeAttribute813 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
814 StringRef AttrName)
815 : Base(IRP, 32), AttrName(AttrName) {}
816
817 /// See AbstractAttribute::trackStatistics()
trackStatistics__anon93435ae10111::AAAMDSizeRangeAttribute818 void trackStatistics() const override {}
819
updateImplImpl__anon93435ae10111::AAAMDSizeRangeAttribute820 template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
821 ChangeStatus Change = ChangeStatus::UNCHANGED;
822
823 auto CheckCallSite = [&](AbstractCallSite CS) {
824 Function *Caller = CS.getInstruction()->getFunction();
825 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
826 << "->" << getAssociatedFunction()->getName() << '\n');
827
828 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
829 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
830 if (!CallerInfo || !CallerInfo->isValidState())
831 return false;
832
833 Change |=
834 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
835
836 return true;
837 };
838
839 bool AllCallSitesKnown = true;
840 if (!A.checkForAllCallSites(CheckCallSite, *this,
841 /*RequireAllCallSites=*/true,
842 AllCallSitesKnown))
843 return indicatePessimisticFixpoint();
844
845 return Change;
846 }
847
848 /// Clamp the assumed range to the default value ([Min, Max]) and emit the
849 /// attribute if it is not same as default.
850 ChangeStatus
emitAttributeIfNotDefaultAfterClamp__anon93435ae10111::AAAMDSizeRangeAttribute851 emitAttributeIfNotDefaultAfterClamp(Attributor &A,
852 std::pair<unsigned, unsigned> Default) {
853 auto [Min, Max] = Default;
854 unsigned Lower = getAssumed().getLower().getZExtValue();
855 unsigned Upper = getAssumed().getUpper().getZExtValue();
856
857 // Clamp the range to the default value.
858 if (Lower < Min)
859 Lower = Min;
860 if (Upper > Max + 1)
861 Upper = Max + 1;
862
863 // No manifest if the value is invalid or same as default after clamp.
864 if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
865 return ChangeStatus::UNCHANGED;
866
867 Function *F = getAssociatedFunction();
868 LLVMContext &Ctx = F->getContext();
869 SmallString<10> Buffer;
870 raw_svector_ostream OS(Buffer);
871 OS << Lower << ',' << Upper - 1;
872 return A.manifestAttrs(getIRPosition(),
873 {Attribute::get(Ctx, AttrName, OS.str())},
874 /*ForceReplace=*/true);
875 }
876
getAsStr__anon93435ae10111::AAAMDSizeRangeAttribute877 const std::string getAsStr(Attributor *) const override {
878 std::string Str;
879 raw_string_ostream OS(Str);
880 OS << getName() << '[';
881 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
882 OS << ']';
883 return OS.str();
884 }
885 };
886
887 /// Propagate amdgpu-flat-work-group-size attribute.
888 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
AAAMDFlatWorkGroupSize__anon93435ae10111::AAAMDFlatWorkGroupSize889 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
890 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
891
initialize__anon93435ae10111::AAAMDFlatWorkGroupSize892 void initialize(Attributor &A) override {
893 Function *F = getAssociatedFunction();
894 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
895
896 bool HasAttr = false;
897 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
898 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
899
900 if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
901 // We only consider an attribute that is not max range because the front
902 // end always emits the attribute, unfortunately, and sometimes it emits
903 // the max range.
904 if (*Attr != MaxRange) {
905 Range = *Attr;
906 HasAttr = true;
907 }
908 }
909
910 // We don't want to directly clamp the state if it's the max range because
911 // that is basically the worst state.
912 if (Range == MaxRange)
913 return;
914
915 auto [Min, Max] = Range;
916 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
917 IntegerRangeState IRS(CR);
918 clampStateAndIndicateChange(this->getState(), IRS);
919
920 if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
921 indicateOptimisticFixpoint();
922 }
923
updateImpl__anon93435ae10111::AAAMDFlatWorkGroupSize924 ChangeStatus updateImpl(Attributor &A) override {
925 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
926 }
927
928 /// Create an abstract attribute view for the position \p IRP.
929 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
930 Attributor &A);
931
manifest__anon93435ae10111::AAAMDFlatWorkGroupSize932 ChangeStatus manifest(Attributor &A) override {
933 Function *F = getAssociatedFunction();
934 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
935 return emitAttributeIfNotDefaultAfterClamp(
936 A, InfoCache.getMaximumFlatWorkGroupRange(*F));
937 }
938
939 /// See AbstractAttribute::getName()
getName__anon93435ae10111::AAAMDFlatWorkGroupSize940 StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
941
942 /// See AbstractAttribute::getIdAddr()
getIdAddr__anon93435ae10111::AAAMDFlatWorkGroupSize943 const char *getIdAddr() const override { return &ID; }
944
945 /// This function should return true if the type of the \p AA is
946 /// AAAMDFlatWorkGroupSize
classof__anon93435ae10111::AAAMDFlatWorkGroupSize947 static bool classof(const AbstractAttribute *AA) {
948 return (AA->getIdAddr() == &ID);
949 }
950
951 /// Unique ID (due to the unique address)
952 static const char ID;
953 };
954
955 const char AAAMDFlatWorkGroupSize::ID = 0;
956
957 AAAMDFlatWorkGroupSize &
createForPosition(const IRPosition & IRP,Attributor & A)958 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
959 Attributor &A) {
960 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
961 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
962 llvm_unreachable(
963 "AAAMDFlatWorkGroupSize is only valid for function position");
964 }
965
966 struct TupleDecIntegerRangeState : public AbstractState {
967 DecIntegerState<uint32_t> X, Y, Z;
968
isValidState__anon93435ae10111::TupleDecIntegerRangeState969 bool isValidState() const override {
970 return X.isValidState() && Y.isValidState() && Z.isValidState();
971 }
972
isAtFixpoint__anon93435ae10111::TupleDecIntegerRangeState973 bool isAtFixpoint() const override {
974 return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
975 }
976
indicateOptimisticFixpoint__anon93435ae10111::TupleDecIntegerRangeState977 ChangeStatus indicateOptimisticFixpoint() override {
978 return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
979 Z.indicateOptimisticFixpoint();
980 }
981
indicatePessimisticFixpoint__anon93435ae10111::TupleDecIntegerRangeState982 ChangeStatus indicatePessimisticFixpoint() override {
983 return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
984 Z.indicatePessimisticFixpoint();
985 }
986
operator ^=__anon93435ae10111::TupleDecIntegerRangeState987 TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
988 X ^= Other.X;
989 Y ^= Other.Y;
990 Z ^= Other.Z;
991 return *this;
992 }
993
operator ==__anon93435ae10111::TupleDecIntegerRangeState994 bool operator==(const TupleDecIntegerRangeState &Other) const {
995 return X == Other.X && Y == Other.Y && Z == Other.Z;
996 }
997
getAssumed__anon93435ae10111::TupleDecIntegerRangeState998 TupleDecIntegerRangeState &getAssumed() { return *this; }
getAssumed__anon93435ae10111::TupleDecIntegerRangeState999 const TupleDecIntegerRangeState &getAssumed() const { return *this; }
1000 };
1001
1002 using AAAMDMaxNumWorkgroupsState =
1003 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1004
1005 /// Propagate amdgpu-max-num-workgroups attribute.
1006 struct AAAMDMaxNumWorkgroups
1007 : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1008 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1009
AAAMDMaxNumWorkgroups__anon93435ae10111::AAAMDMaxNumWorkgroups1010 AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1011
initialize__anon93435ae10111::AAAMDMaxNumWorkgroups1012 void initialize(Attributor &A) override {
1013 Function *F = getAssociatedFunction();
1014 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1015
1016 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1017
1018 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1019 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1020 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1021
1022 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1023 indicatePessimisticFixpoint();
1024 }
1025
updateImpl__anon93435ae10111::AAAMDMaxNumWorkgroups1026 ChangeStatus updateImpl(Attributor &A) override {
1027 ChangeStatus Change = ChangeStatus::UNCHANGED;
1028
1029 auto CheckCallSite = [&](AbstractCallSite CS) {
1030 Function *Caller = CS.getInstruction()->getFunction();
1031 LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1032 << "->" << getAssociatedFunction()->getName() << '\n');
1033
1034 const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1035 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1036 if (!CallerInfo || !CallerInfo->isValidState())
1037 return false;
1038
1039 Change |=
1040 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1041 return true;
1042 };
1043
1044 bool AllCallSitesKnown = true;
1045 if (!A.checkForAllCallSites(CheckCallSite, *this,
1046 /*RequireAllCallSites=*/true,
1047 AllCallSitesKnown))
1048 return indicatePessimisticFixpoint();
1049
1050 return Change;
1051 }
1052
1053 /// Create an abstract attribute view for the position \p IRP.
1054 static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1055 Attributor &A);
1056
manifest__anon93435ae10111::AAAMDMaxNumWorkgroups1057 ChangeStatus manifest(Attributor &A) override {
1058 Function *F = getAssociatedFunction();
1059 LLVMContext &Ctx = F->getContext();
1060 SmallString<32> Buffer;
1061 raw_svector_ostream OS(Buffer);
1062 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1063
1064 // TODO: Should annotate loads of the group size for this to do anything
1065 // useful.
1066 return A.manifestAttrs(
1067 getIRPosition(),
1068 {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1069 /* ForceReplace= */ true);
1070 }
1071
getName__anon93435ae10111::AAAMDMaxNumWorkgroups1072 StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1073
getAsStr__anon93435ae10111::AAAMDMaxNumWorkgroups1074 const std::string getAsStr(Attributor *) const override {
1075 std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1076 raw_string_ostream OS(Buffer);
1077 OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1078 << ']';
1079 return OS.str();
1080 }
1081
getIdAddr__anon93435ae10111::AAAMDMaxNumWorkgroups1082 const char *getIdAddr() const override { return &ID; }
1083
1084 /// This function should return true if the type of the \p AA is
1085 /// AAAMDMaxNumWorkgroups
classof__anon93435ae10111::AAAMDMaxNumWorkgroups1086 static bool classof(const AbstractAttribute *AA) {
1087 return (AA->getIdAddr() == &ID);
1088 }
1089
trackStatistics__anon93435ae10111::AAAMDMaxNumWorkgroups1090 void trackStatistics() const override {}
1091
1092 /// Unique ID (due to the unique address)
1093 static const char ID;
1094 };
1095
1096 const char AAAMDMaxNumWorkgroups::ID = 0;
1097
1098 AAAMDMaxNumWorkgroups &
createForPosition(const IRPosition & IRP,Attributor & A)1099 AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1100 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1101 return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1102 llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1103 }
1104
1105 /// Propagate amdgpu-waves-per-eu attribute.
1106 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
AAAMDWavesPerEU__anon93435ae10111::AAAMDWavesPerEU1107 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1108 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1109
initialize__anon93435ae10111::AAAMDWavesPerEU1110 void initialize(Attributor &A) override {
1111 Function *F = getAssociatedFunction();
1112 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1113
1114 // If the attribute exists, we will honor it if it is not the default.
1115 if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1116 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1117 1U, InfoCache.getMaxWavesPerEU(*F)};
1118 if (*Attr != MaxWavesPerEURange) {
1119 auto [Min, Max] = *Attr;
1120 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1121 IntegerRangeState RangeState(Range);
1122 this->getState() = RangeState;
1123 indicateOptimisticFixpoint();
1124 return;
1125 }
1126 }
1127
1128 if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1129 indicatePessimisticFixpoint();
1130 }
1131
updateImpl__anon93435ae10111::AAAMDWavesPerEU1132 ChangeStatus updateImpl(Attributor &A) override {
1133 ChangeStatus Change = ChangeStatus::UNCHANGED;
1134
1135 auto CheckCallSite = [&](AbstractCallSite CS) {
1136 Function *Caller = CS.getInstruction()->getFunction();
1137 Function *Func = getAssociatedFunction();
1138 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1139 << "->" << Func->getName() << '\n');
1140 (void)Func;
1141
1142 const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1143 *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1144 if (!CallerAA || !CallerAA->isValidState())
1145 return false;
1146
1147 ConstantRange Assumed = getAssumed();
1148 unsigned Min = std::max(Assumed.getLower().getZExtValue(),
1149 CallerAA->getAssumed().getLower().getZExtValue());
1150 unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
1151 CallerAA->getAssumed().getUpper().getZExtValue());
1152 ConstantRange Range(APInt(32, Min), APInt(32, Max));
1153 IntegerRangeState RangeState(Range);
1154 getState() = RangeState;
1155 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1156 : ChangeStatus::CHANGED;
1157
1158 return true;
1159 };
1160
1161 bool AllCallSitesKnown = true;
1162 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1163 return indicatePessimisticFixpoint();
1164
1165 return Change;
1166 }
1167
1168 /// Create an abstract attribute view for the position \p IRP.
1169 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1170 Attributor &A);
1171
manifest__anon93435ae10111::AAAMDWavesPerEU1172 ChangeStatus manifest(Attributor &A) override {
1173 Function *F = getAssociatedFunction();
1174 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1175 return emitAttributeIfNotDefaultAfterClamp(
1176 A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1177 }
1178
1179 /// See AbstractAttribute::getName()
getName__anon93435ae10111::AAAMDWavesPerEU1180 StringRef getName() const override { return "AAAMDWavesPerEU"; }
1181
1182 /// See AbstractAttribute::getIdAddr()
getIdAddr__anon93435ae10111::AAAMDWavesPerEU1183 const char *getIdAddr() const override { return &ID; }
1184
1185 /// This function should return true if the type of the \p AA is
1186 /// AAAMDWavesPerEU
classof__anon93435ae10111::AAAMDWavesPerEU1187 static bool classof(const AbstractAttribute *AA) {
1188 return (AA->getIdAddr() == &ID);
1189 }
1190
1191 /// Unique ID (due to the unique address)
1192 static const char ID;
1193 };
1194
1195 const char AAAMDWavesPerEU::ID = 0;
1196
createForPosition(const IRPosition & IRP,Attributor & A)1197 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1198 Attributor &A) {
1199 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1200 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1201 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1202 }
1203
inlineAsmUsesAGPRs(const InlineAsm * IA)1204 static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1205 for (const auto &CI : IA->ParseConstraints()) {
1206 for (StringRef Code : CI.Codes) {
1207 Code.consume_front("{");
1208 if (Code.starts_with("a"))
1209 return true;
1210 }
1211 }
1212
1213 return false;
1214 }
1215
1216 // TODO: Migrate to range merge of amdgpu-agpr-alloc.
1217 // FIXME: Why is this using Attribute::NoUnwind?
1218 struct AAAMDGPUNoAGPR
1219 : public IRAttribute<Attribute::NoUnwind,
1220 StateWrapper<BooleanState, AbstractAttribute>,
1221 AAAMDGPUNoAGPR> {
AAAMDGPUNoAGPR__anon93435ae10111::AAAMDGPUNoAGPR1222 AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1223
createForPosition__anon93435ae10111::AAAMDGPUNoAGPR1224 static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1225 Attributor &A) {
1226 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1227 return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1228 llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1229 }
1230
initialize__anon93435ae10111::AAAMDGPUNoAGPR1231 void initialize(Attributor &A) override {
1232 Function *F = getAssociatedFunction();
1233 auto [MinNumAGPR, MaxNumAGPR] =
1234 AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
1235 /*OnlyFirstRequired=*/true);
1236 if (MinNumAGPR == 0)
1237 indicateOptimisticFixpoint();
1238 }
1239
getAsStr__anon93435ae10111::AAAMDGPUNoAGPR1240 const std::string getAsStr(Attributor *A) const override {
1241 return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1242 }
1243
trackStatistics__anon93435ae10111::AAAMDGPUNoAGPR1244 void trackStatistics() const override {}
1245
updateImpl__anon93435ae10111::AAAMDGPUNoAGPR1246 ChangeStatus updateImpl(Attributor &A) override {
1247 // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1248
1249 auto CheckForNoAGPRs = [&](Instruction &I) {
1250 const auto &CB = cast<CallBase>(I);
1251 const Value *CalleeOp = CB.getCalledOperand();
1252 const Function *Callee = dyn_cast<Function>(CalleeOp);
1253 if (!Callee) {
1254 if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1255 return !inlineAsmUsesAGPRs(IA);
1256 return false;
1257 }
1258
1259 // Some intrinsics may use AGPRs, but if we have a choice, we are not
1260 // required to use AGPRs.
1261 if (Callee->isIntrinsic())
1262 return true;
1263
1264 // TODO: Handle callsite attributes
1265 const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1266 *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
1267 return CalleeInfo && CalleeInfo->isValidState() &&
1268 CalleeInfo->getAssumed();
1269 };
1270
1271 bool UsedAssumedInformation = false;
1272 if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1273 UsedAssumedInformation))
1274 return indicatePessimisticFixpoint();
1275 return ChangeStatus::UNCHANGED;
1276 }
1277
manifest__anon93435ae10111::AAAMDGPUNoAGPR1278 ChangeStatus manifest(Attributor &A) override {
1279 if (!getAssumed())
1280 return ChangeStatus::UNCHANGED;
1281 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1282 return A.manifestAttrs(getIRPosition(),
1283 {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
1284 }
1285
getName__anon93435ae10111::AAAMDGPUNoAGPR1286 StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
getIdAddr__anon93435ae10111::AAAMDGPUNoAGPR1287 const char *getIdAddr() const override { return &ID; }
1288
1289 /// This function should return true if the type of the \p AA is
1290 /// AAAMDGPUNoAGPRs
classof__anon93435ae10111::AAAMDGPUNoAGPR1291 static bool classof(const AbstractAttribute *AA) {
1292 return (AA->getIdAddr() == &ID);
1293 }
1294
1295 static const char ID;
1296 };
1297
1298 const char AAAMDGPUNoAGPR::ID = 0;
1299
1300 /// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1301 /// based on the finalized 'amdgpu-flat-work-group-size' attribute.
1302 /// Both attributes start with narrow ranges that expand during iteration.
1303 /// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1304 /// preventing optimal updates later. Therefore, waves-per-eu can't be updated
1305 /// with intermediate values during the attributor run. We defer the
1306 /// finalization of waves-per-eu until after the flat-workgroup-size is
1307 /// finalized.
1308 /// TODO: Remove this and move similar logic back into the attributor run once
1309 /// we have a better representation for waves-per-eu.
updateWavesPerEU(Module & M,TargetMachine & TM)1310 static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1311 bool Changed = false;
1312
1313 LLVMContext &Ctx = M.getContext();
1314
1315 for (Function &F : M) {
1316 if (F.isDeclaration())
1317 continue;
1318
1319 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1320
1321 std::optional<std::pair<unsigned, std::optional<unsigned>>>
1322 FlatWgrpSizeAttr =
1323 AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
1324
1325 unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1326 unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1327
1328 unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
1329 unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
1330 if (FlatWgrpSizeAttr.has_value()) {
1331 MinFlatWgrpSize = FlatWgrpSizeAttr->first;
1332 MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
1333 }
1334
1335 // Start with the "best" range.
1336 unsigned Min = MinWavesPerEU;
1337 unsigned Max = MinWavesPerEU;
1338
1339 // Compute the range from flat workgroup size. `getWavesPerEU` will also
1340 // account for the 'amdgpu-waves-er-eu' attribute.
1341 auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1342 ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize});
1343
1344 // For the lower bound, we have to "tighten" it.
1345 Min = std::max(Min, MinFromFlatWgrpSize);
1346 // For the upper bound, we have to "extend" it.
1347 Max = std::max(Max, MaxFromFlatWgrpSize);
1348
1349 // Clamp the range to the max range.
1350 Min = std::max(Min, MinWavesPerEU);
1351 Max = std::min(Max, MaxWavesPerEU);
1352
1353 // Update the attribute if it is not the max.
1354 if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1355 SmallString<10> Buffer;
1356 raw_svector_ostream OS(Buffer);
1357 OS << Min << ',' << Max;
1358 Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu");
1359 Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str());
1360 F.addFnAttr(NewAttr);
1361 Changed |= OldAttr == NewAttr;
1362 }
1363 }
1364
1365 return Changed;
1366 }
1367
runImpl(Module & M,AnalysisGetter & AG,TargetMachine & TM,AMDGPUAttributorOptions Options,ThinOrFullLTOPhase LTOPhase)1368 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1369 AMDGPUAttributorOptions Options,
1370 ThinOrFullLTOPhase LTOPhase) {
1371 SetVector<Function *> Functions;
1372 for (Function &F : M) {
1373 if (!F.isIntrinsic())
1374 Functions.insert(&F);
1375 }
1376
1377 CallGraphUpdater CGUpdater;
1378 BumpPtrAllocator Allocator;
1379 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1380 DenseSet<const char *> Allowed(
1381 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1382 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1383 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1384 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1385 &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
1386 &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
1387
1388 AttributorConfig AC(CGUpdater);
1389 AC.IsClosedWorldModule = Options.IsClosedWorld;
1390 AC.Allowed = &Allowed;
1391 AC.IsModulePass = true;
1392 AC.DefaultInitializeLiveInternals = false;
1393 AC.IndirectCalleeSpecializationCallback =
1394 [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1395 Function &Callee, unsigned NumAssumedCallees) {
1396 return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1397 (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1398 };
1399 AC.IPOAmendableCB = [](const Function &F) {
1400 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1401 };
1402
1403 Attributor A(Functions, InfoCache, AC);
1404
1405 LLVM_DEBUG({
1406 StringRef LTOPhaseStr = to_string(LTOPhase);
1407 dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1408 << "[AMDGPUAttributor] Module " << M.getName() << " is "
1409 << (AC.IsClosedWorldModule ? "" : "not ")
1410 << "assumed to be a closed world.\n";
1411 });
1412
1413 for (auto *F : Functions) {
1414 A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1415 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1416 A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1417 A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1418 CallingConv::ID CC = F->getCallingConv();
1419 if (!AMDGPU::isEntryFunctionCC(CC)) {
1420 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1421 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1422 }
1423
1424 for (auto &I : instructions(F)) {
1425 Value *Ptr = nullptr;
1426 if (auto *LI = dyn_cast<LoadInst>(&I))
1427 Ptr = LI->getPointerOperand();
1428 else if (auto *SI = dyn_cast<StoreInst>(&I))
1429 Ptr = SI->getPointerOperand();
1430 else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
1431 Ptr = RMW->getPointerOperand();
1432 else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
1433 Ptr = CmpX->getPointerOperand();
1434
1435 if (Ptr) {
1436 A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
1437 A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
1438 }
1439 }
1440 }
1441
1442 bool Changed = A.run() == ChangeStatus::CHANGED;
1443
1444 Changed |= updateWavesPerEU(M, TM);
1445
1446 return Changed;
1447 }
1448 } // namespace
1449
run(Module & M,ModuleAnalysisManager & AM)1450 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1451 ModuleAnalysisManager &AM) {
1452
1453 FunctionAnalysisManager &FAM =
1454 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1455 AnalysisGetter AG(FAM);
1456
1457 // TODO: Probably preserves CFG
1458 return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1459 : PreservedAnalyses::all();
1460 }
1461