xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/CodeGen/TargetPassConfig.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IntrinsicsR600.h"
19 #include "llvm/Target/TargetMachine.h"
20 #include "llvm/Transforms/IPO/Attributor.h"
21 
22 #define DEBUG_TYPE "amdgpu-attributor"
23 
24 using namespace llvm;
25 
26 static cl::opt<unsigned> IndirectCallSpecializationThreshold(
27     "amdgpu-indirect-call-specialization-threshold",
28     cl::desc(
29         "A threshold controls whether an indirect call will be specialized"),
30     cl::init(3));
31 
32 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
33 
34 enum ImplicitArgumentPositions {
35 #include "AMDGPUAttributes.def"
36   LAST_ARG_POS
37 };
38 
39 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
40 
41 enum ImplicitArgumentMask {
42   NOT_IMPLICIT_INPUT = 0,
43 #include "AMDGPUAttributes.def"
44   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
45 };
46 
47 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
48 static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
49     ImplicitAttrs[] = {
50 #include "AMDGPUAttributes.def"
51 };
52 
53 // We do not need to note the x workitem or workgroup id because they are always
54 // initialized.
55 //
56 // TODO: We should not add the attributes if the known compile time workgroup
57 // size is 1 for y/z.
58 static ImplicitArgumentMask
intrinsicToAttrMask(Intrinsic::ID ID,bool & NonKernelOnly,bool & NeedsImplicit,bool HasApertureRegs,bool SupportsGetDoorBellID,unsigned CodeObjectVersion)59 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
60                     bool HasApertureRegs, bool SupportsGetDoorBellID,
61                     unsigned CodeObjectVersion) {
62   switch (ID) {
63   case Intrinsic::amdgcn_workitem_id_x:
64     NonKernelOnly = true;
65     return WORKITEM_ID_X;
66   case Intrinsic::amdgcn_workgroup_id_x:
67     NonKernelOnly = true;
68     return WORKGROUP_ID_X;
69   case Intrinsic::amdgcn_workitem_id_y:
70   case Intrinsic::r600_read_tidig_y:
71     return WORKITEM_ID_Y;
72   case Intrinsic::amdgcn_workitem_id_z:
73   case Intrinsic::r600_read_tidig_z:
74     return WORKITEM_ID_Z;
75   case Intrinsic::amdgcn_workgroup_id_y:
76   case Intrinsic::r600_read_tgid_y:
77     return WORKGROUP_ID_Y;
78   case Intrinsic::amdgcn_workgroup_id_z:
79   case Intrinsic::r600_read_tgid_z:
80     return WORKGROUP_ID_Z;
81   case Intrinsic::amdgcn_lds_kernel_id:
82     return LDS_KERNEL_ID;
83   case Intrinsic::amdgcn_dispatch_ptr:
84     return DISPATCH_PTR;
85   case Intrinsic::amdgcn_dispatch_id:
86     return DISPATCH_ID;
87   case Intrinsic::amdgcn_implicitarg_ptr:
88     return IMPLICIT_ARG_PTR;
89   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
90   // queue_ptr.
91   case Intrinsic::amdgcn_queue_ptr:
92     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
93     return QUEUE_PTR;
94   case Intrinsic::amdgcn_is_shared:
95   case Intrinsic::amdgcn_is_private:
96     if (HasApertureRegs)
97       return NOT_IMPLICIT_INPUT;
98     // Under V5, we need implicitarg_ptr + offsets to access private_base or
99     // shared_base. For pre-V5, however, need to access them through queue_ptr +
100     // offsets.
101     return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
102                                                     : QUEUE_PTR;
103   case Intrinsic::trap:
104   case Intrinsic::debugtrap:
105   case Intrinsic::ubsantrap:
106     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
107       return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
108                                                       : QUEUE_PTR;
109     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
110     return QUEUE_PTR;
111   default:
112     return NOT_IMPLICIT_INPUT;
113   }
114 }
115 
castRequiresQueuePtr(unsigned SrcAS)116 static bool castRequiresQueuePtr(unsigned SrcAS) {
117   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
118 }
119 
isDSAddress(const Constant * C)120 static bool isDSAddress(const Constant *C) {
121   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
122   if (!GV)
123     return false;
124   unsigned AS = GV->getAddressSpace();
125   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
126 }
127 
128 /// Returns true if the function requires the implicit argument be passed
129 /// regardless of the function contents.
funcRequiresHostcallPtr(const Function & F)130 static bool funcRequiresHostcallPtr(const Function &F) {
131   // Sanitizers require the hostcall buffer passed in the implicit arguments.
132   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
133          F.hasFnAttribute(Attribute::SanitizeThread) ||
134          F.hasFnAttribute(Attribute::SanitizeMemory) ||
135          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
136          F.hasFnAttribute(Attribute::SanitizeMemTag);
137 }
138 
139 namespace {
140 class AMDGPUInformationCache : public InformationCache {
141 public:
AMDGPUInformationCache(const Module & M,AnalysisGetter & AG,BumpPtrAllocator & Allocator,SetVector<Function * > * CGSCC,TargetMachine & TM)142   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
143                          BumpPtrAllocator &Allocator,
144                          SetVector<Function *> *CGSCC, TargetMachine &TM)
145       : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
146         CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
147 
148   TargetMachine &TM;
149 
150   enum ConstantStatus : uint8_t {
151     NONE = 0,
152     DS_GLOBAL = 1 << 0,
153     ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
154     ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
155     ADDR_SPACE_CAST_BOTH_TO_FLAT =
156         ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
157   };
158 
159   /// Check if the subtarget has aperture regs.
hasApertureRegs(Function & F)160   bool hasApertureRegs(Function &F) {
161     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
162     return ST.hasApertureRegs();
163   }
164 
165   /// Check if the subtarget supports GetDoorbellID.
supportsGetDoorbellID(Function & F)166   bool supportsGetDoorbellID(Function &F) {
167     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
168     return ST.supportsGetDoorbellID();
169   }
170 
171   std::optional<std::pair<unsigned, unsigned>>
getFlatWorkGroupSizeAttr(const Function & F) const172   getFlatWorkGroupSizeAttr(const Function &F) const {
173     auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
174     if (!R)
175       return std::nullopt;
176     return std::make_pair(R->first, *(R->second));
177   }
178 
179   std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(const Function & F) const180   getDefaultFlatWorkGroupSize(const Function &F) const {
181     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
182     return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
183   }
184 
185   std::pair<unsigned, unsigned>
getMaximumFlatWorkGroupRange(const Function & F)186   getMaximumFlatWorkGroupRange(const Function &F) {
187     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
189   }
190 
getMaxNumWorkGroups(const Function & F)191   SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
192     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
193     return ST.getMaxNumWorkGroups(F);
194   }
195 
196   /// Get code object version.
getCodeObjectVersion() const197   unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
198 
199   /// Get the effective value of "amdgpu-waves-per-eu" for the function,
200   /// accounting for the interaction with the passed value to use for
201   /// "amdgpu-flat-work-group-size".
202   std::pair<unsigned, unsigned>
getWavesPerEU(const Function & F,std::pair<unsigned,unsigned> FlatWorkGroupSize)203   getWavesPerEU(const Function &F,
204                 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
205     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
206     return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
207   }
208 
209   std::optional<std::pair<unsigned, unsigned>>
getWavesPerEUAttr(const Function & F)210   getWavesPerEUAttr(const Function &F) {
211     auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
212                                                /*OnlyFirstRequired=*/true);
213     if (!Val)
214       return std::nullopt;
215     if (!Val->second) {
216       const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
217       Val->second = ST.getMaxWavesPerEU();
218     }
219     return std::make_pair(Val->first, *(Val->second));
220   }
221 
222   std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,std::pair<unsigned,unsigned> FlatWorkGroupSize)223   getEffectiveWavesPerEU(const Function &F,
224                          std::pair<unsigned, unsigned> WavesPerEU,
225                          std::pair<unsigned, unsigned> FlatWorkGroupSize) {
226     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
227     return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
228                                      getLDSSize(F));
229   }
230 
getMaxWavesPerEU(const Function & F)231   unsigned getMaxWavesPerEU(const Function &F) {
232     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
233     return ST.getMaxWavesPerEU();
234   }
235 
getMaxAddrSpace() const236   unsigned getMaxAddrSpace() const override {
237     return AMDGPUAS::MAX_AMDGPU_ADDRESS;
238   }
239 
240 private:
241   /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
242   /// local to flat. These casts may require the queue pointer.
visitConstExpr(const ConstantExpr * CE)243   static uint8_t visitConstExpr(const ConstantExpr *CE) {
244     uint8_t Status = NONE;
245 
246     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
247       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
248       if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
249         Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
250       else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
251         Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
252     }
253 
254     return Status;
255   }
256 
257   /// Returns the minimum amount of LDS space used by a workgroup running
258   /// function \p F.
getLDSSize(const Function & F)259   static unsigned getLDSSize(const Function &F) {
260     return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
261                                            {0, UINT32_MAX}, true)
262         .first;
263   }
264 
265   /// Get the constant access bitmap for \p C.
getConstantAccess(const Constant * C,SmallPtrSetImpl<const Constant * > & Visited)266   uint8_t getConstantAccess(const Constant *C,
267                             SmallPtrSetImpl<const Constant *> &Visited) {
268     auto It = ConstantStatus.find(C);
269     if (It != ConstantStatus.end())
270       return It->second;
271 
272     uint8_t Result = 0;
273     if (isDSAddress(C))
274       Result = DS_GLOBAL;
275 
276     if (const auto *CE = dyn_cast<ConstantExpr>(C))
277       Result |= visitConstExpr(CE);
278 
279     for (const Use &U : C->operands()) {
280       const auto *OpC = dyn_cast<Constant>(U);
281       if (!OpC || !Visited.insert(OpC).second)
282         continue;
283 
284       Result |= getConstantAccess(OpC, Visited);
285     }
286     return Result;
287   }
288 
289 public:
290   /// Returns true if \p Fn needs the queue pointer because of \p C.
needsQueuePtr(const Constant * C,Function & Fn)291   bool needsQueuePtr(const Constant *C, Function &Fn) {
292     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
293     bool HasAperture = hasApertureRegs(Fn);
294 
295     // No need to explore the constants.
296     if (!IsNonEntryFunc && HasAperture)
297       return false;
298 
299     SmallPtrSet<const Constant *, 8> Visited;
300     uint8_t Access = getConstantAccess(C, Visited);
301 
302     // We need to trap on DS globals in non-entry functions.
303     if (IsNonEntryFunc && (Access & DS_GLOBAL))
304       return true;
305 
306     return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
307   }
308 
checkConstForAddrSpaceCastFromPrivate(const Constant * C)309   bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
310     SmallPtrSet<const Constant *, 8> Visited;
311     uint8_t Access = getConstantAccess(C, Visited);
312     return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
313   }
314 
315 private:
316   /// Used to determine if the Constant needs the queue pointer.
317   DenseMap<const Constant *, uint8_t> ConstantStatus;
318   const unsigned CodeObjectVersion;
319 };
320 
321 struct AAAMDAttributes
322     : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
323                           AbstractAttribute> {
324   using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
325                             AbstractAttribute>;
326 
AAAMDAttributes__anon93435ae10111::AAAMDAttributes327   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
328 
329   /// Create an abstract attribute view for the position \p IRP.
330   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
331                                             Attributor &A);
332 
333   /// See AbstractAttribute::getName().
getName__anon93435ae10111::AAAMDAttributes334   StringRef getName() const override { return "AAAMDAttributes"; }
335 
336   /// See AbstractAttribute::getIdAddr().
getIdAddr__anon93435ae10111::AAAMDAttributes337   const char *getIdAddr() const override { return &ID; }
338 
339   /// This function should return true if the type of the \p AA is
340   /// AAAMDAttributes.
classof__anon93435ae10111::AAAMDAttributes341   static bool classof(const AbstractAttribute *AA) {
342     return (AA->getIdAddr() == &ID);
343   }
344 
345   /// Unique ID (due to the unique address)
346   static const char ID;
347 };
348 const char AAAMDAttributes::ID = 0;
349 
350 struct AAUniformWorkGroupSize
351     : public StateWrapper<BooleanState, AbstractAttribute> {
352   using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAUniformWorkGroupSize__anon93435ae10111::AAUniformWorkGroupSize353   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
354 
355   /// Create an abstract attribute view for the position \p IRP.
356   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
357                                                    Attributor &A);
358 
359   /// See AbstractAttribute::getName().
getName__anon93435ae10111::AAUniformWorkGroupSize360   StringRef getName() const override { return "AAUniformWorkGroupSize"; }
361 
362   /// See AbstractAttribute::getIdAddr().
getIdAddr__anon93435ae10111::AAUniformWorkGroupSize363   const char *getIdAddr() const override { return &ID; }
364 
365   /// This function should return true if the type of the \p AA is
366   /// AAAMDAttributes.
classof__anon93435ae10111::AAUniformWorkGroupSize367   static bool classof(const AbstractAttribute *AA) {
368     return (AA->getIdAddr() == &ID);
369   }
370 
371   /// Unique ID (due to the unique address)
372   static const char ID;
373 };
374 const char AAUniformWorkGroupSize::ID = 0;
375 
376 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
AAUniformWorkGroupSizeFunction__anon93435ae10111::AAUniformWorkGroupSizeFunction377   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
378       : AAUniformWorkGroupSize(IRP, A) {}
379 
initialize__anon93435ae10111::AAUniformWorkGroupSizeFunction380   void initialize(Attributor &A) override {
381     Function *F = getAssociatedFunction();
382     CallingConv::ID CC = F->getCallingConv();
383 
384     if (CC != CallingConv::AMDGPU_KERNEL)
385       return;
386 
387     bool InitialValue = false;
388     if (F->hasFnAttribute("uniform-work-group-size"))
389       InitialValue =
390           F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
391           "true";
392 
393     if (InitialValue)
394       indicateOptimisticFixpoint();
395     else
396       indicatePessimisticFixpoint();
397   }
398 
updateImpl__anon93435ae10111::AAUniformWorkGroupSizeFunction399   ChangeStatus updateImpl(Attributor &A) override {
400     ChangeStatus Change = ChangeStatus::UNCHANGED;
401 
402     auto CheckCallSite = [&](AbstractCallSite CS) {
403       Function *Caller = CS.getInstruction()->getFunction();
404       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
405                         << "->" << getAssociatedFunction()->getName() << "\n");
406 
407       const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
408           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
409       if (!CallerInfo || !CallerInfo->isValidState())
410         return false;
411 
412       Change = Change | clampStateAndIndicateChange(this->getState(),
413                                                     CallerInfo->getState());
414 
415       return true;
416     };
417 
418     bool AllCallSitesKnown = true;
419     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
420       return indicatePessimisticFixpoint();
421 
422     return Change;
423   }
424 
manifest__anon93435ae10111::AAUniformWorkGroupSizeFunction425   ChangeStatus manifest(Attributor &A) override {
426     SmallVector<Attribute, 8> AttrList;
427     LLVMContext &Ctx = getAssociatedFunction()->getContext();
428 
429     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
430                                       getAssumed() ? "true" : "false"));
431     return A.manifestAttrs(getIRPosition(), AttrList,
432                            /* ForceReplace */ true);
433   }
434 
isValidState__anon93435ae10111::AAUniformWorkGroupSizeFunction435   bool isValidState() const override {
436     // This state is always valid, even when the state is false.
437     return true;
438   }
439 
getAsStr__anon93435ae10111::AAUniformWorkGroupSizeFunction440   const std::string getAsStr(Attributor *) const override {
441     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
442   }
443 
444   /// See AbstractAttribute::trackStatistics()
trackStatistics__anon93435ae10111::AAUniformWorkGroupSizeFunction445   void trackStatistics() const override {}
446 };
447 
448 AAUniformWorkGroupSize &
createForPosition(const IRPosition & IRP,Attributor & A)449 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
450                                           Attributor &A) {
451   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
452     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
453   llvm_unreachable(
454       "AAUniformWorkGroupSize is only valid for function position");
455 }
456 
457 struct AAAMDAttributesFunction : public AAAMDAttributes {
AAAMDAttributesFunction__anon93435ae10111::AAAMDAttributesFunction458   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
459       : AAAMDAttributes(IRP, A) {}
460 
initialize__anon93435ae10111::AAAMDAttributesFunction461   void initialize(Attributor &A) override {
462     Function *F = getAssociatedFunction();
463 
464     // If the function requires the implicit arg pointer due to sanitizers,
465     // assume it's needed even if explicitly marked as not requiring it.
466     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
467     if (NeedsHostcall) {
468       removeAssumedBits(IMPLICIT_ARG_PTR);
469       removeAssumedBits(HOSTCALL_PTR);
470     }
471 
472     for (auto Attr : ImplicitAttrs) {
473       if (NeedsHostcall &&
474           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
475         continue;
476 
477       if (F->hasFnAttribute(Attr.second))
478         addKnownBits(Attr.first);
479     }
480 
481     if (F->isDeclaration())
482       return;
483 
484     // Ignore functions with graphics calling conventions, these are currently
485     // not allowed to have kernel arguments.
486     if (AMDGPU::isGraphics(F->getCallingConv())) {
487       indicatePessimisticFixpoint();
488       return;
489     }
490   }
491 
updateImpl__anon93435ae10111::AAAMDAttributesFunction492   ChangeStatus updateImpl(Attributor &A) override {
493     Function *F = getAssociatedFunction();
494     // The current assumed state used to determine a change.
495     auto OrigAssumed = getAssumed();
496 
497     // Check for Intrinsics and propagate attributes.
498     const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
499         *this, this->getIRPosition(), DepClassTy::REQUIRED);
500     if (!AAEdges || !AAEdges->isValidState() ||
501         AAEdges->hasNonAsmUnknownCallee())
502       return indicatePessimisticFixpoint();
503 
504     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
505 
506     bool NeedsImplicit = false;
507     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
508     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
509     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
510     unsigned COV = InfoCache.getCodeObjectVersion();
511 
512     for (Function *Callee : AAEdges->getOptimisticEdges()) {
513       Intrinsic::ID IID = Callee->getIntrinsicID();
514       if (IID == Intrinsic::not_intrinsic) {
515         const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
516             *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
517         if (!AAAMD || !AAAMD->isValidState())
518           return indicatePessimisticFixpoint();
519         *this &= *AAAMD;
520         continue;
521       }
522 
523       bool NonKernelOnly = false;
524       ImplicitArgumentMask AttrMask =
525           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
526                               HasApertureRegs, SupportsGetDoorbellID, COV);
527       if (AttrMask != NOT_IMPLICIT_INPUT) {
528         if ((IsNonEntryFunc || !NonKernelOnly))
529           removeAssumedBits(AttrMask);
530       }
531     }
532 
533     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
534     if (NeedsImplicit)
535       removeAssumedBits(IMPLICIT_ARG_PTR);
536 
537     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
538       // Under V5, we need implicitarg_ptr + offsets to access private_base or
539       // shared_base. We do not actually need queue_ptr.
540       if (COV >= 5)
541         removeAssumedBits(IMPLICIT_ARG_PTR);
542       else
543         removeAssumedBits(QUEUE_PTR);
544     }
545 
546     if (funcRetrievesMultigridSyncArg(A, COV)) {
547       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
548              "multigrid_sync_arg needs implicitarg_ptr");
549       removeAssumedBits(MULTIGRID_SYNC_ARG);
550     }
551 
552     if (funcRetrievesHostcallPtr(A, COV)) {
553       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
554       removeAssumedBits(HOSTCALL_PTR);
555     }
556 
557     if (funcRetrievesHeapPtr(A, COV)) {
558       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
559       removeAssumedBits(HEAP_PTR);
560     }
561 
562     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
563       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
564       removeAssumedBits(QUEUE_PTR);
565     }
566 
567     if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
568       removeAssumedBits(LDS_KERNEL_ID);
569     }
570 
571     if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
572       removeAssumedBits(DEFAULT_QUEUE);
573 
574     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
575       removeAssumedBits(COMPLETION_ACTION);
576 
577     if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
578       removeAssumedBits(FLAT_SCRATCH_INIT);
579 
580     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
581                                        : ChangeStatus::UNCHANGED;
582   }
583 
manifest__anon93435ae10111::AAAMDAttributesFunction584   ChangeStatus manifest(Attributor &A) override {
585     SmallVector<Attribute, 8> AttrList;
586     LLVMContext &Ctx = getAssociatedFunction()->getContext();
587 
588     for (auto Attr : ImplicitAttrs) {
589       if (isKnown(Attr.first))
590         AttrList.push_back(Attribute::get(Ctx, Attr.second));
591     }
592 
593     return A.manifestAttrs(getIRPosition(), AttrList,
594                            /* ForceReplace */ true);
595   }
596 
getAsStr__anon93435ae10111::AAAMDAttributesFunction597   const std::string getAsStr(Attributor *) const override {
598     std::string Str;
599     raw_string_ostream OS(Str);
600     OS << "AMDInfo[";
601     for (auto Attr : ImplicitAttrs)
602       if (isAssumed(Attr.first))
603         OS << ' ' << Attr.second;
604     OS << " ]";
605     return OS.str();
606   }
607 
608   /// See AbstractAttribute::trackStatistics()
trackStatistics__anon93435ae10111::AAAMDAttributesFunction609   void trackStatistics() const override {}
610 
611 private:
checkForQueuePtr__anon93435ae10111::AAAMDAttributesFunction612   bool checkForQueuePtr(Attributor &A) {
613     Function *F = getAssociatedFunction();
614     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
615 
616     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
617 
618     bool NeedsQueuePtr = false;
619 
620     auto CheckAddrSpaceCasts = [&](Instruction &I) {
621       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
622       if (castRequiresQueuePtr(SrcAS)) {
623         NeedsQueuePtr = true;
624         return false;
625       }
626       return true;
627     };
628 
629     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
630 
631     // `checkForAllInstructions` is much more cheaper than going through all
632     // instructions, try it first.
633 
634     // The queue pointer is not needed if aperture regs is present.
635     if (!HasApertureRegs) {
636       bool UsedAssumedInformation = false;
637       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
638                                 {Instruction::AddrSpaceCast},
639                                 UsedAssumedInformation);
640     }
641 
642     // If we found  that we need the queue pointer, nothing else to do.
643     if (NeedsQueuePtr)
644       return true;
645 
646     if (!IsNonEntryFunc && HasApertureRegs)
647       return false;
648 
649     for (BasicBlock &BB : *F) {
650       for (Instruction &I : BB) {
651         for (const Use &U : I.operands()) {
652           if (const auto *C = dyn_cast<Constant>(U)) {
653             if (InfoCache.needsQueuePtr(C, *F))
654               return true;
655           }
656         }
657       }
658     }
659 
660     return false;
661   }
662 
funcRetrievesMultigridSyncArg__anon93435ae10111::AAAMDAttributesFunction663   bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
664     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
665     AA::RangeTy Range(Pos, 8);
666     return funcRetrievesImplicitKernelArg(A, Range);
667   }
668 
funcRetrievesHostcallPtr__anon93435ae10111::AAAMDAttributesFunction669   bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
670     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
671     AA::RangeTy Range(Pos, 8);
672     return funcRetrievesImplicitKernelArg(A, Range);
673   }
674 
funcRetrievesDefaultQueue__anon93435ae10111::AAAMDAttributesFunction675   bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
676     auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
677     AA::RangeTy Range(Pos, 8);
678     return funcRetrievesImplicitKernelArg(A, Range);
679   }
680 
funcRetrievesCompletionAction__anon93435ae10111::AAAMDAttributesFunction681   bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
682     auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
683     AA::RangeTy Range(Pos, 8);
684     return funcRetrievesImplicitKernelArg(A, Range);
685   }
686 
funcRetrievesHeapPtr__anon93435ae10111::AAAMDAttributesFunction687   bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
688     if (COV < 5)
689       return false;
690     AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
691     return funcRetrievesImplicitKernelArg(A, Range);
692   }
693 
funcRetrievesQueuePtr__anon93435ae10111::AAAMDAttributesFunction694   bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
695     if (COV < 5)
696       return false;
697     AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
698     return funcRetrievesImplicitKernelArg(A, Range);
699   }
700 
funcRetrievesImplicitKernelArg__anon93435ae10111::AAAMDAttributesFunction701   bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
702     // Check if this is a call to the implicitarg_ptr builtin and it
703     // is used to retrieve the hostcall pointer. The implicit arg for
704     // hostcall is not used only if every use of the implicitarg_ptr
705     // is a load that clearly does not retrieve any byte of the
706     // hostcall pointer. We check this by tracing all the uses of the
707     // initial call to the implicitarg_ptr intrinsic.
708     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
709       auto &Call = cast<CallBase>(I);
710       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
711         return true;
712 
713       const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
714           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
715       if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
716         return false;
717 
718       return PointerInfoAA->forallInterferingAccesses(
719           Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
720             return Acc.getRemoteInst()->isDroppable();
721           });
722     };
723 
724     bool UsedAssumedInformation = false;
725     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
726                                               UsedAssumedInformation);
727   }
728 
funcRetrievesLDSKernelId__anon93435ae10111::AAAMDAttributesFunction729   bool funcRetrievesLDSKernelId(Attributor &A) {
730     auto DoesNotRetrieve = [&](Instruction &I) {
731       auto &Call = cast<CallBase>(I);
732       return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
733     };
734     bool UsedAssumedInformation = false;
735     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
736                                               UsedAssumedInformation);
737   }
738 
739   // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
740   // not to be set.
needFlatScratchInit__anon93435ae10111::AAAMDAttributesFunction741   bool needFlatScratchInit(Attributor &A) {
742     assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
743 
744     // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
745     // there is a cast from PRIVATE_ADDRESS.
746     auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
747       return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
748              AMDGPUAS::PRIVATE_ADDRESS;
749     };
750 
751     bool UsedAssumedInformation = false;
752     if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
753                                    {Instruction::AddrSpaceCast},
754                                    UsedAssumedInformation))
755       return true;
756 
757     // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
758     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
759 
760     Function *F = getAssociatedFunction();
761     for (Instruction &I : instructions(F)) {
762       for (const Use &U : I.operands()) {
763         if (const auto *C = dyn_cast<Constant>(U)) {
764           if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
765             return true;
766         }
767       }
768     }
769 
770     // Finally check callees.
771 
772     // This is called on each callee; false means callee shouldn't have
773     // no-flat-scratch-init.
774     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
775       const auto &CB = cast<CallBase>(I);
776       const Function *Callee = CB.getCalledFunction();
777 
778       // Callee == 0 for inline asm or indirect call with known callees.
779       // In the latter case, updateImpl() already checked the callees and we
780       // know their FLAT_SCRATCH_INIT bit is set.
781       // If function has indirect call with unknown callees, the bit is
782       // already removed in updateImpl() and execution won't reach here.
783       if (!Callee)
784         return true;
785 
786       return Callee->getIntrinsicID() !=
787              Intrinsic::amdgcn_addrspacecast_nonnull;
788     };
789 
790     UsedAssumedInformation = false;
791     // If any callee is false (i.e. need FlatScratchInit),
792     // checkForAllCallLikeInstructions returns false, in which case this
793     // function returns true.
794     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
795                                               UsedAssumedInformation);
796   }
797 };
798 
createForPosition(const IRPosition & IRP,Attributor & A)799 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
800                                                     Attributor &A) {
801   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
802     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
803   llvm_unreachable("AAAMDAttributes is only valid for function position");
804 }
805 
806 /// Base class to derive different size ranges.
807 struct AAAMDSizeRangeAttribute
808     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
809   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
810 
811   StringRef AttrName;
812 
AAAMDSizeRangeAttribute__anon93435ae10111::AAAMDSizeRangeAttribute813   AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
814                           StringRef AttrName)
815       : Base(IRP, 32), AttrName(AttrName) {}
816 
817   /// See AbstractAttribute::trackStatistics()
trackStatistics__anon93435ae10111::AAAMDSizeRangeAttribute818   void trackStatistics() const override {}
819 
updateImplImpl__anon93435ae10111::AAAMDSizeRangeAttribute820   template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
821     ChangeStatus Change = ChangeStatus::UNCHANGED;
822 
823     auto CheckCallSite = [&](AbstractCallSite CS) {
824       Function *Caller = CS.getInstruction()->getFunction();
825       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
826                         << "->" << getAssociatedFunction()->getName() << '\n');
827 
828       const auto *CallerInfo = A.getAAFor<AttributeImpl>(
829           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
830       if (!CallerInfo || !CallerInfo->isValidState())
831         return false;
832 
833       Change |=
834           clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
835 
836       return true;
837     };
838 
839     bool AllCallSitesKnown = true;
840     if (!A.checkForAllCallSites(CheckCallSite, *this,
841                                 /*RequireAllCallSites=*/true,
842                                 AllCallSitesKnown))
843       return indicatePessimisticFixpoint();
844 
845     return Change;
846   }
847 
848   /// Clamp the assumed range to the default value ([Min, Max]) and emit the
849   /// attribute if it is not same as default.
850   ChangeStatus
emitAttributeIfNotDefaultAfterClamp__anon93435ae10111::AAAMDSizeRangeAttribute851   emitAttributeIfNotDefaultAfterClamp(Attributor &A,
852                                       std::pair<unsigned, unsigned> Default) {
853     auto [Min, Max] = Default;
854     unsigned Lower = getAssumed().getLower().getZExtValue();
855     unsigned Upper = getAssumed().getUpper().getZExtValue();
856 
857     // Clamp the range to the default value.
858     if (Lower < Min)
859       Lower = Min;
860     if (Upper > Max + 1)
861       Upper = Max + 1;
862 
863     // No manifest if the value is invalid or same as default after clamp.
864     if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
865       return ChangeStatus::UNCHANGED;
866 
867     Function *F = getAssociatedFunction();
868     LLVMContext &Ctx = F->getContext();
869     SmallString<10> Buffer;
870     raw_svector_ostream OS(Buffer);
871     OS << Lower << ',' << Upper - 1;
872     return A.manifestAttrs(getIRPosition(),
873                            {Attribute::get(Ctx, AttrName, OS.str())},
874                            /*ForceReplace=*/true);
875   }
876 
getAsStr__anon93435ae10111::AAAMDSizeRangeAttribute877   const std::string getAsStr(Attributor *) const override {
878     std::string Str;
879     raw_string_ostream OS(Str);
880     OS << getName() << '[';
881     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
882     OS << ']';
883     return OS.str();
884   }
885 };
886 
887 /// Propagate amdgpu-flat-work-group-size attribute.
888 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
AAAMDFlatWorkGroupSize__anon93435ae10111::AAAMDFlatWorkGroupSize889   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
890       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
891 
initialize__anon93435ae10111::AAAMDFlatWorkGroupSize892   void initialize(Attributor &A) override {
893     Function *F = getAssociatedFunction();
894     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
895 
896     bool HasAttr = false;
897     auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
898     auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
899 
900     if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
901       // We only consider an attribute that is not max range because the front
902       // end always emits the attribute, unfortunately, and sometimes it emits
903       // the max range.
904       if (*Attr != MaxRange) {
905         Range = *Attr;
906         HasAttr = true;
907       }
908     }
909 
910     // We don't want to directly clamp the state if it's the max range because
911     // that is basically the worst state.
912     if (Range == MaxRange)
913       return;
914 
915     auto [Min, Max] = Range;
916     ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
917     IntegerRangeState IRS(CR);
918     clampStateAndIndicateChange(this->getState(), IRS);
919 
920     if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
921       indicateOptimisticFixpoint();
922   }
923 
updateImpl__anon93435ae10111::AAAMDFlatWorkGroupSize924   ChangeStatus updateImpl(Attributor &A) override {
925     return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
926   }
927 
928   /// Create an abstract attribute view for the position \p IRP.
929   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
930                                                    Attributor &A);
931 
manifest__anon93435ae10111::AAAMDFlatWorkGroupSize932   ChangeStatus manifest(Attributor &A) override {
933     Function *F = getAssociatedFunction();
934     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
935     return emitAttributeIfNotDefaultAfterClamp(
936         A, InfoCache.getMaximumFlatWorkGroupRange(*F));
937   }
938 
939   /// See AbstractAttribute::getName()
getName__anon93435ae10111::AAAMDFlatWorkGroupSize940   StringRef getName() const override { return "AAAMDFlatWorkGroupSize"; }
941 
942   /// See AbstractAttribute::getIdAddr()
getIdAddr__anon93435ae10111::AAAMDFlatWorkGroupSize943   const char *getIdAddr() const override { return &ID; }
944 
945   /// This function should return true if the type of the \p AA is
946   /// AAAMDFlatWorkGroupSize
classof__anon93435ae10111::AAAMDFlatWorkGroupSize947   static bool classof(const AbstractAttribute *AA) {
948     return (AA->getIdAddr() == &ID);
949   }
950 
951   /// Unique ID (due to the unique address)
952   static const char ID;
953 };
954 
955 const char AAAMDFlatWorkGroupSize::ID = 0;
956 
957 AAAMDFlatWorkGroupSize &
createForPosition(const IRPosition & IRP,Attributor & A)958 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
959                                           Attributor &A) {
960   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
961     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
962   llvm_unreachable(
963       "AAAMDFlatWorkGroupSize is only valid for function position");
964 }
965 
966 struct TupleDecIntegerRangeState : public AbstractState {
967   DecIntegerState<uint32_t> X, Y, Z;
968 
isValidState__anon93435ae10111::TupleDecIntegerRangeState969   bool isValidState() const override {
970     return X.isValidState() && Y.isValidState() && Z.isValidState();
971   }
972 
isAtFixpoint__anon93435ae10111::TupleDecIntegerRangeState973   bool isAtFixpoint() const override {
974     return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
975   }
976 
indicateOptimisticFixpoint__anon93435ae10111::TupleDecIntegerRangeState977   ChangeStatus indicateOptimisticFixpoint() override {
978     return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
979            Z.indicateOptimisticFixpoint();
980   }
981 
indicatePessimisticFixpoint__anon93435ae10111::TupleDecIntegerRangeState982   ChangeStatus indicatePessimisticFixpoint() override {
983     return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
984            Z.indicatePessimisticFixpoint();
985   }
986 
operator ^=__anon93435ae10111::TupleDecIntegerRangeState987   TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
988     X ^= Other.X;
989     Y ^= Other.Y;
990     Z ^= Other.Z;
991     return *this;
992   }
993 
operator ==__anon93435ae10111::TupleDecIntegerRangeState994   bool operator==(const TupleDecIntegerRangeState &Other) const {
995     return X == Other.X && Y == Other.Y && Z == Other.Z;
996   }
997 
getAssumed__anon93435ae10111::TupleDecIntegerRangeState998   TupleDecIntegerRangeState &getAssumed() { return *this; }
getAssumed__anon93435ae10111::TupleDecIntegerRangeState999   const TupleDecIntegerRangeState &getAssumed() const { return *this; }
1000 };
1001 
1002 using AAAMDMaxNumWorkgroupsState =
1003     StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1004 
1005 /// Propagate amdgpu-max-num-workgroups attribute.
1006 struct AAAMDMaxNumWorkgroups
1007     : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1008   using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1009 
AAAMDMaxNumWorkgroups__anon93435ae10111::AAAMDMaxNumWorkgroups1010   AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1011 
initialize__anon93435ae10111::AAAMDMaxNumWorkgroups1012   void initialize(Attributor &A) override {
1013     Function *F = getAssociatedFunction();
1014     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1015 
1016     SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
1017 
1018     X.takeKnownMinimum(MaxNumWorkgroups[0]);
1019     Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1020     Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1021 
1022     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1023       indicatePessimisticFixpoint();
1024   }
1025 
updateImpl__anon93435ae10111::AAAMDMaxNumWorkgroups1026   ChangeStatus updateImpl(Attributor &A) override {
1027     ChangeStatus Change = ChangeStatus::UNCHANGED;
1028 
1029     auto CheckCallSite = [&](AbstractCallSite CS) {
1030       Function *Caller = CS.getInstruction()->getFunction();
1031       LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
1032                         << "->" << getAssociatedFunction()->getName() << '\n');
1033 
1034       const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
1035           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1036       if (!CallerInfo || !CallerInfo->isValidState())
1037         return false;
1038 
1039       Change |=
1040           clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
1041       return true;
1042     };
1043 
1044     bool AllCallSitesKnown = true;
1045     if (!A.checkForAllCallSites(CheckCallSite, *this,
1046                                 /*RequireAllCallSites=*/true,
1047                                 AllCallSitesKnown))
1048       return indicatePessimisticFixpoint();
1049 
1050     return Change;
1051   }
1052 
1053   /// Create an abstract attribute view for the position \p IRP.
1054   static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
1055                                                   Attributor &A);
1056 
manifest__anon93435ae10111::AAAMDMaxNumWorkgroups1057   ChangeStatus manifest(Attributor &A) override {
1058     Function *F = getAssociatedFunction();
1059     LLVMContext &Ctx = F->getContext();
1060     SmallString<32> Buffer;
1061     raw_svector_ostream OS(Buffer);
1062     OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
1063 
1064     // TODO: Should annotate loads of the group size for this to do anything
1065     // useful.
1066     return A.manifestAttrs(
1067         getIRPosition(),
1068         {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
1069         /* ForceReplace= */ true);
1070   }
1071 
getName__anon93435ae10111::AAAMDMaxNumWorkgroups1072   StringRef getName() const override { return "AAAMDMaxNumWorkgroups"; }
1073 
getAsStr__anon93435ae10111::AAAMDMaxNumWorkgroups1074   const std::string getAsStr(Attributor *) const override {
1075     std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
1076     raw_string_ostream OS(Buffer);
1077     OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
1078        << ']';
1079     return OS.str();
1080   }
1081 
getIdAddr__anon93435ae10111::AAAMDMaxNumWorkgroups1082   const char *getIdAddr() const override { return &ID; }
1083 
1084   /// This function should return true if the type of the \p AA is
1085   /// AAAMDMaxNumWorkgroups
classof__anon93435ae10111::AAAMDMaxNumWorkgroups1086   static bool classof(const AbstractAttribute *AA) {
1087     return (AA->getIdAddr() == &ID);
1088   }
1089 
trackStatistics__anon93435ae10111::AAAMDMaxNumWorkgroups1090   void trackStatistics() const override {}
1091 
1092   /// Unique ID (due to the unique address)
1093   static const char ID;
1094 };
1095 
1096 const char AAAMDMaxNumWorkgroups::ID = 0;
1097 
1098 AAAMDMaxNumWorkgroups &
createForPosition(const IRPosition & IRP,Attributor & A)1099 AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
1100   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1101     return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
1102   llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
1103 }
1104 
1105 /// Propagate amdgpu-waves-per-eu attribute.
1106 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
AAAMDWavesPerEU__anon93435ae10111::AAAMDWavesPerEU1107   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
1108       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
1109 
initialize__anon93435ae10111::AAAMDWavesPerEU1110   void initialize(Attributor &A) override {
1111     Function *F = getAssociatedFunction();
1112     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1113 
1114     // If the attribute exists, we will honor it if it is not the default.
1115     if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1116       std::pair<unsigned, unsigned> MaxWavesPerEURange{
1117           1U, InfoCache.getMaxWavesPerEU(*F)};
1118       if (*Attr != MaxWavesPerEURange) {
1119         auto [Min, Max] = *Attr;
1120         ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1121         IntegerRangeState RangeState(Range);
1122         this->getState() = RangeState;
1123         indicateOptimisticFixpoint();
1124         return;
1125       }
1126     }
1127 
1128     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1129       indicatePessimisticFixpoint();
1130   }
1131 
updateImpl__anon93435ae10111::AAAMDWavesPerEU1132   ChangeStatus updateImpl(Attributor &A) override {
1133     ChangeStatus Change = ChangeStatus::UNCHANGED;
1134 
1135     auto CheckCallSite = [&](AbstractCallSite CS) {
1136       Function *Caller = CS.getInstruction()->getFunction();
1137       Function *Func = getAssociatedFunction();
1138       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
1139                         << "->" << Func->getName() << '\n');
1140       (void)Func;
1141 
1142       const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
1143           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1144       if (!CallerAA || !CallerAA->isValidState())
1145         return false;
1146 
1147       ConstantRange Assumed = getAssumed();
1148       unsigned Min = std::max(Assumed.getLower().getZExtValue(),
1149                               CallerAA->getAssumed().getLower().getZExtValue());
1150       unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
1151                               CallerAA->getAssumed().getUpper().getZExtValue());
1152       ConstantRange Range(APInt(32, Min), APInt(32, Max));
1153       IntegerRangeState RangeState(Range);
1154       getState() = RangeState;
1155       Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1156                                       : ChangeStatus::CHANGED;
1157 
1158       return true;
1159     };
1160 
1161     bool AllCallSitesKnown = true;
1162     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
1163       return indicatePessimisticFixpoint();
1164 
1165     return Change;
1166   }
1167 
1168   /// Create an abstract attribute view for the position \p IRP.
1169   static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
1170                                             Attributor &A);
1171 
manifest__anon93435ae10111::AAAMDWavesPerEU1172   ChangeStatus manifest(Attributor &A) override {
1173     Function *F = getAssociatedFunction();
1174     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
1175     return emitAttributeIfNotDefaultAfterClamp(
1176         A, {1U, InfoCache.getMaxWavesPerEU(*F)});
1177   }
1178 
1179   /// See AbstractAttribute::getName()
getName__anon93435ae10111::AAAMDWavesPerEU1180   StringRef getName() const override { return "AAAMDWavesPerEU"; }
1181 
1182   /// See AbstractAttribute::getIdAddr()
getIdAddr__anon93435ae10111::AAAMDWavesPerEU1183   const char *getIdAddr() const override { return &ID; }
1184 
1185   /// This function should return true if the type of the \p AA is
1186   /// AAAMDWavesPerEU
classof__anon93435ae10111::AAAMDWavesPerEU1187   static bool classof(const AbstractAttribute *AA) {
1188     return (AA->getIdAddr() == &ID);
1189   }
1190 
1191   /// Unique ID (due to the unique address)
1192   static const char ID;
1193 };
1194 
1195 const char AAAMDWavesPerEU::ID = 0;
1196 
createForPosition(const IRPosition & IRP,Attributor & A)1197 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
1198                                                     Attributor &A) {
1199   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1200     return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
1201   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
1202 }
1203 
inlineAsmUsesAGPRs(const InlineAsm * IA)1204 static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
1205   for (const auto &CI : IA->ParseConstraints()) {
1206     for (StringRef Code : CI.Codes) {
1207       Code.consume_front("{");
1208       if (Code.starts_with("a"))
1209         return true;
1210     }
1211   }
1212 
1213   return false;
1214 }
1215 
1216 // TODO: Migrate to range merge of amdgpu-agpr-alloc.
1217 // FIXME: Why is this using Attribute::NoUnwind?
1218 struct AAAMDGPUNoAGPR
1219     : public IRAttribute<Attribute::NoUnwind,
1220                          StateWrapper<BooleanState, AbstractAttribute>,
1221                          AAAMDGPUNoAGPR> {
AAAMDGPUNoAGPR__anon93435ae10111::AAAMDGPUNoAGPR1222   AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1223 
createForPosition__anon93435ae10111::AAAMDGPUNoAGPR1224   static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
1225                                            Attributor &A) {
1226     if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
1227       return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
1228     llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
1229   }
1230 
initialize__anon93435ae10111::AAAMDGPUNoAGPR1231   void initialize(Attributor &A) override {
1232     Function *F = getAssociatedFunction();
1233     auto [MinNumAGPR, MaxNumAGPR] =
1234         AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
1235                                         /*OnlyFirstRequired=*/true);
1236     if (MinNumAGPR == 0)
1237       indicateOptimisticFixpoint();
1238   }
1239 
getAsStr__anon93435ae10111::AAAMDGPUNoAGPR1240   const std::string getAsStr(Attributor *A) const override {
1241     return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
1242   }
1243 
trackStatistics__anon93435ae10111::AAAMDGPUNoAGPR1244   void trackStatistics() const override {}
1245 
updateImpl__anon93435ae10111::AAAMDGPUNoAGPR1246   ChangeStatus updateImpl(Attributor &A) override {
1247     // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
1248 
1249     auto CheckForNoAGPRs = [&](Instruction &I) {
1250       const auto &CB = cast<CallBase>(I);
1251       const Value *CalleeOp = CB.getCalledOperand();
1252       const Function *Callee = dyn_cast<Function>(CalleeOp);
1253       if (!Callee) {
1254         if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
1255           return !inlineAsmUsesAGPRs(IA);
1256         return false;
1257       }
1258 
1259       // Some intrinsics may use AGPRs, but if we have a choice, we are not
1260       // required to use AGPRs.
1261       if (Callee->isIntrinsic())
1262         return true;
1263 
1264       // TODO: Handle callsite attributes
1265       const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
1266           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
1267       return CalleeInfo && CalleeInfo->isValidState() &&
1268              CalleeInfo->getAssumed();
1269     };
1270 
1271     bool UsedAssumedInformation = false;
1272     if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
1273                                            UsedAssumedInformation))
1274       return indicatePessimisticFixpoint();
1275     return ChangeStatus::UNCHANGED;
1276   }
1277 
manifest__anon93435ae10111::AAAMDGPUNoAGPR1278   ChangeStatus manifest(Attributor &A) override {
1279     if (!getAssumed())
1280       return ChangeStatus::UNCHANGED;
1281     LLVMContext &Ctx = getAssociatedFunction()->getContext();
1282     return A.manifestAttrs(getIRPosition(),
1283                            {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
1284   }
1285 
getName__anon93435ae10111::AAAMDGPUNoAGPR1286   StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
getIdAddr__anon93435ae10111::AAAMDGPUNoAGPR1287   const char *getIdAddr() const override { return &ID; }
1288 
1289   /// This function should return true if the type of the \p AA is
1290   /// AAAMDGPUNoAGPRs
classof__anon93435ae10111::AAAMDGPUNoAGPR1291   static bool classof(const AbstractAttribute *AA) {
1292     return (AA->getIdAddr() == &ID);
1293   }
1294 
1295   static const char ID;
1296 };
1297 
1298 const char AAAMDGPUNoAGPR::ID = 0;
1299 
1300 /// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1301 /// based on the finalized 'amdgpu-flat-work-group-size' attribute.
1302 /// Both attributes start with narrow ranges that expand during iteration.
1303 /// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1304 /// preventing optimal updates later. Therefore, waves-per-eu can't be updated
1305 /// with intermediate values during the attributor run. We defer the
1306 /// finalization of waves-per-eu until after the flat-workgroup-size is
1307 /// finalized.
1308 /// TODO: Remove this and move similar logic back into the attributor run once
1309 /// we have a better representation for waves-per-eu.
updateWavesPerEU(Module & M,TargetMachine & TM)1310 static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1311   bool Changed = false;
1312 
1313   LLVMContext &Ctx = M.getContext();
1314 
1315   for (Function &F : M) {
1316     if (F.isDeclaration())
1317       continue;
1318 
1319     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1320 
1321     std::optional<std::pair<unsigned, std::optional<unsigned>>>
1322         FlatWgrpSizeAttr =
1323             AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
1324 
1325     unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1326     unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1327 
1328     unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
1329     unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
1330     if (FlatWgrpSizeAttr.has_value()) {
1331       MinFlatWgrpSize = FlatWgrpSizeAttr->first;
1332       MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
1333     }
1334 
1335     // Start with the "best" range.
1336     unsigned Min = MinWavesPerEU;
1337     unsigned Max = MinWavesPerEU;
1338 
1339     // Compute the range from flat workgroup size. `getWavesPerEU` will also
1340     // account for the 'amdgpu-waves-er-eu' attribute.
1341     auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1342         ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize});
1343 
1344     // For the lower bound, we have to "tighten" it.
1345     Min = std::max(Min, MinFromFlatWgrpSize);
1346     // For the upper bound, we have to "extend" it.
1347     Max = std::max(Max, MaxFromFlatWgrpSize);
1348 
1349     // Clamp the range to the max range.
1350     Min = std::max(Min, MinWavesPerEU);
1351     Max = std::min(Max, MaxWavesPerEU);
1352 
1353     // Update the attribute if it is not the max.
1354     if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1355       SmallString<10> Buffer;
1356       raw_svector_ostream OS(Buffer);
1357       OS << Min << ',' << Max;
1358       Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu");
1359       Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str());
1360       F.addFnAttr(NewAttr);
1361       Changed |= OldAttr == NewAttr;
1362     }
1363   }
1364 
1365   return Changed;
1366 }
1367 
runImpl(Module & M,AnalysisGetter & AG,TargetMachine & TM,AMDGPUAttributorOptions Options,ThinOrFullLTOPhase LTOPhase)1368 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1369                     AMDGPUAttributorOptions Options,
1370                     ThinOrFullLTOPhase LTOPhase) {
1371   SetVector<Function *> Functions;
1372   for (Function &F : M) {
1373     if (!F.isIntrinsic())
1374       Functions.insert(&F);
1375   }
1376 
1377   CallGraphUpdater CGUpdater;
1378   BumpPtrAllocator Allocator;
1379   AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1380   DenseSet<const char *> Allowed(
1381       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1382        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1383        &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1384        &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1385        &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
1386        &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
1387 
1388   AttributorConfig AC(CGUpdater);
1389   AC.IsClosedWorldModule = Options.IsClosedWorld;
1390   AC.Allowed = &Allowed;
1391   AC.IsModulePass = true;
1392   AC.DefaultInitializeLiveInternals = false;
1393   AC.IndirectCalleeSpecializationCallback =
1394       [](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1395          Function &Callee, unsigned NumAssumedCallees) {
1396         return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1397                (NumAssumedCallees <= IndirectCallSpecializationThreshold);
1398       };
1399   AC.IPOAmendableCB = [](const Function &F) {
1400     return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1401   };
1402 
1403   Attributor A(Functions, InfoCache, AC);
1404 
1405   LLVM_DEBUG({
1406     StringRef LTOPhaseStr = to_string(LTOPhase);
1407     dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
1408            << "[AMDGPUAttributor] Module " << M.getName() << " is "
1409            << (AC.IsClosedWorldModule ? "" : "not ")
1410            << "assumed to be a closed world.\n";
1411   });
1412 
1413   for (auto *F : Functions) {
1414     A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
1415     A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
1416     A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
1417     A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
1418     CallingConv::ID CC = F->getCallingConv();
1419     if (!AMDGPU::isEntryFunctionCC(CC)) {
1420       A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
1421       A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1422     }
1423 
1424     for (auto &I : instructions(F)) {
1425       Value *Ptr = nullptr;
1426       if (auto *LI = dyn_cast<LoadInst>(&I))
1427         Ptr = LI->getPointerOperand();
1428       else if (auto *SI = dyn_cast<StoreInst>(&I))
1429         Ptr = SI->getPointerOperand();
1430       else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
1431         Ptr = RMW->getPointerOperand();
1432       else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
1433         Ptr = CmpX->getPointerOperand();
1434 
1435       if (Ptr) {
1436         A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
1437         A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
1438       }
1439     }
1440   }
1441 
1442   bool Changed = A.run() == ChangeStatus::CHANGED;
1443 
1444   Changed |= updateWavesPerEU(M, TM);
1445 
1446   return Changed;
1447 }
1448 } // namespace
1449 
run(Module & M,ModuleAnalysisManager & AM)1450 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1451                                                   ModuleAnalysisManager &AM) {
1452 
1453   FunctionAnalysisManager &FAM =
1454       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1455   AnalysisGetter AG(FAM);
1456 
1457   // TODO: Probably preserves CFG
1458   return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
1459                                                : PreservedAnalyses::all();
1460 }
1461