xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (revision 9f23cbd6cae82fd77edfad7173432fa8dccd0a95)
1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/Analysis/CycleAnalysis.h"
17 #include "llvm/CodeGen/TargetPassConfig.h"
18 #include "llvm/IR/IntrinsicsAMDGPU.h"
19 #include "llvm/IR/IntrinsicsR600.h"
20 #include "llvm/Target/TargetMachine.h"
21 #include "llvm/Transforms/IPO/Attributor.h"
22 
23 #define DEBUG_TYPE "amdgpu-attributor"
24 
25 namespace llvm {
26 void initializeCycleInfoWrapperPassPass(PassRegistry &);
27 }
28 
29 using namespace llvm;
30 
31 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32 
33 enum ImplicitArgumentPositions {
34   #include "AMDGPUAttributes.def"
35   LAST_ARG_POS
36 };
37 
38 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39 
40 enum ImplicitArgumentMask {
41   NOT_IMPLICIT_INPUT = 0,
42   #include "AMDGPUAttributes.def"
43   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
44 };
45 
46 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
47 static constexpr std::pair<ImplicitArgumentMask,
48                            StringLiteral> ImplicitAttrs[] = {
49  #include "AMDGPUAttributes.def"
50 };
51 
52 // We do not need to note the x workitem or workgroup id because they are always
53 // initialized.
54 //
55 // TODO: We should not add the attributes if the known compile time workgroup
56 // size is 1 for y/z.
57 static ImplicitArgumentMask
58 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
59                     bool HasApertureRegs, bool SupportsGetDoorBellID) {
60   unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
61   switch (ID) {
62   case Intrinsic::amdgcn_workitem_id_x:
63     NonKernelOnly = true;
64     return WORKITEM_ID_X;
65   case Intrinsic::amdgcn_workgroup_id_x:
66     NonKernelOnly = true;
67     return WORKGROUP_ID_X;
68   case Intrinsic::amdgcn_workitem_id_y:
69   case Intrinsic::r600_read_tidig_y:
70     return WORKITEM_ID_Y;
71   case Intrinsic::amdgcn_workitem_id_z:
72   case Intrinsic::r600_read_tidig_z:
73     return WORKITEM_ID_Z;
74   case Intrinsic::amdgcn_workgroup_id_y:
75   case Intrinsic::r600_read_tgid_y:
76     return WORKGROUP_ID_Y;
77   case Intrinsic::amdgcn_workgroup_id_z:
78   case Intrinsic::r600_read_tgid_z:
79     return WORKGROUP_ID_Z;
80   case Intrinsic::amdgcn_lds_kernel_id:
81     return LDS_KERNEL_ID;
82   case Intrinsic::amdgcn_dispatch_ptr:
83     return DISPATCH_PTR;
84   case Intrinsic::amdgcn_dispatch_id:
85     return DISPATCH_ID;
86   case Intrinsic::amdgcn_implicitarg_ptr:
87     return IMPLICIT_ARG_PTR;
88   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
89   // queue_ptr.
90   case Intrinsic::amdgcn_queue_ptr:
91     NeedsImplicit = (CodeObjectVersion == 5);
92     return QUEUE_PTR;
93   case Intrinsic::amdgcn_is_shared:
94   case Intrinsic::amdgcn_is_private:
95     if (HasApertureRegs)
96       return NOT_IMPLICIT_INPUT;
97     // Under V5, we need implicitarg_ptr + offsets to access private_base or
98     // shared_base. For pre-V5, however, need to access them through queue_ptr +
99     // offsets.
100     return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
101   case Intrinsic::trap:
102     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
103       return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
104     NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
105     return QUEUE_PTR;
106   default:
107     return NOT_IMPLICIT_INPUT;
108   }
109 }
110 
111 static bool castRequiresQueuePtr(unsigned SrcAS) {
112   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
113 }
114 
115 static bool isDSAddress(const Constant *C) {
116   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
117   if (!GV)
118     return false;
119   unsigned AS = GV->getAddressSpace();
120   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
121 }
122 
123 /// Returns true if the function requires the implicit argument be passed
124 /// regardless of the function contents.
125 static bool funcRequiresHostcallPtr(const Function &F) {
126   // Sanitizers require the hostcall buffer passed in the implicit arguments.
127   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
128          F.hasFnAttribute(Attribute::SanitizeThread) ||
129          F.hasFnAttribute(Attribute::SanitizeMemory) ||
130          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
131          F.hasFnAttribute(Attribute::SanitizeMemTag);
132 }
133 
134 namespace {
135 class AMDGPUInformationCache : public InformationCache {
136 public:
137   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
138                          BumpPtrAllocator &Allocator,
139                          SetVector<Function *> *CGSCC, TargetMachine &TM)
140       : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
141   TargetMachine &TM;
142 
143   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
144 
145   /// Check if the subtarget has aperture regs.
146   bool hasApertureRegs(Function &F) {
147     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
148     return ST.hasApertureRegs();
149   }
150 
151   /// Check if the subtarget supports GetDoorbellID.
152   bool supportsGetDoorbellID(Function &F) {
153     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
154     return ST.supportsGetDoorbellID();
155   }
156 
157   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
158     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
159     return ST.getFlatWorkGroupSizes(F);
160   }
161 
162   std::pair<unsigned, unsigned>
163   getMaximumFlatWorkGroupRange(const Function &F) {
164     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
165     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
166   }
167 
168 private:
169   /// Check if the ConstantExpr \p CE requires the queue pointer.
170   static bool visitConstExpr(const ConstantExpr *CE) {
171     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
172       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
173       return castRequiresQueuePtr(SrcAS);
174     }
175     return false;
176   }
177 
178   /// Get the constant access bitmap for \p C.
179   uint8_t getConstantAccess(const Constant *C) {
180     auto It = ConstantStatus.find(C);
181     if (It != ConstantStatus.end())
182       return It->second;
183 
184     uint8_t Result = 0;
185     if (isDSAddress(C))
186       Result = DS_GLOBAL;
187 
188     if (const auto *CE = dyn_cast<ConstantExpr>(C))
189       if (visitConstExpr(CE))
190         Result |= ADDR_SPACE_CAST;
191 
192     for (const Use &U : C->operands()) {
193       const auto *OpC = dyn_cast<Constant>(U);
194       if (!OpC)
195         continue;
196 
197       Result |= getConstantAccess(OpC);
198     }
199     return Result;
200   }
201 
202 public:
203   /// Returns true if \p Fn needs the queue pointer because of \p C.
204   bool needsQueuePtr(const Constant *C, Function &Fn) {
205     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
206     bool HasAperture = hasApertureRegs(Fn);
207 
208     // No need to explore the constants.
209     if (!IsNonEntryFunc && HasAperture)
210       return false;
211 
212     uint8_t Access = getConstantAccess(C);
213 
214     // We need to trap on DS globals in non-entry functions.
215     if (IsNonEntryFunc && (Access & DS_GLOBAL))
216       return true;
217 
218     return !HasAperture && (Access & ADDR_SPACE_CAST);
219   }
220 
221 private:
222   /// Used to determine if the Constant needs the queue pointer.
223   DenseMap<const Constant *, uint8_t> ConstantStatus;
224 };
225 
226 struct AAAMDAttributes
227     : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
228                           AbstractAttribute> {
229   using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
230                             AbstractAttribute>;
231 
232   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
233 
234   /// Create an abstract attribute view for the position \p IRP.
235   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
236                                             Attributor &A);
237 
238   /// See AbstractAttribute::getName().
239   const std::string getName() const override { return "AAAMDAttributes"; }
240 
241   /// See AbstractAttribute::getIdAddr().
242   const char *getIdAddr() const override { return &ID; }
243 
244   /// This function should return true if the type of the \p AA is
245   /// AAAMDAttributes.
246   static bool classof(const AbstractAttribute *AA) {
247     return (AA->getIdAddr() == &ID);
248   }
249 
250   /// Unique ID (due to the unique address)
251   static const char ID;
252 };
253 const char AAAMDAttributes::ID = 0;
254 
255 struct AAUniformWorkGroupSize
256     : public StateWrapper<BooleanState, AbstractAttribute> {
257   using Base = StateWrapper<BooleanState, AbstractAttribute>;
258   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
259 
260   /// Create an abstract attribute view for the position \p IRP.
261   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
262                                                    Attributor &A);
263 
264   /// See AbstractAttribute::getName().
265   const std::string getName() const override {
266     return "AAUniformWorkGroupSize";
267   }
268 
269   /// See AbstractAttribute::getIdAddr().
270   const char *getIdAddr() const override { return &ID; }
271 
272   /// This function should return true if the type of the \p AA is
273   /// AAAMDAttributes.
274   static bool classof(const AbstractAttribute *AA) {
275     return (AA->getIdAddr() == &ID);
276   }
277 
278   /// Unique ID (due to the unique address)
279   static const char ID;
280 };
281 const char AAUniformWorkGroupSize::ID = 0;
282 
283 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
284   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
285       : AAUniformWorkGroupSize(IRP, A) {}
286 
287   void initialize(Attributor &A) override {
288     Function *F = getAssociatedFunction();
289     CallingConv::ID CC = F->getCallingConv();
290 
291     if (CC != CallingConv::AMDGPU_KERNEL)
292       return;
293 
294     bool InitialValue = false;
295     if (F->hasFnAttribute("uniform-work-group-size"))
296       InitialValue = F->getFnAttribute("uniform-work-group-size")
297                          .getValueAsString()
298                          .equals("true");
299 
300     if (InitialValue)
301       indicateOptimisticFixpoint();
302     else
303       indicatePessimisticFixpoint();
304   }
305 
306   ChangeStatus updateImpl(Attributor &A) override {
307     ChangeStatus Change = ChangeStatus::UNCHANGED;
308 
309     auto CheckCallSite = [&](AbstractCallSite CS) {
310       Function *Caller = CS.getInstruction()->getFunction();
311       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
312                         << "->" << getAssociatedFunction()->getName() << "\n");
313 
314       const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
315           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
316 
317       Change = Change | clampStateAndIndicateChange(this->getState(),
318                                                     CallerInfo.getState());
319 
320       return true;
321     };
322 
323     bool AllCallSitesKnown = true;
324     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
325       return indicatePessimisticFixpoint();
326 
327     return Change;
328   }
329 
330   ChangeStatus manifest(Attributor &A) override {
331     SmallVector<Attribute, 8> AttrList;
332     LLVMContext &Ctx = getAssociatedFunction()->getContext();
333 
334     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
335                                       getAssumed() ? "true" : "false"));
336     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
337                                               /* ForceReplace */ true);
338   }
339 
340   bool isValidState() const override {
341     // This state is always valid, even when the state is false.
342     return true;
343   }
344 
345   const std::string getAsStr() const override {
346     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
347   }
348 
349   /// See AbstractAttribute::trackStatistics()
350   void trackStatistics() const override {}
351 };
352 
353 AAUniformWorkGroupSize &
354 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
355                                           Attributor &A) {
356   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
357     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
358   llvm_unreachable(
359       "AAUniformWorkGroupSize is only valid for function position");
360 }
361 
362 struct AAAMDAttributesFunction : public AAAMDAttributes {
363   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
364       : AAAMDAttributes(IRP, A) {}
365 
366   void initialize(Attributor &A) override {
367     Function *F = getAssociatedFunction();
368 
369     // If the function requires the implicit arg pointer due to sanitizers,
370     // assume it's needed even if explicitly marked as not requiring it.
371     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
372     if (NeedsHostcall) {
373       removeAssumedBits(IMPLICIT_ARG_PTR);
374       removeAssumedBits(HOSTCALL_PTR);
375     }
376 
377     for (auto Attr : ImplicitAttrs) {
378       if (NeedsHostcall &&
379           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
380         continue;
381 
382       if (F->hasFnAttribute(Attr.second))
383         addKnownBits(Attr.first);
384     }
385 
386     if (F->isDeclaration())
387       return;
388 
389     // Ignore functions with graphics calling conventions, these are currently
390     // not allowed to have kernel arguments.
391     if (AMDGPU::isGraphics(F->getCallingConv())) {
392       indicatePessimisticFixpoint();
393       return;
394     }
395   }
396 
397   ChangeStatus updateImpl(Attributor &A) override {
398     Function *F = getAssociatedFunction();
399     // The current assumed state used to determine a change.
400     auto OrigAssumed = getAssumed();
401 
402     // Check for Intrinsics and propagate attributes.
403     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
404         *this, this->getIRPosition(), DepClassTy::REQUIRED);
405     if (AAEdges.hasNonAsmUnknownCallee())
406       return indicatePessimisticFixpoint();
407 
408     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
409 
410     bool NeedsImplicit = false;
411     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
412     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
413     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
414 
415     for (Function *Callee : AAEdges.getOptimisticEdges()) {
416       Intrinsic::ID IID = Callee->getIntrinsicID();
417       if (IID == Intrinsic::not_intrinsic) {
418         const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
419           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
420         *this &= AAAMD;
421         continue;
422       }
423 
424       bool NonKernelOnly = false;
425       ImplicitArgumentMask AttrMask =
426           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
427                               HasApertureRegs, SupportsGetDoorbellID);
428       if (AttrMask != NOT_IMPLICIT_INPUT) {
429         if ((IsNonEntryFunc || !NonKernelOnly))
430           removeAssumedBits(AttrMask);
431       }
432     }
433 
434     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
435     if (NeedsImplicit)
436       removeAssumedBits(IMPLICIT_ARG_PTR);
437 
438     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
439       // Under V5, we need implicitarg_ptr + offsets to access private_base or
440       // shared_base. We do not actually need queue_ptr.
441       if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
442         removeAssumedBits(IMPLICIT_ARG_PTR);
443       else
444         removeAssumedBits(QUEUE_PTR);
445     }
446 
447     if (funcRetrievesMultigridSyncArg(A)) {
448       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
449              "multigrid_sync_arg needs implicitarg_ptr");
450       removeAssumedBits(MULTIGRID_SYNC_ARG);
451     }
452 
453     if (funcRetrievesHostcallPtr(A)) {
454       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
455       removeAssumedBits(HOSTCALL_PTR);
456     }
457 
458     if (funcRetrievesHeapPtr(A)) {
459       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
460       removeAssumedBits(HEAP_PTR);
461     }
462 
463     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
464       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
465       removeAssumedBits(QUEUE_PTR);
466     }
467 
468     if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
469       removeAssumedBits(LDS_KERNEL_ID);
470     }
471 
472     if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A))
473       removeAssumedBits(DEFAULT_QUEUE);
474 
475     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A))
476       removeAssumedBits(COMPLETION_ACTION);
477 
478     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
479                                        : ChangeStatus::UNCHANGED;
480   }
481 
482   ChangeStatus manifest(Attributor &A) override {
483     SmallVector<Attribute, 8> AttrList;
484     LLVMContext &Ctx = getAssociatedFunction()->getContext();
485 
486     for (auto Attr : ImplicitAttrs) {
487       if (isKnown(Attr.first))
488         AttrList.push_back(Attribute::get(Ctx, Attr.second));
489     }
490 
491     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
492                                               /* ForceReplace */ true);
493   }
494 
495   const std::string getAsStr() const override {
496     std::string Str;
497     raw_string_ostream OS(Str);
498     OS << "AMDInfo[";
499     for (auto Attr : ImplicitAttrs)
500       OS << ' ' << Attr.second;
501     OS << " ]";
502     return OS.str();
503   }
504 
505   /// See AbstractAttribute::trackStatistics()
506   void trackStatistics() const override {}
507 
508 private:
509   bool checkForQueuePtr(Attributor &A) {
510     Function *F = getAssociatedFunction();
511     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
512 
513     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
514 
515     bool NeedsQueuePtr = false;
516 
517     auto CheckAddrSpaceCasts = [&](Instruction &I) {
518       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
519       if (castRequiresQueuePtr(SrcAS)) {
520         NeedsQueuePtr = true;
521         return false;
522       }
523       return true;
524     };
525 
526     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
527 
528     // `checkForAllInstructions` is much more cheaper than going through all
529     // instructions, try it first.
530 
531     // The queue pointer is not needed if aperture regs is present.
532     if (!HasApertureRegs) {
533       bool UsedAssumedInformation = false;
534       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
535                                 {Instruction::AddrSpaceCast},
536                                 UsedAssumedInformation);
537     }
538 
539     // If we found  that we need the queue pointer, nothing else to do.
540     if (NeedsQueuePtr)
541       return true;
542 
543     if (!IsNonEntryFunc && HasApertureRegs)
544       return false;
545 
546     for (BasicBlock &BB : *F) {
547       for (Instruction &I : BB) {
548         for (const Use &U : I.operands()) {
549           if (const auto *C = dyn_cast<Constant>(U)) {
550             if (InfoCache.needsQueuePtr(C, *F))
551               return true;
552           }
553         }
554       }
555     }
556 
557     return false;
558   }
559 
560   bool funcRetrievesMultigridSyncArg(Attributor &A) {
561     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
562     AA::RangeTy Range(Pos, 8);
563     return funcRetrievesImplicitKernelArg(A, Range);
564   }
565 
566   bool funcRetrievesHostcallPtr(Attributor &A) {
567     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
568     AA::RangeTy Range(Pos, 8);
569     return funcRetrievesImplicitKernelArg(A, Range);
570   }
571 
572   bool funcRetrievesDefaultQueue(Attributor &A) {
573     auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition();
574     AA::RangeTy Range(Pos, 8);
575     return funcRetrievesImplicitKernelArg(A, Range);
576   }
577 
578   bool funcRetrievesCompletionAction(Attributor &A) {
579     auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition();
580     AA::RangeTy Range(Pos, 8);
581     return funcRetrievesImplicitKernelArg(A, Range);
582   }
583 
584   bool funcRetrievesHeapPtr(Attributor &A) {
585     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
586       return false;
587     AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
588     return funcRetrievesImplicitKernelArg(A, Range);
589   }
590 
591   bool funcRetrievesQueuePtr(Attributor &A) {
592     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
593       return false;
594     AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
595     return funcRetrievesImplicitKernelArg(A, Range);
596   }
597 
598   bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
599     // Check if this is a call to the implicitarg_ptr builtin and it
600     // is used to retrieve the hostcall pointer. The implicit arg for
601     // hostcall is not used only if every use of the implicitarg_ptr
602     // is a load that clearly does not retrieve any byte of the
603     // hostcall pointer. We check this by tracing all the uses of the
604     // initial call to the implicitarg_ptr intrinsic.
605     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
606       auto &Call = cast<CallBase>(I);
607       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
608         return true;
609 
610       const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
611           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
612 
613       return PointerInfoAA.forallInterferingAccesses(
614           Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
615             return Acc.getRemoteInst()->isDroppable();
616           });
617     };
618 
619     bool UsedAssumedInformation = false;
620     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
621                                               UsedAssumedInformation);
622   }
623 
624   bool funcRetrievesLDSKernelId(Attributor &A) {
625     auto DoesNotRetrieve = [&](Instruction &I) {
626       auto &Call = cast<CallBase>(I);
627       return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
628     };
629     bool UsedAssumedInformation = false;
630     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
631                                               UsedAssumedInformation);
632   }
633 };
634 
635 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
636                                                     Attributor &A) {
637   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
638     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
639   llvm_unreachable("AAAMDAttributes is only valid for function position");
640 }
641 
642 /// Propagate amdgpu-flat-work-group-size attribute.
643 struct AAAMDFlatWorkGroupSize
644     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
645   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
646   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
647       : Base(IRP, 32) {}
648 
649   /// See AbstractAttribute::getState(...).
650   IntegerRangeState &getState() override { return *this; }
651   const IntegerRangeState &getState() const override { return *this; }
652 
653   void initialize(Attributor &A) override {
654     Function *F = getAssociatedFunction();
655     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
656     unsigned MinGroupSize, MaxGroupSize;
657     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
658     intersectKnown(
659         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
660 
661     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
662       indicatePessimisticFixpoint();
663   }
664 
665   ChangeStatus updateImpl(Attributor &A) override {
666     ChangeStatus Change = ChangeStatus::UNCHANGED;
667 
668     auto CheckCallSite = [&](AbstractCallSite CS) {
669       Function *Caller = CS.getInstruction()->getFunction();
670       LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
671                         << "->" << getAssociatedFunction()->getName() << '\n');
672 
673       const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
674           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
675 
676       Change |=
677           clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
678 
679       return true;
680     };
681 
682     bool AllCallSitesKnown = true;
683     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
684       return indicatePessimisticFixpoint();
685 
686     return Change;
687   }
688 
689   ChangeStatus manifest(Attributor &A) override {
690     SmallVector<Attribute, 8> AttrList;
691     Function *F = getAssociatedFunction();
692     LLVMContext &Ctx = F->getContext();
693 
694     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
695     unsigned Min, Max;
696     std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
697 
698     // Don't add the attribute if it's the implied default.
699     if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
700       return ChangeStatus::UNCHANGED;
701 
702     SmallString<10> Buffer;
703     raw_svector_ostream OS(Buffer);
704     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
705 
706     AttrList.push_back(
707         Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
708     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
709                                               /* ForceReplace */ true);
710   }
711 
712   const std::string getAsStr() const override {
713     std::string Str;
714     raw_string_ostream OS(Str);
715     OS << "AMDFlatWorkGroupSize[";
716     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
717     OS << ']';
718     return OS.str();
719   }
720 
721   /// See AbstractAttribute::trackStatistics()
722   void trackStatistics() const override {}
723 
724   /// Create an abstract attribute view for the position \p IRP.
725   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
726                                                    Attributor &A);
727 
728   /// See AbstractAttribute::getName()
729   const std::string getName() const override {
730     return "AAAMDFlatWorkGroupSize";
731   }
732 
733   /// See AbstractAttribute::getIdAddr()
734   const char *getIdAddr() const override { return &ID; }
735 
736   /// This function should return true if the type of the \p AA is
737   /// AAAMDFlatWorkGroupSize
738   static bool classof(const AbstractAttribute *AA) {
739     return (AA->getIdAddr() == &ID);
740   }
741 
742   /// Unique ID (due to the unique address)
743   static const char ID;
744 };
745 
746 const char AAAMDFlatWorkGroupSize::ID = 0;
747 
748 AAAMDFlatWorkGroupSize &
749 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
750                                           Attributor &A) {
751   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
752     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
753   llvm_unreachable(
754       "AAAMDFlatWorkGroupSize is only valid for function position");
755 }
756 
757 class AMDGPUAttributor : public ModulePass {
758 public:
759   AMDGPUAttributor() : ModulePass(ID) {}
760 
761   /// doInitialization - Virtual method overridden by subclasses to do
762   /// any necessary initialization before any pass is run.
763   bool doInitialization(Module &) override {
764     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
765     if (!TPC)
766       report_fatal_error("TargetMachine is required");
767 
768     TM = &TPC->getTM<TargetMachine>();
769     return false;
770   }
771 
772   bool runOnModule(Module &M) override {
773     SetVector<Function *> Functions;
774     AnalysisGetter AG(this);
775     for (Function &F : M) {
776       if (!F.isIntrinsic())
777         Functions.insert(&F);
778     }
779 
780     CallGraphUpdater CGUpdater;
781     BumpPtrAllocator Allocator;
782     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
783     DenseSet<const char *> Allowed(
784         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
785          &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID,
786          &AAPointerInfo::ID, &AAPotentialConstantValues::ID});
787 
788     AttributorConfig AC(CGUpdater);
789     AC.Allowed = &Allowed;
790     AC.IsModulePass = true;
791     AC.DefaultInitializeLiveInternals = false;
792 
793     Attributor A(Functions, InfoCache, AC);
794 
795     for (Function &F : M) {
796       if (!F.isIntrinsic()) {
797         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
798         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
799         if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
800           A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
801         }
802       }
803     }
804 
805     ChangeStatus Change = A.run();
806     return Change == ChangeStatus::CHANGED;
807   }
808 
809   void getAnalysisUsage(AnalysisUsage &AU) const override {
810     AU.addRequired<CycleInfoWrapperPass>();
811   }
812 
813   StringRef getPassName() const override { return "AMDGPU Attributor"; }
814   TargetMachine *TM;
815   static char ID;
816 };
817 } // namespace
818 
819 char AMDGPUAttributor::ID = 0;
820 
821 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
822 INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
823                       false)
824 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
825 INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
826                     false)
827