xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (revision 02e9120893770924227138ba49df1edb3896112a)
1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/Analysis/CycleAnalysis.h"
17 #include "llvm/CodeGen/TargetPassConfig.h"
18 #include "llvm/IR/IntrinsicsAMDGPU.h"
19 #include "llvm/IR/IntrinsicsR600.h"
20 #include "llvm/Target/TargetMachine.h"
21 #include "llvm/Transforms/IPO/Attributor.h"
22 
23 #define DEBUG_TYPE "amdgpu-attributor"
24 
25 namespace llvm {
26 void initializeCycleInfoWrapperPassPass(PassRegistry &);
27 }
28 
29 using namespace llvm;
30 
31 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
32 
33 enum ImplicitArgumentPositions {
34   #include "AMDGPUAttributes.def"
35   LAST_ARG_POS
36 };
37 
38 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
39 
40 enum ImplicitArgumentMask {
41   NOT_IMPLICIT_INPUT = 0,
42   #include "AMDGPUAttributes.def"
43   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
44 };
45 
46 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
47 static constexpr std::pair<ImplicitArgumentMask,
48                            StringLiteral> ImplicitAttrs[] = {
49  #include "AMDGPUAttributes.def"
50 };
51 
52 // We do not need to note the x workitem or workgroup id because they are always
53 // initialized.
54 //
55 // TODO: We should not add the attributes if the known compile time workgroup
56 // size is 1 for y/z.
57 static ImplicitArgumentMask
58 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
59                     bool HasApertureRegs, bool SupportsGetDoorBellID,
60                     unsigned CodeObjectVersion) {
61   switch (ID) {
62   case Intrinsic::amdgcn_workitem_id_x:
63     NonKernelOnly = true;
64     return WORKITEM_ID_X;
65   case Intrinsic::amdgcn_workgroup_id_x:
66     NonKernelOnly = true;
67     return WORKGROUP_ID_X;
68   case Intrinsic::amdgcn_workitem_id_y:
69   case Intrinsic::r600_read_tidig_y:
70     return WORKITEM_ID_Y;
71   case Intrinsic::amdgcn_workitem_id_z:
72   case Intrinsic::r600_read_tidig_z:
73     return WORKITEM_ID_Z;
74   case Intrinsic::amdgcn_workgroup_id_y:
75   case Intrinsic::r600_read_tgid_y:
76     return WORKGROUP_ID_Y;
77   case Intrinsic::amdgcn_workgroup_id_z:
78   case Intrinsic::r600_read_tgid_z:
79     return WORKGROUP_ID_Z;
80   case Intrinsic::amdgcn_lds_kernel_id:
81     return LDS_KERNEL_ID;
82   case Intrinsic::amdgcn_dispatch_ptr:
83     return DISPATCH_PTR;
84   case Intrinsic::amdgcn_dispatch_id:
85     return DISPATCH_ID;
86   case Intrinsic::amdgcn_implicitarg_ptr:
87     return IMPLICIT_ARG_PTR;
88   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
89   // queue_ptr.
90   case Intrinsic::amdgcn_queue_ptr:
91     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
92     return QUEUE_PTR;
93   case Intrinsic::amdgcn_is_shared:
94   case Intrinsic::amdgcn_is_private:
95     if (HasApertureRegs)
96       return NOT_IMPLICIT_INPUT;
97     // Under V5, we need implicitarg_ptr + offsets to access private_base or
98     // shared_base. For pre-V5, however, need to access them through queue_ptr +
99     // offsets.
100     return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :
101                                                       QUEUE_PTR;
102   case Intrinsic::trap:
103     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
104       return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :
105                                                         QUEUE_PTR;
106     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
107     return QUEUE_PTR;
108   default:
109     return NOT_IMPLICIT_INPUT;
110   }
111 }
112 
113 static bool castRequiresQueuePtr(unsigned SrcAS) {
114   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
115 }
116 
117 static bool isDSAddress(const Constant *C) {
118   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
119   if (!GV)
120     return false;
121   unsigned AS = GV->getAddressSpace();
122   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
123 }
124 
125 /// Returns true if the function requires the implicit argument be passed
126 /// regardless of the function contents.
127 static bool funcRequiresHostcallPtr(const Function &F) {
128   // Sanitizers require the hostcall buffer passed in the implicit arguments.
129   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
130          F.hasFnAttribute(Attribute::SanitizeThread) ||
131          F.hasFnAttribute(Attribute::SanitizeMemory) ||
132          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
133          F.hasFnAttribute(Attribute::SanitizeMemTag);
134 }
135 
136 namespace {
137 class AMDGPUInformationCache : public InformationCache {
138 public:
139   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
140                          BumpPtrAllocator &Allocator,
141                          SetVector<Function *> *CGSCC, TargetMachine &TM)
142       : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
143         CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {}
144 
145   TargetMachine &TM;
146 
147   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
148 
149   /// Check if the subtarget has aperture regs.
150   bool hasApertureRegs(Function &F) {
151     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
152     return ST.hasApertureRegs();
153   }
154 
155   /// Check if the subtarget supports GetDoorbellID.
156   bool supportsGetDoorbellID(Function &F) {
157     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
158     return ST.supportsGetDoorbellID();
159   }
160 
161   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
162     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
163     return ST.getFlatWorkGroupSizes(F);
164   }
165 
166   std::pair<unsigned, unsigned>
167   getMaximumFlatWorkGroupRange(const Function &F) {
168     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
169     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
170   }
171 
172   /// Get code object version.
173   unsigned getCodeObjectVersion() const {
174     return CodeObjectVersion;
175   }
176 
177   /// Get the effective value of "amdgpu-waves-per-eu" for the function,
178   /// accounting for the interaction with the passed value to use for
179   /// "amdgpu-flat-work-group-size".
180   std::pair<unsigned, unsigned>
181   getWavesPerEU(const Function &F,
182                 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
183     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
184     return ST.getWavesPerEU(F, FlatWorkGroupSize);
185   }
186 
187   std::pair<unsigned, unsigned>
188   getEffectiveWavesPerEU(const Function &F,
189                          std::pair<unsigned, unsigned> WavesPerEU,
190                          std::pair<unsigned, unsigned> FlatWorkGroupSize) {
191     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
192     return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
193   }
194 
195   unsigned getMaxWavesPerEU(const Function &F) {
196     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
197     return ST.getMaxWavesPerEU();
198   }
199 
200 private:
201   /// Check if the ConstantExpr \p CE requires the queue pointer.
202   static bool visitConstExpr(const ConstantExpr *CE) {
203     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
204       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
205       return castRequiresQueuePtr(SrcAS);
206     }
207     return false;
208   }
209 
210   /// Get the constant access bitmap for \p C.
211   uint8_t getConstantAccess(const Constant *C,
212                             SmallPtrSetImpl<const Constant *> &Visited) {
213     auto It = ConstantStatus.find(C);
214     if (It != ConstantStatus.end())
215       return It->second;
216 
217     uint8_t Result = 0;
218     if (isDSAddress(C))
219       Result = DS_GLOBAL;
220 
221     if (const auto *CE = dyn_cast<ConstantExpr>(C))
222       if (visitConstExpr(CE))
223         Result |= ADDR_SPACE_CAST;
224 
225     for (const Use &U : C->operands()) {
226       const auto *OpC = dyn_cast<Constant>(U);
227       if (!OpC || !Visited.insert(OpC).second)
228         continue;
229 
230       Result |= getConstantAccess(OpC, Visited);
231     }
232     return Result;
233   }
234 
235 public:
236   /// Returns true if \p Fn needs the queue pointer because of \p C.
237   bool needsQueuePtr(const Constant *C, Function &Fn) {
238     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
239     bool HasAperture = hasApertureRegs(Fn);
240 
241     // No need to explore the constants.
242     if (!IsNonEntryFunc && HasAperture)
243       return false;
244 
245     SmallPtrSet<const Constant *, 8> Visited;
246     uint8_t Access = getConstantAccess(C, Visited);
247 
248     // We need to trap on DS globals in non-entry functions.
249     if (IsNonEntryFunc && (Access & DS_GLOBAL))
250       return true;
251 
252     return !HasAperture && (Access & ADDR_SPACE_CAST);
253   }
254 
255 private:
256   /// Used to determine if the Constant needs the queue pointer.
257   DenseMap<const Constant *, uint8_t> ConstantStatus;
258   const unsigned CodeObjectVersion;
259 };
260 
261 struct AAAMDAttributes
262     : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
263                           AbstractAttribute> {
264   using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
265                             AbstractAttribute>;
266 
267   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
268 
269   /// Create an abstract attribute view for the position \p IRP.
270   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
271                                             Attributor &A);
272 
273   /// See AbstractAttribute::getName().
274   const std::string getName() const override { return "AAAMDAttributes"; }
275 
276   /// See AbstractAttribute::getIdAddr().
277   const char *getIdAddr() const override { return &ID; }
278 
279   /// This function should return true if the type of the \p AA is
280   /// AAAMDAttributes.
281   static bool classof(const AbstractAttribute *AA) {
282     return (AA->getIdAddr() == &ID);
283   }
284 
285   /// Unique ID (due to the unique address)
286   static const char ID;
287 };
288 const char AAAMDAttributes::ID = 0;
289 
290 struct AAUniformWorkGroupSize
291     : public StateWrapper<BooleanState, AbstractAttribute> {
292   using Base = StateWrapper<BooleanState, AbstractAttribute>;
293   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
294 
295   /// Create an abstract attribute view for the position \p IRP.
296   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
297                                                    Attributor &A);
298 
299   /// See AbstractAttribute::getName().
300   const std::string getName() const override {
301     return "AAUniformWorkGroupSize";
302   }
303 
304   /// See AbstractAttribute::getIdAddr().
305   const char *getIdAddr() const override { return &ID; }
306 
307   /// This function should return true if the type of the \p AA is
308   /// AAAMDAttributes.
309   static bool classof(const AbstractAttribute *AA) {
310     return (AA->getIdAddr() == &ID);
311   }
312 
313   /// Unique ID (due to the unique address)
314   static const char ID;
315 };
316 const char AAUniformWorkGroupSize::ID = 0;
317 
318 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
319   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
320       : AAUniformWorkGroupSize(IRP, A) {}
321 
322   void initialize(Attributor &A) override {
323     Function *F = getAssociatedFunction();
324     CallingConv::ID CC = F->getCallingConv();
325 
326     if (CC != CallingConv::AMDGPU_KERNEL)
327       return;
328 
329     bool InitialValue = false;
330     if (F->hasFnAttribute("uniform-work-group-size"))
331       InitialValue = F->getFnAttribute("uniform-work-group-size")
332                          .getValueAsString()
333                          .equals("true");
334 
335     if (InitialValue)
336       indicateOptimisticFixpoint();
337     else
338       indicatePessimisticFixpoint();
339   }
340 
341   ChangeStatus updateImpl(Attributor &A) override {
342     ChangeStatus Change = ChangeStatus::UNCHANGED;
343 
344     auto CheckCallSite = [&](AbstractCallSite CS) {
345       Function *Caller = CS.getInstruction()->getFunction();
346       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
347                         << "->" << getAssociatedFunction()->getName() << "\n");
348 
349       const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
350           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
351       if (!CallerInfo)
352         return false;
353 
354       Change = Change | clampStateAndIndicateChange(this->getState(),
355                                                     CallerInfo->getState());
356 
357       return true;
358     };
359 
360     bool AllCallSitesKnown = true;
361     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
362       return indicatePessimisticFixpoint();
363 
364     return Change;
365   }
366 
367   ChangeStatus manifest(Attributor &A) override {
368     SmallVector<Attribute, 8> AttrList;
369     LLVMContext &Ctx = getAssociatedFunction()->getContext();
370 
371     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
372                                       getAssumed() ? "true" : "false"));
373     return A.manifestAttrs(getIRPosition(), AttrList,
374                            /* ForceReplace */ true);
375   }
376 
377   bool isValidState() const override {
378     // This state is always valid, even when the state is false.
379     return true;
380   }
381 
382   const std::string getAsStr(Attributor *) const override {
383     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
384   }
385 
386   /// See AbstractAttribute::trackStatistics()
387   void trackStatistics() const override {}
388 };
389 
390 AAUniformWorkGroupSize &
391 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
392                                           Attributor &A) {
393   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
394     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
395   llvm_unreachable(
396       "AAUniformWorkGroupSize is only valid for function position");
397 }
398 
399 struct AAAMDAttributesFunction : public AAAMDAttributes {
400   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
401       : AAAMDAttributes(IRP, A) {}
402 
403   void initialize(Attributor &A) override {
404     Function *F = getAssociatedFunction();
405 
406     // If the function requires the implicit arg pointer due to sanitizers,
407     // assume it's needed even if explicitly marked as not requiring it.
408     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
409     if (NeedsHostcall) {
410       removeAssumedBits(IMPLICIT_ARG_PTR);
411       removeAssumedBits(HOSTCALL_PTR);
412     }
413 
414     for (auto Attr : ImplicitAttrs) {
415       if (NeedsHostcall &&
416           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
417         continue;
418 
419       if (F->hasFnAttribute(Attr.second))
420         addKnownBits(Attr.first);
421     }
422 
423     if (F->isDeclaration())
424       return;
425 
426     // Ignore functions with graphics calling conventions, these are currently
427     // not allowed to have kernel arguments.
428     if (AMDGPU::isGraphics(F->getCallingConv())) {
429       indicatePessimisticFixpoint();
430       return;
431     }
432   }
433 
434   ChangeStatus updateImpl(Attributor &A) override {
435     Function *F = getAssociatedFunction();
436     // The current assumed state used to determine a change.
437     auto OrigAssumed = getAssumed();
438 
439     // Check for Intrinsics and propagate attributes.
440     const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
441         *this, this->getIRPosition(), DepClassTy::REQUIRED);
442     if (!AAEdges || AAEdges->hasNonAsmUnknownCallee())
443       return indicatePessimisticFixpoint();
444 
445     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
446 
447     bool NeedsImplicit = false;
448     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
449     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
450     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
451     unsigned COV = InfoCache.getCodeObjectVersion();
452 
453     for (Function *Callee : AAEdges->getOptimisticEdges()) {
454       Intrinsic::ID IID = Callee->getIntrinsicID();
455       if (IID == Intrinsic::not_intrinsic) {
456         const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
457             *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
458         if (!AAAMD)
459           return indicatePessimisticFixpoint();
460         *this &= *AAAMD;
461         continue;
462       }
463 
464       bool NonKernelOnly = false;
465       ImplicitArgumentMask AttrMask =
466           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
467                               HasApertureRegs, SupportsGetDoorbellID, COV);
468       if (AttrMask != NOT_IMPLICIT_INPUT) {
469         if ((IsNonEntryFunc || !NonKernelOnly))
470           removeAssumedBits(AttrMask);
471       }
472     }
473 
474     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
475     if (NeedsImplicit)
476       removeAssumedBits(IMPLICIT_ARG_PTR);
477 
478     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
479       // Under V5, we need implicitarg_ptr + offsets to access private_base or
480       // shared_base. We do not actually need queue_ptr.
481       if (COV >= 5)
482         removeAssumedBits(IMPLICIT_ARG_PTR);
483       else
484         removeAssumedBits(QUEUE_PTR);
485     }
486 
487     if (funcRetrievesMultigridSyncArg(A, COV)) {
488       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
489              "multigrid_sync_arg needs implicitarg_ptr");
490       removeAssumedBits(MULTIGRID_SYNC_ARG);
491     }
492 
493     if (funcRetrievesHostcallPtr(A, COV)) {
494       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
495       removeAssumedBits(HOSTCALL_PTR);
496     }
497 
498     if (funcRetrievesHeapPtr(A, COV)) {
499       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
500       removeAssumedBits(HEAP_PTR);
501     }
502 
503     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
504       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
505       removeAssumedBits(QUEUE_PTR);
506     }
507 
508     if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
509       removeAssumedBits(LDS_KERNEL_ID);
510     }
511 
512     if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
513       removeAssumedBits(DEFAULT_QUEUE);
514 
515     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
516       removeAssumedBits(COMPLETION_ACTION);
517 
518     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
519                                        : ChangeStatus::UNCHANGED;
520   }
521 
522   ChangeStatus manifest(Attributor &A) override {
523     SmallVector<Attribute, 8> AttrList;
524     LLVMContext &Ctx = getAssociatedFunction()->getContext();
525 
526     for (auto Attr : ImplicitAttrs) {
527       if (isKnown(Attr.first))
528         AttrList.push_back(Attribute::get(Ctx, Attr.second));
529     }
530 
531     return A.manifestAttrs(getIRPosition(), AttrList,
532                            /* ForceReplace */ true);
533   }
534 
535   const std::string getAsStr(Attributor *) const override {
536     std::string Str;
537     raw_string_ostream OS(Str);
538     OS << "AMDInfo[";
539     for (auto Attr : ImplicitAttrs)
540       if (isAssumed(Attr.first))
541         OS << ' ' << Attr.second;
542     OS << " ]";
543     return OS.str();
544   }
545 
546   /// See AbstractAttribute::trackStatistics()
547   void trackStatistics() const override {}
548 
549 private:
550   bool checkForQueuePtr(Attributor &A) {
551     Function *F = getAssociatedFunction();
552     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
553 
554     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
555 
556     bool NeedsQueuePtr = false;
557 
558     auto CheckAddrSpaceCasts = [&](Instruction &I) {
559       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
560       if (castRequiresQueuePtr(SrcAS)) {
561         NeedsQueuePtr = true;
562         return false;
563       }
564       return true;
565     };
566 
567     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
568 
569     // `checkForAllInstructions` is much more cheaper than going through all
570     // instructions, try it first.
571 
572     // The queue pointer is not needed if aperture regs is present.
573     if (!HasApertureRegs) {
574       bool UsedAssumedInformation = false;
575       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
576                                 {Instruction::AddrSpaceCast},
577                                 UsedAssumedInformation);
578     }
579 
580     // If we found  that we need the queue pointer, nothing else to do.
581     if (NeedsQueuePtr)
582       return true;
583 
584     if (!IsNonEntryFunc && HasApertureRegs)
585       return false;
586 
587     for (BasicBlock &BB : *F) {
588       for (Instruction &I : BB) {
589         for (const Use &U : I.operands()) {
590           if (const auto *C = dyn_cast<Constant>(U)) {
591             if (InfoCache.needsQueuePtr(C, *F))
592               return true;
593           }
594         }
595       }
596     }
597 
598     return false;
599   }
600 
601   bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
602     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
603     AA::RangeTy Range(Pos, 8);
604     return funcRetrievesImplicitKernelArg(A, Range);
605   }
606 
607   bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
608     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
609     AA::RangeTy Range(Pos, 8);
610     return funcRetrievesImplicitKernelArg(A, Range);
611   }
612 
613   bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
614     auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
615     AA::RangeTy Range(Pos, 8);
616     return funcRetrievesImplicitKernelArg(A, Range);
617   }
618 
619   bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
620     auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
621     AA::RangeTy Range(Pos, 8);
622     return funcRetrievesImplicitKernelArg(A, Range);
623   }
624 
625   bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
626     if (COV < 5)
627       return false;
628     AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
629     return funcRetrievesImplicitKernelArg(A, Range);
630   }
631 
632   bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
633     if (COV < 5)
634       return false;
635     AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
636     return funcRetrievesImplicitKernelArg(A, Range);
637   }
638 
639   bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
640     // Check if this is a call to the implicitarg_ptr builtin and it
641     // is used to retrieve the hostcall pointer. The implicit arg for
642     // hostcall is not used only if every use of the implicitarg_ptr
643     // is a load that clearly does not retrieve any byte of the
644     // hostcall pointer. We check this by tracing all the uses of the
645     // initial call to the implicitarg_ptr intrinsic.
646     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
647       auto &Call = cast<CallBase>(I);
648       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
649         return true;
650 
651       const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
652           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
653       if (!PointerInfoAA)
654         return false;
655 
656       return PointerInfoAA->forallInterferingAccesses(
657           Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
658             return Acc.getRemoteInst()->isDroppable();
659           });
660     };
661 
662     bool UsedAssumedInformation = false;
663     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
664                                               UsedAssumedInformation);
665   }
666 
667   bool funcRetrievesLDSKernelId(Attributor &A) {
668     auto DoesNotRetrieve = [&](Instruction &I) {
669       auto &Call = cast<CallBase>(I);
670       return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
671     };
672     bool UsedAssumedInformation = false;
673     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
674                                               UsedAssumedInformation);
675   }
676 };
677 
678 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
679                                                     Attributor &A) {
680   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
681     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
682   llvm_unreachable("AAAMDAttributes is only valid for function position");
683 }
684 
685 /// Base class to derive different size ranges.
686 struct AAAMDSizeRangeAttribute
687     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
688   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
689 
690   StringRef AttrName;
691 
692   AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
693                           StringRef AttrName)
694       : Base(IRP, 32), AttrName(AttrName) {}
695 
696   /// See AbstractAttribute::trackStatistics()
697   void trackStatistics() const override {}
698 
699   template <class AttributeImpl>
700   ChangeStatus updateImplImpl(Attributor &A) {
701     ChangeStatus Change = ChangeStatus::UNCHANGED;
702 
703     auto CheckCallSite = [&](AbstractCallSite CS) {
704       Function *Caller = CS.getInstruction()->getFunction();
705       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
706                         << "->" << getAssociatedFunction()->getName() << '\n');
707 
708       const auto *CallerInfo = A.getAAFor<AttributeImpl>(
709           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
710       if (!CallerInfo)
711         return false;
712 
713       Change |=
714           clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
715 
716       return true;
717     };
718 
719     bool AllCallSitesKnown = true;
720     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
721       return indicatePessimisticFixpoint();
722 
723     return Change;
724   }
725 
726   ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
727                                          unsigned Max) {
728     // Don't add the attribute if it's the implied default.
729     if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
730       return ChangeStatus::UNCHANGED;
731 
732     Function *F = getAssociatedFunction();
733     LLVMContext &Ctx = F->getContext();
734     SmallString<10> Buffer;
735     raw_svector_ostream OS(Buffer);
736     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
737     return A.manifestAttrs(getIRPosition(),
738                            {Attribute::get(Ctx, AttrName, OS.str())},
739                            /* ForceReplace */ true);
740   }
741 
742   const std::string getAsStr(Attributor *) const override {
743     std::string Str;
744     raw_string_ostream OS(Str);
745     OS << getName() << '[';
746     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
747     OS << ']';
748     return OS.str();
749   }
750 };
751 
752 /// Propagate amdgpu-flat-work-group-size attribute.
753 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
754   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
755       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
756 
757   void initialize(Attributor &A) override {
758     Function *F = getAssociatedFunction();
759     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
760     unsigned MinGroupSize, MaxGroupSize;
761     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
762     intersectKnown(
763         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
764 
765     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
766       indicatePessimisticFixpoint();
767   }
768 
769   ChangeStatus updateImpl(Attributor &A) override {
770     return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
771   }
772 
773   /// Create an abstract attribute view for the position \p IRP.
774   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
775                                                    Attributor &A);
776 
777   ChangeStatus manifest(Attributor &A) override {
778     Function *F = getAssociatedFunction();
779     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
780     unsigned Min, Max;
781     std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
782     return emitAttributeIfNotDefault(A, Min, Max);
783   }
784 
785   /// See AbstractAttribute::getName()
786   const std::string getName() const override {
787     return "AAAMDFlatWorkGroupSize";
788   }
789 
790   /// See AbstractAttribute::getIdAddr()
791   const char *getIdAddr() const override { return &ID; }
792 
793   /// This function should return true if the type of the \p AA is
794   /// AAAMDFlatWorkGroupSize
795   static bool classof(const AbstractAttribute *AA) {
796     return (AA->getIdAddr() == &ID);
797   }
798 
799   /// Unique ID (due to the unique address)
800   static const char ID;
801 };
802 
803 const char AAAMDFlatWorkGroupSize::ID = 0;
804 
805 AAAMDFlatWorkGroupSize &
806 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
807                                           Attributor &A) {
808   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
809     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
810   llvm_unreachable(
811       "AAAMDFlatWorkGroupSize is only valid for function position");
812 }
813 
814 /// Propagate amdgpu-waves-per-eu attribute.
815 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
816   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
817       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
818 
819   bool isValidState() const override {
820     return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
821   }
822 
823   void initialize(Attributor &A) override {
824     Function *F = getAssociatedFunction();
825     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
826 
827     if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
828             *this, IRPosition::function(*F), DepClassTy::REQUIRED)) {
829 
830       unsigned Min, Max;
831       std::tie(Min, Max) = InfoCache.getWavesPerEU(
832           *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
833                AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
834 
835       ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
836       intersectKnown(Range);
837     }
838 
839     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
840       indicatePessimisticFixpoint();
841   }
842 
843   ChangeStatus updateImpl(Attributor &A) override {
844     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
845     ChangeStatus Change = ChangeStatus::UNCHANGED;
846 
847     auto CheckCallSite = [&](AbstractCallSite CS) {
848       Function *Caller = CS.getInstruction()->getFunction();
849       Function *Func = getAssociatedFunction();
850       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
851                         << "->" << Func->getName() << '\n');
852 
853       const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
854           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
855       const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
856           *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
857       if (!CallerInfo || !AssumedGroupSize)
858         return false;
859 
860       unsigned Min, Max;
861       std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
862           *Caller,
863           {CallerInfo->getAssumed().getLower().getZExtValue(),
864            CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
865           {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
866            AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
867       ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
868       IntegerRangeState CallerRangeState(CallerRange);
869       Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
870 
871       return true;
872     };
873 
874     bool AllCallSitesKnown = true;
875     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
876       return indicatePessimisticFixpoint();
877 
878     return Change;
879   }
880 
881   /// Create an abstract attribute view for the position \p IRP.
882   static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
883                                             Attributor &A);
884 
885   ChangeStatus manifest(Attributor &A) override {
886     Function *F = getAssociatedFunction();
887     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
888     unsigned Max = InfoCache.getMaxWavesPerEU(*F);
889     return emitAttributeIfNotDefault(A, 1, Max);
890   }
891 
892   /// See AbstractAttribute::getName()
893   const std::string getName() const override { return "AAAMDWavesPerEU"; }
894 
895   /// See AbstractAttribute::getIdAddr()
896   const char *getIdAddr() const override { return &ID; }
897 
898   /// This function should return true if the type of the \p AA is
899   /// AAAMDWavesPerEU
900   static bool classof(const AbstractAttribute *AA) {
901     return (AA->getIdAddr() == &ID);
902   }
903 
904   /// Unique ID (due to the unique address)
905   static const char ID;
906 };
907 
908 const char AAAMDWavesPerEU::ID = 0;
909 
910 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
911                                                     Attributor &A) {
912   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
913     return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
914   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
915 }
916 
917 class AMDGPUAttributor : public ModulePass {
918 public:
919   AMDGPUAttributor() : ModulePass(ID) {}
920 
921   /// doInitialization - Virtual method overridden by subclasses to do
922   /// any necessary initialization before any pass is run.
923   bool doInitialization(Module &) override {
924     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
925     if (!TPC)
926       report_fatal_error("TargetMachine is required");
927 
928     TM = &TPC->getTM<TargetMachine>();
929     return false;
930   }
931 
932   bool runOnModule(Module &M) override {
933     SetVector<Function *> Functions;
934     AnalysisGetter AG(this);
935     for (Function &F : M) {
936       if (!F.isIntrinsic())
937         Functions.insert(&F);
938     }
939 
940     CallGraphUpdater CGUpdater;
941     BumpPtrAllocator Allocator;
942     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
943     DenseSet<const char *> Allowed(
944         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
945          &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
946          &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
947          &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
948 
949     AttributorConfig AC(CGUpdater);
950     AC.Allowed = &Allowed;
951     AC.IsModulePass = true;
952     AC.DefaultInitializeLiveInternals = false;
953     AC.IPOAmendableCB = [](const Function &F) {
954       return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
955     };
956 
957     Attributor A(Functions, InfoCache, AC);
958 
959     for (Function &F : M) {
960       if (!F.isIntrinsic()) {
961         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
962         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
963         if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
964           A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
965           A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
966         }
967       }
968     }
969 
970     ChangeStatus Change = A.run();
971     return Change == ChangeStatus::CHANGED;
972   }
973 
974   void getAnalysisUsage(AnalysisUsage &AU) const override {
975     AU.addRequired<CycleInfoWrapperPass>();
976   }
977 
978   StringRef getPassName() const override { return "AMDGPU Attributor"; }
979   TargetMachine *TM;
980   static char ID;
981 };
982 } // namespace
983 
984 char AMDGPUAttributor::ID = 0;
985 
986 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
987 INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
988                       false)
989 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
990 INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
991                     false)
992