xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Eliminates allocas by either converting them into vectors or by migrating
10 // them to local address space.
11 //
12 // Two passes are exposed by this file:
13 //    - "promote-alloca-to-vector", which runs early in the pipeline and only
14 //      promotes to vector. Promotion to vector is almost always profitable
15 //      except when the alloca is too big and the promotion would result in
16 //      very high register pressure.
17 //    - "promote-alloca", which does both promotion to vector and LDS and runs
18 //      much later in the pipeline. This runs after SROA because promoting to
19 //      LDS is of course less profitable than getting rid of the alloca or
20 //      vectorizing it, thus we only want to do it when the only alternative is
21 //      lowering the alloca to stack.
22 //
23 // Note that both of them exist for the old and new PMs. The new PM passes are
24 // declared in AMDGPU.h and the legacy PM ones are declared here.s
25 //
26 //===----------------------------------------------------------------------===//
27 
28 #include "AMDGPU.h"
29 #include "GCNSubtarget.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/Analysis/CaptureTracking.h"
33 #include "llvm/Analysis/InstSimplifyFolder.h"
34 #include "llvm/Analysis/InstructionSimplify.h"
35 #include "llvm/Analysis/LoopInfo.h"
36 #include "llvm/Analysis/ValueTracking.h"
37 #include "llvm/CodeGen/TargetPassConfig.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/IntrinsicsAMDGPU.h"
41 #include "llvm/IR/IntrinsicsR600.h"
42 #include "llvm/IR/PatternMatch.h"
43 #include "llvm/InitializePasses.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Transforms/Utils/SSAUpdater.h"
47 
48 #define DEBUG_TYPE "amdgpu-promote-alloca"
49 
50 using namespace llvm;
51 
52 namespace {
53 
54 static cl::opt<bool>
55     DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56                                  cl::desc("Disable promote alloca to vector"),
57                                  cl::init(false));
58 
59 static cl::opt<bool>
60     DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61                               cl::desc("Disable promote alloca to LDS"),
62                               cl::init(false));
63 
64 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65     "amdgpu-promote-alloca-to-vector-limit",
66     cl::desc("Maximum byte size to consider promote alloca to vector"),
67     cl::init(0));
68 
69 static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70     "amdgpu-promote-alloca-to-vector-max-regs",
71     cl::desc(
72         "Maximum vector size (in 32b registers) to use when promoting alloca"),
73     cl::init(16));
74 
75 // Use up to 1/4 of available register budget for vectorization.
76 // FIXME: Increase the limit for whole function budgets? Perhaps x2?
77 static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78     "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79     cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80     cl::init(4));
81 
82 static cl::opt<unsigned>
83     LoopUserWeight("promote-alloca-vector-loop-user-weight",
84                    cl::desc("The bonus weight of users of allocas within loop "
85                             "when sorting profitable allocas"),
86                    cl::init(4));
87 
88 // Shared implementation which can do both promotion to vector and to LDS.
89 class AMDGPUPromoteAllocaImpl {
90 private:
91   const TargetMachine &TM;
92   LoopInfo &LI;
93   Module *Mod = nullptr;
94   const DataLayout *DL = nullptr;
95 
96   // FIXME: This should be per-kernel.
97   uint32_t LocalMemLimit = 0;
98   uint32_t CurrentLocalMemUsage = 0;
99   unsigned MaxVGPRs;
100   unsigned VGPRBudgetRatio;
101   unsigned MaxVectorRegs;
102 
103   bool IsAMDGCN = false;
104   bool IsAMDHSA = false;
105 
106   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
107   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
108 
109   /// BaseAlloca is the alloca root the search started from.
110   /// Val may be that alloca or a recursive user of it.
111   bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
112                                std::vector<Value *> &WorkList) const;
113 
114   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116   /// Returns true if both operands are derived from the same alloca. Val should
117   /// be the same value as one of the input operands of UseInst.
118   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
119                                        Instruction *UseInst, int OpIdx0,
120                                        int OpIdx1) const;
121 
122   /// Check whether we have enough local memory for promotion.
123   bool hasSufficientLocalMem(const Function &F);
124 
125   bool tryPromoteAllocaToVector(AllocaInst &I);
126   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
127 
128   void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
129 
130   void setFunctionLimits(const Function &F);
131 
132 public:
AMDGPUPromoteAllocaImpl(TargetMachine & TM,LoopInfo & LI)133   AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
134 
135     const Triple &TT = TM.getTargetTriple();
136     IsAMDGCN = TT.isAMDGCN();
137     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
138   }
139 
140   bool run(Function &F, bool PromoteToLDS);
141 };
142 
143 // FIXME: This can create globals so should be a module pass.
144 class AMDGPUPromoteAlloca : public FunctionPass {
145 public:
146   static char ID;
147 
AMDGPUPromoteAlloca()148   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
149 
runOnFunction(Function & F)150   bool runOnFunction(Function &F) override {
151     if (skipFunction(F))
152       return false;
153     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154       return AMDGPUPromoteAllocaImpl(
155                  TPC->getTM<TargetMachine>(),
156                  getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
157           .run(F, /*PromoteToLDS*/ true);
158     return false;
159   }
160 
getPassName() const161   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
162 
getAnalysisUsage(AnalysisUsage & AU) const163   void getAnalysisUsage(AnalysisUsage &AU) const override {
164     AU.setPreservesCFG();
165     AU.addRequired<LoopInfoWrapperPass>();
166     FunctionPass::getAnalysisUsage(AU);
167   }
168 };
169 
getMaxVGPRs(unsigned LDSBytes,const TargetMachine & TM,const Function & F)170 static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
171                             const Function &F) {
172   if (!TM.getTargetTriple().isAMDGCN())
173     return 128;
174 
175   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
176 
177   unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
178   // Temporarily check both the attribute and the subtarget feature, until the
179   // latter is removed.
180   if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
181     DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
182 
183   unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184       ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
185       DynamicVGPRBlockSize);
186 
187   // A non-entry function has only 32 caller preserved registers.
188   // Do not promote alloca which will force spilling unless we know the function
189   // will be inlined.
190   if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
191       !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
192     MaxVGPRs = std::min(MaxVGPRs, 32u);
193   return MaxVGPRs;
194 }
195 
196 } // end anonymous namespace
197 
198 char AMDGPUPromoteAlloca::ID = 0;
199 
200 INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
201                       "AMDGPU promote alloca to vector or LDS", false, false)
202 // Move LDS uses from functions to kernels before promote alloca for accurate
203 // estimation of LDS available
204 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
205 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
206 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
207                     "AMDGPU promote alloca to vector or LDS", false, false)
208 
209 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
210 
run(Function & F,FunctionAnalysisManager & AM)211 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
212                                                FunctionAnalysisManager &AM) {
213   auto &LI = AM.getResult<LoopAnalysis>(F);
214   bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
215   if (Changed) {
216     PreservedAnalyses PA;
217     PA.preserveSet<CFGAnalyses>();
218     return PA;
219   }
220   return PreservedAnalyses::all();
221 }
222 
223 PreservedAnalyses
run(Function & F,FunctionAnalysisManager & AM)224 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
225   auto &LI = AM.getResult<LoopAnalysis>(F);
226   bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
227   if (Changed) {
228     PreservedAnalyses PA;
229     PA.preserveSet<CFGAnalyses>();
230     return PA;
231   }
232   return PreservedAnalyses::all();
233 }
234 
createAMDGPUPromoteAlloca()235 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
236   return new AMDGPUPromoteAlloca();
237 }
238 
collectAllocaUses(AllocaInst & Alloca,SmallVectorImpl<Use * > & Uses)239 static void collectAllocaUses(AllocaInst &Alloca,
240                               SmallVectorImpl<Use *> &Uses) {
241   SmallVector<Instruction *, 4> WorkList({&Alloca});
242   while (!WorkList.empty()) {
243     auto *Cur = WorkList.pop_back_val();
244     for (auto &U : Cur->uses()) {
245       Uses.push_back(&U);
246 
247       if (isa<GetElementPtrInst>(U.getUser()))
248         WorkList.push_back(cast<Instruction>(U.getUser()));
249     }
250   }
251 }
252 
sortAllocasToPromote(SmallVectorImpl<AllocaInst * > & Allocas)253 void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
254     SmallVectorImpl<AllocaInst *> &Allocas) {
255   DenseMap<AllocaInst *, unsigned> Scores;
256 
257   for (auto *Alloca : Allocas) {
258     LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
259     unsigned &Score = Scores[Alloca];
260     // Increment score by one for each user + a bonus for users within loops.
261     SmallVector<Use *, 8> Uses;
262     collectAllocaUses(*Alloca, Uses);
263     for (auto *U : Uses) {
264       Instruction *Inst = cast<Instruction>(U->getUser());
265       if (isa<GetElementPtrInst>(Inst))
266         continue;
267       unsigned UserScore =
268           1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
269       LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
270       Score += UserScore;
271     }
272     LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
273   }
274 
275   stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
276     return Scores.at(A) > Scores.at(B);
277   });
278 
279   // clang-format off
280   LLVM_DEBUG(
281     dbgs() << "Sorted Worklist:\n";
282     for (auto *A: Allocas)
283       dbgs() << "  " << *A << "\n";
284   );
285   // clang-format on
286 }
287 
setFunctionLimits(const Function & F)288 void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289   // Load per function limits, overriding with global options where appropriate.
290   MaxVectorRegs = F.getFnAttributeAsParsedInteger(
291       "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
292   if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
293     MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
294   VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
295       "amdgpu-promote-alloca-to-vector-vgpr-ratio",
296       PromoteAllocaToVectorVGPRRatio);
297   if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
298     VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
299 }
300 
run(Function & F,bool PromoteToLDS)301 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
302   Mod = F.getParent();
303   DL = &Mod->getDataLayout();
304 
305   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
306   if (!ST.isPromoteAllocaEnabled())
307     return false;
308 
309   bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
310   MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
311   setFunctionLimits(F);
312 
313   unsigned VectorizationBudget =
314       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
315                                   : (MaxVGPRs * 32)) /
316       VGPRBudgetRatio;
317 
318   SmallVector<AllocaInst *, 16> Allocas;
319   for (Instruction &I : F.getEntryBlock()) {
320     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
321       // Array allocations are probably not worth handling, since an allocation
322       // of the array type is the canonical form.
323       if (!AI->isStaticAlloca() || AI->isArrayAllocation())
324         continue;
325       Allocas.push_back(AI);
326     }
327   }
328 
329   sortAllocasToPromote(Allocas);
330 
331   bool Changed = false;
332   for (AllocaInst *AI : Allocas) {
333     const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
334     // First, check if we have enough budget to vectorize this alloca.
335     if (AllocaCost <= VectorizationBudget) {
336       // If we do, attempt vectorization, otherwise, fall through and try
337       // promoting to LDS instead.
338       if (tryPromoteAllocaToVector(*AI)) {
339         Changed = true;
340         assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
341                "Underflow!");
342         VectorizationBudget -= AllocaCost;
343         LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
344                           << VectorizationBudget << "\n");
345         continue;
346       }
347     } else {
348       LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
349                         << AllocaCost << ", budget:" << VectorizationBudget
350                         << "): " << *AI << "\n");
351     }
352 
353     if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
354       Changed = true;
355   }
356 
357   // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
358   // dangling pointers. If we want to reuse it past this point, the loop above
359   // would need to be updated to remove successfully promoted allocas.
360 
361   return Changed;
362 }
363 
364 struct MemTransferInfo {
365   ConstantInt *SrcIndex = nullptr;
366   ConstantInt *DestIndex = nullptr;
367 };
368 
369 // Checks if the instruction I is a memset user of the alloca AI that we can
370 // deal with. Currently, only non-volatile memsets that affect the whole alloca
371 // are handled.
isSupportedMemset(MemSetInst * I,AllocaInst * AI,const DataLayout & DL)372 static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
373                               const DataLayout &DL) {
374   using namespace PatternMatch;
375   // For now we only care about non-volatile memsets that affect the whole type
376   // (start at index 0 and fill the whole alloca).
377   //
378   // TODO: Now that we moved to PromoteAlloca we could handle any memsets
379   // (except maybe volatile ones?) - we just need to use shufflevector if it
380   // only affects a subset of the vector.
381   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
382   return I->getOperand(0) == AI &&
383          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
384 }
385 
calculateVectorIndex(Value * Ptr,const std::map<GetElementPtrInst *,WeakTrackingVH> & GEPIdx)386 static Value *calculateVectorIndex(
387     Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
388   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
389   if (!GEP)
390     return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
391 
392   auto I = GEPIdx.find(GEP);
393   assert(I != GEPIdx.end() && "Must have entry for GEP!");
394 
395   Value *IndexValue = I->second;
396   assert(IndexValue && "index value missing from GEP index map");
397   return IndexValue;
398 }
399 
GEPToVectorIndex(GetElementPtrInst * GEP,AllocaInst * Alloca,Type * VecElemTy,const DataLayout & DL,SmallVector<Instruction * > & NewInsts)400 static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
401                                Type *VecElemTy, const DataLayout &DL,
402                                SmallVector<Instruction *> &NewInsts) {
403   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
404   // helper.
405   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
406   SmallMapVector<Value *, APInt, 4> VarOffsets;
407   APInt ConstOffset(BW, 0);
408 
409   // Walk backwards through nested GEPs to collect both constant and variable
410   // offsets, so that nested vector GEP chains can be lowered in one step.
411   //
412   // Given this IR fragment as input:
413   //
414   //   %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
415   //   %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
416   //   %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
417   //   %3 = load i32, ptr addrspace(5) %2, align 4
418   //
419   // Combine both GEP operations in a single pass, producing:
420   //   BasePtr      = %0
421   //   ConstOffset  = 4
422   //   VarOffsets   = { %j -> element_size(<2 x i32>) }
423   //
424   // That lets us emit a single buffer_load directly into a VGPR, without ever
425   // allocating scratch memory for the intermediate pointer.
426   Value *CurPtr = GEP;
427   while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
428     if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
429       return nullptr;
430 
431     // Move to the next outer pointer.
432     CurPtr = CurGEP->getPointerOperand();
433   }
434 
435   assert(CurPtr == Alloca && "GEP not based on alloca");
436 
437   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
438   if (VarOffsets.size() > 1)
439     return nullptr;
440 
441   APInt IndexQuot;
442   uint64_t Rem;
443   APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
444   if (Rem != 0)
445     return nullptr;
446   if (VarOffsets.size() == 0)
447     return ConstantInt::get(GEP->getContext(), IndexQuot);
448 
449   IRBuilder<> Builder(GEP);
450 
451   const auto &VarOffset = VarOffsets.front();
452   APInt OffsetQuot;
453   APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
454   if (Rem != 0 || OffsetQuot.isZero())
455     return nullptr;
456 
457   Value *Offset = VarOffset.first;
458   auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
459   if (!OffsetType)
460     return nullptr;
461 
462   if (!OffsetQuot.isOne()) {
463     ConstantInt *ConstMul =
464         ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
465     Offset = Builder.CreateMul(Offset, ConstMul);
466     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
467       NewInsts.push_back(NewInst);
468   }
469   if (ConstOffset.isZero())
470     return Offset;
471 
472   ConstantInt *ConstIndex =
473       ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
474   Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
475   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
476     NewInsts.push_back(NewInst);
477   return IndexAdd;
478 }
479 
480 /// Promotes a single user of the alloca to a vector form.
481 ///
482 /// \param Inst           Instruction to be promoted.
483 /// \param DL             Module Data Layout.
484 /// \param VectorTy       Vectorized Type.
485 /// \param VecStoreSize   Size of \p VectorTy in bytes.
486 /// \param ElementSize    Size of \p VectorTy element type in bytes.
487 /// \param TransferInfo   MemTransferInst info map.
488 /// \param GEPVectorIdx   GEP -> VectorIdx cache.
489 /// \param CurVal         Current value of the vector (e.g. last stored value)
490 /// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
491 ///              be promoted now. This happens when promoting requires \p
492 ///              CurVal, but \p CurVal is nullptr.
493 /// \return the stored value if \p Inst would have written to the alloca, or
494 ///         nullptr otherwise.
promoteAllocaUserToVector(Instruction * Inst,const DataLayout & DL,FixedVectorType * VectorTy,unsigned VecStoreSize,unsigned ElementSize,DenseMap<MemTransferInst *,MemTransferInfo> & TransferInfo,std::map<GetElementPtrInst *,WeakTrackingVH> & GEPVectorIdx,Value * CurVal,SmallVectorImpl<LoadInst * > & DeferredLoads)495 static Value *promoteAllocaUserToVector(
496     Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
497     unsigned VecStoreSize, unsigned ElementSize,
498     DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
499     std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
500     SmallVectorImpl<LoadInst *> &DeferredLoads) {
501   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
502   // to do more folding, especially in the case of vector splats.
503   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
504                                         InstSimplifyFolder(DL));
505   Builder.SetInsertPoint(Inst);
506 
507   const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
508     if (CurVal)
509       return CurVal;
510 
511     // If the current value is not known, insert a dummy load and lower it on
512     // the second pass.
513     LoadInst *Dummy =
514         Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
515                            "promotealloca.dummyload");
516     DeferredLoads.push_back(Dummy);
517     return Dummy;
518   };
519 
520   const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
521                                                    Type *PtrTy) -> Value * {
522     assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
523     const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
524     if (!PtrTy->isVectorTy())
525       return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
526     const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
527     // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
528     // first cast the ptr vector to <2 x i64>.
529     assert((Size % NumPtrElts == 0) && "Vector size not divisble");
530     Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
531     return Builder.CreateBitOrPointerCast(
532         Val, FixedVectorType::get(EltTy, NumPtrElts));
533   };
534 
535   Type *VecEltTy = VectorTy->getElementType();
536 
537   switch (Inst->getOpcode()) {
538   case Instruction::Load: {
539     // Loads can only be lowered if the value is known.
540     if (!CurVal) {
541       DeferredLoads.push_back(cast<LoadInst>(Inst));
542       return nullptr;
543     }
544 
545     Value *Index = calculateVectorIndex(
546         cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
547 
548     // We're loading the full vector.
549     Type *AccessTy = Inst->getType();
550     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
551     if (Constant *CI = dyn_cast<Constant>(Index)) {
552       if (CI->isZeroValue() && AccessSize == VecStoreSize) {
553         if (AccessTy->isPtrOrPtrVectorTy())
554           CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
555         else if (CurVal->getType()->isPtrOrPtrVectorTy())
556           CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
557         Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
558         Inst->replaceAllUsesWith(NewVal);
559         return nullptr;
560       }
561     }
562 
563     // Loading a subvector.
564     if (isa<FixedVectorType>(AccessTy)) {
565       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
566       const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
567       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
568       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
569 
570       Value *SubVec = PoisonValue::get(SubVecTy);
571       for (unsigned K = 0; K < NumLoadedElts; ++K) {
572         Value *CurIdx =
573             Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
574         SubVec = Builder.CreateInsertElement(
575             SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
576       }
577 
578       if (AccessTy->isPtrOrPtrVectorTy())
579         SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
580       else if (SubVecTy->isPtrOrPtrVectorTy())
581         SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
582 
583       SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
584       Inst->replaceAllUsesWith(SubVec);
585       return nullptr;
586     }
587 
588     // We're loading one element.
589     Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
590     if (AccessTy != VecEltTy)
591       ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
592 
593     Inst->replaceAllUsesWith(ExtractElement);
594     return nullptr;
595   }
596   case Instruction::Store: {
597     // For stores, it's a bit trickier and it depends on whether we're storing
598     // the full vector or not. If we're storing the full vector, we don't need
599     // to know the current value. If this is a store of a single element, we
600     // need to know the value.
601     StoreInst *SI = cast<StoreInst>(Inst);
602     Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
603     Value *Val = SI->getValueOperand();
604 
605     // We're storing the full vector, we can handle this without knowing CurVal.
606     Type *AccessTy = Val->getType();
607     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
608     if (Constant *CI = dyn_cast<Constant>(Index)) {
609       if (CI->isZeroValue() && AccessSize == VecStoreSize) {
610         if (AccessTy->isPtrOrPtrVectorTy())
611           Val = CreateTempPtrIntCast(Val, AccessTy);
612         else if (VectorTy->isPtrOrPtrVectorTy())
613           Val = CreateTempPtrIntCast(Val, VectorTy);
614         return Builder.CreateBitOrPointerCast(Val, VectorTy);
615       }
616     }
617 
618     // Storing a subvector.
619     if (isa<FixedVectorType>(AccessTy)) {
620       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
621       const unsigned NumWrittenElts =
622           AccessSize / DL.getTypeStoreSize(VecEltTy);
623       const unsigned NumVecElts = VectorTy->getNumElements();
624       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
625       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
626 
627       if (SubVecTy->isPtrOrPtrVectorTy())
628         Val = CreateTempPtrIntCast(Val, SubVecTy);
629       else if (AccessTy->isPtrOrPtrVectorTy())
630         Val = CreateTempPtrIntCast(Val, AccessTy);
631 
632       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
633 
634       Value *CurVec = GetOrLoadCurrentVectorValue();
635       for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
636            K < NumElts; ++K) {
637         Value *CurIdx =
638             Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
639         CurVec = Builder.CreateInsertElement(
640             CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
641       }
642       return CurVec;
643     }
644 
645     if (Val->getType() != VecEltTy)
646       Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
647     return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
648                                        Index);
649   }
650   case Instruction::Call: {
651     if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
652       // For memcpy, we need to know curval.
653       ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
654       unsigned NumCopied = Length->getZExtValue() / ElementSize;
655       MemTransferInfo *TI = &TransferInfo[MTI];
656       unsigned SrcBegin = TI->SrcIndex->getZExtValue();
657       unsigned DestBegin = TI->DestIndex->getZExtValue();
658 
659       SmallVector<int> Mask;
660       for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
661         if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
662           Mask.push_back(SrcBegin < VectorTy->getNumElements()
663                              ? SrcBegin++
664                              : PoisonMaskElem);
665         } else {
666           Mask.push_back(Idx);
667         }
668       }
669 
670       return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
671     }
672 
673     if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
674       // For memset, we don't need to know the previous value because we
675       // currently only allow memsets that cover the whole alloca.
676       Value *Elt = MSI->getOperand(1);
677       const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
678       if (BytesPerElt > 1) {
679         Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
680 
681         // If the element type of the vector is a pointer, we need to first cast
682         // to an integer, then use a PtrCast.
683         if (VecEltTy->isPointerTy()) {
684           Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
685           Elt = Builder.CreateBitCast(EltBytes, PtrInt);
686           Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
687         } else
688           Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
689       }
690 
691       return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
692     }
693 
694     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
695       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
696         Intr->replaceAllUsesWith(
697             Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
698                             DL.getTypeAllocSize(VectorTy)));
699         return nullptr;
700       }
701     }
702 
703     llvm_unreachable("Unsupported call when promoting alloca to vector");
704   }
705 
706   default:
707     llvm_unreachable("Inconsistency in instructions promotable to vector");
708   }
709 
710   llvm_unreachable("Did not return after promoting instruction!");
711 }
712 
isSupportedAccessType(FixedVectorType * VecTy,Type * AccessTy,const DataLayout & DL)713 static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
714                                   const DataLayout &DL) {
715   // Access as a vector type can work if the size of the access vector is a
716   // multiple of the size of the alloca's vector element type.
717   //
718   // Examples:
719   //    - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
720   //    - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
721   //    - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
722   //        - 3*32 is not a multiple of 64
723   //
724   // We could handle more complicated cases, but it'd make things a lot more
725   // complicated.
726   if (isa<FixedVectorType>(AccessTy)) {
727     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
728     // If the type size and the store size don't match, we would need to do more
729     // than just bitcast to translate between an extracted/insertable subvectors
730     // and the accessed value.
731     if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
732       return false;
733     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
734     return AccTS.isKnownMultipleOf(VecTS);
735   }
736 
737   return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy,
738                                               DL);
739 }
740 
741 /// Iterates over an instruction worklist that may contain multiple instructions
742 /// from the same basic block, but in a different order.
743 template <typename InstContainer>
forEachWorkListItem(const InstContainer & WorkList,std::function<void (Instruction *)> Fn)744 static void forEachWorkListItem(const InstContainer &WorkList,
745                                 std::function<void(Instruction *)> Fn) {
746   // Bucket up uses of the alloca by the block they occur in.
747   // This is important because we have to handle multiple defs/uses in a block
748   // ourselves: SSAUpdater is purely for cross-block references.
749   DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
750   for (Instruction *User : WorkList)
751     UsesByBlock[User->getParent()].insert(User);
752 
753   for (Instruction *User : WorkList) {
754     BasicBlock *BB = User->getParent();
755     auto &BlockUses = UsesByBlock[BB];
756 
757     // Already processed, skip.
758     if (BlockUses.empty())
759       continue;
760 
761     // Only user in the block, directly process it.
762     if (BlockUses.size() == 1) {
763       Fn(User);
764       continue;
765     }
766 
767     // Multiple users in the block, do a linear scan to see users in order.
768     for (Instruction &Inst : *BB) {
769       if (!BlockUses.contains(&Inst))
770         continue;
771 
772       Fn(&Inst);
773     }
774 
775     // Clear the block so we know it's been processed.
776     BlockUses.clear();
777   }
778 }
779 
780 /// Find an insert point after an alloca, after all other allocas clustered at
781 /// the start of the block.
skipToNonAllocaInsertPt(BasicBlock & BB,BasicBlock::iterator I)782 static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
783                                                     BasicBlock::iterator I) {
784   for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
785     ;
786   return I;
787 }
788 
789 // FIXME: Should try to pick the most likely to be profitable allocas first.
tryPromoteAllocaToVector(AllocaInst & Alloca)790 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
791   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
792 
793   if (DisablePromoteAllocaToVector) {
794     LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
795     return false;
796   }
797 
798   Type *AllocaTy = Alloca.getAllocatedType();
799   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
800   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
801     uint64_t NumElems = 1;
802     Type *ElemTy;
803     do {
804       NumElems *= ArrayTy->getNumElements();
805       ElemTy = ArrayTy->getElementType();
806     } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
807 
808     // Check for array of vectors
809     auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
810     if (InnerVectorTy) {
811       NumElems *= InnerVectorTy->getNumElements();
812       ElemTy = InnerVectorTy->getElementType();
813     }
814 
815     if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
816       unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
817       if (ElementSize > 0) {
818         unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
819         // Expand vector if required to match padding of inner type,
820         // i.e. odd size subvectors.
821         // Storage size of new vector must match that of alloca for correct
822         // behaviour of byte offsets and GEP computation.
823         if (NumElems * ElementSize != AllocaSize)
824           NumElems = AllocaSize / ElementSize;
825         if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
826           VectorTy = FixedVectorType::get(ElemTy, NumElems);
827       }
828     }
829   }
830 
831   if (!VectorTy) {
832     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
833     return false;
834   }
835 
836   const unsigned MaxElements =
837       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
838 
839   if (VectorTy->getNumElements() > MaxElements ||
840       VectorTy->getNumElements() < 2) {
841     LLVM_DEBUG(dbgs() << "  " << *VectorTy
842                       << " has an unsupported number of elements\n");
843     return false;
844   }
845 
846   std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
847   SmallVector<Instruction *> WorkList;
848   SmallVector<Instruction *> UsersToRemove;
849   SmallVector<Instruction *> DeferredInsts;
850   SmallVector<Instruction *> NewGEPInsts;
851   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
852 
853   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
854     LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
855                       << "    " << *Inst << "\n");
856     for (auto *Inst : reverse(NewGEPInsts))
857       Inst->eraseFromParent();
858     return false;
859   };
860 
861   SmallVector<Use *, 8> Uses;
862   collectAllocaUses(Alloca, Uses);
863 
864   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
865 
866   Type *VecEltTy = VectorTy->getElementType();
867   unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
868   if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
869     LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
870                          "does not match the type's size\n");
871     return false;
872   }
873   unsigned ElementSize = ElementSizeInBits / 8;
874   assert(ElementSize > 0);
875   for (auto *U : Uses) {
876     Instruction *Inst = cast<Instruction>(U->getUser());
877 
878     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
879       // This is a store of the pointer, not to the pointer.
880       if (isa<StoreInst>(Inst) &&
881           U->getOperandNo() != StoreInst::getPointerOperandIndex())
882         return RejectUser(Inst, "pointer is being stored");
883 
884       Type *AccessTy = getLoadStoreType(Inst);
885       if (AccessTy->isAggregateType())
886         return RejectUser(Inst, "unsupported load/store as aggregate");
887       assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
888 
889       // Check that this is a simple access of a vector element.
890       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
891                                           : cast<StoreInst>(Inst)->isSimple();
892       if (!IsSimple)
893         return RejectUser(Inst, "not a simple load or store");
894 
895       Ptr = Ptr->stripPointerCasts();
896 
897       // Alloca already accessed as vector.
898       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
899                                 DL->getTypeStoreSize(AccessTy)) {
900         WorkList.push_back(Inst);
901         continue;
902       }
903 
904       if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
905         return RejectUser(Inst, "not a supported access type");
906 
907       WorkList.push_back(Inst);
908       continue;
909     }
910 
911     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
912       // If we can't compute a vector index from this GEP, then we can't
913       // promote this alloca to vector.
914       Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
915       if (!Index)
916         return RejectUser(Inst, "cannot compute vector index for GEP");
917 
918       GEPVectorIdx[GEP] = Index;
919       UsersToRemove.push_back(Inst);
920       continue;
921     }
922 
923     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
924         MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
925       WorkList.push_back(Inst);
926       continue;
927     }
928 
929     if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
930       if (TransferInst->isVolatile())
931         return RejectUser(Inst, "mem transfer inst is volatile");
932 
933       ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
934       if (!Len || (Len->getZExtValue() % ElementSize))
935         return RejectUser(Inst, "mem transfer inst length is non-constant or "
936                                 "not a multiple of the vector element size");
937 
938       if (TransferInfo.try_emplace(TransferInst).second) {
939         DeferredInsts.push_back(Inst);
940         WorkList.push_back(Inst);
941       }
942 
943       auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
944         GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
945         if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
946           return nullptr;
947 
948         return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
949       };
950 
951       unsigned OpNum = U->getOperandNo();
952       MemTransferInfo *TI = &TransferInfo[TransferInst];
953       if (OpNum == 0) {
954         Value *Dest = TransferInst->getDest();
955         ConstantInt *Index = getPointerIndexOfAlloca(Dest);
956         if (!Index)
957           return RejectUser(Inst, "could not calculate constant dest index");
958         TI->DestIndex = Index;
959       } else {
960         assert(OpNum == 1);
961         Value *Src = TransferInst->getSource();
962         ConstantInt *Index = getPointerIndexOfAlloca(Src);
963         if (!Index)
964           return RejectUser(Inst, "could not calculate constant src index");
965         TI->SrcIndex = Index;
966       }
967       continue;
968     }
969 
970     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
971       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
972         WorkList.push_back(Inst);
973         continue;
974       }
975     }
976 
977     // Ignore assume-like intrinsics and comparisons used in assumes.
978     if (isAssumeLikeIntrinsic(Inst)) {
979       if (!Inst->use_empty())
980         return RejectUser(Inst, "assume-like intrinsic cannot have any users");
981       UsersToRemove.push_back(Inst);
982       continue;
983     }
984 
985     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
986           return isAssumeLikeIntrinsic(cast<Instruction>(U));
987         })) {
988       UsersToRemove.push_back(Inst);
989       continue;
990     }
991 
992     return RejectUser(Inst, "unhandled alloca user");
993   }
994 
995   while (!DeferredInsts.empty()) {
996     Instruction *Inst = DeferredInsts.pop_back_val();
997     MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
998     // TODO: Support the case if the pointers are from different alloca or
999     // from different address spaces.
1000     MemTransferInfo &Info = TransferInfo[TransferInst];
1001     if (!Info.SrcIndex || !Info.DestIndex)
1002       return RejectUser(
1003           Inst, "mem transfer inst is missing constant src and/or dst index");
1004   }
1005 
1006   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
1007                     << *VectorTy << '\n');
1008   const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
1009 
1010   // Alloca is uninitialized memory. Imitate that by making the first value
1011   // undef.
1012   SSAUpdater Updater;
1013   Updater.Initialize(VectorTy, "promotealloca");
1014 
1015   BasicBlock *EntryBB = Alloca.getParent();
1016   BasicBlock::iterator InitInsertPos =
1017       skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
1018   // Alloca memory is undefined to begin, not poison.
1019   Value *AllocaInitValue =
1020       new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
1021   AllocaInitValue->takeName(&Alloca);
1022 
1023   Updater.AddAvailableValue(EntryBB, AllocaInitValue);
1024 
1025   // First handle the initial worklist.
1026   SmallVector<LoadInst *, 4> DeferredLoads;
1027   forEachWorkListItem(WorkList, [&](Instruction *I) {
1028     BasicBlock *BB = I->getParent();
1029     // On the first pass, we only take values that are trivially known, i.e.
1030     // where AddAvailableValue was already called in this block.
1031     Value *Result = promoteAllocaUserToVector(
1032         I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1033         Updater.FindValueForBlock(BB), DeferredLoads);
1034     if (Result)
1035       Updater.AddAvailableValue(BB, Result);
1036   });
1037 
1038   // Then handle deferred loads.
1039   forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
1040     SmallVector<LoadInst *, 0> NewDLs;
1041     BasicBlock *BB = I->getParent();
1042     // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1043     // get a value, inserting PHIs as needed.
1044     Value *Result = promoteAllocaUserToVector(
1045         I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1046         Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
1047     if (Result)
1048       Updater.AddAvailableValue(BB, Result);
1049     assert(NewDLs.empty() && "No more deferred loads should be queued!");
1050   });
1051 
1052   // Delete all instructions. On the first pass, new dummy loads may have been
1053   // added so we need to collect them too.
1054   DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1055   InstsToDelete.insert_range(DeferredLoads);
1056   for (Instruction *I : InstsToDelete) {
1057     assert(I->use_empty());
1058     I->eraseFromParent();
1059   }
1060 
1061   // Delete all the users that are known to be removeable.
1062   for (Instruction *I : reverse(UsersToRemove)) {
1063     I->dropDroppableUses();
1064     assert(I->use_empty());
1065     I->eraseFromParent();
1066   }
1067 
1068   // Alloca should now be dead too.
1069   assert(Alloca.use_empty());
1070   Alloca.eraseFromParent();
1071   return true;
1072 }
1073 
1074 std::pair<Value *, Value *>
getLocalSizeYZ(IRBuilder<> & Builder)1075 AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1076   Function &F = *Builder.GetInsertBlock()->getParent();
1077   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1078 
1079   if (!IsAMDHSA) {
1080     CallInst *LocalSizeY =
1081         Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1082     CallInst *LocalSizeZ =
1083         Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1084 
1085     ST.makeLIDRangeMetadata(LocalSizeY);
1086     ST.makeLIDRangeMetadata(LocalSizeZ);
1087 
1088     return std::pair(LocalSizeY, LocalSizeZ);
1089   }
1090 
1091   // We must read the size out of the dispatch pointer.
1092   assert(IsAMDGCN);
1093 
1094   // We are indexing into this struct, and want to extract the workgroup_size_*
1095   // fields.
1096   //
1097   //   typedef struct hsa_kernel_dispatch_packet_s {
1098   //     uint16_t header;
1099   //     uint16_t setup;
1100   //     uint16_t workgroup_size_x ;
1101   //     uint16_t workgroup_size_y;
1102   //     uint16_t workgroup_size_z;
1103   //     uint16_t reserved0;
1104   //     uint32_t grid_size_x ;
1105   //     uint32_t grid_size_y ;
1106   //     uint32_t grid_size_z;
1107   //
1108   //     uint32_t private_segment_size;
1109   //     uint32_t group_segment_size;
1110   //     uint64_t kernel_object;
1111   //
1112   // #ifdef HSA_LARGE_MODEL
1113   //     void *kernarg_address;
1114   // #elif defined HSA_LITTLE_ENDIAN
1115   //     void *kernarg_address;
1116   //     uint32_t reserved1;
1117   // #else
1118   //     uint32_t reserved1;
1119   //     void *kernarg_address;
1120   // #endif
1121   //     uint64_t reserved2;
1122   //     hsa_signal_t completion_signal; // uint64_t wrapper
1123   //   } hsa_kernel_dispatch_packet_t
1124   //
1125   CallInst *DispatchPtr =
1126       Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1127   DispatchPtr->addRetAttr(Attribute::NoAlias);
1128   DispatchPtr->addRetAttr(Attribute::NonNull);
1129   F.removeFnAttr("amdgpu-no-dispatch-ptr");
1130 
1131   // Size of the dispatch packet struct.
1132   DispatchPtr->addDereferenceableRetAttr(64);
1133 
1134   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1135 
1136   // We could do a single 64-bit load here, but it's likely that the basic
1137   // 32-bit and extract sequence is already present, and it is probably easier
1138   // to CSE this. The loads should be mergeable later anyway.
1139   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1140   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1141 
1142   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1143   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1144 
1145   MDNode *MD = MDNode::get(Mod->getContext(), {});
1146   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1147   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1148   ST.makeLIDRangeMetadata(LoadZU);
1149 
1150   // Extract y component. Upper half of LoadZU should be zero already.
1151   Value *Y = Builder.CreateLShr(LoadXY, 16);
1152 
1153   return std::pair(Y, LoadZU);
1154 }
1155 
getWorkitemID(IRBuilder<> & Builder,unsigned N)1156 Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1157                                               unsigned N) {
1158   Function *F = Builder.GetInsertBlock()->getParent();
1159   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
1160   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1161   StringRef AttrName;
1162 
1163   switch (N) {
1164   case 0:
1165     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1166                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1167     AttrName = "amdgpu-no-workitem-id-x";
1168     break;
1169   case 1:
1170     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1171                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1172     AttrName = "amdgpu-no-workitem-id-y";
1173     break;
1174 
1175   case 2:
1176     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1177                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1178     AttrName = "amdgpu-no-workitem-id-z";
1179     break;
1180   default:
1181     llvm_unreachable("invalid dimension");
1182   }
1183 
1184   Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1185   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1186   ST.makeLIDRangeMetadata(CI);
1187   F->removeFnAttr(AttrName);
1188 
1189   return CI;
1190 }
1191 
isCallPromotable(CallInst * CI)1192 static bool isCallPromotable(CallInst *CI) {
1193   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
1194   if (!II)
1195     return false;
1196 
1197   switch (II->getIntrinsicID()) {
1198   case Intrinsic::memcpy:
1199   case Intrinsic::memmove:
1200   case Intrinsic::memset:
1201   case Intrinsic::lifetime_start:
1202   case Intrinsic::lifetime_end:
1203   case Intrinsic::invariant_start:
1204   case Intrinsic::invariant_end:
1205   case Intrinsic::launder_invariant_group:
1206   case Intrinsic::strip_invariant_group:
1207   case Intrinsic::objectsize:
1208     return true;
1209   default:
1210     return false;
1211   }
1212 }
1213 
binaryOpIsDerivedFromSameAlloca(Value * BaseAlloca,Value * Val,Instruction * Inst,int OpIdx0,int OpIdx1) const1214 bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1215     Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1216     int OpIdx1) const {
1217   // Figure out which operand is the one we might not be promoting.
1218   Value *OtherOp = Inst->getOperand(OpIdx0);
1219   if (Val == OtherOp)
1220     OtherOp = Inst->getOperand(OpIdx1);
1221 
1222   if (isa<ConstantPointerNull, ConstantAggregateZero>(OtherOp))
1223     return true;
1224 
1225   // TODO: getUnderlyingObject will not work on a vector getelementptr
1226   Value *OtherObj = getUnderlyingObject(OtherOp);
1227   if (!isa<AllocaInst>(OtherObj))
1228     return false;
1229 
1230   // TODO: We should be able to replace undefs with the right pointer type.
1231 
1232   // TODO: If we know the other base object is another promotable
1233   // alloca, not necessarily this alloca, we can do this. The
1234   // important part is both must have the same address space at
1235   // the end.
1236   if (OtherObj != BaseAlloca) {
1237     LLVM_DEBUG(
1238         dbgs() << "Found a binary instruction with another alloca object\n");
1239     return false;
1240   }
1241 
1242   return true;
1243 }
1244 
collectUsesWithPtrTypes(Value * BaseAlloca,Value * Val,std::vector<Value * > & WorkList) const1245 bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1246     Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
1247 
1248   for (User *User : Val->users()) {
1249     if (is_contained(WorkList, User))
1250       continue;
1251 
1252     if (CallInst *CI = dyn_cast<CallInst>(User)) {
1253       if (!isCallPromotable(CI))
1254         return false;
1255 
1256       WorkList.push_back(User);
1257       continue;
1258     }
1259 
1260     Instruction *UseInst = cast<Instruction>(User);
1261     if (UseInst->getOpcode() == Instruction::PtrToInt)
1262       return false;
1263 
1264     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1265       if (LI->isVolatile())
1266         return false;
1267       continue;
1268     }
1269 
1270     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1271       if (SI->isVolatile())
1272         return false;
1273 
1274       // Reject if the stored value is not the pointer operand.
1275       if (SI->getPointerOperand() != Val)
1276         return false;
1277       continue;
1278     }
1279 
1280     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1281       if (RMW->isVolatile())
1282         return false;
1283       continue;
1284     }
1285 
1286     if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1287       if (CAS->isVolatile())
1288         return false;
1289       continue;
1290     }
1291 
1292     // Only promote a select if we know that the other select operand
1293     // is from another pointer that will also be promoted.
1294     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1295       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
1296         return false;
1297 
1298       // May need to rewrite constant operands.
1299       WorkList.push_back(ICmp);
1300       continue;
1301     }
1302 
1303     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
1304       // Be conservative if an address could be computed outside the bounds of
1305       // the alloca.
1306       if (!GEP->isInBounds())
1307         return false;
1308     } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
1309       // Only promote a select if we know that the other select operand is from
1310       // another pointer that will also be promoted.
1311       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
1312         return false;
1313     } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
1314       // Repeat for phis.
1315 
1316       // TODO: Handle more complex cases. We should be able to replace loops
1317       // over arrays.
1318       switch (Phi->getNumIncomingValues()) {
1319       case 1:
1320         break;
1321       case 2:
1322         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
1323           return false;
1324         break;
1325       default:
1326         return false;
1327       }
1328     } else if (!isa<ExtractElementInst>(User)) {
1329       // Do not promote vector/aggregate type instructions. It is hard to track
1330       // their users.
1331 
1332       // Do not promote addrspacecast.
1333       //
1334       // TODO: If we know the address is only observed through flat pointers, we
1335       // could still promote.
1336       return false;
1337     }
1338 
1339     WorkList.push_back(User);
1340     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
1341       return false;
1342   }
1343 
1344   return true;
1345 }
1346 
hasSufficientLocalMem(const Function & F)1347 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1348 
1349   FunctionType *FTy = F.getFunctionType();
1350   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1351 
1352   // If the function has any arguments in the local address space, then it's
1353   // possible these arguments require the entire local memory space, so
1354   // we cannot use local memory in the pass.
1355   for (Type *ParamTy : FTy->params()) {
1356     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1357     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1358       LocalMemLimit = 0;
1359       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1360                            "local memory disabled.\n");
1361       return false;
1362     }
1363   }
1364 
1365   LocalMemLimit = ST.getAddressableLocalMemorySize();
1366   if (LocalMemLimit == 0)
1367     return false;
1368 
1369   SmallVector<const Constant *, 16> Stack;
1370   SmallPtrSet<const Constant *, 8> VisitedConstants;
1371   SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
1372 
1373   auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1374     for (const User *U : Val->users()) {
1375       if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1376         if (Use->getParent()->getParent() == &F)
1377           return true;
1378       } else {
1379         const Constant *C = cast<Constant>(U);
1380         if (VisitedConstants.insert(C).second)
1381           Stack.push_back(C);
1382       }
1383     }
1384 
1385     return false;
1386   };
1387 
1388   for (GlobalVariable &GV : Mod->globals()) {
1389     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1390       continue;
1391 
1392     if (visitUsers(&GV, &GV)) {
1393       UsedLDS.insert(&GV);
1394       Stack.clear();
1395       continue;
1396     }
1397 
1398     // For any ConstantExpr uses, we need to recursively search the users until
1399     // we see a function.
1400     while (!Stack.empty()) {
1401       const Constant *C = Stack.pop_back_val();
1402       if (visitUsers(&GV, C)) {
1403         UsedLDS.insert(&GV);
1404         Stack.clear();
1405         break;
1406       }
1407     }
1408   }
1409 
1410   const DataLayout &DL = Mod->getDataLayout();
1411   SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1412   AllocatedSizes.reserve(UsedLDS.size());
1413 
1414   for (const GlobalVariable *GV : UsedLDS) {
1415     Align Alignment =
1416         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1417     uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1418 
1419     // HIP uses an extern unsized array in local address space for dynamically
1420     // allocated shared memory.  In that case, we have to disable the promotion.
1421     if (GV->hasExternalLinkage() && AllocSize == 0) {
1422       LocalMemLimit = 0;
1423       LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1424                            "local memory. Promoting to local memory "
1425                            "disabled.\n");
1426       return false;
1427     }
1428 
1429     AllocatedSizes.emplace_back(AllocSize, Alignment);
1430   }
1431 
1432   // Sort to try to estimate the worst case alignment padding
1433   //
1434   // FIXME: We should really do something to fix the addresses to a more optimal
1435   // value instead
1436   llvm::sort(AllocatedSizes, llvm::less_second());
1437 
1438   // Check how much local memory is being used by global objects
1439   CurrentLocalMemUsage = 0;
1440 
1441   // FIXME: Try to account for padding here. The real padding and address is
1442   // currently determined from the inverse order of uses in the function when
1443   // legalizing, which could also potentially change. We try to estimate the
1444   // worst case here, but we probably should fix the addresses earlier.
1445   for (auto Alloc : AllocatedSizes) {
1446     CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1447     CurrentLocalMemUsage += Alloc.first;
1448   }
1449 
1450   unsigned MaxOccupancy =
1451       ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1452           .second;
1453 
1454   // Round up to the next tier of usage.
1455   unsigned MaxSizeWithWaveCount =
1456       ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1457 
1458   // Program may already use more LDS than is usable at maximum occupancy.
1459   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1460     return false;
1461 
1462   LocalMemLimit = MaxSizeWithWaveCount;
1463 
1464   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1465                     << " bytes of LDS\n"
1466                     << "  Rounding size to " << MaxSizeWithWaveCount
1467                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
1468                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1469                     << " available for promotion\n");
1470 
1471   return true;
1472 }
1473 
1474 // FIXME: Should try to pick the most likely to be profitable allocas first.
tryPromoteAllocaToLDS(AllocaInst & I,bool SufficientLDS)1475 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1476                                                     bool SufficientLDS) {
1477   LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1478 
1479   if (DisablePromoteAllocaToLDS) {
1480     LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
1481     return false;
1482   }
1483 
1484   const DataLayout &DL = Mod->getDataLayout();
1485   IRBuilder<> Builder(&I);
1486 
1487   const Function &ContainingFunction = *I.getParent()->getParent();
1488   CallingConv::ID CC = ContainingFunction.getCallingConv();
1489 
1490   // Don't promote the alloca to LDS for shader calling conventions as the work
1491   // item ID intrinsics are not supported for these calling conventions.
1492   // Furthermore not all LDS is available for some of the stages.
1493   switch (CC) {
1494   case CallingConv::AMDGPU_KERNEL:
1495   case CallingConv::SPIR_KERNEL:
1496     break;
1497   default:
1498     LLVM_DEBUG(
1499         dbgs()
1500         << " promote alloca to LDS not supported with calling convention.\n");
1501     return false;
1502   }
1503 
1504   // Not likely to have sufficient local memory for promotion.
1505   if (!SufficientLDS)
1506     return false;
1507 
1508   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1509   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1510 
1511   Align Alignment =
1512       DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
1513 
1514   // FIXME: This computed padding is likely wrong since it depends on inverse
1515   // usage order.
1516   //
1517   // FIXME: It is also possible that if we're allowed to use all of the memory
1518   // could end up using more than the maximum due to alignment padding.
1519 
1520   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1521   uint32_t AllocSize =
1522       WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
1523   NewSize += AllocSize;
1524 
1525   if (NewSize > LocalMemLimit) {
1526     LLVM_DEBUG(dbgs() << "  " << AllocSize
1527                       << " bytes of local memory not available to promote\n");
1528     return false;
1529   }
1530 
1531   CurrentLocalMemUsage = NewSize;
1532 
1533   std::vector<Value *> WorkList;
1534 
1535   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
1536     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1537     return false;
1538   }
1539 
1540   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1541 
1542   Function *F = I.getParent()->getParent();
1543 
1544   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
1545   GlobalVariable *GV = new GlobalVariable(
1546       *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
1547       Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1548       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1549   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1550   GV->setAlignment(I.getAlign());
1551 
1552   Value *TCntY, *TCntZ;
1553 
1554   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1555   Value *TIdX = getWorkitemID(Builder, 0);
1556   Value *TIdY = getWorkitemID(Builder, 1);
1557   Value *TIdZ = getWorkitemID(Builder, 2);
1558 
1559   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1560   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1561   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1562   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1563   TID = Builder.CreateAdd(TID, TIdZ);
1564 
1565   LLVMContext &Context = Mod->getContext();
1566   Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
1567 
1568   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1569   I.mutateType(Offset->getType());
1570   I.replaceAllUsesWith(Offset);
1571   I.eraseFromParent();
1572 
1573   SmallVector<IntrinsicInst *> DeferredIntrs;
1574 
1575   PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
1576 
1577   for (Value *V : WorkList) {
1578     CallInst *Call = dyn_cast<CallInst>(V);
1579     if (!Call) {
1580       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1581         Value *LHS = CI->getOperand(0);
1582         Value *RHS = CI->getOperand(1);
1583 
1584         Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1585         if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS))
1586           CI->setOperand(0, Constant::getNullValue(NewTy));
1587 
1588         if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS))
1589           CI->setOperand(1, Constant::getNullValue(NewTy));
1590 
1591         continue;
1592       }
1593 
1594       // The operand's value should be corrected on its own and we don't want to
1595       // touch the users.
1596       if (isa<AddrSpaceCastInst>(V))
1597         continue;
1598 
1599       assert(V->getType()->isPtrOrPtrVectorTy());
1600 
1601       Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1602       V->mutateType(NewTy);
1603 
1604       // Adjust the types of any constant operands.
1605       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
1606         if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(1)))
1607           SI->setOperand(1, Constant::getNullValue(NewTy));
1608 
1609         if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(2)))
1610           SI->setOperand(2, Constant::getNullValue(NewTy));
1611       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1612         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1613           if (isa<ConstantPointerNull, ConstantAggregateZero>(
1614                   Phi->getIncomingValue(I)))
1615             Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1616         }
1617       }
1618 
1619       continue;
1620     }
1621 
1622     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
1623     Builder.SetInsertPoint(Intr);
1624     switch (Intr->getIntrinsicID()) {
1625     case Intrinsic::lifetime_start:
1626     case Intrinsic::lifetime_end:
1627       // These intrinsics are for address space 0 only
1628       Intr->eraseFromParent();
1629       continue;
1630     case Intrinsic::memcpy:
1631     case Intrinsic::memmove:
1632       // These have 2 pointer operands. In case if second pointer also needs
1633       // to be replaced we defer processing of these intrinsics until all
1634       // other values are processed.
1635       DeferredIntrs.push_back(Intr);
1636       continue;
1637     case Intrinsic::memset: {
1638       MemSetInst *MemSet = cast<MemSetInst>(Intr);
1639       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1640                            MemSet->getLength(), MemSet->getDestAlign(),
1641                            MemSet->isVolatile());
1642       Intr->eraseFromParent();
1643       continue;
1644     }
1645     case Intrinsic::invariant_start:
1646     case Intrinsic::invariant_end:
1647     case Intrinsic::launder_invariant_group:
1648     case Intrinsic::strip_invariant_group: {
1649       SmallVector<Value *> Args;
1650       if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1651         Args.emplace_back(Intr->getArgOperand(0));
1652       } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1653         Args.emplace_back(Intr->getArgOperand(0));
1654         Args.emplace_back(Intr->getArgOperand(1));
1655       }
1656       Args.emplace_back(Offset);
1657       Function *F = Intrinsic::getOrInsertDeclaration(
1658           Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1659       CallInst *NewIntr =
1660           CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1661       Intr->mutateType(NewIntr->getType());
1662       Intr->replaceAllUsesWith(NewIntr);
1663       Intr->eraseFromParent();
1664       continue;
1665     }
1666     case Intrinsic::objectsize: {
1667       Value *Src = Intr->getOperand(0);
1668 
1669       CallInst *NewCall = Builder.CreateIntrinsic(
1670           Intrinsic::objectsize,
1671           {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)},
1672           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1673       Intr->replaceAllUsesWith(NewCall);
1674       Intr->eraseFromParent();
1675       continue;
1676     }
1677     default:
1678       Intr->print(errs());
1679       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1680     }
1681   }
1682 
1683   for (IntrinsicInst *Intr : DeferredIntrs) {
1684     Builder.SetInsertPoint(Intr);
1685     Intrinsic::ID ID = Intr->getIntrinsicID();
1686     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1687 
1688     MemTransferInst *MI = cast<MemTransferInst>(Intr);
1689     auto *B = Builder.CreateMemTransferInst(
1690         ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1691         MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1692 
1693     for (unsigned I = 0; I != 2; ++I) {
1694       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1695         B->addDereferenceableParamAttr(I, Bytes);
1696       }
1697     }
1698 
1699     Intr->eraseFromParent();
1700   }
1701 
1702   return true;
1703 }
1704