xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (revision c203bd70b5957f85616424b6fa374479372d06e3)
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates allocas by either converting them into vectors or
10 // by migrating them to local address space.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/APInt.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Triple.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Analysis/CaptureTracking.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/Attributes.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Constant.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/IR/GlobalVariable.h"
35 #include "llvm/IR/IRBuilder.h"
36 #include "llvm/IR/Instruction.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/IntrinsicsAMDGPU.h"
41 #include "llvm/IR/IntrinsicsR600.h"
42 #include "llvm/IR/LLVMContext.h"
43 #include "llvm/IR/Metadata.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/Type.h"
46 #include "llvm/IR/User.h"
47 #include "llvm/IR/Value.h"
48 #include "llvm/Pass.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <map>
59 #include <tuple>
60 #include <utility>
61 #include <vector>
62 
63 #define DEBUG_TYPE "amdgpu-promote-alloca"
64 
65 using namespace llvm;
66 
67 namespace {
68 
69 static cl::opt<bool> DisablePromoteAllocaToVector(
70   "disable-promote-alloca-to-vector",
71   cl::desc("Disable promote alloca to vector"),
72   cl::init(false));
73 
74 static cl::opt<bool> DisablePromoteAllocaToLDS(
75   "disable-promote-alloca-to-lds",
76   cl::desc("Disable promote alloca to LDS"),
77   cl::init(false));
78 
79 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
80   "amdgpu-promote-alloca-to-vector-limit",
81   cl::desc("Maximum byte size to consider promote alloca to vector"),
82   cl::init(0));
83 
84 // FIXME: This can create globals so should be a module pass.
85 class AMDGPUPromoteAlloca : public FunctionPass {
86 private:
87   const TargetMachine *TM;
88   Module *Mod = nullptr;
89   const DataLayout *DL = nullptr;
90 
91   // FIXME: This should be per-kernel.
92   uint32_t LocalMemLimit = 0;
93   uint32_t CurrentLocalMemUsage = 0;
94   unsigned MaxVGPRs;
95 
96   bool IsAMDGCN = false;
97   bool IsAMDHSA = false;
98 
99   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
100   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
101 
102   /// BaseAlloca is the alloca root the search started from.
103   /// Val may be that alloca or a recursive user of it.
104   bool collectUsesWithPtrTypes(Value *BaseAlloca,
105                                Value *Val,
106                                std::vector<Value*> &WorkList) const;
107 
108   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
109   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
110   /// Returns true if both operands are derived from the same alloca. Val should
111   /// be the same value as one of the input operands of UseInst.
112   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
113                                        Instruction *UseInst,
114                                        int OpIdx0, int OpIdx1) const;
115 
116   /// Check whether we have enough local memory for promotion.
117   bool hasSufficientLocalMem(const Function &F);
118 
119 public:
120   static char ID;
121 
122   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
123 
124   bool doInitialization(Module &M) override;
125   bool runOnFunction(Function &F) override;
126 
127   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
128 
129   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
130 
131   void getAnalysisUsage(AnalysisUsage &AU) const override {
132     AU.setPreservesCFG();
133     FunctionPass::getAnalysisUsage(AU);
134   }
135 };
136 
137 class AMDGPUPromoteAllocaToVector : public FunctionPass {
138 private:
139   unsigned MaxVGPRs;
140 
141 public:
142   static char ID;
143 
144   AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
145 
146   bool runOnFunction(Function &F) override;
147 
148   StringRef getPassName() const override {
149     return "AMDGPU Promote Alloca to vector";
150   }
151 
152   bool handleAlloca(AllocaInst &I);
153 
154   void getAnalysisUsage(AnalysisUsage &AU) const override {
155     AU.setPreservesCFG();
156     FunctionPass::getAnalysisUsage(AU);
157   }
158 };
159 
160 } // end anonymous namespace
161 
162 char AMDGPUPromoteAlloca::ID = 0;
163 char AMDGPUPromoteAllocaToVector::ID = 0;
164 
165 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
166                 "AMDGPU promote alloca to vector or LDS", false, false)
167 
168 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
169                 "AMDGPU promote alloca to vector", false, false)
170 
171 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
172 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
173 
174 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
175   Mod = &M;
176   DL = &Mod->getDataLayout();
177 
178   return false;
179 }
180 
181 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
182   if (skipFunction(F))
183     return false;
184 
185   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
186     TM = &TPC->getTM<TargetMachine>();
187   else
188     return false;
189 
190   const Triple &TT = TM->getTargetTriple();
191   IsAMDGCN = TT.getArch() == Triple::amdgcn;
192   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
193 
194   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
195   if (!ST.isPromoteAllocaEnabled())
196     return false;
197 
198   if (IsAMDGCN) {
199     const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
200     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
201   } else {
202     MaxVGPRs = 128;
203   }
204 
205   bool SufficientLDS = hasSufficientLocalMem(F);
206   bool Changed = false;
207   BasicBlock &EntryBB = *F.begin();
208 
209   SmallVector<AllocaInst *, 16> Allocas;
210   for (Instruction &I : EntryBB) {
211     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
212       Allocas.push_back(AI);
213   }
214 
215   for (AllocaInst *AI : Allocas) {
216     if (handleAlloca(*AI, SufficientLDS))
217       Changed = true;
218   }
219 
220   return Changed;
221 }
222 
223 std::pair<Value *, Value *>
224 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
225   const Function &F = *Builder.GetInsertBlock()->getParent();
226   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
227 
228   if (!IsAMDHSA) {
229     Function *LocalSizeYFn
230       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
231     Function *LocalSizeZFn
232       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
233 
234     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
235     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
236 
237     ST.makeLIDRangeMetadata(LocalSizeY);
238     ST.makeLIDRangeMetadata(LocalSizeZ);
239 
240     return std::make_pair(LocalSizeY, LocalSizeZ);
241   }
242 
243   // We must read the size out of the dispatch pointer.
244   assert(IsAMDGCN);
245 
246   // We are indexing into this struct, and want to extract the workgroup_size_*
247   // fields.
248   //
249   //   typedef struct hsa_kernel_dispatch_packet_s {
250   //     uint16_t header;
251   //     uint16_t setup;
252   //     uint16_t workgroup_size_x ;
253   //     uint16_t workgroup_size_y;
254   //     uint16_t workgroup_size_z;
255   //     uint16_t reserved0;
256   //     uint32_t grid_size_x ;
257   //     uint32_t grid_size_y ;
258   //     uint32_t grid_size_z;
259   //
260   //     uint32_t private_segment_size;
261   //     uint32_t group_segment_size;
262   //     uint64_t kernel_object;
263   //
264   // #ifdef HSA_LARGE_MODEL
265   //     void *kernarg_address;
266   // #elif defined HSA_LITTLE_ENDIAN
267   //     void *kernarg_address;
268   //     uint32_t reserved1;
269   // #else
270   //     uint32_t reserved1;
271   //     void *kernarg_address;
272   // #endif
273   //     uint64_t reserved2;
274   //     hsa_signal_t completion_signal; // uint64_t wrapper
275   //   } hsa_kernel_dispatch_packet_t
276   //
277   Function *DispatchPtrFn
278     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
279 
280   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
281   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
282   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
283 
284   // Size of the dispatch packet struct.
285   DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
286 
287   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
288   Value *CastDispatchPtr = Builder.CreateBitCast(
289     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
290 
291   // We could do a single 64-bit load here, but it's likely that the basic
292   // 32-bit and extract sequence is already present, and it is probably easier
293   // to CSE this. The loads should be mergable later anyway.
294   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
295   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
296 
297   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
298   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
299 
300   MDNode *MD = MDNode::get(Mod->getContext(), None);
301   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
302   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
303   ST.makeLIDRangeMetadata(LoadZU);
304 
305   // Extract y component. Upper half of LoadZU should be zero already.
306   Value *Y = Builder.CreateLShr(LoadXY, 16);
307 
308   return std::make_pair(Y, LoadZU);
309 }
310 
311 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
312   const AMDGPUSubtarget &ST =
313       AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
314   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
315 
316   switch (N) {
317   case 0:
318     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
319                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
320     break;
321   case 1:
322     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
323                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
324     break;
325 
326   case 2:
327     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
328                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
329     break;
330   default:
331     llvm_unreachable("invalid dimension");
332   }
333 
334   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
335   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
336   ST.makeLIDRangeMetadata(CI);
337 
338   return CI;
339 }
340 
341 static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
342   return FixedVectorType::get(ArrayTy->getElementType(),
343                               ArrayTy->getNumElements());
344 }
345 
346 static Value *stripBitcasts(Value *V) {
347   while (Instruction *I = dyn_cast<Instruction>(V)) {
348     if (I->getOpcode() != Instruction::BitCast)
349       break;
350     V = I->getOperand(0);
351   }
352   return V;
353 }
354 
355 static Value *
356 calculateVectorIndex(Value *Ptr,
357                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
358   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
359   if (!GEP)
360     return nullptr;
361 
362   auto I = GEPIdx.find(GEP);
363   return I == GEPIdx.end() ? nullptr : I->second;
364 }
365 
366 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
367   // FIXME we only support simple cases
368   if (GEP->getNumOperands() != 3)
369     return nullptr;
370 
371   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
372   if (!I0 || !I0->isZero())
373     return nullptr;
374 
375   return GEP->getOperand(2);
376 }
377 
378 // Not an instruction handled below to turn into a vector.
379 //
380 // TODO: Check isTriviallyVectorizable for calls and handle other
381 // instructions.
382 static bool canVectorizeInst(Instruction *Inst, User *User,
383                              const DataLayout &DL) {
384   switch (Inst->getOpcode()) {
385   case Instruction::Load: {
386     // Currently only handle the case where the Pointer Operand is a GEP.
387     // Also we could not vectorize volatile or atomic loads.
388     LoadInst *LI = cast<LoadInst>(Inst);
389     if (isa<AllocaInst>(User) &&
390         LI->getPointerOperandType() == User->getType() &&
391         isa<VectorType>(LI->getType()))
392       return true;
393 
394     Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
395     if (!PtrInst)
396       return false;
397 
398     return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
399             PtrInst->getOpcode() == Instruction::BitCast) &&
400            LI->isSimple();
401   }
402   case Instruction::BitCast:
403     return true;
404   case Instruction::Store: {
405     // Must be the stored pointer operand, not a stored value, plus
406     // since it should be canonical form, the User should be a GEP.
407     // Also we could not vectorize volatile or atomic stores.
408     StoreInst *SI = cast<StoreInst>(Inst);
409     if (isa<AllocaInst>(User) &&
410         SI->getPointerOperandType() == User->getType() &&
411         isa<VectorType>(SI->getValueOperand()->getType()))
412       return true;
413 
414     Instruction *UserInst = dyn_cast<Instruction>(User);
415     if (!UserInst)
416       return false;
417 
418     return (SI->getPointerOperand() == User) &&
419            (UserInst->getOpcode() == Instruction::GetElementPtr ||
420             UserInst->getOpcode() == Instruction::BitCast) &&
421            SI->isSimple();
422   }
423   default:
424     return false;
425   }
426 }
427 
428 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
429                                      unsigned MaxVGPRs) {
430 
431   if (DisablePromoteAllocaToVector) {
432     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
433     return false;
434   }
435 
436   Type *AllocaTy = Alloca->getAllocatedType();
437   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
438   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
439     if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
440         ArrayTy->getNumElements() > 0)
441       VectorTy = arrayTypeToVecType(ArrayTy);
442   }
443 
444   // Use up to 1/4 of available register budget for vectorization.
445   unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
446                                               : (MaxVGPRs * 32);
447 
448   if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
449     LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
450                       << MaxVGPRs << " registers available\n");
451     return false;
452   }
453 
454   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
455 
456   // FIXME: There is no reason why we can't support larger arrays, we
457   // are just being conservative for now.
458   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
459   // could also be promoted but we don't currently handle this case
460   if (!VectorTy || VectorTy->getNumElements() > 16 ||
461       VectorTy->getNumElements() < 2) {
462     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
463     return false;
464   }
465 
466   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
467   std::vector<Value *> WorkList;
468   SmallVector<User *, 8> Users(Alloca->users());
469   SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
470   Type *VecEltTy = VectorTy->getElementType();
471   while (!Users.empty()) {
472     User *AllocaUser = Users.pop_back_val();
473     User *UseUser = UseUsers.pop_back_val();
474     Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
475 
476     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
477     if (!GEP) {
478       if (!canVectorizeInst(Inst, UseUser, DL))
479         return false;
480 
481       if (Inst->getOpcode() == Instruction::BitCast) {
482         Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
483         Type *ToTy = Inst->getType()->getPointerElementType();
484         if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
485             DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
486           continue;
487 
488         for (User *CastUser : Inst->users()) {
489           if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
490             continue;
491           Users.push_back(CastUser);
492           UseUsers.push_back(Inst);
493         }
494 
495         continue;
496       }
497 
498       WorkList.push_back(AllocaUser);
499       continue;
500     }
501 
502     Value *Index = GEPToVectorIndex(GEP);
503 
504     // If we can't compute a vector index from this GEP, then we can't
505     // promote this alloca to vector.
506     if (!Index) {
507       LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
508                         << '\n');
509       return false;
510     }
511 
512     GEPVectorIdx[GEP] = Index;
513     Users.append(GEP->user_begin(), GEP->user_end());
514     UseUsers.append(GEP->getNumUses(), GEP);
515   }
516 
517   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
518                     << *VectorTy << '\n');
519 
520   for (Value *V : WorkList) {
521     Instruction *Inst = cast<Instruction>(V);
522     IRBuilder<> Builder(Inst);
523     switch (Inst->getOpcode()) {
524     case Instruction::Load: {
525       if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
526         break;
527 
528       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
529       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
530       if (!Index)
531         break;
532 
533       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
534       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
535       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
536       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
537       if (Inst->getType() != VecEltTy)
538         ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
539       Inst->replaceAllUsesWith(ExtractElement);
540       Inst->eraseFromParent();
541       break;
542     }
543     case Instruction::Store: {
544       StoreInst *SI = cast<StoreInst>(Inst);
545       if (SI->getValueOperand()->getType() == AllocaTy ||
546           SI->getValueOperand()->getType()->isVectorTy())
547         break;
548 
549       Value *Ptr = SI->getPointerOperand();
550       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
551       if (!Index)
552         break;
553 
554       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
555       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
556       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
557       Value *Elt = SI->getValueOperand();
558       if (Elt->getType() != VecEltTy)
559         Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
560       Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
561       Builder.CreateStore(NewVecValue, BitCast);
562       Inst->eraseFromParent();
563       break;
564     }
565 
566     default:
567       llvm_unreachable("Inconsistency in instructions promotable to vector");
568     }
569   }
570   return true;
571 }
572 
573 static bool isCallPromotable(CallInst *CI) {
574   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
575   if (!II)
576     return false;
577 
578   switch (II->getIntrinsicID()) {
579   case Intrinsic::memcpy:
580   case Intrinsic::memmove:
581   case Intrinsic::memset:
582   case Intrinsic::lifetime_start:
583   case Intrinsic::lifetime_end:
584   case Intrinsic::invariant_start:
585   case Intrinsic::invariant_end:
586   case Intrinsic::launder_invariant_group:
587   case Intrinsic::strip_invariant_group:
588   case Intrinsic::objectsize:
589     return true;
590   default:
591     return false;
592   }
593 }
594 
595 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
596                                                           Value *Val,
597                                                           Instruction *Inst,
598                                                           int OpIdx0,
599                                                           int OpIdx1) const {
600   // Figure out which operand is the one we might not be promoting.
601   Value *OtherOp = Inst->getOperand(OpIdx0);
602   if (Val == OtherOp)
603     OtherOp = Inst->getOperand(OpIdx1);
604 
605   if (isa<ConstantPointerNull>(OtherOp))
606     return true;
607 
608   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
609   if (!isa<AllocaInst>(OtherObj))
610     return false;
611 
612   // TODO: We should be able to replace undefs with the right pointer type.
613 
614   // TODO: If we know the other base object is another promotable
615   // alloca, not necessarily this alloca, we can do this. The
616   // important part is both must have the same address space at
617   // the end.
618   if (OtherObj != BaseAlloca) {
619     LLVM_DEBUG(
620         dbgs() << "Found a binary instruction with another alloca object\n");
621     return false;
622   }
623 
624   return true;
625 }
626 
627 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
628   Value *BaseAlloca,
629   Value *Val,
630   std::vector<Value*> &WorkList) const {
631 
632   for (User *User : Val->users()) {
633     if (is_contained(WorkList, User))
634       continue;
635 
636     if (CallInst *CI = dyn_cast<CallInst>(User)) {
637       if (!isCallPromotable(CI))
638         return false;
639 
640       WorkList.push_back(User);
641       continue;
642     }
643 
644     Instruction *UseInst = cast<Instruction>(User);
645     if (UseInst->getOpcode() == Instruction::PtrToInt)
646       return false;
647 
648     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
649       if (LI->isVolatile())
650         return false;
651 
652       continue;
653     }
654 
655     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
656       if (SI->isVolatile())
657         return false;
658 
659       // Reject if the stored value is not the pointer operand.
660       if (SI->getPointerOperand() != Val)
661         return false;
662     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
663       if (RMW->isVolatile())
664         return false;
665     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
666       if (CAS->isVolatile())
667         return false;
668     }
669 
670     // Only promote a select if we know that the other select operand
671     // is from another pointer that will also be promoted.
672     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
673       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
674         return false;
675 
676       // May need to rewrite constant operands.
677       WorkList.push_back(ICmp);
678     }
679 
680     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
681       // Give up if the pointer may be captured.
682       if (PointerMayBeCaptured(UseInst, true, true))
683         return false;
684       // Don't collect the users of this.
685       WorkList.push_back(User);
686       continue;
687     }
688 
689     if (!User->getType()->isPointerTy())
690       continue;
691 
692     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
693       // Be conservative if an address could be computed outside the bounds of
694       // the alloca.
695       if (!GEP->isInBounds())
696         return false;
697     }
698 
699     // Only promote a select if we know that the other select operand is from
700     // another pointer that will also be promoted.
701     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
702       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
703         return false;
704     }
705 
706     // Repeat for phis.
707     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
708       // TODO: Handle more complex cases. We should be able to replace loops
709       // over arrays.
710       switch (Phi->getNumIncomingValues()) {
711       case 1:
712         break;
713       case 2:
714         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
715           return false;
716         break;
717       default:
718         return false;
719       }
720     }
721 
722     WorkList.push_back(User);
723     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
724       return false;
725   }
726 
727   return true;
728 }
729 
730 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
731 
732   FunctionType *FTy = F.getFunctionType();
733   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
734 
735   // If the function has any arguments in the local address space, then it's
736   // possible these arguments require the entire local memory space, so
737   // we cannot use local memory in the pass.
738   for (Type *ParamTy : FTy->params()) {
739     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
740     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
741       LocalMemLimit = 0;
742       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
743                            "local memory disabled.\n");
744       return false;
745     }
746   }
747 
748   LocalMemLimit = ST.getLocalMemorySize();
749   if (LocalMemLimit == 0)
750     return false;
751 
752   const DataLayout &DL = Mod->getDataLayout();
753 
754   // Check how much local memory is being used by global objects
755   CurrentLocalMemUsage = 0;
756   for (GlobalVariable &GV : Mod->globals()) {
757     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
758       continue;
759 
760     for (const User *U : GV.users()) {
761       const Instruction *Use = dyn_cast<Instruction>(U);
762       if (!Use)
763         continue;
764 
765       if (Use->getParent()->getParent() == &F) {
766         Align Alignment =
767             DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
768 
769         // FIXME: Try to account for padding here. The padding is currently
770         // determined from the inverse order of uses in the function. I'm not
771         // sure if the use list order is in any way connected to this, so the
772         // total reported size is likely incorrect.
773         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
774         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
775         CurrentLocalMemUsage += AllocSize;
776         break;
777       }
778     }
779   }
780 
781   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
782                                                           F);
783 
784   // Restrict local memory usage so that we don't drastically reduce occupancy,
785   // unless it is already significantly reduced.
786 
787   // TODO: Have some sort of hint or other heuristics to guess occupancy based
788   // on other factors..
789   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
790   if (OccupancyHint == 0)
791     OccupancyHint = 7;
792 
793   // Clamp to max value.
794   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
795 
796   // Check the hint but ignore it if it's obviously wrong from the existing LDS
797   // usage.
798   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
799 
800 
801   // Round up to the next tier of usage.
802   unsigned MaxSizeWithWaveCount
803     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
804 
805   // Program is possibly broken by using more local mem than available.
806   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
807     return false;
808 
809   LocalMemLimit = MaxSizeWithWaveCount;
810 
811   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
812                     << " bytes of LDS\n"
813                     << "  Rounding size to " << MaxSizeWithWaveCount
814                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
815                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
816                     << " available for promotion\n");
817 
818   return true;
819 }
820 
821 // FIXME: Should try to pick the most likely to be profitable allocas first.
822 bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
823   // Array allocations are probably not worth handling, since an allocation of
824   // the array type is the canonical form.
825   if (!I.isStaticAlloca() || I.isArrayAllocation())
826     return false;
827 
828   const DataLayout &DL = Mod->getDataLayout();
829   IRBuilder<> Builder(&I);
830 
831   // First try to replace the alloca with a vector
832   Type *AllocaTy = I.getAllocatedType();
833 
834   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
835 
836   if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
837     return true; // Promoted to vector.
838 
839   if (DisablePromoteAllocaToLDS)
840     return false;
841 
842   const Function &ContainingFunction = *I.getParent()->getParent();
843   CallingConv::ID CC = ContainingFunction.getCallingConv();
844 
845   // Don't promote the alloca to LDS for shader calling conventions as the work
846   // item ID intrinsics are not supported for these calling conventions.
847   // Furthermore not all LDS is available for some of the stages.
848   switch (CC) {
849   case CallingConv::AMDGPU_KERNEL:
850   case CallingConv::SPIR_KERNEL:
851     break;
852   default:
853     LLVM_DEBUG(
854         dbgs()
855         << " promote alloca to LDS not supported with calling convention.\n");
856     return false;
857   }
858 
859   // Not likely to have sufficient local memory for promotion.
860   if (!SufficientLDS)
861     return false;
862 
863   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
864   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
865 
866   Align Alignment =
867       DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
868 
869   // FIXME: This computed padding is likely wrong since it depends on inverse
870   // usage order.
871   //
872   // FIXME: It is also possible that if we're allowed to use all of the memory
873   // could could end up using more than the maximum due to alignment padding.
874 
875   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
876   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
877   NewSize += AllocSize;
878 
879   if (NewSize > LocalMemLimit) {
880     LLVM_DEBUG(dbgs() << "  " << AllocSize
881                       << " bytes of local memory not available to promote\n");
882     return false;
883   }
884 
885   CurrentLocalMemUsage = NewSize;
886 
887   std::vector<Value*> WorkList;
888 
889   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
890     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
891     return false;
892   }
893 
894   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
895 
896   Function *F = I.getParent()->getParent();
897 
898   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
899   GlobalVariable *GV = new GlobalVariable(
900       *Mod, GVTy, false, GlobalValue::InternalLinkage,
901       UndefValue::get(GVTy),
902       Twine(F->getName()) + Twine('.') + I.getName(),
903       nullptr,
904       GlobalVariable::NotThreadLocal,
905       AMDGPUAS::LOCAL_ADDRESS);
906   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
907   GV->setAlignment(MaybeAlign(I.getAlignment()));
908 
909   Value *TCntY, *TCntZ;
910 
911   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
912   Value *TIdX = getWorkitemID(Builder, 0);
913   Value *TIdY = getWorkitemID(Builder, 1);
914   Value *TIdZ = getWorkitemID(Builder, 2);
915 
916   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
917   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
918   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
919   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
920   TID = Builder.CreateAdd(TID, TIdZ);
921 
922   Value *Indices[] = {
923     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
924     TID
925   };
926 
927   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
928   I.mutateType(Offset->getType());
929   I.replaceAllUsesWith(Offset);
930   I.eraseFromParent();
931 
932   for (Value *V : WorkList) {
933     CallInst *Call = dyn_cast<CallInst>(V);
934     if (!Call) {
935       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
936         Value *Src0 = CI->getOperand(0);
937         Type *EltTy = Src0->getType()->getPointerElementType();
938         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
939 
940         if (isa<ConstantPointerNull>(CI->getOperand(0)))
941           CI->setOperand(0, ConstantPointerNull::get(NewTy));
942 
943         if (isa<ConstantPointerNull>(CI->getOperand(1)))
944           CI->setOperand(1, ConstantPointerNull::get(NewTy));
945 
946         continue;
947       }
948 
949       // The operand's value should be corrected on its own and we don't want to
950       // touch the users.
951       if (isa<AddrSpaceCastInst>(V))
952         continue;
953 
954       Type *EltTy = V->getType()->getPointerElementType();
955       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
956 
957       // FIXME: It doesn't really make sense to try to do this for all
958       // instructions.
959       V->mutateType(NewTy);
960 
961       // Adjust the types of any constant operands.
962       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
963         if (isa<ConstantPointerNull>(SI->getOperand(1)))
964           SI->setOperand(1, ConstantPointerNull::get(NewTy));
965 
966         if (isa<ConstantPointerNull>(SI->getOperand(2)))
967           SI->setOperand(2, ConstantPointerNull::get(NewTy));
968       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
969         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
970           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
971             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
972         }
973       }
974 
975       continue;
976     }
977 
978     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
979     Builder.SetInsertPoint(Intr);
980     switch (Intr->getIntrinsicID()) {
981     case Intrinsic::lifetime_start:
982     case Intrinsic::lifetime_end:
983       // These intrinsics are for address space 0 only
984       Intr->eraseFromParent();
985       continue;
986     case Intrinsic::memcpy: {
987       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
988       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
989                            MemCpy->getRawSource(), MemCpy->getSourceAlign(),
990                            MemCpy->getLength(), MemCpy->isVolatile());
991       Intr->eraseFromParent();
992       continue;
993     }
994     case Intrinsic::memmove: {
995       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
996       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
997                             MemMove->getRawSource(), MemMove->getSourceAlign(),
998                             MemMove->getLength(), MemMove->isVolatile());
999       Intr->eraseFromParent();
1000       continue;
1001     }
1002     case Intrinsic::memset: {
1003       MemSetInst *MemSet = cast<MemSetInst>(Intr);
1004       Builder.CreateMemSet(
1005           MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
1006           MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
1007       Intr->eraseFromParent();
1008       continue;
1009     }
1010     case Intrinsic::invariant_start:
1011     case Intrinsic::invariant_end:
1012     case Intrinsic::launder_invariant_group:
1013     case Intrinsic::strip_invariant_group:
1014       Intr->eraseFromParent();
1015       // FIXME: I think the invariant marker should still theoretically apply,
1016       // but the intrinsics need to be changed to accept pointers with any
1017       // address space.
1018       continue;
1019     case Intrinsic::objectsize: {
1020       Value *Src = Intr->getOperand(0);
1021       Type *SrcTy = Src->getType()->getPointerElementType();
1022       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
1023         Intrinsic::objectsize,
1024         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
1025       );
1026 
1027       CallInst *NewCall = Builder.CreateCall(
1028           ObjectSize,
1029           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1030       Intr->replaceAllUsesWith(NewCall);
1031       Intr->eraseFromParent();
1032       continue;
1033     }
1034     default:
1035       Intr->print(errs());
1036       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1037     }
1038   }
1039   return true;
1040 }
1041 
1042 bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
1043   if (skipFunction(F) || DisablePromoteAllocaToVector)
1044     return false;
1045 
1046   const TargetMachine *TM;
1047   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
1048     TM = &TPC->getTM<TargetMachine>();
1049   else
1050     return false;
1051 
1052   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
1053   if (!ST.isPromoteAllocaEnabled())
1054     return false;
1055 
1056   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1057     const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
1058     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
1059   } else {
1060     MaxVGPRs = 128;
1061   }
1062 
1063   bool Changed = false;
1064   BasicBlock &EntryBB = *F.begin();
1065 
1066   SmallVector<AllocaInst *, 16> Allocas;
1067   for (Instruction &I : EntryBB) {
1068     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1069       Allocas.push_back(AI);
1070   }
1071 
1072   for (AllocaInst *AI : Allocas) {
1073     if (handleAlloca(*AI))
1074       Changed = true;
1075   }
1076 
1077   return Changed;
1078 }
1079 
1080 bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
1081   // Array allocations are probably not worth handling, since an allocation of
1082   // the array type is the canonical form.
1083   if (!I.isStaticAlloca() || I.isArrayAllocation())
1084     return false;
1085 
1086   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
1087 
1088   Module *Mod = I.getParent()->getParent()->getParent();
1089   return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
1090 }
1091 
1092 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
1093   return new AMDGPUPromoteAlloca();
1094 }
1095 
1096 FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
1097   return new AMDGPUPromoteAllocaToVector();
1098 }
1099