xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
10b57cec5SDimitry Andric //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
9*06c3fb27SDimitry Andric // Eliminates allocas by either converting them into vectors or by migrating
10*06c3fb27SDimitry Andric // them to local address space.
11*06c3fb27SDimitry Andric //
12*06c3fb27SDimitry Andric // Two passes are exposed by this file:
13*06c3fb27SDimitry Andric //    - "promote-alloca-to-vector", which runs early in the pipeline and only
14*06c3fb27SDimitry Andric //      promotes to vector. Promotion to vector is almost always profitable
15*06c3fb27SDimitry Andric //      except when the alloca is too big and the promotion would result in
16*06c3fb27SDimitry Andric //      very high register pressure.
17*06c3fb27SDimitry Andric //    - "promote-alloca", which does both promotion to vector and LDS and runs
18*06c3fb27SDimitry Andric //      much later in the pipeline. This runs after SROA because promoting to
19*06c3fb27SDimitry Andric //      LDS is of course less profitable than getting rid of the alloca or
20*06c3fb27SDimitry Andric //      vectorizing it, thus we only want to do it when the only alternative is
21*06c3fb27SDimitry Andric //      lowering the alloca to stack.
22*06c3fb27SDimitry Andric //
23*06c3fb27SDimitry Andric // Note that both of them exist for the old and new PMs. The new PM passes are
24*06c3fb27SDimitry Andric // declared in AMDGPU.h and the legacy PM ones are declared here.s
250b57cec5SDimitry Andric //
260b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
270b57cec5SDimitry Andric 
280b57cec5SDimitry Andric #include "AMDGPU.h"
29e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
301fd87a68SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
31*06c3fb27SDimitry Andric #include "llvm/ADT/STLExtras.h"
320b57cec5SDimitry Andric #include "llvm/Analysis/CaptureTracking.h"
33*06c3fb27SDimitry Andric #include "llvm/Analysis/InstSimplifyFolder.h"
34*06c3fb27SDimitry Andric #include "llvm/Analysis/InstructionSimplify.h"
350b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
360b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
370b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
381fd87a68SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
39480093f4SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
40480093f4SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
41*06c3fb27SDimitry Andric #include "llvm/IR/PatternMatch.h"
420b57cec5SDimitry Andric #include "llvm/Pass.h"
430b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
44*06c3fb27SDimitry Andric #include "llvm/Transforms/Utils/SSAUpdater.h"
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-promote-alloca"
470b57cec5SDimitry Andric 
480b57cec5SDimitry Andric using namespace llvm;
490b57cec5SDimitry Andric 
500b57cec5SDimitry Andric namespace {
510b57cec5SDimitry Andric 
52*06c3fb27SDimitry Andric static cl::opt<bool>
53*06c3fb27SDimitry Andric     DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
540b57cec5SDimitry Andric                                  cl::desc("Disable promote alloca to vector"),
550b57cec5SDimitry Andric                                  cl::init(false));
560b57cec5SDimitry Andric 
57*06c3fb27SDimitry Andric static cl::opt<bool>
58*06c3fb27SDimitry Andric     DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
590b57cec5SDimitry Andric                               cl::desc("Disable promote alloca to LDS"),
600b57cec5SDimitry Andric                               cl::init(false));
610b57cec5SDimitry Andric 
625ffd83dbSDimitry Andric static cl::opt<unsigned> PromoteAllocaToVectorLimit(
635ffd83dbSDimitry Andric     "amdgpu-promote-alloca-to-vector-limit",
645ffd83dbSDimitry Andric     cl::desc("Maximum byte size to consider promote alloca to vector"),
655ffd83dbSDimitry Andric     cl::init(0));
665ffd83dbSDimitry Andric 
67*06c3fb27SDimitry Andric // Shared implementation which can do both promotion to vector and to LDS.
68e8d8bef9SDimitry Andric class AMDGPUPromoteAllocaImpl {
690b57cec5SDimitry Andric private:
70e8d8bef9SDimitry Andric   const TargetMachine &TM;
710b57cec5SDimitry Andric   Module *Mod = nullptr;
720b57cec5SDimitry Andric   const DataLayout *DL = nullptr;
730b57cec5SDimitry Andric 
740b57cec5SDimitry Andric   // FIXME: This should be per-kernel.
750b57cec5SDimitry Andric   uint32_t LocalMemLimit = 0;
760b57cec5SDimitry Andric   uint32_t CurrentLocalMemUsage = 0;
775ffd83dbSDimitry Andric   unsigned MaxVGPRs;
780b57cec5SDimitry Andric 
790b57cec5SDimitry Andric   bool IsAMDGCN = false;
800b57cec5SDimitry Andric   bool IsAMDHSA = false;
810b57cec5SDimitry Andric 
820b57cec5SDimitry Andric   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
830b57cec5SDimitry Andric   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
840b57cec5SDimitry Andric 
850b57cec5SDimitry Andric   /// BaseAlloca is the alloca root the search started from.
860b57cec5SDimitry Andric   /// Val may be that alloca or a recursive user of it.
87*06c3fb27SDimitry Andric   bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
880b57cec5SDimitry Andric                                std::vector<Value *> &WorkList) const;
890b57cec5SDimitry Andric 
900b57cec5SDimitry Andric   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
910b57cec5SDimitry Andric   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
920b57cec5SDimitry Andric   /// Returns true if both operands are derived from the same alloca. Val should
930b57cec5SDimitry Andric   /// be the same value as one of the input operands of UseInst.
940b57cec5SDimitry Andric   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
95*06c3fb27SDimitry Andric                                        Instruction *UseInst, int OpIdx0,
96*06c3fb27SDimitry Andric                                        int OpIdx1) const;
970b57cec5SDimitry Andric 
980b57cec5SDimitry Andric   /// Check whether we have enough local memory for promotion.
990b57cec5SDimitry Andric   bool hasSufficientLocalMem(const Function &F);
1000b57cec5SDimitry Andric 
101*06c3fb27SDimitry Andric   bool tryPromoteAllocaToVector(AllocaInst &I);
102*06c3fb27SDimitry Andric   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
1030b57cec5SDimitry Andric 
104e8d8bef9SDimitry Andric public:
105*06c3fb27SDimitry Andric   AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
106*06c3fb27SDimitry Andric     const Triple &TT = TM.getTargetTriple();
107*06c3fb27SDimitry Andric     IsAMDGCN = TT.getArch() == Triple::amdgcn;
108*06c3fb27SDimitry Andric     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
109*06c3fb27SDimitry Andric   }
110*06c3fb27SDimitry Andric 
111*06c3fb27SDimitry Andric   bool run(Function &F, bool PromoteToLDS);
112*06c3fb27SDimitry Andric };
113*06c3fb27SDimitry Andric 
114*06c3fb27SDimitry Andric // FIXME: This can create globals so should be a module pass.
115*06c3fb27SDimitry Andric class AMDGPUPromoteAlloca : public FunctionPass {
116*06c3fb27SDimitry Andric public:
117*06c3fb27SDimitry Andric   static char ID;
118*06c3fb27SDimitry Andric 
119*06c3fb27SDimitry Andric   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
120*06c3fb27SDimitry Andric 
121*06c3fb27SDimitry Andric   bool runOnFunction(Function &F) override {
122*06c3fb27SDimitry Andric     if (skipFunction(F))
123*06c3fb27SDimitry Andric       return false;
124*06c3fb27SDimitry Andric     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
125*06c3fb27SDimitry Andric       return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
126*06c3fb27SDimitry Andric           .run(F, /*PromoteToLDS*/ true);
127*06c3fb27SDimitry Andric     return false;
128*06c3fb27SDimitry Andric   }
129*06c3fb27SDimitry Andric 
130*06c3fb27SDimitry Andric   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
131*06c3fb27SDimitry Andric 
132*06c3fb27SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
133*06c3fb27SDimitry Andric     AU.setPreservesCFG();
134*06c3fb27SDimitry Andric     FunctionPass::getAnalysisUsage(AU);
135*06c3fb27SDimitry Andric   }
1360b57cec5SDimitry Andric };
1370b57cec5SDimitry Andric 
1385ffd83dbSDimitry Andric class AMDGPUPromoteAllocaToVector : public FunctionPass {
1395ffd83dbSDimitry Andric public:
1405ffd83dbSDimitry Andric   static char ID;
1415ffd83dbSDimitry Andric 
1425ffd83dbSDimitry Andric   AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
1435ffd83dbSDimitry Andric 
144*06c3fb27SDimitry Andric   bool runOnFunction(Function &F) override {
145*06c3fb27SDimitry Andric     if (skipFunction(F))
146*06c3fb27SDimitry Andric       return false;
147*06c3fb27SDimitry Andric     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
148*06c3fb27SDimitry Andric       return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
149*06c3fb27SDimitry Andric           .run(F, /*PromoteToLDS*/ false);
150*06c3fb27SDimitry Andric     return false;
151*06c3fb27SDimitry Andric   }
1525ffd83dbSDimitry Andric 
1535ffd83dbSDimitry Andric   StringRef getPassName() const override {
1545ffd83dbSDimitry Andric     return "AMDGPU Promote Alloca to vector";
1555ffd83dbSDimitry Andric   }
1565ffd83dbSDimitry Andric 
1575ffd83dbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
1585ffd83dbSDimitry Andric     AU.setPreservesCFG();
1595ffd83dbSDimitry Andric     FunctionPass::getAnalysisUsage(AU);
1605ffd83dbSDimitry Andric   }
1615ffd83dbSDimitry Andric };
1625ffd83dbSDimitry Andric 
163*06c3fb27SDimitry Andric unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
164*06c3fb27SDimitry Andric   if (!TM.getTargetTriple().isAMDGCN())
165*06c3fb27SDimitry Andric     return 128;
166*06c3fb27SDimitry Andric 
167*06c3fb27SDimitry Andric   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
168*06c3fb27SDimitry Andric   unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
169*06c3fb27SDimitry Andric 
170*06c3fb27SDimitry Andric   // A non-entry function has only 32 caller preserved registers.
171*06c3fb27SDimitry Andric   // Do not promote alloca which will force spilling unless we know the function
172*06c3fb27SDimitry Andric   // will be inlined.
173*06c3fb27SDimitry Andric   if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
174*06c3fb27SDimitry Andric       !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
175*06c3fb27SDimitry Andric     MaxVGPRs = std::min(MaxVGPRs, 32u);
176*06c3fb27SDimitry Andric   return MaxVGPRs;
177*06c3fb27SDimitry Andric }
178*06c3fb27SDimitry Andric 
1790b57cec5SDimitry Andric } // end anonymous namespace
1800b57cec5SDimitry Andric 
1810b57cec5SDimitry Andric char AMDGPUPromoteAlloca::ID = 0;
1825ffd83dbSDimitry Andric char AMDGPUPromoteAllocaToVector::ID = 0;
1830b57cec5SDimitry Andric 
184fe6060f1SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
185fe6060f1SDimitry Andric                       "AMDGPU promote alloca to vector or LDS", false, false)
186fe6060f1SDimitry Andric // Move LDS uses from functions to kernels before promote alloca for accurate
187fe6060f1SDimitry Andric // estimation of LDS available
188fe6060f1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
189fe6060f1SDimitry Andric INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
1900b57cec5SDimitry Andric                     "AMDGPU promote alloca to vector or LDS", false, false)
1910b57cec5SDimitry Andric 
1925ffd83dbSDimitry Andric INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
1935ffd83dbSDimitry Andric                 "AMDGPU promote alloca to vector", false, false)
1945ffd83dbSDimitry Andric 
1950b57cec5SDimitry Andric char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
1965ffd83dbSDimitry Andric char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
1970b57cec5SDimitry Andric 
198e8d8bef9SDimitry Andric PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
199e8d8bef9SDimitry Andric                                                FunctionAnalysisManager &AM) {
200*06c3fb27SDimitry Andric   bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
201e8d8bef9SDimitry Andric   if (Changed) {
202e8d8bef9SDimitry Andric     PreservedAnalyses PA;
203e8d8bef9SDimitry Andric     PA.preserveSet<CFGAnalyses>();
204e8d8bef9SDimitry Andric     return PA;
205e8d8bef9SDimitry Andric   }
206e8d8bef9SDimitry Andric   return PreservedAnalyses::all();
207e8d8bef9SDimitry Andric }
208e8d8bef9SDimitry Andric 
209*06c3fb27SDimitry Andric PreservedAnalyses
210*06c3fb27SDimitry Andric AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
211*06c3fb27SDimitry Andric   bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
212*06c3fb27SDimitry Andric   if (Changed) {
213*06c3fb27SDimitry Andric     PreservedAnalyses PA;
214*06c3fb27SDimitry Andric     PA.preserveSet<CFGAnalyses>();
215*06c3fb27SDimitry Andric     return PA;
216*06c3fb27SDimitry Andric   }
217*06c3fb27SDimitry Andric   return PreservedAnalyses::all();
218*06c3fb27SDimitry Andric }
219*06c3fb27SDimitry Andric 
220*06c3fb27SDimitry Andric FunctionPass *llvm::createAMDGPUPromoteAlloca() {
221*06c3fb27SDimitry Andric   return new AMDGPUPromoteAlloca();
222*06c3fb27SDimitry Andric }
223*06c3fb27SDimitry Andric 
224*06c3fb27SDimitry Andric FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
225*06c3fb27SDimitry Andric   return new AMDGPUPromoteAllocaToVector();
226*06c3fb27SDimitry Andric }
227*06c3fb27SDimitry Andric 
228*06c3fb27SDimitry Andric bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
229e8d8bef9SDimitry Andric   Mod = F.getParent();
230e8d8bef9SDimitry Andric   DL = &Mod->getDataLayout();
231e8d8bef9SDimitry Andric 
232e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
2330b57cec5SDimitry Andric   if (!ST.isPromoteAllocaEnabled())
2340b57cec5SDimitry Andric     return false;
2350b57cec5SDimitry Andric 
236*06c3fb27SDimitry Andric   MaxVGPRs = getMaxVGPRs(TM, F);
2375ffd83dbSDimitry Andric 
238*06c3fb27SDimitry Andric   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
2390b57cec5SDimitry Andric 
2400b57cec5SDimitry Andric   SmallVector<AllocaInst *, 16> Allocas;
241*06c3fb27SDimitry Andric   for (Instruction &I : F.getEntryBlock()) {
242*06c3fb27SDimitry Andric     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
243*06c3fb27SDimitry Andric       // Array allocations are probably not worth handling, since an allocation
244*06c3fb27SDimitry Andric       // of the array type is the canonical form.
245*06c3fb27SDimitry Andric       if (!AI->isStaticAlloca() || AI->isArrayAllocation())
246*06c3fb27SDimitry Andric         continue;
2470b57cec5SDimitry Andric       Allocas.push_back(AI);
2480b57cec5SDimitry Andric     }
249*06c3fb27SDimitry Andric   }
2500b57cec5SDimitry Andric 
251*06c3fb27SDimitry Andric   bool Changed = false;
2520b57cec5SDimitry Andric   for (AllocaInst *AI : Allocas) {
253*06c3fb27SDimitry Andric     if (tryPromoteAllocaToVector(*AI))
254*06c3fb27SDimitry Andric       Changed = true;
255*06c3fb27SDimitry Andric     else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
2560b57cec5SDimitry Andric       Changed = true;
2570b57cec5SDimitry Andric   }
2580b57cec5SDimitry Andric 
259*06c3fb27SDimitry Andric   // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
260*06c3fb27SDimitry Andric   // dangling pointers. If we want to reuse it past this point, the loop above
261*06c3fb27SDimitry Andric   // would need to be updated to remove successfully promoted allocas.
262*06c3fb27SDimitry Andric 
2630b57cec5SDimitry Andric   return Changed;
2640b57cec5SDimitry Andric }
2650b57cec5SDimitry Andric 
266*06c3fb27SDimitry Andric struct MemTransferInfo {
267*06c3fb27SDimitry Andric   ConstantInt *SrcIndex = nullptr;
268*06c3fb27SDimitry Andric   ConstantInt *DestIndex = nullptr;
269*06c3fb27SDimitry Andric };
270*06c3fb27SDimitry Andric 
271*06c3fb27SDimitry Andric // Checks if the instruction I is a memset user of the alloca AI that we can
272*06c3fb27SDimitry Andric // deal with. Currently, only non-volatile memsets that affect the whole alloca
273*06c3fb27SDimitry Andric // are handled.
274*06c3fb27SDimitry Andric static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
275*06c3fb27SDimitry Andric                               const DataLayout &DL) {
276*06c3fb27SDimitry Andric   using namespace PatternMatch;
277*06c3fb27SDimitry Andric   // For now we only care about non-volatile memsets that affect the whole type
278*06c3fb27SDimitry Andric   // (start at index 0 and fill the whole alloca).
279*06c3fb27SDimitry Andric   //
280*06c3fb27SDimitry Andric   // TODO: Now that we moved to PromoteAlloca we could handle any memsets
281*06c3fb27SDimitry Andric   // (except maybe volatile ones?) - we just need to use shufflevector if it
282*06c3fb27SDimitry Andric   // only affects a subset of the vector.
283*06c3fb27SDimitry Andric   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
284*06c3fb27SDimitry Andric   return I->getOperand(0) == AI &&
285*06c3fb27SDimitry Andric          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
286*06c3fb27SDimitry Andric }
287*06c3fb27SDimitry Andric 
288*06c3fb27SDimitry Andric static Value *
289*06c3fb27SDimitry Andric calculateVectorIndex(Value *Ptr,
290*06c3fb27SDimitry Andric                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
291*06c3fb27SDimitry Andric   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
292*06c3fb27SDimitry Andric   if (!GEP)
293*06c3fb27SDimitry Andric     return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
294*06c3fb27SDimitry Andric 
295*06c3fb27SDimitry Andric   auto I = GEPIdx.find(GEP);
296*06c3fb27SDimitry Andric   assert(I != GEPIdx.end() && "Must have entry for GEP!");
297*06c3fb27SDimitry Andric   return I->second;
298*06c3fb27SDimitry Andric }
299*06c3fb27SDimitry Andric 
300*06c3fb27SDimitry Andric static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
301*06c3fb27SDimitry Andric                                Type *VecElemTy, const DataLayout &DL) {
302*06c3fb27SDimitry Andric   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
303*06c3fb27SDimitry Andric   // helper.
304*06c3fb27SDimitry Andric   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
305*06c3fb27SDimitry Andric   MapVector<Value *, APInt> VarOffsets;
306*06c3fb27SDimitry Andric   APInt ConstOffset(BW, 0);
307*06c3fb27SDimitry Andric   if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
308*06c3fb27SDimitry Andric       !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
309*06c3fb27SDimitry Andric     return nullptr;
310*06c3fb27SDimitry Andric 
311*06c3fb27SDimitry Andric   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
312*06c3fb27SDimitry Andric   if (VarOffsets.size() > 1)
313*06c3fb27SDimitry Andric     return nullptr;
314*06c3fb27SDimitry Andric 
315*06c3fb27SDimitry Andric   if (VarOffsets.size() == 1) {
316*06c3fb27SDimitry Andric     // Only handle cases where we don't need to insert extra arithmetic
317*06c3fb27SDimitry Andric     // instructions.
318*06c3fb27SDimitry Andric     const auto &VarOffset = VarOffsets.front();
319*06c3fb27SDimitry Andric     if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
320*06c3fb27SDimitry Andric       return nullptr;
321*06c3fb27SDimitry Andric     return VarOffset.first;
322*06c3fb27SDimitry Andric   }
323*06c3fb27SDimitry Andric 
324*06c3fb27SDimitry Andric   APInt Quot;
325*06c3fb27SDimitry Andric   uint64_t Rem;
326*06c3fb27SDimitry Andric   APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
327*06c3fb27SDimitry Andric   if (Rem != 0)
328*06c3fb27SDimitry Andric     return nullptr;
329*06c3fb27SDimitry Andric 
330*06c3fb27SDimitry Andric   return ConstantInt::get(GEP->getContext(), Quot);
331*06c3fb27SDimitry Andric }
332*06c3fb27SDimitry Andric 
333*06c3fb27SDimitry Andric /// Promotes a single user of the alloca to a vector form.
334*06c3fb27SDimitry Andric ///
335*06c3fb27SDimitry Andric /// \param Inst           Instruction to be promoted.
336*06c3fb27SDimitry Andric /// \param DL             Module Data Layout.
337*06c3fb27SDimitry Andric /// \param VectorTy       Vectorized Type.
338*06c3fb27SDimitry Andric /// \param VecStoreSize   Size of \p VectorTy in bytes.
339*06c3fb27SDimitry Andric /// \param ElementSize    Size of \p VectorTy element type in bytes.
340*06c3fb27SDimitry Andric /// \param TransferInfo   MemTransferInst info map.
341*06c3fb27SDimitry Andric /// \param GEPVectorIdx   GEP -> VectorIdx cache.
342*06c3fb27SDimitry Andric /// \param CurVal         Current value of the vector (e.g. last stored value)
343*06c3fb27SDimitry Andric /// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
344*06c3fb27SDimitry Andric ///              be promoted now. This happens when promoting requires \p
345*06c3fb27SDimitry Andric ///              CurVal, but \p CurVal is nullptr.
346*06c3fb27SDimitry Andric /// \return the stored value if \p Inst would have written to the alloca, or
347*06c3fb27SDimitry Andric ///         nullptr otherwise.
348*06c3fb27SDimitry Andric static Value *promoteAllocaUserToVector(
349*06c3fb27SDimitry Andric     Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
350*06c3fb27SDimitry Andric     unsigned VecStoreSize, unsigned ElementSize,
351*06c3fb27SDimitry Andric     DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
352*06c3fb27SDimitry Andric     std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
353*06c3fb27SDimitry Andric     SmallVectorImpl<LoadInst *> &DeferredLoads) {
354*06c3fb27SDimitry Andric   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
355*06c3fb27SDimitry Andric   // to do more folding, especially in the case of vector splats.
356*06c3fb27SDimitry Andric   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
357*06c3fb27SDimitry Andric                                         InstSimplifyFolder(DL));
358*06c3fb27SDimitry Andric   Builder.SetInsertPoint(Inst);
359*06c3fb27SDimitry Andric 
360*06c3fb27SDimitry Andric   const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
361*06c3fb27SDimitry Andric     if (CurVal)
362*06c3fb27SDimitry Andric       return CurVal;
363*06c3fb27SDimitry Andric 
364*06c3fb27SDimitry Andric     // If the current value is not known, insert a dummy load and lower it on
365*06c3fb27SDimitry Andric     // the second pass.
366*06c3fb27SDimitry Andric     LoadInst *Dummy =
367*06c3fb27SDimitry Andric         Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
368*06c3fb27SDimitry Andric                            "promotealloca.dummyload");
369*06c3fb27SDimitry Andric     DeferredLoads.push_back(Dummy);
370*06c3fb27SDimitry Andric     return Dummy;
371*06c3fb27SDimitry Andric   };
372*06c3fb27SDimitry Andric 
373*06c3fb27SDimitry Andric   const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
374*06c3fb27SDimitry Andric                                                    Type *PtrTy) -> Value * {
375*06c3fb27SDimitry Andric     assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
376*06c3fb27SDimitry Andric     const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
377*06c3fb27SDimitry Andric     if (!PtrTy->isVectorTy())
378*06c3fb27SDimitry Andric       return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
379*06c3fb27SDimitry Andric     const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
380*06c3fb27SDimitry Andric     // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
381*06c3fb27SDimitry Andric     // first cast the ptr vector to <2 x i64>.
382*06c3fb27SDimitry Andric     assert((Size % NumPtrElts == 0) && "Vector size not divisble");
383*06c3fb27SDimitry Andric     Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
384*06c3fb27SDimitry Andric     return Builder.CreateBitOrPointerCast(
385*06c3fb27SDimitry Andric         Val, FixedVectorType::get(EltTy, NumPtrElts));
386*06c3fb27SDimitry Andric   };
387*06c3fb27SDimitry Andric 
388*06c3fb27SDimitry Andric   Type *VecEltTy = VectorTy->getElementType();
389*06c3fb27SDimitry Andric   switch (Inst->getOpcode()) {
390*06c3fb27SDimitry Andric   case Instruction::Load: {
391*06c3fb27SDimitry Andric     // Loads can only be lowered if the value is known.
392*06c3fb27SDimitry Andric     if (!CurVal) {
393*06c3fb27SDimitry Andric       DeferredLoads.push_back(cast<LoadInst>(Inst));
394*06c3fb27SDimitry Andric       return nullptr;
395*06c3fb27SDimitry Andric     }
396*06c3fb27SDimitry Andric 
397*06c3fb27SDimitry Andric     Value *Index = calculateVectorIndex(
398*06c3fb27SDimitry Andric         cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
399*06c3fb27SDimitry Andric 
400*06c3fb27SDimitry Andric     // We're loading the full vector.
401*06c3fb27SDimitry Andric     Type *AccessTy = Inst->getType();
402*06c3fb27SDimitry Andric     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
403*06c3fb27SDimitry Andric     if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) {
404*06c3fb27SDimitry Andric       if (AccessTy->isPtrOrPtrVectorTy())
405*06c3fb27SDimitry Andric         CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
406*06c3fb27SDimitry Andric       else if (CurVal->getType()->isPtrOrPtrVectorTy())
407*06c3fb27SDimitry Andric         CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
408*06c3fb27SDimitry Andric       Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
409*06c3fb27SDimitry Andric       Inst->replaceAllUsesWith(NewVal);
410*06c3fb27SDimitry Andric       return nullptr;
411*06c3fb27SDimitry Andric     }
412*06c3fb27SDimitry Andric 
413*06c3fb27SDimitry Andric     // Loading a subvector.
414*06c3fb27SDimitry Andric     if (isa<FixedVectorType>(AccessTy)) {
415*06c3fb27SDimitry Andric       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
416*06c3fb27SDimitry Andric       const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
417*06c3fb27SDimitry Andric       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
418*06c3fb27SDimitry Andric       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
419*06c3fb27SDimitry Andric 
420*06c3fb27SDimitry Andric       unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
421*06c3fb27SDimitry Andric       Value *SubVec = PoisonValue::get(SubVecTy);
422*06c3fb27SDimitry Andric       for (unsigned K = 0; K < NumElts; ++K) {
423*06c3fb27SDimitry Andric         SubVec = Builder.CreateInsertElement(
424*06c3fb27SDimitry Andric             SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K);
425*06c3fb27SDimitry Andric       }
426*06c3fb27SDimitry Andric 
427*06c3fb27SDimitry Andric       if (AccessTy->isPtrOrPtrVectorTy())
428*06c3fb27SDimitry Andric         SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
429*06c3fb27SDimitry Andric       else if (SubVecTy->isPtrOrPtrVectorTy())
430*06c3fb27SDimitry Andric         SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
431*06c3fb27SDimitry Andric 
432*06c3fb27SDimitry Andric       SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
433*06c3fb27SDimitry Andric       Inst->replaceAllUsesWith(SubVec);
434*06c3fb27SDimitry Andric       return nullptr;
435*06c3fb27SDimitry Andric     }
436*06c3fb27SDimitry Andric 
437*06c3fb27SDimitry Andric     // We're loading one element.
438*06c3fb27SDimitry Andric     Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
439*06c3fb27SDimitry Andric     if (AccessTy != VecEltTy)
440*06c3fb27SDimitry Andric       ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
441*06c3fb27SDimitry Andric 
442*06c3fb27SDimitry Andric     Inst->replaceAllUsesWith(ExtractElement);
443*06c3fb27SDimitry Andric     return nullptr;
444*06c3fb27SDimitry Andric   }
445*06c3fb27SDimitry Andric   case Instruction::Store: {
446*06c3fb27SDimitry Andric     // For stores, it's a bit trickier and it depends on whether we're storing
447*06c3fb27SDimitry Andric     // the full vector or not. If we're storing the full vector, we don't need
448*06c3fb27SDimitry Andric     // to know the current value. If this is a store of a single element, we
449*06c3fb27SDimitry Andric     // need to know the value.
450*06c3fb27SDimitry Andric     StoreInst *SI = cast<StoreInst>(Inst);
451*06c3fb27SDimitry Andric     Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
452*06c3fb27SDimitry Andric     Value *Val = SI->getValueOperand();
453*06c3fb27SDimitry Andric 
454*06c3fb27SDimitry Andric     // We're storing the full vector, we can handle this without knowing CurVal.
455*06c3fb27SDimitry Andric     Type *AccessTy = Val->getType();
456*06c3fb27SDimitry Andric     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
457*06c3fb27SDimitry Andric     if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) {
458*06c3fb27SDimitry Andric       if (AccessTy->isPtrOrPtrVectorTy())
459*06c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, AccessTy);
460*06c3fb27SDimitry Andric       else if (VectorTy->isPtrOrPtrVectorTy())
461*06c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, VectorTy);
462*06c3fb27SDimitry Andric       return Builder.CreateBitOrPointerCast(Val, VectorTy);
463*06c3fb27SDimitry Andric     }
464*06c3fb27SDimitry Andric 
465*06c3fb27SDimitry Andric     // Storing a subvector.
466*06c3fb27SDimitry Andric     if (isa<FixedVectorType>(AccessTy)) {
467*06c3fb27SDimitry Andric       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
468*06c3fb27SDimitry Andric       const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
469*06c3fb27SDimitry Andric       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
470*06c3fb27SDimitry Andric       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
471*06c3fb27SDimitry Andric 
472*06c3fb27SDimitry Andric       if (SubVecTy->isPtrOrPtrVectorTy())
473*06c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, SubVecTy);
474*06c3fb27SDimitry Andric       else if (AccessTy->isPtrOrPtrVectorTy())
475*06c3fb27SDimitry Andric         Val = CreateTempPtrIntCast(Val, AccessTy);
476*06c3fb27SDimitry Andric 
477*06c3fb27SDimitry Andric       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
478*06c3fb27SDimitry Andric 
479*06c3fb27SDimitry Andric       unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
480*06c3fb27SDimitry Andric       Value *CurVec = GetOrLoadCurrentVectorValue();
481*06c3fb27SDimitry Andric       for (unsigned K = 0; (IndexVal + K) < NumElts; ++K) {
482*06c3fb27SDimitry Andric         CurVec = Builder.CreateInsertElement(
483*06c3fb27SDimitry Andric             CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K);
484*06c3fb27SDimitry Andric       }
485*06c3fb27SDimitry Andric       return CurVec;
486*06c3fb27SDimitry Andric     }
487*06c3fb27SDimitry Andric 
488*06c3fb27SDimitry Andric     if (Val->getType() != VecEltTy)
489*06c3fb27SDimitry Andric       Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
490*06c3fb27SDimitry Andric     return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
491*06c3fb27SDimitry Andric                                        Index);
492*06c3fb27SDimitry Andric   }
493*06c3fb27SDimitry Andric   case Instruction::Call: {
494*06c3fb27SDimitry Andric     if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
495*06c3fb27SDimitry Andric       // For memcpy, we need to know curval.
496*06c3fb27SDimitry Andric       ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
497*06c3fb27SDimitry Andric       unsigned NumCopied = Length->getZExtValue() / ElementSize;
498*06c3fb27SDimitry Andric       MemTransferInfo *TI = &TransferInfo[MTI];
499*06c3fb27SDimitry Andric       unsigned SrcBegin = TI->SrcIndex->getZExtValue();
500*06c3fb27SDimitry Andric       unsigned DestBegin = TI->DestIndex->getZExtValue();
501*06c3fb27SDimitry Andric 
502*06c3fb27SDimitry Andric       SmallVector<int> Mask;
503*06c3fb27SDimitry Andric       for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
504*06c3fb27SDimitry Andric         if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
505*06c3fb27SDimitry Andric           Mask.push_back(SrcBegin++);
506*06c3fb27SDimitry Andric         } else {
507*06c3fb27SDimitry Andric           Mask.push_back(Idx);
508*06c3fb27SDimitry Andric         }
509*06c3fb27SDimitry Andric       }
510*06c3fb27SDimitry Andric 
511*06c3fb27SDimitry Andric       return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
512*06c3fb27SDimitry Andric     }
513*06c3fb27SDimitry Andric 
514*06c3fb27SDimitry Andric     if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
515*06c3fb27SDimitry Andric       // For memset, we don't need to know the previous value because we
516*06c3fb27SDimitry Andric       // currently only allow memsets that cover the whole alloca.
517*06c3fb27SDimitry Andric       Value *Elt = MSI->getOperand(1);
518*06c3fb27SDimitry Andric       if (DL.getTypeStoreSize(VecEltTy) > 1) {
519*06c3fb27SDimitry Andric         Value *EltBytes =
520*06c3fb27SDimitry Andric             Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
521*06c3fb27SDimitry Andric         Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
522*06c3fb27SDimitry Andric       }
523*06c3fb27SDimitry Andric 
524*06c3fb27SDimitry Andric       return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
525*06c3fb27SDimitry Andric     }
526*06c3fb27SDimitry Andric 
527*06c3fb27SDimitry Andric     llvm_unreachable("Unsupported call when promoting alloca to vector");
528*06c3fb27SDimitry Andric   }
529*06c3fb27SDimitry Andric 
530*06c3fb27SDimitry Andric   default:
531*06c3fb27SDimitry Andric     llvm_unreachable("Inconsistency in instructions promotable to vector");
532*06c3fb27SDimitry Andric   }
533*06c3fb27SDimitry Andric 
534*06c3fb27SDimitry Andric   llvm_unreachable("Did not return after promoting instruction!");
535*06c3fb27SDimitry Andric }
536*06c3fb27SDimitry Andric 
537*06c3fb27SDimitry Andric static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
538*06c3fb27SDimitry Andric                                   const DataLayout &DL) {
539*06c3fb27SDimitry Andric   // Access as a vector type can work if the size of the access vector is a
540*06c3fb27SDimitry Andric   // multiple of the size of the alloca's vector element type.
541*06c3fb27SDimitry Andric   //
542*06c3fb27SDimitry Andric   // Examples:
543*06c3fb27SDimitry Andric   //    - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
544*06c3fb27SDimitry Andric   //    - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
545*06c3fb27SDimitry Andric   //    - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
546*06c3fb27SDimitry Andric   //        - 3*32 is not a multiple of 64
547*06c3fb27SDimitry Andric   //
548*06c3fb27SDimitry Andric   // We could handle more complicated cases, but it'd make things a lot more
549*06c3fb27SDimitry Andric   // complicated.
550*06c3fb27SDimitry Andric   if (isa<FixedVectorType>(AccessTy)) {
551*06c3fb27SDimitry Andric     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
552*06c3fb27SDimitry Andric     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
553*06c3fb27SDimitry Andric     return AccTS.isKnownMultipleOf(VecTS);
554*06c3fb27SDimitry Andric   }
555*06c3fb27SDimitry Andric 
556*06c3fb27SDimitry Andric   return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy,
557*06c3fb27SDimitry Andric                                               DL);
558*06c3fb27SDimitry Andric }
559*06c3fb27SDimitry Andric 
560*06c3fb27SDimitry Andric /// Iterates over an instruction worklist that may contain multiple instructions
561*06c3fb27SDimitry Andric /// from the same basic block, but in a different order.
562*06c3fb27SDimitry Andric template <typename InstContainer>
563*06c3fb27SDimitry Andric static void forEachWorkListItem(const InstContainer &WorkList,
564*06c3fb27SDimitry Andric                                 std::function<void(Instruction *)> Fn) {
565*06c3fb27SDimitry Andric   // Bucket up uses of the alloca by the block they occur in.
566*06c3fb27SDimitry Andric   // This is important because we have to handle multiple defs/uses in a block
567*06c3fb27SDimitry Andric   // ourselves: SSAUpdater is purely for cross-block references.
568*06c3fb27SDimitry Andric   DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
569*06c3fb27SDimitry Andric   for (Instruction *User : WorkList)
570*06c3fb27SDimitry Andric     UsesByBlock[User->getParent()].insert(User);
571*06c3fb27SDimitry Andric 
572*06c3fb27SDimitry Andric   for (Instruction *User : WorkList) {
573*06c3fb27SDimitry Andric     BasicBlock *BB = User->getParent();
574*06c3fb27SDimitry Andric     auto &BlockUses = UsesByBlock[BB];
575*06c3fb27SDimitry Andric 
576*06c3fb27SDimitry Andric     // Already processed, skip.
577*06c3fb27SDimitry Andric     if (BlockUses.empty())
578*06c3fb27SDimitry Andric       continue;
579*06c3fb27SDimitry Andric 
580*06c3fb27SDimitry Andric     // Only user in the block, directly process it.
581*06c3fb27SDimitry Andric     if (BlockUses.size() == 1) {
582*06c3fb27SDimitry Andric       Fn(User);
583*06c3fb27SDimitry Andric       continue;
584*06c3fb27SDimitry Andric     }
585*06c3fb27SDimitry Andric 
586*06c3fb27SDimitry Andric     // Multiple users in the block, do a linear scan to see users in order.
587*06c3fb27SDimitry Andric     for (Instruction &Inst : *BB) {
588*06c3fb27SDimitry Andric       if (!BlockUses.contains(&Inst))
589*06c3fb27SDimitry Andric         continue;
590*06c3fb27SDimitry Andric 
591*06c3fb27SDimitry Andric       Fn(&Inst);
592*06c3fb27SDimitry Andric     }
593*06c3fb27SDimitry Andric 
594*06c3fb27SDimitry Andric     // Clear the block so we know it's been processed.
595*06c3fb27SDimitry Andric     BlockUses.clear();
596*06c3fb27SDimitry Andric   }
597*06c3fb27SDimitry Andric }
598*06c3fb27SDimitry Andric 
599*06c3fb27SDimitry Andric // FIXME: Should try to pick the most likely to be profitable allocas first.
600*06c3fb27SDimitry Andric bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
601*06c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
602*06c3fb27SDimitry Andric 
603*06c3fb27SDimitry Andric   if (DisablePromoteAllocaToVector) {
604*06c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
605*06c3fb27SDimitry Andric     return false;
606*06c3fb27SDimitry Andric   }
607*06c3fb27SDimitry Andric 
608*06c3fb27SDimitry Andric   Type *AllocaTy = Alloca.getAllocatedType();
609*06c3fb27SDimitry Andric   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
610*06c3fb27SDimitry Andric   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
611*06c3fb27SDimitry Andric     if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
612*06c3fb27SDimitry Andric         ArrayTy->getNumElements() > 0)
613*06c3fb27SDimitry Andric       VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
614*06c3fb27SDimitry Andric                                       ArrayTy->getNumElements());
615*06c3fb27SDimitry Andric   }
616*06c3fb27SDimitry Andric 
617*06c3fb27SDimitry Andric   // Use up to 1/4 of available register budget for vectorization.
618*06c3fb27SDimitry Andric   unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
619*06c3fb27SDimitry Andric                                               : (MaxVGPRs * 32);
620*06c3fb27SDimitry Andric 
621*06c3fb27SDimitry Andric   if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
622*06c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with " << MaxVGPRs
623*06c3fb27SDimitry Andric                       << " registers available\n");
624*06c3fb27SDimitry Andric     return false;
625*06c3fb27SDimitry Andric   }
626*06c3fb27SDimitry Andric 
627*06c3fb27SDimitry Andric   // FIXME: There is no reason why we can't support larger arrays, we
628*06c3fb27SDimitry Andric   // are just being conservative for now.
629*06c3fb27SDimitry Andric   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
630*06c3fb27SDimitry Andric   // equivalent. Potentially these could also be promoted but we don't currently
631*06c3fb27SDimitry Andric   // handle this case
632*06c3fb27SDimitry Andric   if (!VectorTy) {
633*06c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
634*06c3fb27SDimitry Andric     return false;
635*06c3fb27SDimitry Andric   }
636*06c3fb27SDimitry Andric 
637*06c3fb27SDimitry Andric   if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
638*06c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  " << *VectorTy
639*06c3fb27SDimitry Andric                       << " has an unsupported number of elements\n");
640*06c3fb27SDimitry Andric     return false;
641*06c3fb27SDimitry Andric   }
642*06c3fb27SDimitry Andric 
643*06c3fb27SDimitry Andric   std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
644*06c3fb27SDimitry Andric   SmallVector<Instruction *> WorkList;
645*06c3fb27SDimitry Andric   SmallVector<Instruction *> UsersToRemove;
646*06c3fb27SDimitry Andric   SmallVector<Instruction *> DeferredInsts;
647*06c3fb27SDimitry Andric   SmallVector<Use *, 8> Uses;
648*06c3fb27SDimitry Andric   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
649*06c3fb27SDimitry Andric 
650*06c3fb27SDimitry Andric   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
651*06c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
652*06c3fb27SDimitry Andric                       << "    " << *Inst << "\n");
653*06c3fb27SDimitry Andric     return false;
654*06c3fb27SDimitry Andric   };
655*06c3fb27SDimitry Andric 
656*06c3fb27SDimitry Andric   for (Use &U : Alloca.uses())
657*06c3fb27SDimitry Andric     Uses.push_back(&U);
658*06c3fb27SDimitry Andric 
659*06c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
660*06c3fb27SDimitry Andric 
661*06c3fb27SDimitry Andric   Type *VecEltTy = VectorTy->getElementType();
662*06c3fb27SDimitry Andric   unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
663*06c3fb27SDimitry Andric   while (!Uses.empty()) {
664*06c3fb27SDimitry Andric     Use *U = Uses.pop_back_val();
665*06c3fb27SDimitry Andric     Instruction *Inst = cast<Instruction>(U->getUser());
666*06c3fb27SDimitry Andric 
667*06c3fb27SDimitry Andric     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
668*06c3fb27SDimitry Andric       // This is a store of the pointer, not to the pointer.
669*06c3fb27SDimitry Andric       if (isa<StoreInst>(Inst) &&
670*06c3fb27SDimitry Andric           U->getOperandNo() != StoreInst::getPointerOperandIndex())
671*06c3fb27SDimitry Andric         return RejectUser(Inst, "pointer is being stored");
672*06c3fb27SDimitry Andric 
673*06c3fb27SDimitry Andric       Type *AccessTy = getLoadStoreType(Inst);
674*06c3fb27SDimitry Andric       if (AccessTy->isAggregateType())
675*06c3fb27SDimitry Andric         return RejectUser(Inst, "unsupported load/store as aggregate");
676*06c3fb27SDimitry Andric       assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
677*06c3fb27SDimitry Andric 
678*06c3fb27SDimitry Andric       Ptr = Ptr->stripPointerCasts();
679*06c3fb27SDimitry Andric 
680*06c3fb27SDimitry Andric       // Alloca already accessed as vector.
681*06c3fb27SDimitry Andric       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
682*06c3fb27SDimitry Andric                                 DL->getTypeStoreSize(AccessTy)) {
683*06c3fb27SDimitry Andric         WorkList.push_back(Inst);
684*06c3fb27SDimitry Andric         continue;
685*06c3fb27SDimitry Andric       }
686*06c3fb27SDimitry Andric 
687*06c3fb27SDimitry Andric       // Check that this is a simple access of a vector element.
688*06c3fb27SDimitry Andric       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
689*06c3fb27SDimitry Andric                                           : cast<StoreInst>(Inst)->isSimple();
690*06c3fb27SDimitry Andric       if (!IsSimple)
691*06c3fb27SDimitry Andric         return RejectUser(Inst, "not a simple load or store");
692*06c3fb27SDimitry Andric       if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
693*06c3fb27SDimitry Andric         return RejectUser(Inst, "not a supported access type");
694*06c3fb27SDimitry Andric 
695*06c3fb27SDimitry Andric       WorkList.push_back(Inst);
696*06c3fb27SDimitry Andric       continue;
697*06c3fb27SDimitry Andric     }
698*06c3fb27SDimitry Andric 
699*06c3fb27SDimitry Andric     if (isa<BitCastInst>(Inst)) {
700*06c3fb27SDimitry Andric       // Look through bitcasts.
701*06c3fb27SDimitry Andric       for (Use &U : Inst->uses())
702*06c3fb27SDimitry Andric         Uses.push_back(&U);
703*06c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
704*06c3fb27SDimitry Andric       continue;
705*06c3fb27SDimitry Andric     }
706*06c3fb27SDimitry Andric 
707*06c3fb27SDimitry Andric     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
708*06c3fb27SDimitry Andric       // If we can't compute a vector index from this GEP, then we can't
709*06c3fb27SDimitry Andric       // promote this alloca to vector.
710*06c3fb27SDimitry Andric       Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
711*06c3fb27SDimitry Andric       if (!Index)
712*06c3fb27SDimitry Andric         return RejectUser(Inst, "cannot compute vector index for GEP");
713*06c3fb27SDimitry Andric 
714*06c3fb27SDimitry Andric       GEPVectorIdx[GEP] = Index;
715*06c3fb27SDimitry Andric       for (Use &U : Inst->uses())
716*06c3fb27SDimitry Andric         Uses.push_back(&U);
717*06c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
718*06c3fb27SDimitry Andric       continue;
719*06c3fb27SDimitry Andric     }
720*06c3fb27SDimitry Andric 
721*06c3fb27SDimitry Andric     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
722*06c3fb27SDimitry Andric         MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
723*06c3fb27SDimitry Andric       WorkList.push_back(Inst);
724*06c3fb27SDimitry Andric       continue;
725*06c3fb27SDimitry Andric     }
726*06c3fb27SDimitry Andric 
727*06c3fb27SDimitry Andric     if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
728*06c3fb27SDimitry Andric       if (TransferInst->isVolatile())
729*06c3fb27SDimitry Andric         return RejectUser(Inst, "mem transfer inst is volatile");
730*06c3fb27SDimitry Andric 
731*06c3fb27SDimitry Andric       ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
732*06c3fb27SDimitry Andric       if (!Len || (Len->getZExtValue() % ElementSize))
733*06c3fb27SDimitry Andric         return RejectUser(Inst, "mem transfer inst length is non-constant or "
734*06c3fb27SDimitry Andric                                 "not a multiple of the vector element size");
735*06c3fb27SDimitry Andric 
736*06c3fb27SDimitry Andric       if (!TransferInfo.count(TransferInst)) {
737*06c3fb27SDimitry Andric         DeferredInsts.push_back(Inst);
738*06c3fb27SDimitry Andric         WorkList.push_back(Inst);
739*06c3fb27SDimitry Andric         TransferInfo[TransferInst] = MemTransferInfo();
740*06c3fb27SDimitry Andric       }
741*06c3fb27SDimitry Andric 
742*06c3fb27SDimitry Andric       auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
743*06c3fb27SDimitry Andric         GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
744*06c3fb27SDimitry Andric         if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
745*06c3fb27SDimitry Andric           return nullptr;
746*06c3fb27SDimitry Andric 
747*06c3fb27SDimitry Andric         return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
748*06c3fb27SDimitry Andric       };
749*06c3fb27SDimitry Andric 
750*06c3fb27SDimitry Andric       unsigned OpNum = U->getOperandNo();
751*06c3fb27SDimitry Andric       MemTransferInfo *TI = &TransferInfo[TransferInst];
752*06c3fb27SDimitry Andric       if (OpNum == 0) {
753*06c3fb27SDimitry Andric         Value *Dest = TransferInst->getDest();
754*06c3fb27SDimitry Andric         ConstantInt *Index = getPointerIndexOfAlloca(Dest);
755*06c3fb27SDimitry Andric         if (!Index)
756*06c3fb27SDimitry Andric           return RejectUser(Inst, "could not calculate constant dest index");
757*06c3fb27SDimitry Andric         TI->DestIndex = Index;
758*06c3fb27SDimitry Andric       } else {
759*06c3fb27SDimitry Andric         assert(OpNum == 1);
760*06c3fb27SDimitry Andric         Value *Src = TransferInst->getSource();
761*06c3fb27SDimitry Andric         ConstantInt *Index = getPointerIndexOfAlloca(Src);
762*06c3fb27SDimitry Andric         if (!Index)
763*06c3fb27SDimitry Andric           return RejectUser(Inst, "could not calculate constant src index");
764*06c3fb27SDimitry Andric         TI->SrcIndex = Index;
765*06c3fb27SDimitry Andric       }
766*06c3fb27SDimitry Andric       continue;
767*06c3fb27SDimitry Andric     }
768*06c3fb27SDimitry Andric 
769*06c3fb27SDimitry Andric     // Ignore assume-like intrinsics and comparisons used in assumes.
770*06c3fb27SDimitry Andric     if (isAssumeLikeIntrinsic(Inst)) {
771*06c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
772*06c3fb27SDimitry Andric       continue;
773*06c3fb27SDimitry Andric     }
774*06c3fb27SDimitry Andric 
775*06c3fb27SDimitry Andric     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
776*06c3fb27SDimitry Andric           return isAssumeLikeIntrinsic(cast<Instruction>(U));
777*06c3fb27SDimitry Andric         })) {
778*06c3fb27SDimitry Andric       UsersToRemove.push_back(Inst);
779*06c3fb27SDimitry Andric       continue;
780*06c3fb27SDimitry Andric     }
781*06c3fb27SDimitry Andric 
782*06c3fb27SDimitry Andric     return RejectUser(Inst, "unhandled alloca user");
783*06c3fb27SDimitry Andric   }
784*06c3fb27SDimitry Andric 
785*06c3fb27SDimitry Andric   while (!DeferredInsts.empty()) {
786*06c3fb27SDimitry Andric     Instruction *Inst = DeferredInsts.pop_back_val();
787*06c3fb27SDimitry Andric     MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
788*06c3fb27SDimitry Andric     // TODO: Support the case if the pointers are from different alloca or
789*06c3fb27SDimitry Andric     // from different address spaces.
790*06c3fb27SDimitry Andric     MemTransferInfo &Info = TransferInfo[TransferInst];
791*06c3fb27SDimitry Andric     if (!Info.SrcIndex || !Info.DestIndex)
792*06c3fb27SDimitry Andric       return RejectUser(
793*06c3fb27SDimitry Andric           Inst, "mem transfer inst is missing constant src and/or dst index");
794*06c3fb27SDimitry Andric   }
795*06c3fb27SDimitry Andric 
796*06c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
797*06c3fb27SDimitry Andric                     << *VectorTy << '\n');
798*06c3fb27SDimitry Andric   const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
799*06c3fb27SDimitry Andric 
800*06c3fb27SDimitry Andric   // Alloca is uninitialized memory. Imitate that by making the first value
801*06c3fb27SDimitry Andric   // undef.
802*06c3fb27SDimitry Andric   SSAUpdater Updater;
803*06c3fb27SDimitry Andric   Updater.Initialize(VectorTy, "promotealloca");
804*06c3fb27SDimitry Andric   Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
805*06c3fb27SDimitry Andric 
806*06c3fb27SDimitry Andric   // First handle the initial worklist.
807*06c3fb27SDimitry Andric   SmallVector<LoadInst *, 4> DeferredLoads;
808*06c3fb27SDimitry Andric   forEachWorkListItem(WorkList, [&](Instruction *I) {
809*06c3fb27SDimitry Andric     BasicBlock *BB = I->getParent();
810*06c3fb27SDimitry Andric     // On the first pass, we only take values that are trivially known, i.e.
811*06c3fb27SDimitry Andric     // where AddAvailableValue was already called in this block.
812*06c3fb27SDimitry Andric     Value *Result = promoteAllocaUserToVector(
813*06c3fb27SDimitry Andric         I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
814*06c3fb27SDimitry Andric         Updater.FindValueForBlock(BB), DeferredLoads);
815*06c3fb27SDimitry Andric     if (Result)
816*06c3fb27SDimitry Andric       Updater.AddAvailableValue(BB, Result);
817*06c3fb27SDimitry Andric   });
818*06c3fb27SDimitry Andric 
819*06c3fb27SDimitry Andric   // Then handle deferred loads.
820*06c3fb27SDimitry Andric   forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
821*06c3fb27SDimitry Andric     SmallVector<LoadInst *, 0> NewDLs;
822*06c3fb27SDimitry Andric     BasicBlock *BB = I->getParent();
823*06c3fb27SDimitry Andric     // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
824*06c3fb27SDimitry Andric     // get a value, inserting PHIs as needed.
825*06c3fb27SDimitry Andric     Value *Result = promoteAllocaUserToVector(
826*06c3fb27SDimitry Andric         I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
827*06c3fb27SDimitry Andric         Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
828*06c3fb27SDimitry Andric     if (Result)
829*06c3fb27SDimitry Andric       Updater.AddAvailableValue(BB, Result);
830*06c3fb27SDimitry Andric     assert(NewDLs.empty() && "No more deferred loads should be queued!");
831*06c3fb27SDimitry Andric   });
832*06c3fb27SDimitry Andric 
833*06c3fb27SDimitry Andric   // Delete all instructions. On the first pass, new dummy loads may have been
834*06c3fb27SDimitry Andric   // added so we need to collect them too.
835*06c3fb27SDimitry Andric   DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
836*06c3fb27SDimitry Andric   InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end());
837*06c3fb27SDimitry Andric   for (Instruction *I : InstsToDelete) {
838*06c3fb27SDimitry Andric     assert(I->use_empty());
839*06c3fb27SDimitry Andric     I->eraseFromParent();
840*06c3fb27SDimitry Andric   }
841*06c3fb27SDimitry Andric 
842*06c3fb27SDimitry Andric   // Delete all the users that are known to be removeable.
843*06c3fb27SDimitry Andric   for (Instruction *I : reverse(UsersToRemove)) {
844*06c3fb27SDimitry Andric     I->dropDroppableUses();
845*06c3fb27SDimitry Andric     assert(I->use_empty());
846*06c3fb27SDimitry Andric     I->eraseFromParent();
847*06c3fb27SDimitry Andric   }
848*06c3fb27SDimitry Andric 
849*06c3fb27SDimitry Andric   // Alloca should now be dead too.
850*06c3fb27SDimitry Andric   assert(Alloca.use_empty());
851*06c3fb27SDimitry Andric   Alloca.eraseFromParent();
852*06c3fb27SDimitry Andric   return true;
853*06c3fb27SDimitry Andric }
854*06c3fb27SDimitry Andric 
8550b57cec5SDimitry Andric std::pair<Value *, Value *>
856e8d8bef9SDimitry Andric AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
857349cc55cSDimitry Andric   Function &F = *Builder.GetInsertBlock()->getParent();
858e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
8590b57cec5SDimitry Andric 
8600b57cec5SDimitry Andric   if (!IsAMDHSA) {
861*06c3fb27SDimitry Andric     Function *LocalSizeYFn =
862*06c3fb27SDimitry Andric         Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
863*06c3fb27SDimitry Andric     Function *LocalSizeZFn =
864*06c3fb27SDimitry Andric         Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
8650b57cec5SDimitry Andric 
8660b57cec5SDimitry Andric     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
8670b57cec5SDimitry Andric     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
8680b57cec5SDimitry Andric 
8690b57cec5SDimitry Andric     ST.makeLIDRangeMetadata(LocalSizeY);
8700b57cec5SDimitry Andric     ST.makeLIDRangeMetadata(LocalSizeZ);
8710b57cec5SDimitry Andric 
872bdd1243dSDimitry Andric     return std::pair(LocalSizeY, LocalSizeZ);
8730b57cec5SDimitry Andric   }
8740b57cec5SDimitry Andric 
8750b57cec5SDimitry Andric   // We must read the size out of the dispatch pointer.
8760b57cec5SDimitry Andric   assert(IsAMDGCN);
8770b57cec5SDimitry Andric 
8780b57cec5SDimitry Andric   // We are indexing into this struct, and want to extract the workgroup_size_*
8790b57cec5SDimitry Andric   // fields.
8800b57cec5SDimitry Andric   //
8810b57cec5SDimitry Andric   //   typedef struct hsa_kernel_dispatch_packet_s {
8820b57cec5SDimitry Andric   //     uint16_t header;
8830b57cec5SDimitry Andric   //     uint16_t setup;
8840b57cec5SDimitry Andric   //     uint16_t workgroup_size_x ;
8850b57cec5SDimitry Andric   //     uint16_t workgroup_size_y;
8860b57cec5SDimitry Andric   //     uint16_t workgroup_size_z;
8870b57cec5SDimitry Andric   //     uint16_t reserved0;
8880b57cec5SDimitry Andric   //     uint32_t grid_size_x ;
8890b57cec5SDimitry Andric   //     uint32_t grid_size_y ;
8900b57cec5SDimitry Andric   //     uint32_t grid_size_z;
8910b57cec5SDimitry Andric   //
8920b57cec5SDimitry Andric   //     uint32_t private_segment_size;
8930b57cec5SDimitry Andric   //     uint32_t group_segment_size;
8940b57cec5SDimitry Andric   //     uint64_t kernel_object;
8950b57cec5SDimitry Andric   //
8960b57cec5SDimitry Andric   // #ifdef HSA_LARGE_MODEL
8970b57cec5SDimitry Andric   //     void *kernarg_address;
8980b57cec5SDimitry Andric   // #elif defined HSA_LITTLE_ENDIAN
8990b57cec5SDimitry Andric   //     void *kernarg_address;
9000b57cec5SDimitry Andric   //     uint32_t reserved1;
9010b57cec5SDimitry Andric   // #else
9020b57cec5SDimitry Andric   //     uint32_t reserved1;
9030b57cec5SDimitry Andric   //     void *kernarg_address;
9040b57cec5SDimitry Andric   // #endif
9050b57cec5SDimitry Andric   //     uint64_t reserved2;
9060b57cec5SDimitry Andric   //     hsa_signal_t completion_signal; // uint64_t wrapper
9070b57cec5SDimitry Andric   //   } hsa_kernel_dispatch_packet_t
9080b57cec5SDimitry Andric   //
909*06c3fb27SDimitry Andric   Function *DispatchPtrFn =
910*06c3fb27SDimitry Andric       Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
9110b57cec5SDimitry Andric 
9120b57cec5SDimitry Andric   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
913349cc55cSDimitry Andric   DispatchPtr->addRetAttr(Attribute::NoAlias);
914349cc55cSDimitry Andric   DispatchPtr->addRetAttr(Attribute::NonNull);
915349cc55cSDimitry Andric   F.removeFnAttr("amdgpu-no-dispatch-ptr");
9160b57cec5SDimitry Andric 
9170b57cec5SDimitry Andric   // Size of the dispatch packet struct.
918349cc55cSDimitry Andric   DispatchPtr->addDereferenceableRetAttr(64);
9190b57cec5SDimitry Andric 
9200b57cec5SDimitry Andric   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
9210b57cec5SDimitry Andric   Value *CastDispatchPtr = Builder.CreateBitCast(
9220b57cec5SDimitry Andric       DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
9230b57cec5SDimitry Andric 
9240b57cec5SDimitry Andric   // We could do a single 64-bit load here, but it's likely that the basic
9250b57cec5SDimitry Andric   // 32-bit and extract sequence is already present, and it is probably easier
926349cc55cSDimitry Andric   // to CSE this. The loads should be mergeable later anyway.
9270b57cec5SDimitry Andric   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
9285ffd83dbSDimitry Andric   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
9290b57cec5SDimitry Andric 
9300b57cec5SDimitry Andric   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
9315ffd83dbSDimitry Andric   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
9320b57cec5SDimitry Andric 
933bdd1243dSDimitry Andric   MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt);
9340b57cec5SDimitry Andric   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
9350b57cec5SDimitry Andric   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
9360b57cec5SDimitry Andric   ST.makeLIDRangeMetadata(LoadZU);
9370b57cec5SDimitry Andric 
9380b57cec5SDimitry Andric   // Extract y component. Upper half of LoadZU should be zero already.
9390b57cec5SDimitry Andric   Value *Y = Builder.CreateLShr(LoadXY, 16);
9400b57cec5SDimitry Andric 
941bdd1243dSDimitry Andric   return std::pair(Y, LoadZU);
9420b57cec5SDimitry Andric }
9430b57cec5SDimitry Andric 
944e8d8bef9SDimitry Andric Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
945e8d8bef9SDimitry Andric                                               unsigned N) {
946349cc55cSDimitry Andric   Function *F = Builder.GetInsertBlock()->getParent();
947349cc55cSDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
948480093f4SDimitry Andric   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
949349cc55cSDimitry Andric   StringRef AttrName;
9500b57cec5SDimitry Andric 
9510b57cec5SDimitry Andric   switch (N) {
9520b57cec5SDimitry Andric   case 0:
953480093f4SDimitry Andric     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
954480093f4SDimitry Andric                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
955349cc55cSDimitry Andric     AttrName = "amdgpu-no-workitem-id-x";
9560b57cec5SDimitry Andric     break;
9570b57cec5SDimitry Andric   case 1:
958480093f4SDimitry Andric     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
959480093f4SDimitry Andric                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
960349cc55cSDimitry Andric     AttrName = "amdgpu-no-workitem-id-y";
9610b57cec5SDimitry Andric     break;
9620b57cec5SDimitry Andric 
9630b57cec5SDimitry Andric   case 2:
964480093f4SDimitry Andric     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
965480093f4SDimitry Andric                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
966349cc55cSDimitry Andric     AttrName = "amdgpu-no-workitem-id-z";
9670b57cec5SDimitry Andric     break;
9680b57cec5SDimitry Andric   default:
9690b57cec5SDimitry Andric     llvm_unreachable("invalid dimension");
9700b57cec5SDimitry Andric   }
9710b57cec5SDimitry Andric 
9720b57cec5SDimitry Andric   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
9730b57cec5SDimitry Andric   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
9740b57cec5SDimitry Andric   ST.makeLIDRangeMetadata(CI);
975349cc55cSDimitry Andric   F->removeFnAttr(AttrName);
9760b57cec5SDimitry Andric 
9770b57cec5SDimitry Andric   return CI;
9780b57cec5SDimitry Andric }
9790b57cec5SDimitry Andric 
9800b57cec5SDimitry Andric static bool isCallPromotable(CallInst *CI) {
9810b57cec5SDimitry Andric   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
9820b57cec5SDimitry Andric   if (!II)
9830b57cec5SDimitry Andric     return false;
9840b57cec5SDimitry Andric 
9850b57cec5SDimitry Andric   switch (II->getIntrinsicID()) {
9860b57cec5SDimitry Andric   case Intrinsic::memcpy:
9870b57cec5SDimitry Andric   case Intrinsic::memmove:
9880b57cec5SDimitry Andric   case Intrinsic::memset:
9890b57cec5SDimitry Andric   case Intrinsic::lifetime_start:
9900b57cec5SDimitry Andric   case Intrinsic::lifetime_end:
9910b57cec5SDimitry Andric   case Intrinsic::invariant_start:
9920b57cec5SDimitry Andric   case Intrinsic::invariant_end:
9930b57cec5SDimitry Andric   case Intrinsic::launder_invariant_group:
9940b57cec5SDimitry Andric   case Intrinsic::strip_invariant_group:
9950b57cec5SDimitry Andric   case Intrinsic::objectsize:
9960b57cec5SDimitry Andric     return true;
9970b57cec5SDimitry Andric   default:
9980b57cec5SDimitry Andric     return false;
9990b57cec5SDimitry Andric   }
10000b57cec5SDimitry Andric }
10010b57cec5SDimitry Andric 
1002e8d8bef9SDimitry Andric bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1003e8d8bef9SDimitry Andric     Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
10040b57cec5SDimitry Andric     int OpIdx1) const {
10050b57cec5SDimitry Andric   // Figure out which operand is the one we might not be promoting.
10060b57cec5SDimitry Andric   Value *OtherOp = Inst->getOperand(OpIdx0);
10070b57cec5SDimitry Andric   if (Val == OtherOp)
10080b57cec5SDimitry Andric     OtherOp = Inst->getOperand(OpIdx1);
10090b57cec5SDimitry Andric 
10100b57cec5SDimitry Andric   if (isa<ConstantPointerNull>(OtherOp))
10110b57cec5SDimitry Andric     return true;
10120b57cec5SDimitry Andric 
1013e8d8bef9SDimitry Andric   Value *OtherObj = getUnderlyingObject(OtherOp);
10140b57cec5SDimitry Andric   if (!isa<AllocaInst>(OtherObj))
10150b57cec5SDimitry Andric     return false;
10160b57cec5SDimitry Andric 
10170b57cec5SDimitry Andric   // TODO: We should be able to replace undefs with the right pointer type.
10180b57cec5SDimitry Andric 
10190b57cec5SDimitry Andric   // TODO: If we know the other base object is another promotable
10200b57cec5SDimitry Andric   // alloca, not necessarily this alloca, we can do this. The
10210b57cec5SDimitry Andric   // important part is both must have the same address space at
10220b57cec5SDimitry Andric   // the end.
10230b57cec5SDimitry Andric   if (OtherObj != BaseAlloca) {
10240b57cec5SDimitry Andric     LLVM_DEBUG(
10250b57cec5SDimitry Andric         dbgs() << "Found a binary instruction with another alloca object\n");
10260b57cec5SDimitry Andric     return false;
10270b57cec5SDimitry Andric   }
10280b57cec5SDimitry Andric 
10290b57cec5SDimitry Andric   return true;
10300b57cec5SDimitry Andric }
10310b57cec5SDimitry Andric 
1032e8d8bef9SDimitry Andric bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1033e8d8bef9SDimitry Andric     Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
10340b57cec5SDimitry Andric 
10350b57cec5SDimitry Andric   for (User *User : Val->users()) {
10360b57cec5SDimitry Andric     if (is_contained(WorkList, User))
10370b57cec5SDimitry Andric       continue;
10380b57cec5SDimitry Andric 
10390b57cec5SDimitry Andric     if (CallInst *CI = dyn_cast<CallInst>(User)) {
10400b57cec5SDimitry Andric       if (!isCallPromotable(CI))
10410b57cec5SDimitry Andric         return false;
10420b57cec5SDimitry Andric 
10430b57cec5SDimitry Andric       WorkList.push_back(User);
10440b57cec5SDimitry Andric       continue;
10450b57cec5SDimitry Andric     }
10460b57cec5SDimitry Andric 
10470b57cec5SDimitry Andric     Instruction *UseInst = cast<Instruction>(User);
10480b57cec5SDimitry Andric     if (UseInst->getOpcode() == Instruction::PtrToInt)
10490b57cec5SDimitry Andric       return false;
10500b57cec5SDimitry Andric 
10510b57cec5SDimitry Andric     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
10520b57cec5SDimitry Andric       if (LI->isVolatile())
10530b57cec5SDimitry Andric         return false;
10540b57cec5SDimitry Andric 
10550b57cec5SDimitry Andric       continue;
10560b57cec5SDimitry Andric     }
10570b57cec5SDimitry Andric 
10580b57cec5SDimitry Andric     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
10590b57cec5SDimitry Andric       if (SI->isVolatile())
10600b57cec5SDimitry Andric         return false;
10610b57cec5SDimitry Andric 
10620b57cec5SDimitry Andric       // Reject if the stored value is not the pointer operand.
10630b57cec5SDimitry Andric       if (SI->getPointerOperand() != Val)
10640b57cec5SDimitry Andric         return false;
10650b57cec5SDimitry Andric     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
10660b57cec5SDimitry Andric       if (RMW->isVolatile())
10670b57cec5SDimitry Andric         return false;
10680b57cec5SDimitry Andric     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
10690b57cec5SDimitry Andric       if (CAS->isVolatile())
10700b57cec5SDimitry Andric         return false;
10710b57cec5SDimitry Andric     }
10720b57cec5SDimitry Andric 
10730b57cec5SDimitry Andric     // Only promote a select if we know that the other select operand
10740b57cec5SDimitry Andric     // is from another pointer that will also be promoted.
10750b57cec5SDimitry Andric     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
10760b57cec5SDimitry Andric       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
10770b57cec5SDimitry Andric         return false;
10780b57cec5SDimitry Andric 
10790b57cec5SDimitry Andric       // May need to rewrite constant operands.
10800b57cec5SDimitry Andric       WorkList.push_back(ICmp);
10810b57cec5SDimitry Andric     }
10820b57cec5SDimitry Andric 
10830b57cec5SDimitry Andric     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
10840b57cec5SDimitry Andric       // Give up if the pointer may be captured.
10850b57cec5SDimitry Andric       if (PointerMayBeCaptured(UseInst, true, true))
10860b57cec5SDimitry Andric         return false;
10870b57cec5SDimitry Andric       // Don't collect the users of this.
10880b57cec5SDimitry Andric       WorkList.push_back(User);
10890b57cec5SDimitry Andric       continue;
10900b57cec5SDimitry Andric     }
10910b57cec5SDimitry Andric 
1092fe6060f1SDimitry Andric     // Do not promote vector/aggregate type instructions. It is hard to track
1093fe6060f1SDimitry Andric     // their users.
1094fe6060f1SDimitry Andric     if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
1095fe6060f1SDimitry Andric       return false;
1096fe6060f1SDimitry Andric 
10970b57cec5SDimitry Andric     if (!User->getType()->isPointerTy())
10980b57cec5SDimitry Andric       continue;
10990b57cec5SDimitry Andric 
11000b57cec5SDimitry Andric     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
11010b57cec5SDimitry Andric       // Be conservative if an address could be computed outside the bounds of
11020b57cec5SDimitry Andric       // the alloca.
11030b57cec5SDimitry Andric       if (!GEP->isInBounds())
11040b57cec5SDimitry Andric         return false;
11050b57cec5SDimitry Andric     }
11060b57cec5SDimitry Andric 
11070b57cec5SDimitry Andric     // Only promote a select if we know that the other select operand is from
11080b57cec5SDimitry Andric     // another pointer that will also be promoted.
11090b57cec5SDimitry Andric     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
11100b57cec5SDimitry Andric       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
11110b57cec5SDimitry Andric         return false;
11120b57cec5SDimitry Andric     }
11130b57cec5SDimitry Andric 
11140b57cec5SDimitry Andric     // Repeat for phis.
11150b57cec5SDimitry Andric     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
11160b57cec5SDimitry Andric       // TODO: Handle more complex cases. We should be able to replace loops
11170b57cec5SDimitry Andric       // over arrays.
11180b57cec5SDimitry Andric       switch (Phi->getNumIncomingValues()) {
11190b57cec5SDimitry Andric       case 1:
11200b57cec5SDimitry Andric         break;
11210b57cec5SDimitry Andric       case 2:
11220b57cec5SDimitry Andric         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
11230b57cec5SDimitry Andric           return false;
11240b57cec5SDimitry Andric         break;
11250b57cec5SDimitry Andric       default:
11260b57cec5SDimitry Andric         return false;
11270b57cec5SDimitry Andric       }
11280b57cec5SDimitry Andric     }
11290b57cec5SDimitry Andric 
11300b57cec5SDimitry Andric     WorkList.push_back(User);
11310b57cec5SDimitry Andric     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
11320b57cec5SDimitry Andric       return false;
11330b57cec5SDimitry Andric   }
11340b57cec5SDimitry Andric 
11350b57cec5SDimitry Andric   return true;
11360b57cec5SDimitry Andric }
11370b57cec5SDimitry Andric 
1138e8d8bef9SDimitry Andric bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
11390b57cec5SDimitry Andric 
11400b57cec5SDimitry Andric   FunctionType *FTy = F.getFunctionType();
1141e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
11420b57cec5SDimitry Andric 
11430b57cec5SDimitry Andric   // If the function has any arguments in the local address space, then it's
11440b57cec5SDimitry Andric   // possible these arguments require the entire local memory space, so
11450b57cec5SDimitry Andric   // we cannot use local memory in the pass.
11460b57cec5SDimitry Andric   for (Type *ParamTy : FTy->params()) {
11470b57cec5SDimitry Andric     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
11480b57cec5SDimitry Andric     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
11490b57cec5SDimitry Andric       LocalMemLimit = 0;
11500b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
11510b57cec5SDimitry Andric                            "local memory disabled.\n");
11520b57cec5SDimitry Andric       return false;
11530b57cec5SDimitry Andric     }
11540b57cec5SDimitry Andric   }
11550b57cec5SDimitry Andric 
1156bdd1243dSDimitry Andric   LocalMemLimit = ST.getAddressableLocalMemorySize();
11570b57cec5SDimitry Andric   if (LocalMemLimit == 0)
11580b57cec5SDimitry Andric     return false;
11590b57cec5SDimitry Andric 
1160e8d8bef9SDimitry Andric   SmallVector<const Constant *, 16> Stack;
1161e8d8bef9SDimitry Andric   SmallPtrSet<const Constant *, 8> VisitedConstants;
1162e8d8bef9SDimitry Andric   SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
11630b57cec5SDimitry Andric 
1164e8d8bef9SDimitry Andric   auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1165e8d8bef9SDimitry Andric     for (const User *U : Val->users()) {
1166e8d8bef9SDimitry Andric       if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1167e8d8bef9SDimitry Andric         if (Use->getParent()->getParent() == &F)
1168e8d8bef9SDimitry Andric           return true;
1169e8d8bef9SDimitry Andric       } else {
1170e8d8bef9SDimitry Andric         const Constant *C = cast<Constant>(U);
1171e8d8bef9SDimitry Andric         if (VisitedConstants.insert(C).second)
1172e8d8bef9SDimitry Andric           Stack.push_back(C);
1173e8d8bef9SDimitry Andric       }
1174e8d8bef9SDimitry Andric     }
1175e8d8bef9SDimitry Andric 
1176e8d8bef9SDimitry Andric     return false;
1177e8d8bef9SDimitry Andric   };
1178e8d8bef9SDimitry Andric 
11790b57cec5SDimitry Andric   for (GlobalVariable &GV : Mod->globals()) {
1180480093f4SDimitry Andric     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
11810b57cec5SDimitry Andric       continue;
11820b57cec5SDimitry Andric 
1183e8d8bef9SDimitry Andric     if (visitUsers(&GV, &GV)) {
1184e8d8bef9SDimitry Andric       UsedLDS.insert(&GV);
1185e8d8bef9SDimitry Andric       Stack.clear();
11860b57cec5SDimitry Andric       continue;
1187e8d8bef9SDimitry Andric     }
11880b57cec5SDimitry Andric 
1189e8d8bef9SDimitry Andric     // For any ConstantExpr uses, we need to recursively search the users until
1190e8d8bef9SDimitry Andric     // we see a function.
1191e8d8bef9SDimitry Andric     while (!Stack.empty()) {
1192e8d8bef9SDimitry Andric       const Constant *C = Stack.pop_back_val();
1193e8d8bef9SDimitry Andric       if (visitUsers(&GV, C)) {
1194e8d8bef9SDimitry Andric         UsedLDS.insert(&GV);
1195e8d8bef9SDimitry Andric         Stack.clear();
11960b57cec5SDimitry Andric         break;
11970b57cec5SDimitry Andric       }
11980b57cec5SDimitry Andric     }
11990b57cec5SDimitry Andric   }
12000b57cec5SDimitry Andric 
1201e8d8bef9SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
1202e8d8bef9SDimitry Andric   SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1203e8d8bef9SDimitry Andric   AllocatedSizes.reserve(UsedLDS.size());
1204e8d8bef9SDimitry Andric 
1205e8d8bef9SDimitry Andric   for (const GlobalVariable *GV : UsedLDS) {
1206e8d8bef9SDimitry Andric     Align Alignment =
1207e8d8bef9SDimitry Andric         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1208e8d8bef9SDimitry Andric     uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
120904eeddc0SDimitry Andric 
121004eeddc0SDimitry Andric     // HIP uses an extern unsized array in local address space for dynamically
121104eeddc0SDimitry Andric     // allocated shared memory.  In that case, we have to disable the promotion.
121204eeddc0SDimitry Andric     if (GV->hasExternalLinkage() && AllocSize == 0) {
121304eeddc0SDimitry Andric       LocalMemLimit = 0;
121404eeddc0SDimitry Andric       LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
121504eeddc0SDimitry Andric                            "local memory. Promoting to local memory "
121604eeddc0SDimitry Andric                            "disabled.\n");
121704eeddc0SDimitry Andric       return false;
121804eeddc0SDimitry Andric     }
121904eeddc0SDimitry Andric 
1220e8d8bef9SDimitry Andric     AllocatedSizes.emplace_back(AllocSize, Alignment);
1221e8d8bef9SDimitry Andric   }
1222e8d8bef9SDimitry Andric 
1223e8d8bef9SDimitry Andric   // Sort to try to estimate the worst case alignment padding
1224e8d8bef9SDimitry Andric   //
1225e8d8bef9SDimitry Andric   // FIXME: We should really do something to fix the addresses to a more optimal
1226e8d8bef9SDimitry Andric   // value instead
122781ad6265SDimitry Andric   llvm::sort(AllocatedSizes, llvm::less_second());
1228e8d8bef9SDimitry Andric 
1229e8d8bef9SDimitry Andric   // Check how much local memory is being used by global objects
1230e8d8bef9SDimitry Andric   CurrentLocalMemUsage = 0;
1231e8d8bef9SDimitry Andric 
1232e8d8bef9SDimitry Andric   // FIXME: Try to account for padding here. The real padding and address is
1233e8d8bef9SDimitry Andric   // currently determined from the inverse order of uses in the function when
1234e8d8bef9SDimitry Andric   // legalizing, which could also potentially change. We try to estimate the
1235e8d8bef9SDimitry Andric   // worst case here, but we probably should fix the addresses earlier.
1236e8d8bef9SDimitry Andric   for (auto Alloc : AllocatedSizes) {
1237e8d8bef9SDimitry Andric     CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1238e8d8bef9SDimitry Andric     CurrentLocalMemUsage += Alloc.first;
1239e8d8bef9SDimitry Andric   }
1240e8d8bef9SDimitry Andric 
1241*06c3fb27SDimitry Andric   unsigned MaxOccupancy =
1242*06c3fb27SDimitry Andric       ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
12430b57cec5SDimitry Andric 
12440b57cec5SDimitry Andric   // Restrict local memory usage so that we don't drastically reduce occupancy,
12450b57cec5SDimitry Andric   // unless it is already significantly reduced.
12460b57cec5SDimitry Andric 
12470b57cec5SDimitry Andric   // TODO: Have some sort of hint or other heuristics to guess occupancy based
12480b57cec5SDimitry Andric   // on other factors..
12490b57cec5SDimitry Andric   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
12500b57cec5SDimitry Andric   if (OccupancyHint == 0)
12510b57cec5SDimitry Andric     OccupancyHint = 7;
12520b57cec5SDimitry Andric 
12530b57cec5SDimitry Andric   // Clamp to max value.
12540b57cec5SDimitry Andric   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
12550b57cec5SDimitry Andric 
12560b57cec5SDimitry Andric   // Check the hint but ignore it if it's obviously wrong from the existing LDS
12570b57cec5SDimitry Andric   // usage.
12580b57cec5SDimitry Andric   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
12590b57cec5SDimitry Andric 
12600b57cec5SDimitry Andric   // Round up to the next tier of usage.
1261*06c3fb27SDimitry Andric   unsigned MaxSizeWithWaveCount =
1262*06c3fb27SDimitry Andric       ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
12630b57cec5SDimitry Andric 
12640b57cec5SDimitry Andric   // Program is possibly broken by using more local mem than available.
12650b57cec5SDimitry Andric   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
12660b57cec5SDimitry Andric     return false;
12670b57cec5SDimitry Andric 
12680b57cec5SDimitry Andric   LocalMemLimit = MaxSizeWithWaveCount;
12690b57cec5SDimitry Andric 
12700b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
12710b57cec5SDimitry Andric                     << " bytes of LDS\n"
12720b57cec5SDimitry Andric                     << "  Rounding size to " << MaxSizeWithWaveCount
12730b57cec5SDimitry Andric                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
12740b57cec5SDimitry Andric                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
12750b57cec5SDimitry Andric                     << " available for promotion\n");
12760b57cec5SDimitry Andric 
12770b57cec5SDimitry Andric   return true;
12780b57cec5SDimitry Andric }
12790b57cec5SDimitry Andric 
12800b57cec5SDimitry Andric // FIXME: Should try to pick the most likely to be profitable allocas first.
1281*06c3fb27SDimitry Andric bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1282*06c3fb27SDimitry Andric                                                     bool SufficientLDS) {
1283*06c3fb27SDimitry Andric   LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1284*06c3fb27SDimitry Andric 
1285*06c3fb27SDimitry Andric   if (DisablePromoteAllocaToLDS) {
1286*06c3fb27SDimitry Andric     LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
12870b57cec5SDimitry Andric     return false;
1288*06c3fb27SDimitry Andric   }
12890b57cec5SDimitry Andric 
12905ffd83dbSDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
12910b57cec5SDimitry Andric   IRBuilder<> Builder(&I);
12920b57cec5SDimitry Andric 
12930b57cec5SDimitry Andric   const Function &ContainingFunction = *I.getParent()->getParent();
12940b57cec5SDimitry Andric   CallingConv::ID CC = ContainingFunction.getCallingConv();
12950b57cec5SDimitry Andric 
12960b57cec5SDimitry Andric   // Don't promote the alloca to LDS for shader calling conventions as the work
12970b57cec5SDimitry Andric   // item ID intrinsics are not supported for these calling conventions.
12980b57cec5SDimitry Andric   // Furthermore not all LDS is available for some of the stages.
12990b57cec5SDimitry Andric   switch (CC) {
13000b57cec5SDimitry Andric   case CallingConv::AMDGPU_KERNEL:
13010b57cec5SDimitry Andric   case CallingConv::SPIR_KERNEL:
13020b57cec5SDimitry Andric     break;
13030b57cec5SDimitry Andric   default:
13040b57cec5SDimitry Andric     LLVM_DEBUG(
13050b57cec5SDimitry Andric         dbgs()
13060b57cec5SDimitry Andric         << " promote alloca to LDS not supported with calling convention.\n");
13070b57cec5SDimitry Andric     return false;
13080b57cec5SDimitry Andric   }
13090b57cec5SDimitry Andric 
13100b57cec5SDimitry Andric   // Not likely to have sufficient local memory for promotion.
13110b57cec5SDimitry Andric   if (!SufficientLDS)
13120b57cec5SDimitry Andric     return false;
13130b57cec5SDimitry Andric 
1314e8d8bef9SDimitry Andric   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
13150b57cec5SDimitry Andric   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
13160b57cec5SDimitry Andric 
13175ffd83dbSDimitry Andric   Align Alignment =
13185ffd83dbSDimitry Andric       DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
13190b57cec5SDimitry Andric 
13200b57cec5SDimitry Andric   // FIXME: This computed padding is likely wrong since it depends on inverse
13210b57cec5SDimitry Andric   // usage order.
13220b57cec5SDimitry Andric   //
13230b57cec5SDimitry Andric   // FIXME: It is also possible that if we're allowed to use all of the memory
132481ad6265SDimitry Andric   // could end up using more than the maximum due to alignment padding.
13250b57cec5SDimitry Andric 
13265ffd83dbSDimitry Andric   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1327*06c3fb27SDimitry Andric   uint32_t AllocSize =
1328*06c3fb27SDimitry Andric       WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
13290b57cec5SDimitry Andric   NewSize += AllocSize;
13300b57cec5SDimitry Andric 
13310b57cec5SDimitry Andric   if (NewSize > LocalMemLimit) {
13320b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  " << AllocSize
13330b57cec5SDimitry Andric                       << " bytes of local memory not available to promote\n");
13340b57cec5SDimitry Andric     return false;
13350b57cec5SDimitry Andric   }
13360b57cec5SDimitry Andric 
13370b57cec5SDimitry Andric   CurrentLocalMemUsage = NewSize;
13380b57cec5SDimitry Andric 
13390b57cec5SDimitry Andric   std::vector<Value *> WorkList;
13400b57cec5SDimitry Andric 
13410b57cec5SDimitry Andric   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
13420b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
13430b57cec5SDimitry Andric     return false;
13440b57cec5SDimitry Andric   }
13450b57cec5SDimitry Andric 
13460b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
13470b57cec5SDimitry Andric 
13480b57cec5SDimitry Andric   Function *F = I.getParent()->getParent();
13490b57cec5SDimitry Andric 
13500b57cec5SDimitry Andric   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
13510b57cec5SDimitry Andric   GlobalVariable *GV = new GlobalVariable(
1352bdd1243dSDimitry Andric       *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
1353bdd1243dSDimitry Andric       Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1354bdd1243dSDimitry Andric       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
13550b57cec5SDimitry Andric   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
13560eae32dcSDimitry Andric   GV->setAlignment(I.getAlign());
13570b57cec5SDimitry Andric 
13580b57cec5SDimitry Andric   Value *TCntY, *TCntZ;
13590b57cec5SDimitry Andric 
13600b57cec5SDimitry Andric   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
13610b57cec5SDimitry Andric   Value *TIdX = getWorkitemID(Builder, 0);
13620b57cec5SDimitry Andric   Value *TIdY = getWorkitemID(Builder, 1);
13630b57cec5SDimitry Andric   Value *TIdZ = getWorkitemID(Builder, 2);
13640b57cec5SDimitry Andric 
13650b57cec5SDimitry Andric   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
13660b57cec5SDimitry Andric   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
13670b57cec5SDimitry Andric   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
13680b57cec5SDimitry Andric   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
13690b57cec5SDimitry Andric   TID = Builder.CreateAdd(TID, TIdZ);
13700b57cec5SDimitry Andric 
1371*06c3fb27SDimitry Andric   LLVMContext &Context = Mod->getContext();
1372*06c3fb27SDimitry Andric   Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
13730b57cec5SDimitry Andric 
13740b57cec5SDimitry Andric   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
13750b57cec5SDimitry Andric   I.mutateType(Offset->getType());
13760b57cec5SDimitry Andric   I.replaceAllUsesWith(Offset);
13770b57cec5SDimitry Andric   I.eraseFromParent();
13780b57cec5SDimitry Andric 
1379fe6060f1SDimitry Andric   SmallVector<IntrinsicInst *> DeferredIntrs;
1380fe6060f1SDimitry Andric 
13810b57cec5SDimitry Andric   for (Value *V : WorkList) {
13820b57cec5SDimitry Andric     CallInst *Call = dyn_cast<CallInst>(V);
13830b57cec5SDimitry Andric     if (!Call) {
13840b57cec5SDimitry Andric       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1385*06c3fb27SDimitry Andric         PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
13860b57cec5SDimitry Andric 
13870b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(CI->getOperand(0)))
13880b57cec5SDimitry Andric           CI->setOperand(0, ConstantPointerNull::get(NewTy));
13890b57cec5SDimitry Andric 
13900b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(CI->getOperand(1)))
13910b57cec5SDimitry Andric           CI->setOperand(1, ConstantPointerNull::get(NewTy));
13920b57cec5SDimitry Andric 
13930b57cec5SDimitry Andric         continue;
13940b57cec5SDimitry Andric       }
13950b57cec5SDimitry Andric 
13960b57cec5SDimitry Andric       // The operand's value should be corrected on its own and we don't want to
13970b57cec5SDimitry Andric       // touch the users.
13980b57cec5SDimitry Andric       if (isa<AddrSpaceCastInst>(V))
13990b57cec5SDimitry Andric         continue;
14000b57cec5SDimitry Andric 
1401*06c3fb27SDimitry Andric       PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
14020b57cec5SDimitry Andric 
14030b57cec5SDimitry Andric       // FIXME: It doesn't really make sense to try to do this for all
14040b57cec5SDimitry Andric       // instructions.
14050b57cec5SDimitry Andric       V->mutateType(NewTy);
14060b57cec5SDimitry Andric 
14070b57cec5SDimitry Andric       // Adjust the types of any constant operands.
14080b57cec5SDimitry Andric       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
14090b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(SI->getOperand(1)))
14100b57cec5SDimitry Andric           SI->setOperand(1, ConstantPointerNull::get(NewTy));
14110b57cec5SDimitry Andric 
14120b57cec5SDimitry Andric         if (isa<ConstantPointerNull>(SI->getOperand(2)))
14130b57cec5SDimitry Andric           SI->setOperand(2, ConstantPointerNull::get(NewTy));
14140b57cec5SDimitry Andric       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
14150b57cec5SDimitry Andric         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
14160b57cec5SDimitry Andric           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
14170b57cec5SDimitry Andric             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
14180b57cec5SDimitry Andric         }
14190b57cec5SDimitry Andric       }
14200b57cec5SDimitry Andric 
14210b57cec5SDimitry Andric       continue;
14220b57cec5SDimitry Andric     }
14230b57cec5SDimitry Andric 
14240b57cec5SDimitry Andric     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
14250b57cec5SDimitry Andric     Builder.SetInsertPoint(Intr);
14260b57cec5SDimitry Andric     switch (Intr->getIntrinsicID()) {
14270b57cec5SDimitry Andric     case Intrinsic::lifetime_start:
14280b57cec5SDimitry Andric     case Intrinsic::lifetime_end:
14290b57cec5SDimitry Andric       // These intrinsics are for address space 0 only
14300b57cec5SDimitry Andric       Intr->eraseFromParent();
14310b57cec5SDimitry Andric       continue;
1432fe6060f1SDimitry Andric     case Intrinsic::memcpy:
1433fe6060f1SDimitry Andric     case Intrinsic::memmove:
1434fe6060f1SDimitry Andric       // These have 2 pointer operands. In case if second pointer also needs
1435fe6060f1SDimitry Andric       // to be replaced we defer processing of these intrinsics until all
1436fe6060f1SDimitry Andric       // other values are processed.
1437fe6060f1SDimitry Andric       DeferredIntrs.push_back(Intr);
14380b57cec5SDimitry Andric       continue;
14390b57cec5SDimitry Andric     case Intrinsic::memset: {
14400b57cec5SDimitry Andric       MemSetInst *MemSet = cast<MemSetInst>(Intr);
1441bdd1243dSDimitry Andric       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1442bdd1243dSDimitry Andric                            MemSet->getLength(), MemSet->getDestAlign(),
1443bdd1243dSDimitry Andric                            MemSet->isVolatile());
14440b57cec5SDimitry Andric       Intr->eraseFromParent();
14450b57cec5SDimitry Andric       continue;
14460b57cec5SDimitry Andric     }
14470b57cec5SDimitry Andric     case Intrinsic::invariant_start:
14480b57cec5SDimitry Andric     case Intrinsic::invariant_end:
14490b57cec5SDimitry Andric     case Intrinsic::launder_invariant_group:
14500b57cec5SDimitry Andric     case Intrinsic::strip_invariant_group:
14510b57cec5SDimitry Andric       Intr->eraseFromParent();
14520b57cec5SDimitry Andric       // FIXME: I think the invariant marker should still theoretically apply,
14530b57cec5SDimitry Andric       // but the intrinsics need to be changed to accept pointers with any
14540b57cec5SDimitry Andric       // address space.
14550b57cec5SDimitry Andric       continue;
14560b57cec5SDimitry Andric     case Intrinsic::objectsize: {
14570b57cec5SDimitry Andric       Value *Src = Intr->getOperand(0);
1458fe6060f1SDimitry Andric       Function *ObjectSize = Intrinsic::getDeclaration(
1459fe6060f1SDimitry Andric           Mod, Intrinsic::objectsize,
1460fe6060f1SDimitry Andric           {Intr->getType(),
1461*06c3fb27SDimitry Andric            PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)});
14620b57cec5SDimitry Andric 
14630b57cec5SDimitry Andric       CallInst *NewCall = Builder.CreateCall(
14640b57cec5SDimitry Andric           ObjectSize,
14650b57cec5SDimitry Andric           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
14660b57cec5SDimitry Andric       Intr->replaceAllUsesWith(NewCall);
14670b57cec5SDimitry Andric       Intr->eraseFromParent();
14680b57cec5SDimitry Andric       continue;
14690b57cec5SDimitry Andric     }
14700b57cec5SDimitry Andric     default:
14710b57cec5SDimitry Andric       Intr->print(errs());
14720b57cec5SDimitry Andric       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
14730b57cec5SDimitry Andric     }
14740b57cec5SDimitry Andric   }
1475fe6060f1SDimitry Andric 
1476fe6060f1SDimitry Andric   for (IntrinsicInst *Intr : DeferredIntrs) {
1477fe6060f1SDimitry Andric     Builder.SetInsertPoint(Intr);
1478fe6060f1SDimitry Andric     Intrinsic::ID ID = Intr->getIntrinsicID();
1479fe6060f1SDimitry Andric     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1480fe6060f1SDimitry Andric 
1481fe6060f1SDimitry Andric     MemTransferInst *MI = cast<MemTransferInst>(Intr);
1482*06c3fb27SDimitry Andric     auto *B = Builder.CreateMemTransferInst(
1483*06c3fb27SDimitry Andric         ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1484*06c3fb27SDimitry Andric         MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1485fe6060f1SDimitry Andric 
1486349cc55cSDimitry Andric     for (unsigned I = 0; I != 2; ++I) {
1487349cc55cSDimitry Andric       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1488349cc55cSDimitry Andric         B->addDereferenceableParamAttr(I, Bytes);
1489fe6060f1SDimitry Andric       }
1490fe6060f1SDimitry Andric     }
1491fe6060f1SDimitry Andric 
1492fe6060f1SDimitry Andric     Intr->eraseFromParent();
1493fe6060f1SDimitry Andric   }
1494fe6060f1SDimitry Andric 
14950b57cec5SDimitry Andric   return true;
14960b57cec5SDimitry Andric }
1497