Lines Matching +full:module +full:- +full:instance
1 //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass eliminates local data store, LDS, uses from non-kernel functions.
16 // kernels this is straightforward - assign an integer to the kernel for the
27 // - memory is limited and exceeding it halts compilation
28 // - a global accessed by one kernel exists independent of other kernels
29 // - a global exists independent of simultaneous execution of the same kernel
30 // - the address of the global may be different from different kernels as they
32 // - if the address is allowed to differ, functions need help to find it
34 // Uses from kernels are implemented here by grouping them in a per-kernel
35 // struct instance. This duplicates the variables, accurately modelling their
48 // LDS variables with constant annotation or non-undef initializer are passed
50 // Non-undef initializers are not yet implemented for LDS.
61 // --------+--------------------+-------------------+-----------------+
62 // Module | No | Yes | Yes |
67 // "Module" spends LDS memory to save cycles. "Table" spends cycles and global
73 // The "module" lowering implemented here finds LDS variables which are used by
74 // non-kernel functions and creates a new struct with a field for each of those
86 // Third, uses from non-kernel functions are replaced with a table lookup using
91 // can be lowered to ConstantExpr address of a struct instance specific to that
98 // number of kernels using the module strategy as that is free for the first
99 // variable. Any futher variables that can be lowered with the module strategy
104 // - No heuristics or user controlled magic numbers, hybrid is the right choice
105 // - Kernels that don't use functions (or have had them all inlined) are not
107 // - Kernels that don't make indirect function calls are not affected by those
109 // - Variables which are used by lots of kernels, e.g. those injected by a
111 // - Implementations that instantiate templates per-kernel where those templates
113 // - The runtime properties impose a cost in compiler implementation complexity
128 // A single LDS global variable represents an instance per kernel that can reach
132 // may need amending when implementing non-undef initialisers.
145 // allocate the fixed-address variables immediately upon starting the function
147 // the function to the variables that it allocates. For the module scope lds,
177 //===----------------------------------------------------------------------===//
212 #define DEBUG_TYPE "amdgpu-lower-module-lds"
220 "amdgpu-super-align-lds-globals",
224 enum class LoweringKind { module, table, kernel, hybrid }; enumerator
226 "amdgpu-lower-module-lds-strategy",
231 clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
240 return L->getName() < R->getName(); in sortByName()
249 removeLocalVarsFromUsedLists(Module &M, in removeLocalVarsFromUsedLists()
255 LocalVarsSet.insert(cast<Constant>(LocalVar->stripPointerCasts())); in removeLocalVarsFromUsedLists()
261 LocalVar->removeDeadConstantUsers(); in removeLocalVarsFromUsedLists()
265 // The llvm.amdgcn.module.lds instance is implicitly used by all kernels in markUsedByKernel()
268 // in the module. This implicit use is redefined as an explicit use here so in markUsedByKernel()
276 // llvm.donothing that takes a pointer to the instance and is lowered to a in markUsedByKernel()
277 // no-op after LDS is allocated, but that is not presently necessary. in markUsedByKernel()
285 BasicBlock *Entry = &Func->getEntryBlock(); in markUsedByKernel()
286 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); in markUsedByKernel()
289 Func->getParent(), Intrinsic::donothing, {}); in markUsedByKernel()
292 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; in markUsedByKernel()
327 auto *elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32); in getAddressesOfVariablesInKernel()
337 Module &M, ArrayRef<GlobalVariable *> Variables, in buildLookupTable()
362 Ctx, Variables, Replacement->second.LDSVarsToConstantGEP); in buildLookupTable()
374 void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, in replaceUseWithTableLookup()
383 Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction()); in replaceUseWithTableLookup()
386 BasicBlock *BB = Phi->getIncomingBlock(U); in replaceUseWithTableLookup()
387 Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); in replaceUseWithTableLookup()
400 LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); in replaceUseWithTableLookup()
405 Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); in replaceUseWithTableLookup()
411 Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, in replaceUsesInInstructionsWithTableLookup()
421 for (Use &U : make_early_inc_range(GV->uses())) { in replaceUsesInInstructionsWithTableLookup()
433 Module &M, LDSUsesInfoTy &LDSUsesInfo, in kernelsThatIndirectlyAccessAnyOfPassedVariables()
471 // Fewer users makes module scope variable less attractive in chooseBestVariableForModuleStrategy()
479 // Bigger makes module scope variable less attractive in chooseBestVariableForModuleStrategy()
489 return GV->getName() < Other.GV->getName(); in chooseBestVariableForModuleStrategy()
504 DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); in chooseBestVariableForModuleStrategy()
512 static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, in recordLDSAbsoluteAddress()
516 LLVMContext &Ctx = M->getContext(); in recordLDSAbsoluteAddress()
518 M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); in recordLDSAbsoluteAddress()
521 GV->setMetadata(LLVMContext::MD_absolute_symbol, in recordLDSAbsoluteAddress()
526 Value *getTableLookupKernelIndex(Module &M, Function *F) { in getTableLookupKernelIndex()
532 auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); in getTableLookupKernelIndex()
535 It->second = Builder.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}); in getTableLookupKernelIndex()
538 return It->second; in getTableLookupKernelIndex()
542 Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, in assignLDSKernelIDToEachKernel()
553 for (Function &Func : M->functions()) { in assignLDSKernelIDToEachKernel()
570 LLVMContext &Ctx = M->getContext(); in assignLDSKernelIDToEachKernel()
582 OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", in assignLDSKernelIDToEachKernel()
590 Module &M, LDSUsesInfoTy const &LDSUsesInfo, in partitionVariablesIntoIndirectStrategies()
623 case LoweringKind::module: in partitionVariablesIntoIndirectStrategies()
637 "cannot lower LDS '" + GV->getName() + in partitionVariablesIntoIndirectStrategies()
666 Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, in lowerModuleScopeStructVariables()
669 // Replace all uses of those variables from non-kernel functions with the in lowerModuleScopeStructVariables()
670 // new struct instance Replace only the uses from kernel functions that will in lowerModuleScopeStructVariables()
671 // allocate this instance. That is a space optimisation - kernels that use a in lowerModuleScopeStructVariables()
672 // subset of the module scope struct and do not need to allocate it for in lowerModuleScopeStructVariables()
674 // of the per-kernel lowering). in lowerModuleScopeStructVariables()
682 createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", in lowerModuleScopeStructVariables()
690 // module.lds will be allocated at zero in any kernel that allocates it in lowerModuleScopeStructVariables()
696 // Replace all uses of module scope variable from non-kernel functions in lowerModuleScopeStructVariables()
703 Function *F = I->getFunction(); in lowerModuleScopeStructVariables()
707 // Replace uses of module scope variable from kernel functions that in lowerModuleScopeStructVariables()
708 // allocate the module scope variable, otherwise leave them unchanged in lowerModuleScopeStructVariables()
709 // Record on each kernel whether the module scope global is used by it in lowerModuleScopeStructVariables()
722 Function *F = I->getFunction(); in lowerModuleScopeStructVariables()
735 Module &M, LDSUsesInfoTy &LDSUsesInfo, in lowerKernelScopeStructVariables()
740 // Create a struct for each kernel for the non-module-scope variables. in lowerKernelScopeStructVariables()
757 // this struct instance can find them from nested functions. in lowerKernelScopeStructVariables()
764 // Variables allocated in module lds must all resolve to that struct, in lowerKernelScopeStructVariables()
765 // not to the per-kernel instance. in lowerKernelScopeStructVariables()
773 // Either used no LDS, or the LDS it used was all in the module struct in lowerKernelScopeStructVariables()
796 // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll in lowerKernelScopeStructVariables()
799 !Accesses->second.empty()) in lowerKernelScopeStructVariables()
810 return I && I->getFunction() == &Func; in lowerKernelScopeStructVariables()
817 buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, in buildRepresentativeDynamicLDSInstance()
850 assert(func->hasName()); // Checked by caller in buildRepresentativeDynamicLDSInstance()
854 …Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUA… in buildRepresentativeDynamicLDSInstance()
856 N->setAlignment(MaxDynamicAlignment); in buildRepresentativeDynamicLDSInstance()
863 Module &M, LDSUsesInfoTy &LDSUsesInfo, in lowerDynamicLDSVariables()
880 if (!func->hasName()) { in lowerDynamicLDSVariables()
909 for (Use &U : make_early_inc_range(GV->uses())) { in lowerDynamicLDSVariables()
913 if (isKernelLDS(I->getFunction())) in lowerDynamicLDSVariables()
923 static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, in uniquifyGVPerKernel()
926 for (Use &U : GV->uses()) { in uniquifyGVPerKernel()
928 Function *F = I->getFunction(); in uniquifyGVPerKernel()
939 M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), in uniquifyGVPerKernel()
940 GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, in uniquifyGVPerKernel()
941 GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); in uniquifyGVPerKernel()
942 NewGV->copyAttributesFrom(GV); in uniquifyGVPerKernel()
943 for (Use &U : make_early_inc_range(GV->uses())) { in uniquifyGVPerKernel()
945 Function *F = I->getFunction(); in uniquifyGVPerKernel()
947 U.getUser()->replaceUsesOfWith(GV, NewGV); in uniquifyGVPerKernel()
955 Module &M, LDSUsesInfoTy &LDSUsesInfo, in lowerSpecialLDSVariables()
958 // The 1st round: give module-absolute assignments in lowerSpecialLDSVariables()
965 // give a module-absolute assignment if it is indirectly accessed by in lowerSpecialLDSVariables()
971 // leave it to the 2nd round, which will give a kernel-relative in lowerSpecialLDSVariables()
988 // The 2nd round: give a kernel-relative assignment for GV that in lowerSpecialLDSVariables()
1006 if (GV->isAbsoluteSymbolRef()) { in lowerSpecialLDSVariables()
1040 bool runOnModule(Module &M) { in runOnModule()
1079 // module instance through a call then that kernel needs to allocate the in runOnModule()
1080 // module instance in runOnModule()
1141 // Strip amdgpu-no-lds-kernel-id from all functions reachable from the in runOnModule()
1147 removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"}); in runOnModule()
1162 // module.lds in runOnModule()
1164 // kernel instance in runOnModule()
1185 Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); in runOnModule()
1189 GlobalVariable *KernelStruct = Replacement->second.SGV; in runOnModule()
1192 Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); in runOnModule()
1212 // using special case metadata, annotate with min-lds == max-lds, i.e. in runOnModule()
1220 Func.addFnAttr("amdgpu-lds-size", Buffer); in runOnModule()
1239 static bool superAlignLDSGlobals(Module &M) { in superAlignLDSGlobals()
1247 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { in superAlignLDSGlobals()
1287 Module &M, std::string VarName, in createLDSVariableReplacement()
1289 // Create a struct instance containing LDSVarsToTransform and map from those in createLDSVariableReplacement()
1309 DL.getTypeAllocSize(GV->getValueType()), in createLDSVariableReplacement()
1330 uint64_t Padding = DataAlignV - Rem; in createLDSVariableReplacement()
1333 // Note (o + (a - (o % a)) ) % a == 0 in createLDSVariableReplacement()
1355 [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); in createLDSVariableReplacement()
1365 SGV->setAlignment(StructAlign); in createLDSVariableReplacement()
1374 assert(GV->use_empty()); in createLDSVariableReplacement()
1375 GV->eraseFromParent(); in createLDSVariableReplacement()
1386 Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, in replaceLDSVariablesWithStruct()
1414 // field of the instance that will be allocated by AMDGPUMachineFunction in replaceLDSVariablesWithStruct()
1419 GV->replaceUsesWithIf(GEP, Predicate); in replaceLDSVariablesWithStruct()
1421 APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); in replaceLDSVariablesWithStruct()
1422 GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); in replaceLDSVariablesWithStruct()
1426 commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset); in replaceLDSVariablesWithStruct()
1429 NoAliasList[I - 1] = AliasScopes[I - 1]; in replaceLDSVariablesWithStruct()
1447 for (User *U : Ptr->users()) { in refineUsesAlignmentAndAA()
1449 if (AliasScope && I->mayReadOrWriteMemory()) { in refineUsesAlignmentAndAA()
1450 MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope); in refineUsesAlignmentAndAA()
1453 I->setMetadata(LLVMContext::MD_alias_scope, AS); in refineUsesAlignmentAndAA()
1455 MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); in refineUsesAlignmentAndAA()
1484 I->setMetadata(LLVMContext::MD_noalias, NA); in refineUsesAlignmentAndAA()
1489 LI->setAlignment(std::max(A, LI->getAlign())); in refineUsesAlignmentAndAA()
1493 if (SI->getPointerOperand() == Ptr) in refineUsesAlignmentAndAA()
1494 SI->setAlignment(std::max(A, SI->getAlign())); in refineUsesAlignmentAndAA()
1500 if (AI->getPointerOperand() == Ptr) in refineUsesAlignmentAndAA()
1501 AI->setAlignment(std::max(A, AI->getAlign())); in refineUsesAlignmentAndAA()
1505 if (AI->getPointerOperand() == Ptr) in refineUsesAlignmentAndAA()
1506 AI->setAlignment(std::max(A, AI->getAlign())); in refineUsesAlignmentAndAA()
1510 unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); in refineUsesAlignmentAndAA()
1512 if (GEP->getPointerOperand() == Ptr) { in refineUsesAlignmentAndAA()
1514 if (GEP->accumulateConstantOffset(DL, Off)) in refineUsesAlignmentAndAA()
1517 MaxDepth - 1); in refineUsesAlignmentAndAA()
1522 if (I->getOpcode() == Instruction::BitCast || in refineUsesAlignmentAndAA()
1523 I->getOpcode() == Instruction::AddrSpaceCast) in refineUsesAlignmentAndAA()
1524 refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1); in refineUsesAlignmentAndAA()
1543 bool runOnModule(Module &M) override { in runOnModule()
1559 "Lower uses of LDS variables from non-kernel functions",
1563 "Lower uses of LDS variables from non-kernel functions", in INITIALIZE_PASS_DEPENDENCY()
1571 PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, in run()