//===- AMDGPUSplitModule.cpp ----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file Implements a module splitting algorithm designed to support the /// FullLTO --lto-partitions option for parallel codegen. This is completely /// different from the common SplitModule pass, as this system is designed with /// AMDGPU in mind. /// /// The basic idea of this module splitting implementation is the same as /// SplitModule: load-balance the module's functions across a set of N /// partitions to allow parallel codegen. However, it does it very /// differently than the target-agnostic variant: /// - The module has "split roots", which are kernels in the vast // majority of cases. /// - Each root has a set of dependencies, and when a root and its /// dependencies is considered "big", we try to put it in a partition where /// most dependencies are already imported, to avoid duplicating large /// amounts of code. /// - There's special care for indirect calls in order to ensure /// AMDGPUResourceUsageAnalysis can work correctly. /// /// This file also includes a more elaborate logging system to enable /// users to easily generate logs that (if desired) do not include any value /// names, in order to not leak information about the source file. /// Such logs are very helpful to understand and fix potential issues with /// module splitting. #include "AMDGPUSplitModule.h" #include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/SHA256.h" #include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "amdgpu-split-module" namespace { static cl::opt LargeFnFactor( "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f), cl::Hidden, cl::desc( "consider a function as large and needing special treatment when the " "cost of importing it into a partition" "exceeds the average cost of a partition by this factor; e;g. 2.0 " "means if the function and its dependencies is 2 times bigger than " "an average partition; 0 disables large functions handling entirely")); static cl::opt LargeFnOverlapForMerge( "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f), cl::Hidden, cl::desc( "defines how much overlap between two large function's dependencies " "is needed to put them in the same partition")); static cl::opt NoExternalizeGlobals( "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, cl::desc("disables externalization of global variable with local linkage; " "may cause globals to be duplicated which increases binary size")); static cl::opt LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, cl::desc("output directory for AMDGPU module splitting logs")); static cl::opt LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, cl::desc("hash value names before printing them in the AMDGPU " "module splitting logs")); using CostType = InstructionCost::CostType; using PartitionID = unsigned; using GetTTIFn = function_ref; static bool isEntryPoint(const Function *F) { return AMDGPU::isEntryFunctionCC(F->getCallingConv()); } static std::string getName(const Value &V) { static bool HideNames; static llvm::once_flag HideNameInitFlag; llvm::call_once(HideNameInitFlag, [&]() { if (LogPrivate.getNumOccurrences()) HideNames = LogPrivate; else { const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); HideNames = (EV.value_or("0") != "0"); } }); if (!HideNames) return V.getName().str(); return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), /*LowerCase=*/true); } /// Main logging helper. /// /// Logging can be configured by the following environment variable. /// AMD_SPLIT_MODULE_LOG_DIR= /// If set, uses as the directory to write logfiles to /// each time module splitting is used. /// AMD_SPLIT_MODULE_LOG_PRIVATE /// If set to anything other than zero, all names are hidden. /// /// Both environment variables have corresponding CL options which /// takes priority over them. /// /// Any output printed to the log files is also printed to dbgs() when -debug is /// used and LLVM_DEBUG is defined. /// /// This approach has a small disadvantage over LLVM_DEBUG though: logging logic /// cannot be removed from the code (by building without debug). This probably /// has a small performance cost because if some computation/formatting is /// needed for logging purpose, it may be done everytime only to be ignored /// by the logger. /// /// As this pass only runs once and is not doing anything computationally /// expensive, this is likely a reasonable trade-off. /// /// If some computation should really be avoided when unused, users of the class /// can check whether any logging will occur by using the bool operator. /// /// \code /// if (SML) { /// // Executes only if logging to a file or if -debug is available and /// used. /// } /// \endcode class SplitModuleLogger { public: SplitModuleLogger(const Module &M) { std::string LogDir = LogDirOpt; if (LogDir.empty()) LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); // No log dir specified means we don't need to log to a file. // We may still log to dbgs(), though. if (LogDir.empty()) return; // If a log directory is specified, create a new file with a unique name in // that directory. int Fd; SmallString<0> PathTemplate; SmallString<0> RealPath; sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); if (auto Err = sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { report_fatal_error("Failed to create log file at '" + Twine(LogDir) + "': " + Err.message(), /*CrashDiag=*/false); } FileOS = std::make_unique(Fd, /*shouldClose=*/true); } bool hasLogFile() const { return FileOS != nullptr; } raw_ostream &logfile() { assert(FileOS && "no logfile!"); return *FileOS; } /// \returns true if this SML will log anything either to a file or dbgs(). /// Can be used to avoid expensive computations that are ignored when logging /// is disabled. operator bool() const { return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); } private: std::unique_ptr FileOS; }; template static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { static_assert( !std::is_same_v, "do not print values to logs directly, use handleName instead!"); LLVM_DEBUG(dbgs() << Val); if (SML.hasLogFile()) SML.logfile() << Val; return SML; } /// Calculate the cost of each function in \p M /// \param SML Log Helper /// \param GetTTI Abstract getter for TargetTransformInfo. /// \param M Module to analyze. /// \param CostMap[out] Resulting Function -> Cost map. /// \return The module's total cost. static CostType calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, DenseMap &CostMap) { CostType ModuleCost = 0; CostType KernelCost = 0; for (auto &Fn : M) { if (Fn.isDeclaration()) continue; CostType FnCost = 0; const auto &TTI = GetTTI(Fn); for (const auto &BB : Fn) { for (const auto &I : BB) { auto Cost = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); assert(Cost != InstructionCost::getMax()); // Assume expensive if we can't tell the cost of an instruction. CostType CostVal = Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); assert((FnCost + CostVal) >= FnCost && "Overflow!"); FnCost += CostVal; } } assert(FnCost != 0); CostMap[&Fn] = FnCost; assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); ModuleCost += FnCost; if (isEntryPoint(&Fn)) KernelCost += FnCost; } CostType FnCost = (ModuleCost - KernelCost); CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1; SML << "=> Total Module Cost: " << ModuleCost << '\n' << " => KernelCost: " << KernelCost << " (" << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n" << " => FnsCost: " << FnCost << " (" << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n"; return ModuleCost; } static bool canBeIndirectlyCalled(const Function &F) { if (F.isDeclaration() || isEntryPoint(&F)) return false; return !F.hasLocalLinkage() || F.hasAddressTaken(/*PutOffender=*/nullptr, /*IgnoreCallbackUses=*/false, /*IgnoreAssumeLikeCalls=*/true, /*IgnoreLLVMUsed=*/true, /*IgnoreARCAttachedCall=*/false, /*IgnoreCastedDirectCall=*/true); } /// When a function or any of its callees performs an indirect call, this /// takes over \ref addAllDependencies and adds all potentially callable /// functions to \p Fns so they can be counted as dependencies of the function. /// /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the /// presence of an indirect call, the function's resource usage is the same as /// the most expensive function in the module. /// \param M The module. /// \param Fns[out] Resulting list of functions. static void addAllIndirectCallDependencies(const Module &M, DenseSet &Fns) { for (const auto &Fn : M) { if (canBeIndirectlyCalled(Fn)) Fns.insert(&Fn); } } /// Adds the functions that \p Fn may call to \p Fns, then recurses into each /// callee until all reachable functions have been gathered. /// /// \param SML Log Helper /// \param CG Call graph for \p Fn's module. /// \param Fn Current function to look at. /// \param Fns[out] Resulting list of functions. /// \param OnlyDirect Whether to only consider direct callees. /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some /// point, either in \p Fn or in one of the function it calls. When that /// happens, we fall back to adding all callable functions inside \p Fn's module /// to \p Fns. static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, const Function &Fn, DenseSet &Fns, bool OnlyDirect, bool &HadIndirectCall) { assert(!Fn.isDeclaration()); const Module &M = *Fn.getParent(); SmallVector WorkList({&Fn}); while (!WorkList.empty()) { const auto &CurFn = *WorkList.pop_back_val(); assert(!CurFn.isDeclaration()); // Scan for an indirect call. If such a call is found, we have to // conservatively assume this can call all non-entrypoint functions in the // module. for (auto &CGEntry : *CG[&CurFn]) { auto *CGNode = CGEntry.second; auto *Callee = CGNode->getFunction(); if (!Callee) { if (OnlyDirect) continue; // Functions have an edge towards CallsExternalNode if they're external // declarations, or if they do an indirect call. As we only process // definitions here, we know this means the function has an indirect // call. We then have to conservatively assume this can call all // non-entrypoint functions in the module. if (CGNode != CG.getCallsExternalNode()) continue; // this is another function-less node we don't care about. SML << "Indirect call detected in " << getName(CurFn) << " - treating all non-entrypoint functions as " "potential dependencies\n"; // TODO: Print an ORE as well ? addAllIndirectCallDependencies(M, Fns); HadIndirectCall = true; continue; } if (Callee->isDeclaration()) continue; auto [It, Inserted] = Fns.insert(Callee); if (Inserted) WorkList.push_back(Callee); } } } /// Contains information about a function and its dependencies. /// This is a splitting root. The splitting algorithm works by /// assigning these to partitions. struct FunctionWithDependencies { FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG, const DenseMap &FnCosts, const Function *Fn) : Fn(Fn) { // When Fn is not a kernel, we don't need to collect indirect callees. // Resource usage analysis is only performed on kernels, and we collect // indirect callees for resource usage analysis. addAllDependencies(SML, CG, *Fn, Dependencies, /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall); TotalCost = FnCosts.at(Fn); for (const auto *Dep : Dependencies) { TotalCost += FnCosts.at(Dep); // We cannot duplicate functions with external linkage, or functions that // may be overriden at runtime. HasNonDuplicatableDependecy |= (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); } } const Function *Fn = nullptr; DenseSet Dependencies; /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. bool HasIndirectCall = false; /// Whether any of \p Fn's dependencies cannot be duplicated. bool HasNonDuplicatableDependecy = false; CostType TotalCost = 0; /// \returns true if this function and its dependencies can be considered /// large according to \p Threshold. bool isLarge(CostType Threshold) const { return TotalCost > Threshold && !Dependencies.empty(); } }; /// Calculates how much overlap there is between \p A and \p B. /// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A /// and B have no shared elements. Kernels do not count in overlap calculation. static float calculateOverlap(const DenseSet &A, const DenseSet &B) { DenseSet Total; for (const auto *F : A) { if (!isEntryPoint(F)) Total.insert(F); } if (Total.empty()) return 0.0f; unsigned NumCommon = 0; for (const auto *F : B) { if (isEntryPoint(F)) continue; auto [It, Inserted] = Total.insert(F); if (!Inserted) ++NumCommon; } return static_cast(NumCommon) / Total.size(); } /// Performs all of the partitioning work on \p M. /// \param SML Log Helper /// \param M Module to partition. /// \param NumParts Number of partitions to create. /// \param ModuleCost Total cost of all functions in \p M. /// \param FnCosts Map of Function -> Cost /// \param WorkList Functions and their dependencies to process in order. /// \returns The created partitions (a vector of size \p NumParts ) static std::vector> doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, CostType ModuleCost, const DenseMap &FnCosts, const SmallVector &WorkList) { SML << "\n--Partitioning Starts--\n"; // Calculate a "large function threshold". When more than one function's total // import cost exceeds this value, we will try to assign it to an existing // partition to reduce the amount of duplication needed. // // e.g. let two functions X and Y have a import cost of ~10% of the module, we // assign X to a partition as usual, but when we get to Y, we check if it's // worth also putting it in Y's partition. const CostType LargeFnThreshold = LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor)) : std::numeric_limits::max(); std::vector> Partitions; Partitions.resize(NumParts); // Assign functions to partitions, and try to keep the partitions more or // less balanced. We do that through a priority queue sorted in reverse, so we // can always look at the partition with the least content. // // There are some cases where we will be deliberately unbalanced though. // - Large functions: we try to merge with existing partitions to reduce code // duplication. // - Functions with indirect or external calls always go in the first // partition (P0). auto ComparePartitions = [](const std::pair &a, const std::pair &b) { // When two partitions have the same cost, assign to the one with the // biggest ID first. This allows us to put things in P0 last, because P0 may // have other stuff added later. if (a.second == b.second) return a.first < b.first; return a.second > b.second; }; // We can't use priority_queue here because we need to be able to access any // element. This makes this a bit inefficient as we need to sort it again // everytime we change it, but it's a very small array anyway (likely under 64 // partitions) so it's a cheap operation. std::vector> BalancingQueue; for (unsigned I = 0; I < NumParts; ++I) BalancingQueue.emplace_back(I, 0); // Helper function to handle assigning a function to a partition. This takes // care of updating the balancing queue. const auto AssignToPartition = [&](PartitionID PID, const FunctionWithDependencies &FWD) { auto &FnsInPart = Partitions[PID]; FnsInPart.insert(FWD.Fn); FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> "; if (!FWD.Dependencies.empty()) { SML << FWD.Dependencies.size() << " dependencies added\n"; }; // Update the balancing queue. we scan backwards because in the common case // the partition is at the end. for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { if (QueuePID == PID) { CostType NewCost = 0; for (auto *Fn : Partitions[PID]) NewCost += FnCosts.at(Fn); SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; if (Cost) { SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) << "% increase)"; } SML << '\n'; Cost = NewCost; } } sort(BalancingQueue, ComparePartitions); }; for (auto &CurFn : WorkList) { // When a function has indirect calls, it must stay in the first partition // alongside every reachable non-entry function. This is a nightmare case // for splitting as it severely limits what we can do. if (CurFn.HasIndirectCall) { SML << "Function with indirect call(s): " << getName(*CurFn.Fn) << " defaulting to P0\n"; AssignToPartition(0, CurFn); continue; } // When a function has non duplicatable dependencies, we have to keep it in // the first partition as well. This is a conservative approach, a // finer-grained approach could keep track of which dependencies are // non-duplicatable exactly and just make sure they're grouped together. if (CurFn.HasNonDuplicatableDependecy) { SML << "Function with externally visible dependency " << getName(*CurFn.Fn) << " defaulting to P0\n"; AssignToPartition(0, CurFn); continue; } // Be smart with large functions to avoid duplicating their dependencies. if (CurFn.isLarge(LargeFnThreshold)) { assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f); SML << "Large Function: " << getName(*CurFn.Fn) << " - looking for partition with at least " << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n"; bool Assigned = false; for (const auto &[PID, Fns] : enumerate(Partitions)) { float Overlap = calculateOverlap(CurFn.Dependencies, Fns); SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" << PID << '\n'; if (Overlap > LargeFnOverlapForMerge) { SML << " selecting P" << PID << '\n'; AssignToPartition(PID, CurFn); Assigned = true; } } if (Assigned) continue; } // Normal "load-balancing", assign to partition with least pressure. auto [PID, CurCost] = BalancingQueue.back(); AssignToPartition(PID, CurFn); } if (SML) { for (const auto &[Idx, Part] : enumerate(Partitions)) { CostType Cost = 0; for (auto *Fn : Part) Cost += FnCosts.at(Fn); SML << "P" << Idx << " has a total cost of " << Cost << " (" << format("%0.2f", (float(Cost) / ModuleCost) * 100) << "% of source module)\n"; } SML << "--Partitioning Done--\n\n"; } // Check no functions were missed. #ifndef NDEBUG DenseSet AllFunctions; for (const auto &Part : Partitions) AllFunctions.insert(Part.begin(), Part.end()); for (auto &Fn : M) { if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { assert(AllFunctions.contains(&Fn) && "Missed a function?!"); } } #endif return Partitions; } static void externalize(GlobalValue &GV) { if (GV.hasLocalLinkage()) { GV.setLinkage(GlobalValue::ExternalLinkage); GV.setVisibility(GlobalValue::HiddenVisibility); } // Unnamed entities must be named consistently between modules. setName will // give a distinct name to each such entity. if (!GV.hasName()) GV.setName("__llvmsplit_unnamed"); } static bool hasDirectCaller(const Function &Fn) { for (auto &U : Fn.uses()) { if (auto *CB = dyn_cast(U.getUser()); CB && CB->isCallee(&U)) return true; } return false; } static void splitAMDGPUModule( GetTTIFn GetTTI, Module &M, unsigned N, function_ref MPart)> ModuleCallback) { SplitModuleLogger SML(M); CallGraph CG(M); // Externalize functions whose address are taken. // // This is needed because partitioning is purely based on calls, but sometimes // a kernel/function may just look at the address of another local function // and not do anything (no calls). After partitioning, that local function may // end up in a different module (so it's just a declaration in the module // where its address is taken), which emits a "undefined hidden symbol" linker // error. // // Additionally, it guides partitioning to not duplicate this function if it's // called directly at some point. for (auto &Fn : M) { if (Fn.hasAddressTaken()) { if (Fn.hasLocalLinkage()) { SML << "[externalize] " << Fn.getName() << " because its address is taken\n"; } externalize(Fn); } } // Externalize local GVs, which avoids duplicating their initializers, which // in turns helps keep code size in check. if (!NoExternalizeGlobals) { for (auto &GV : M.globals()) { if (GV.hasLocalLinkage()) SML << "[externalize] GV " << GV.getName() << '\n'; externalize(GV); } } // Start by calculating the cost of every function in the module, as well as // the module's overall cost. DenseMap FnCosts; const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); // First, gather ever kernel into the worklist. SmallVector WorkList; for (auto &Fn : M) { if (isEntryPoint(&Fn) && !Fn.isDeclaration()) WorkList.emplace_back(SML, CG, FnCosts, &Fn); } // Then, find missing functions that need to be considered as additional // roots. These can't be called in theory, but in practice we still have to // handle them to avoid linker errors. { DenseSet SeenFunctions; for (const auto &FWD : WorkList) { SeenFunctions.insert(FWD.Fn); SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); } for (auto &Fn : M) { // If this function is not part of any kernel's dependencies and isn't // directly called, consider it as a root. if (!Fn.isDeclaration() && !isEntryPoint(&Fn) && !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) { WorkList.emplace_back(SML, CG, FnCosts, &Fn); } } } // Sort the worklist so the most expensive roots are seen first. sort(WorkList, [&](auto &A, auto &B) { // Sort by total cost, and if the total cost is identical, sort // alphabetically. if (A.TotalCost == B.TotalCost) return A.Fn->getName() < B.Fn->getName(); return A.TotalCost > B.TotalCost; }); if (SML) { SML << "Worklist\n"; for (const auto &FWD : WorkList) { SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost << " indirect:" << FWD.HasIndirectCall << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy << ")\n"; // Sort function names before printing to ensure determinism. SmallVector SortedDepNames; SortedDepNames.reserve(FWD.Dependencies.size()); for (const auto *Dep : FWD.Dependencies) SortedDepNames.push_back(getName(*Dep)); sort(SortedDepNames); for (const auto &Name : SortedDepNames) SML << " [dependency] " << Name << '\n'; } } // This performs all of the partitioning work. auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); assert(Partitions.size() == N); // If we didn't externalize GVs, then local GVs need to be conservatively // imported into every module (including their initializers), and then cleaned // up afterwards. const auto NeedsConservativeImport = [&](const GlobalValue *GV) { // We conservatively import private/internal GVs into every module and clean // them up afterwards. const auto *Var = dyn_cast(GV); return Var && Var->hasLocalLinkage(); }; SML << "Creating " << N << " modules...\n"; unsigned TotalFnImpls = 0; for (unsigned I = 0; I < N; ++I) { const auto &FnsInPart = Partitions[I]; ValueToValueMapTy VMap; std::unique_ptr MPart( CloneModule(M, VMap, [&](const GlobalValue *GV) { // Functions go in their assigned partition. if (const auto *Fn = dyn_cast(GV)) return FnsInPart.contains(Fn); if (NeedsConservativeImport(GV)) return true; // Everything else goes in the first partition. return I == 0; })); // Clean-up conservatively imported GVs without any users. for (auto &GV : make_early_inc_range(MPart->globals())) { if (NeedsConservativeImport(&GV) && GV.use_empty()) GV.eraseFromParent(); } unsigned NumAllFns = 0, NumKernels = 0; for (auto &Cur : *MPart) { if (!Cur.isDeclaration()) { ++NumAllFns; if (isEntryPoint(&Cur)) ++NumKernels; } } TotalFnImpls += NumAllFns; SML << " - Module " << I << " with " << NumAllFns << " functions (" << NumKernels << " kernels)\n"; ModuleCallback(std::move(MPart)); } SML << TotalFnImpls << " function definitions across all modules (" << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) << "% of original module)\n"; } } // namespace PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, ModuleAnalysisManager &MAM) { FunctionAnalysisManager &FAM = MAM.getResult(M).getManager(); const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { return FAM.getResult(F); }; splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); // We don't change the original module. return PreservedAnalyses::all(); }