//===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file implements two passes that enable HIP C++ Standard Parallelism // Support: // // 1. AcceleratorCodeSelection (required): Given that only algorithms are // accelerated, and that the accelerated implementation exists in the form of // a compute kernel, we assume that only the kernel, and all functions // reachable from it, constitute code that the user expects the accelerator // to execute. Thus, we identify the set of all functions reachable from // kernels, and then remove all unreachable ones. This last part is necessary // because it is possible for code that the user did not expect to execute on // an accelerator to contain constructs that cannot be handled by the target // BE, which cannot be provably demonstrated to be dead code in general, and // thus can lead to mis-compilation. The degenerate case of this is when a // Module contains no kernels (the parent TU had no algorithm invocations fit // for acceleration), which we handle by completely emptying said module. // **NOTE**: The above does not handle indirectly reachable functions i.e. // it is possible to obtain a case where the target of an indirect // call is otherwise unreachable and thus is removed; this // restriction is aligned with the current `-hipstdpar` limitations // and will be relaxed in the future. // // 2. AllocationInterposition (required only when on-demand paging is // unsupported): Some accelerators or operating systems might not support // transparent on-demand paging. Thus, they would only be able to access // memory that is allocated by an accelerator-aware mechanism. For such cases // the user can opt into enabling allocation / deallocation interposition, // whereby we replace calls to known allocation / deallocation functions with // calls to runtime implemented equivalents that forward the requests to // accelerator-aware interfaces. We also support freeing system allocated // memory that ends up in one of the runtime equivalents, since this can // happen if e.g. a library that was compiled without interposition returns // an allocation that can be validly passed to `free`. //===----------------------------------------------------------------------===// #include "llvm/Transforms/HipStdPar/HipStdPar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include #include using namespace llvm; template static inline void eraseFromModule(T &ToErase) { ToErase.replaceAllUsesWith(PoisonValue::get(ToErase.getType())); ToErase.eraseFromParent(); } static inline bool checkIfSupported(GlobalVariable &G) { if (!G.isThreadLocal()) return true; G.dropDroppableUses(); if (!G.isConstantUsed()) return true; std::string W; raw_string_ostream OS(W); OS << "Accelerator does not support the thread_local variable " << G.getName(); Instruction *I = nullptr; SmallVector Tmp(G.user_begin(), G.user_end()); SmallPtrSet Visited; do { auto U = std::move(Tmp.back()); Tmp.pop_back(); if (Visited.contains(U)) continue; if (isa(U)) I = cast(U); else Tmp.insert(Tmp.end(), U->user_begin(), U->user_end()); Visited.insert(U); } while (!I && !Tmp.empty()); assert(I && "thread_local global should have at least one non-constant use."); G.getContext().diagnose( DiagnosticInfoUnsupported(*I->getParent()->getParent(), W, I->getDebugLoc(), DS_Error)); return false; } static inline void clearModule(Module &M) { // TODO: simplify. while (!M.functions().empty()) eraseFromModule(*M.begin()); while (!M.globals().empty()) eraseFromModule(*M.globals().begin()); while (!M.aliases().empty()) eraseFromModule(*M.aliases().begin()); while (!M.ifuncs().empty()) eraseFromModule(*M.ifuncs().begin()); } static inline void maybeHandleGlobals(Module &M) { unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); for (auto &&G : M.globals()) { // TODO: should we handle these in the FE? if (!checkIfSupported(G)) return clearModule(M); if (G.isThreadLocal()) continue; if (G.isConstant()) continue; if (G.getAddressSpace() != GlobAS) continue; if (G.getLinkage() != GlobalVariable::ExternalLinkage) continue; G.setLinkage(GlobalVariable::ExternalWeakLinkage); G.setExternallyInitialized(true); } } template static inline void removeUnreachableFunctions( const SmallPtrSet& Reachable, Module &M) { removeFromUsedLists(M, [&](Constant *C) { if (auto F = dyn_cast(C)) return !Reachable.contains(F); return false; }); SmallVector> ToRemove; copy_if(M, std::back_inserter(ToRemove), [&](auto &&F) { return !F.isIntrinsic() && !Reachable.contains(&F); }); for_each(ToRemove, eraseFromModule); } static inline bool isAcceleratorExecutionRoot(const Function *F) { if (!F) return false; return F->getCallingConv() == CallingConv::AMDGPU_KERNEL; } static inline bool checkIfSupported(const Function *F, const CallBase *CB) { const auto Dx = F->getName().rfind("__hipstdpar_unsupported"); if (Dx == StringRef::npos) return true; const auto N = F->getName().substr(0, Dx); std::string W; raw_string_ostream OS(W); if (N == "__ASM") OS << "Accelerator does not support the ASM block:\n" << cast(CB->getArgOperand(0))->getAsCString(); else OS << "Accelerator does not support the " << N << " function."; auto Caller = CB->getParent()->getParent(); Caller->getContext().diagnose( DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error)); return false; } PreservedAnalyses HipStdParAcceleratorCodeSelectionPass::run(Module &M, ModuleAnalysisManager &MAM) { auto &CGA = MAM.getResult(M); SmallPtrSet Reachable; for (auto &&CGN : CGA) { if (!isAcceleratorExecutionRoot(CGN.first)) continue; Reachable.insert(CGN.first); SmallVector Tmp({CGN.first}); do { auto F = std::move(Tmp.back()); Tmp.pop_back(); for (auto &&N : *CGA[F]) { if (!N.second) continue; if (!N.second->getFunction()) continue; if (Reachable.contains(N.second->getFunction())) continue; if (!checkIfSupported(N.second->getFunction(), dyn_cast(*N.first))) return PreservedAnalyses::none(); Reachable.insert(N.second->getFunction()); Tmp.push_back(N.second->getFunction()); } } while (!std::empty(Tmp)); } if (std::empty(Reachable)) clearModule(M); else removeUnreachableFunctions(Reachable, M); maybeHandleGlobals(M); return PreservedAnalyses::none(); } static constexpr std::pair ReplaceMap[]{ {"aligned_alloc", "__hipstdpar_aligned_alloc"}, {"calloc", "__hipstdpar_calloc"}, {"free", "__hipstdpar_free"}, {"malloc", "__hipstdpar_malloc"}, {"memalign", "__hipstdpar_aligned_alloc"}, {"posix_memalign", "__hipstdpar_posix_aligned_alloc"}, {"realloc", "__hipstdpar_realloc"}, {"reallocarray", "__hipstdpar_realloc_array"}, {"_ZdaPv", "__hipstdpar_operator_delete"}, {"_ZdaPvm", "__hipstdpar_operator_delete_sized"}, {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, {"_ZdlPv", "__hipstdpar_operator_delete"}, {"_ZdlPvm", "__hipstdpar_operator_delete_sized"}, {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, {"_Znam", "__hipstdpar_operator_new"}, {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"}, {"_ZnamSt11align_val_tRKSt9nothrow_t", "__hipstdpar_operator_new_aligned_nothrow"}, {"_Znwm", "__hipstdpar_operator_new"}, {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"}, {"_ZnwmSt11align_val_tRKSt9nothrow_t", "__hipstdpar_operator_new_aligned_nothrow"}, {"__builtin_calloc", "__hipstdpar_calloc"}, {"__builtin_free", "__hipstdpar_free"}, {"__builtin_malloc", "__hipstdpar_malloc"}, {"__builtin_operator_delete", "__hipstdpar_operator_delete"}, {"__builtin_operator_new", "__hipstdpar_operator_new"}, {"__builtin_realloc", "__hipstdpar_realloc"}, {"__libc_calloc", "__hipstdpar_calloc"}, {"__libc_free", "__hipstdpar_free"}, {"__libc_malloc", "__hipstdpar_malloc"}, {"__libc_memalign", "__hipstdpar_aligned_alloc"}, {"__libc_realloc", "__hipstdpar_realloc"} }; PreservedAnalyses HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { SmallDenseMap AllocReplacements(std::cbegin(ReplaceMap), std::cend(ReplaceMap)); for (auto &&F : M) { if (!F.hasName()) continue; if (!AllocReplacements.contains(F.getName())) continue; if (auto R = M.getFunction(AllocReplacements[F.getName()])) { F.replaceAllUsesWith(R); } else { std::string W; raw_string_ostream OS(W); OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()] << ". Tried to run the allocation interposition pass without the " << "replacement functions available."; F.getContext().diagnose(DiagnosticInfoUnsupported(F, W, F.getSubprogram(), DS_Warning)); } } if (auto F = M.getFunction("__hipstdpar_hidden_free")) { auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(), F->getAttributes()); F->replaceAllUsesWith(LibcFree.getCallee()); eraseFromModule(*F); } return PreservedAnalyses::none(); }