//===- BalancedPartitioning.cpp -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements BalancedPartitioning, a recursive balanced graph // partitioning algorithm. // //===----------------------------------------------------------------------===// #include "llvm/Support/BalancedPartitioning.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/ThreadPool.h" using namespace llvm; #define DEBUG_TYPE "balanced-partitioning" void BPFunctionNode::dump(raw_ostream &OS) const { OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id, make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket); } template void BalancedPartitioning::BPThreadPool::async(Func &&F) { #if LLVM_ENABLE_THREADS // This new thread could spawn more threads, so mark it as active ++NumActiveThreads; TheThreadPool.async([=]() { // Run the task F(); // This thread will no longer spawn new threads, so mark it as inactive if (--NumActiveThreads == 0) { // There are no more active threads, so mark as finished and notify { std::unique_lock lock(mtx); assert(!IsFinishedSpawning); IsFinishedSpawning = true; } cv.notify_one(); } }); #else llvm_unreachable("threads are disabled"); #endif } void BalancedPartitioning::BPThreadPool::wait() { #if LLVM_ENABLE_THREADS // TODO: We could remove the mutex and condition variable and use // std::atomic::wait() instead, but that isn't available until C++20 { std::unique_lock lock(mtx); cv.wait(lock, [&]() { return IsFinishedSpawning; }); assert(IsFinishedSpawning && NumActiveThreads == 0); } // Now we can call ThreadPool::wait() since all tasks have been submitted TheThreadPool.wait(); #else llvm_unreachable("threads are disabled"); #endif } BalancedPartitioning::BalancedPartitioning( const BalancedPartitioningConfig &Config) : Config(Config) { // Pre-computing log2 values Log2Cache[0] = 0.0; for (unsigned I = 1; I < LOG_CACHE_SIZE; I++) Log2Cache[I] = std::log2(I); } void BalancedPartitioning::run(std::vector &Nodes) const { LLVM_DEBUG( dbgs() << format( "Partitioning %d nodes using depth %d and %d iterations per split\n", Nodes.size(), Config.SplitDepth, Config.IterationsPerSplit)); std::optional TP; #if LLVM_ENABLE_THREADS ThreadPool TheThreadPool; if (Config.TaskSplitDepth > 1) TP.emplace(TheThreadPool); #endif // Record the input order for (unsigned I = 0; I < Nodes.size(); I++) Nodes[I].InputOrderIndex = I; auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end()); auto BisectTask = [=, &TP]() { bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP); }; if (TP) { TP->async(std::move(BisectTask)); TP->wait(); } else { BisectTask(); } llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) { return L.Bucket < R.Bucket; }); LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n"); } void BalancedPartitioning::bisect(const FunctionNodeRange Nodes, unsigned RecDepth, unsigned RootBucket, unsigned Offset, std::optional &TP) const { unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); if (NumNodes <= 1 || RecDepth >= Config.SplitDepth) { // We've reach the lowest level of the recursion tree. Fall back to the // original order and assign to buckets. llvm::sort(Nodes, [](const auto &L, const auto &R) { return L.InputOrderIndex < R.InputOrderIndex; }); for (auto &N : Nodes) N.Bucket = Offset++; return; } LLVM_DEBUG(dbgs() << format("Bisect with %d nodes and root bucket %d\n", NumNodes, RootBucket)); std::mt19937 RNG(RootBucket); unsigned LeftBucket = 2 * RootBucket; unsigned RightBucket = 2 * RootBucket + 1; // Split into two and assign to the left and right buckets split(Nodes, LeftBucket); runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG); // Split nodes wrt the resulting buckets auto NodesMid = llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; }); unsigned MidOffset = Offset + std::distance(Nodes.begin(), NodesMid); auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid); auto RightNodes = llvm::make_range(NodesMid, Nodes.end()); auto LeftRecTask = [=, &TP]() { bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP); }; auto RightRecTask = [=, &TP]() { bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP); }; if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) { TP->async(std::move(LeftRecTask)); TP->async(std::move(RightRecTask)); } else { LeftRecTask(); RightRecTask(); } } void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes, unsigned RecDepth, unsigned LeftBucket, unsigned RightBucket, std::mt19937 &RNG) const { unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); DenseMap UtilityNodeIndex; for (auto &N : Nodes) for (auto &UN : N.UtilityNodes) ++UtilityNodeIndex[UN]; // Remove utility nodes if they have just one edge or are connected to all // functions for (auto &N : Nodes) llvm::erase_if(N.UtilityNodes, [&](auto &UN) { return UtilityNodeIndex[UN] == 1 || UtilityNodeIndex[UN] == NumNodes; }); // Renumber utility nodes so they can be used to index into Signatures UtilityNodeIndex.clear(); for (auto &N : Nodes) for (auto &UN : N.UtilityNodes) UN = UtilityNodeIndex.insert({UN, UtilityNodeIndex.size()}).first->second; // Initialize signatures SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size()); for (auto &N : Nodes) { for (auto &UN : N.UtilityNodes) { assert(UN < Signatures.size()); if (N.Bucket == LeftBucket) { Signatures[UN].LeftCount++; } else { Signatures[UN].RightCount++; } } } for (unsigned I = 0; I < Config.IterationsPerSplit; I++) { unsigned NumMovedNodes = runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG); if (NumMovedNodes == 0) break; } } unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket, unsigned RightBucket, SignaturesT &Signatures, std::mt19937 &RNG) const { // Init signature cost caches for (auto &Signature : Signatures) { if (Signature.CachedGainIsValid) continue; unsigned L = Signature.LeftCount; unsigned R = Signature.RightCount; assert((L > 0 || R > 0) && "incorrect signature"); float Cost = logCost(L, R); Signature.CachedGainLR = 0.f; Signature.CachedGainRL = 0.f; if (L > 0) Signature.CachedGainLR = Cost - logCost(L - 1, R + 1); if (R > 0) Signature.CachedGainRL = Cost - logCost(L + 1, R - 1); Signature.CachedGainIsValid = true; } // Compute move gains typedef std::pair GainPair; std::vector Gains; for (auto &N : Nodes) { bool FromLeftToRight = (N.Bucket == LeftBucket); float Gain = moveGain(N, FromLeftToRight, Signatures); Gains.push_back(std::make_pair(Gain, &N)); } // Collect left and right gains auto LeftEnd = llvm::partition( Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; }); auto LeftRange = llvm::make_range(Gains.begin(), LeftEnd); auto RightRange = llvm::make_range(LeftEnd, Gains.end()); // Sort gains in descending order auto LargerGain = [](const auto &L, const auto &R) { return L.first > R.first; }; llvm::stable_sort(LeftRange, LargerGain); llvm::stable_sort(RightRange, LargerGain); unsigned NumMovedDataVertices = 0; for (auto [LeftPair, RightPair] : llvm::zip(LeftRange, RightRange)) { auto &[LeftGain, LeftNode] = LeftPair; auto &[RightGain, RightNode] = RightPair; // Stop when the gain is no longer beneficial if (LeftGain + RightGain <= 0.f) break; // Try to exchange the nodes between buckets if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket, Signatures, RNG)) ++NumMovedDataVertices; if (moveFunctionNode(*RightNode, LeftBucket, RightBucket, Signatures, RNG)) ++NumMovedDataVertices; } return NumMovedDataVertices; } bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket, unsigned RightBucket, SignaturesT &Signatures, std::mt19937 &RNG) const { // Sometimes we skip the move. This helps to escape local optima if (std::uniform_real_distribution(0.f, 1.f)(RNG) <= Config.SkipProbability) return false; bool FromLeftToRight = (N.Bucket == LeftBucket); // Update the current bucket N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket); // Update signatures and invalidate gain cache if (FromLeftToRight) { for (auto &UN : N.UtilityNodes) { auto &Signature = Signatures[UN]; Signature.LeftCount--; Signature.RightCount++; Signature.CachedGainIsValid = false; } } else { for (auto &UN : N.UtilityNodes) { auto &Signature = Signatures[UN]; Signature.LeftCount++; Signature.RightCount--; Signature.CachedGainIsValid = false; } } return true; } void BalancedPartitioning::split(const FunctionNodeRange Nodes, unsigned StartBucket) const { unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2; std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) { return L.InputOrderIndex < R.InputOrderIndex; }); for (auto &N : llvm::make_range(Nodes.begin(), NodesMid)) N.Bucket = StartBucket; for (auto &N : llvm::make_range(NodesMid, Nodes.end())) N.Bucket = StartBucket + 1; } float BalancedPartitioning::moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures) { float Gain = 0.f; for (auto &UN : N.UtilityNodes) Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR : Signatures[UN].CachedGainRL); return Gain; } float BalancedPartitioning::logCost(unsigned X, unsigned Y) const { return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1)); } float BalancedPartitioning::log2Cached(unsigned i) const { return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i); }