1 //===- BalancedPartitioning.h ---------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements BalancedPartitioning, a recursive balanced graph 10 // partitioning algorithm. 11 // 12 // The algorithm is used to find an ordering of FunctionNodes while optimizing 13 // a specified objective. The algorithm uses recursive bisection; it starts 14 // with a collection of unordered FunctionNodes and tries to split them into 15 // two sets (buckets) of equal cardinality. Each bisection step is comprised of 16 // iterations that greedily swap the FunctionNodes between the two buckets while 17 // there is an improvement of the objective. Once the process converges, the 18 // problem is divided into two sub-problems of half the size, which are 19 // recursively applied for the two buckets. The final ordering of the 20 // FunctionNodes is obtained by concatenating the two (recursively computed) 21 // orderings. 22 // 23 // In order to speed up the computation, we limit the depth of the recursive 24 // tree by a specified constant (SplitDepth) and apply at most a constant 25 // number of greedy iterations per split (IterationsPerSplit). The worst-case 26 // time complexity of the implementation is bounded by O(M*log^2 N), where 27 // N is the number of FunctionNodes and M is the number of 28 // FunctionNode-UtilityNode edges; (assuming that any collection of D 29 // FunctionNodes contains O(D) UtilityNodes). Notice that the two different 30 // recursive sub-problems are independent and thus can be efficiently processed 31 // in parallel. 32 // 33 // Reference: 34 // * Optimizing Function Layout for Mobile Applications, 35 // https://arxiv.org/abs/2211.09285 36 // 37 //===----------------------------------------------------------------------===// 38 39 #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H 40 #define LLVM_SUPPORT_BALANCED_PARTITIONING_H 41 42 #include "raw_ostream.h" 43 #include "llvm/ADT/ArrayRef.h" 44 45 #include <atomic> 46 #include <condition_variable> 47 #include <mutex> 48 #include <random> 49 #include <vector> 50 51 namespace llvm { 52 53 class ThreadPoolInterface; 54 /// A function with a set of utility nodes where it is beneficial to order two 55 /// functions close together if they have similar utility nodes 56 class BPFunctionNode { 57 friend class BalancedPartitioning; 58 59 public: 60 using IDT = uint64_t; 61 using UtilityNodeT = uint32_t; 62 63 /// \param UtilityNodes the set of utility nodes (must be unique'd) BPFunctionNode(IDT Id,ArrayRef<UtilityNodeT> UtilityNodes)64 BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes) 65 : Id(Id), UtilityNodes(UtilityNodes) {} 66 67 /// The ID of this node 68 IDT Id; 69 70 void dump(raw_ostream &OS) const; 71 72 protected: 73 /// The list of utility nodes associated with this node 74 SmallVector<UtilityNodeT, 4> UtilityNodes; 75 /// The bucket assigned by balanced partitioning 76 std::optional<unsigned> Bucket; 77 /// The index of the input order of the FunctionNodes 78 uint64_t InputOrderIndex = 0; 79 80 friend class BPFunctionNodeTest_Basic_Test; 81 friend class BalancedPartitioningTest_Basic_Test; 82 friend class BalancedPartitioningTest_Large_Test; 83 }; 84 85 /// Algorithm parameters; default values are tuned on real-world binaries 86 struct BalancedPartitioningConfig { 87 /// The depth of the recursive bisection 88 unsigned SplitDepth = 18; 89 /// The maximum number of bp iterations per split 90 unsigned IterationsPerSplit = 40; 91 /// The probability for a vertex to skip a move from its current bucket to 92 /// another bucket; it often helps to escape from a local optima 93 float SkipProbability = 0.1f; 94 /// Recursive subtasks up to the given depth are added to the queue and 95 /// distributed among threads by ThreadPool; all subsequent calls are executed 96 /// on the same thread 97 unsigned TaskSplitDepth = 9; 98 }; 99 100 class BalancedPartitioning { 101 public: 102 BalancedPartitioning(const BalancedPartitioningConfig &Config); 103 104 /// Run recursive graph partitioning that optimizes a given objective. 105 void run(std::vector<BPFunctionNode> &Nodes) const; 106 107 private: 108 struct UtilitySignature; 109 using SignaturesT = SmallVector<UtilitySignature, 4>; 110 using FunctionNodeRange = 111 iterator_range<std::vector<BPFunctionNode>::iterator>; 112 113 /// A special ThreadPool that allows for spawning new tasks after blocking on 114 /// wait(). BalancedPartitioning recursively spawns new threads inside other 115 /// threads, so we need to track how many active threads that could spawn more 116 /// threads. 117 struct BPThreadPool { 118 ThreadPoolInterface &TheThreadPool; 119 std::mutex mtx; 120 std::condition_variable cv; 121 /// The number of threads that could spawn more threads 122 std::atomic<int> NumActiveThreads = 0; 123 /// Only true when all threads are down spawning new threads 124 bool IsFinishedSpawning = false; 125 /// Asynchronous submission of the task to the pool 126 template <typename Func> void async(Func &&F); 127 /// Blocking wait for all threads to complete. Unlike ThreadPool, it is 128 /// acceptable for other threads to add more tasks while blocking on this 129 /// call. 130 void wait(); BPThreadPoolBPThreadPool131 BPThreadPool(ThreadPoolInterface &TheThreadPool) 132 : TheThreadPool(TheThreadPool) {} 133 }; 134 135 /// Run a recursive bisection of a given list of FunctionNodes 136 /// \param RecDepth the current depth of recursion 137 /// \param RootBucket the initial bucket of the dataVertices 138 /// \param Offset the assigned buckets are the range [Offset, Offset + 139 /// Nodes.size()] 140 void bisect(const FunctionNodeRange Nodes, unsigned RecDepth, 141 unsigned RootBucket, unsigned Offset, 142 std::optional<BPThreadPool> &TP) const; 143 144 /// Run bisection iterations 145 void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket, 146 unsigned RightBucket, std::mt19937 &RNG) const; 147 148 /// Run a bisection iteration to improve the optimization goal 149 /// \returns the total number of moved FunctionNodes 150 unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket, 151 unsigned RightBucket, SignaturesT &Signatures, 152 std::mt19937 &RNG) const; 153 154 /// Try to move \p N from one bucket to another 155 /// \returns true iff \p N is moved 156 bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket, 157 unsigned RightBucket, SignaturesT &Signatures, 158 std::mt19937 &RNG) const; 159 160 /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket + 161 /// 1 The method is used for an initial assignment before a bisection step 162 void split(const FunctionNodeRange Nodes, unsigned StartBucket) const; 163 164 /// The cost of the uniform log-gap cost, assuming a utility node has \p X 165 /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one. 166 float logCost(unsigned X, unsigned Y) const; 167 168 float log2Cached(unsigned i) const; 169 170 const BalancedPartitioningConfig &Config; 171 172 /// Precomputed values of log2(x). Table size is small enough to fit in cache. 173 static constexpr unsigned LOG_CACHE_SIZE = 16384; 174 float Log2Cache[LOG_CACHE_SIZE]; 175 176 /// The signature of a particular utility node used for the bisection step, 177 /// i.e., the number of \p FunctionNodes in each of the two buckets 178 struct UtilitySignature { 179 /// The number of \p FunctionNodes in the left bucket 180 unsigned LeftCount = 0; 181 /// The number of \p FunctionNodes in the right bucket 182 unsigned RightCount = 0; 183 /// The cached gain of moving a \p FunctionNode from the left bucket to the 184 /// right bucket 185 float CachedGainLR; 186 /// The cached gain of moving a \p FunctionNode from the right bucket to the 187 /// left bucket 188 float CachedGainRL; 189 /// Whether \p CachedGainLR and \p CachedGainRL are valid 190 bool CachedGainIsValid = false; 191 }; 192 193 protected: 194 /// Compute the move gain for uniform log-gap cost 195 static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, 196 const SignaturesT &Signatures); 197 friend class BalancedPartitioningTest_MoveGain_Test; 198 }; 199 200 } // end namespace llvm 201 202 #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H 203