xref: /freebsd/contrib/llvm-project/llvm/include/llvm/Support/BalancedPartitioning.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- BalancedPartitioning.h ---------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements BalancedPartitioning, a recursive balanced graph
10 // partitioning algorithm.
11 //
12 // The algorithm is used to find an ordering of FunctionNodes while optimizing
13 // a specified objective. The algorithm uses recursive bisection; it starts
14 // with a collection of unordered FunctionNodes and tries to split them into
15 // two sets (buckets) of equal cardinality. Each bisection step is comprised of
16 // iterations that greedily swap the FunctionNodes between the two buckets while
17 // there is an improvement of the objective. Once the process converges, the
18 // problem is divided into two sub-problems of half the size, which are
19 // recursively applied for the two buckets. The final ordering of the
20 // FunctionNodes is obtained by concatenating the two (recursively computed)
21 // orderings.
22 //
23 // In order to speed up the computation, we limit the depth of the recursive
24 // tree by a specified constant (SplitDepth) and apply at most a constant
25 // number of greedy iterations per split (IterationsPerSplit). The worst-case
26 // time complexity of the implementation is bounded by O(M*log^2 N), where
27 // N is the number of FunctionNodes and M is the number of
28 // FunctionNode-UtilityNode edges; (assuming that any collection of D
29 // FunctionNodes contains O(D) UtilityNodes). Notice that the two different
30 // recursive sub-problems are independent and thus can be efficiently processed
31 // in parallel.
32 //
33 // Reference:
34 //   * Optimizing Function Layout for Mobile Applications,
35 //     https://arxiv.org/abs/2211.09285
36 //
37 //===----------------------------------------------------------------------===//
38 
39 #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
40 #define LLVM_SUPPORT_BALANCED_PARTITIONING_H
41 
42 #include "raw_ostream.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/Support/Compiler.h"
45 
46 #include <atomic>
47 #include <condition_variable>
48 #include <mutex>
49 #include <random>
50 #include <vector>
51 
52 namespace llvm {
53 
54 class ThreadPoolInterface;
55 /// A function with a set of utility nodes where it is beneficial to order two
56 /// functions close together if they have similar utility nodes
57 class BPFunctionNode {
58   friend class BalancedPartitioning;
59 
60 public:
61   using IDT = uint64_t;
62   using UtilityNodeT = uint32_t;
63 
64   /// \param UtilityNodes the set of utility nodes (must be unique'd)
BPFunctionNode(IDT Id,ArrayRef<UtilityNodeT> UtilityNodes)65   BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
66       : Id(Id), UtilityNodes(UtilityNodes) {}
67 
68   /// The ID of this node
69   IDT Id;
70 
71   LLVM_ABI void dump(raw_ostream &OS) const;
72 
73 protected:
74   /// The list of utility nodes associated with this node
75   SmallVector<UtilityNodeT, 4> UtilityNodes;
76   /// The bucket assigned by balanced partitioning
77   std::optional<unsigned> Bucket;
78   /// The index of the input order of the FunctionNodes
79   uint64_t InputOrderIndex = 0;
80 
81   friend class BPFunctionNodeTest_Basic_Test;
82   friend class BalancedPartitioningTest_Basic_Test;
83   friend class BalancedPartitioningTest_Large_Test;
84 };
85 
86 /// Algorithm parameters; default values are tuned on real-world binaries
87 struct BalancedPartitioningConfig {
88   /// The depth of the recursive bisection
89   unsigned SplitDepth = 18;
90   /// The maximum number of bp iterations per split
91   unsigned IterationsPerSplit = 40;
92   /// The probability for a vertex to skip a move from its current bucket to
93   /// another bucket; it often helps to escape from a local optima
94   float SkipProbability = 0.1f;
95   /// Recursive subtasks up to the given depth are added to the queue and
96   /// distributed among threads by ThreadPool; all subsequent calls are executed
97   /// on the same thread
98   unsigned TaskSplitDepth = 9;
99 };
100 
101 class BalancedPartitioning {
102 public:
103   LLVM_ABI BalancedPartitioning(const BalancedPartitioningConfig &Config);
104 
105   /// Run recursive graph partitioning that optimizes a given objective.
106   LLVM_ABI void run(std::vector<BPFunctionNode> &Nodes) const;
107 
108 private:
109   struct UtilitySignature;
110   using SignaturesT = SmallVector<UtilitySignature, 4>;
111   using FunctionNodeRange =
112       iterator_range<std::vector<BPFunctionNode>::iterator>;
113 
114   /// A special ThreadPool that allows for spawning new tasks after blocking on
115   /// wait(). BalancedPartitioning recursively spawns new threads inside other
116   /// threads, so we need to track how many active threads that could spawn more
117   /// threads.
118   struct BPThreadPool {
119     ThreadPoolInterface &TheThreadPool;
120     std::mutex mtx;
121     std::condition_variable cv;
122     /// The number of threads that could spawn more threads
123     std::atomic<int> NumActiveThreads = 0;
124     /// Only true when all threads are down spawning new threads
125     bool IsFinishedSpawning = false;
126     /// Asynchronous submission of the task to the pool
127     template <typename Func> void async(Func &&F);
128     /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
129     /// acceptable for other threads to add more tasks while blocking on this
130     /// call.
131     LLVM_ABI void wait();
BPThreadPoolBPThreadPool132     BPThreadPool(ThreadPoolInterface &TheThreadPool)
133         : TheThreadPool(TheThreadPool) {}
134   };
135 
136   /// Run a recursive bisection of a given list of FunctionNodes
137   /// \param RecDepth the current depth of recursion
138   /// \param RootBucket the initial bucket of the dataVertices
139   /// \param Offset the assigned buckets are the range [Offset, Offset +
140   /// Nodes.size()]
141   void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
142               unsigned RootBucket, unsigned Offset,
143               std::optional<BPThreadPool> &TP) const;
144 
145   /// Run bisection iterations
146   void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket,
147                      unsigned RightBucket, std::mt19937 &RNG) const;
148 
149   /// Run a bisection iteration to improve the optimization goal
150   /// \returns the total number of moved FunctionNodes
151   unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
152                         unsigned RightBucket, SignaturesT &Signatures,
153                         std::mt19937 &RNG) const;
154 
155   /// Try to move \p N from one bucket to another
156   /// \returns true iff \p N is moved
157   bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
158                         unsigned RightBucket, SignaturesT &Signatures,
159                         std::mt19937 &RNG) const;
160 
161   /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
162   /// 1 The method is used for an initial assignment before a bisection step
163   void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
164 
165   /// The cost of the uniform log-gap cost, assuming a utility node has \p X
166   /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
167   float logCost(unsigned X, unsigned Y) const;
168 
169   float log2Cached(unsigned i) const;
170 
171   const BalancedPartitioningConfig &Config;
172 
173   /// Precomputed values of log2(x). Table size is small enough to fit in cache.
174   static constexpr unsigned LOG_CACHE_SIZE = 16384;
175   float Log2Cache[LOG_CACHE_SIZE];
176 
177   /// The signature of a particular utility node used for the bisection step,
178   /// i.e., the number of \p FunctionNodes in each of the two buckets
179   struct UtilitySignature {
180     /// The number of \p FunctionNodes in the left bucket
181     unsigned LeftCount = 0;
182     /// The number of \p FunctionNodes in the right bucket
183     unsigned RightCount = 0;
184     /// The cached gain of moving a \p FunctionNode from the left bucket to the
185     /// right bucket
186     float CachedGainLR;
187     /// The cached gain of moving a \p FunctionNode from the right bucket to the
188     /// left bucket
189     float CachedGainRL;
190     /// Whether \p CachedGainLR and \p CachedGainRL are valid
191     bool CachedGainIsValid = false;
192   };
193 
194 protected:
195   /// Compute the move gain for uniform log-gap cost
196   LLVM_ABI static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
197                                  const SignaturesT &Signatures);
198   friend class BalancedPartitioningTest_MoveGain_Test;
199 };
200 
201 } // end namespace llvm
202 
203 #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
204