xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a profile inference algorithm. Given an incomplete and
10 // possibly imprecise block counts, the algorithm reconstructs realistic block
11 // and edge counts that satisfy flow conservation rules, while minimally modify
12 // input block counts.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/Transforms/Utils/SampleProfileInference.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
20 #include <queue>
21 #include <set>
22 #include <stack>
23 #include <unordered_set>
24 
25 using namespace llvm;
26 #define DEBUG_TYPE "sample-profile-inference"
27 
28 namespace {
29 
30 static cl::opt<bool> SampleProfileEvenFlowDistribution(
31     "sample-profile-even-flow-distribution", cl::init(true), cl::Hidden,
32     cl::desc("Try to evenly distribute flow when there are multiple equally "
33              "likely options."));
34 
35 static cl::opt<bool> SampleProfileRebalanceUnknown(
36     "sample-profile-rebalance-unknown", cl::init(true), cl::Hidden,
37     cl::desc("Evenly re-distribute flow among unknown subgraphs."));
38 
39 static cl::opt<bool> SampleProfileJoinIslands(
40     "sample-profile-join-islands", cl::init(true), cl::Hidden,
41     cl::desc("Join isolated components having positive flow."));
42 
43 static cl::opt<unsigned> SampleProfileProfiCostBlockInc(
44     "sample-profile-profi-cost-block-inc", cl::init(10), cl::Hidden,
45     cl::desc("The cost of increasing a block's count by one."));
46 
47 static cl::opt<unsigned> SampleProfileProfiCostBlockDec(
48     "sample-profile-profi-cost-block-dec", cl::init(20), cl::Hidden,
49     cl::desc("The cost of decreasing a block's count by one."));
50 
51 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryInc(
52     "sample-profile-profi-cost-block-entry-inc", cl::init(40), cl::Hidden,
53     cl::desc("The cost of increasing the entry block's count by one."));
54 
55 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryDec(
56     "sample-profile-profi-cost-block-entry-dec", cl::init(10), cl::Hidden,
57     cl::desc("The cost of decreasing the entry block's count by one."));
58 
59 static cl::opt<unsigned> SampleProfileProfiCostBlockZeroInc(
60     "sample-profile-profi-cost-block-zero-inc", cl::init(11), cl::Hidden,
61     cl::desc("The cost of increasing a count of zero-weight block by one."));
62 
63 static cl::opt<unsigned> SampleProfileProfiCostBlockUnknownInc(
64     "sample-profile-profi-cost-block-unknown-inc", cl::init(0), cl::Hidden,
65     cl::desc("The cost of increasing an unknown block's count by one."));
66 
67 /// A value indicating an infinite flow/capacity/weight of a block/edge.
68 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
69 /// during the execution.
70 static constexpr int64_t INF = ((int64_t)1) << 50;
71 
72 /// The minimum-cost maximum flow algorithm.
73 ///
74 /// The algorithm finds the maximum flow of minimum cost on a given (directed)
75 /// network using a modified version of the classical Moore-Bellman-Ford
76 /// approach. The algorithm applies a number of augmentation iterations in which
77 /// flow is sent along paths of positive capacity from the source to the sink.
78 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where
79 /// where m is the number of edges, n is the number of vertices, and v(f) is the
80 /// value of the maximum flow. However, the observed running time on typical
81 /// instances is sub-quadratic, that is, o(n^2).
82 ///
83 /// The input is a set of edges with specified costs and capacities, and a pair
84 /// of nodes (source and sink). The output is the flow along each edge of the
85 /// minimum total cost respecting the given edge capacities.
86 class MinCostMaxFlow {
87 public:
MinCostMaxFlow(const ProfiParams & Params)88   MinCostMaxFlow(const ProfiParams &Params) : Params(Params) {}
89 
90   // Initialize algorithm's data structures for a network of a given size.
initialize(uint64_t NodeCount,uint64_t SourceNode,uint64_t SinkNode)91   void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
92     Source = SourceNode;
93     Target = SinkNode;
94 
95     Nodes = std::vector<Node>(NodeCount);
96     Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
97     if (Params.EvenFlowDistribution)
98       AugmentingEdges =
99           std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>());
100   }
101 
102   // Run the algorithm.
run()103   int64_t run() {
104     LLVM_DEBUG(dbgs() << "Starting profi for " << Nodes.size() << " nodes\n");
105 
106     // Iteratively find an augmentation path/dag in the network and send the
107     // flow along its edges
108     size_t AugmentationIters = applyFlowAugmentation();
109 
110     // Compute the total flow and its cost
111     int64_t TotalCost = 0;
112     int64_t TotalFlow = 0;
113     for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
114       for (auto &Edge : Edges[Src]) {
115         if (Edge.Flow > 0) {
116           TotalCost += Edge.Cost * Edge.Flow;
117           if (Src == Source)
118             TotalFlow += Edge.Flow;
119         }
120       }
121     }
122     LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
123                       << " iterations with " << TotalFlow << " total flow"
124                       << " of " << TotalCost << " cost\n");
125     (void)TotalFlow;
126     (void)AugmentationIters;
127     return TotalCost;
128   }
129 
130   /// Adding an edge to the network with a specified capacity and a cost.
131   /// Multiple edges between a pair of nodes are allowed but self-edges
132   /// are not supported.
addEdge(uint64_t Src,uint64_t Dst,int64_t Capacity,int64_t Cost)133   void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
134     assert(Capacity > 0 && "adding an edge of zero capacity");
135     assert(Src != Dst && "loop edge are not supported");
136 
137     Edge SrcEdge;
138     SrcEdge.Dst = Dst;
139     SrcEdge.Cost = Cost;
140     SrcEdge.Capacity = Capacity;
141     SrcEdge.Flow = 0;
142     SrcEdge.RevEdgeIndex = Edges[Dst].size();
143 
144     Edge DstEdge;
145     DstEdge.Dst = Src;
146     DstEdge.Cost = -Cost;
147     DstEdge.Capacity = 0;
148     DstEdge.Flow = 0;
149     DstEdge.RevEdgeIndex = Edges[Src].size();
150 
151     Edges[Src].push_back(SrcEdge);
152     Edges[Dst].push_back(DstEdge);
153   }
154 
155   /// Adding an edge to the network of infinite capacity and a given cost.
addEdge(uint64_t Src,uint64_t Dst,int64_t Cost)156   void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
157     addEdge(Src, Dst, INF, Cost);
158   }
159 
160   /// Get the total flow from a given source node.
161   /// Returns a list of pairs (target node, amount of flow to the target).
getFlow(uint64_t Src) const162   std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
163     std::vector<std::pair<uint64_t, int64_t>> Flow;
164     for (const auto &Edge : Edges[Src]) {
165       if (Edge.Flow > 0)
166         Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
167     }
168     return Flow;
169   }
170 
171   /// Get the total flow between a pair of nodes.
getFlow(uint64_t Src,uint64_t Dst) const172   int64_t getFlow(uint64_t Src, uint64_t Dst) const {
173     int64_t Flow = 0;
174     for (const auto &Edge : Edges[Src]) {
175       if (Edge.Dst == Dst) {
176         Flow += Edge.Flow;
177       }
178     }
179     return Flow;
180   }
181 
182 private:
183   /// Iteratively find an augmentation path/dag in the network and send the
184   /// flow along its edges. The method returns the number of applied iterations.
applyFlowAugmentation()185   size_t applyFlowAugmentation() {
186     size_t AugmentationIters = 0;
187     while (findAugmentingPath()) {
188       uint64_t PathCapacity = computeAugmentingPathCapacity();
189       while (PathCapacity > 0) {
190         bool Progress = false;
191         if (Params.EvenFlowDistribution) {
192           // Identify node/edge candidates for augmentation
193           identifyShortestEdges(PathCapacity);
194 
195           // Find an augmenting DAG
196           auto AugmentingOrder = findAugmentingDAG();
197 
198           // Apply the DAG augmentation
199           Progress = augmentFlowAlongDAG(AugmentingOrder);
200           PathCapacity = computeAugmentingPathCapacity();
201         }
202 
203         if (!Progress) {
204           augmentFlowAlongPath(PathCapacity);
205           PathCapacity = 0;
206         }
207 
208         AugmentationIters++;
209       }
210     }
211     return AugmentationIters;
212   }
213 
214   /// Compute the capacity of the cannonical augmenting path. If the path is
215   /// saturated (that is, no flow can be sent along the path), then return 0.
computeAugmentingPathCapacity()216   uint64_t computeAugmentingPathCapacity() {
217     uint64_t PathCapacity = INF;
218     uint64_t Now = Target;
219     while (Now != Source) {
220       uint64_t Pred = Nodes[Now].ParentNode;
221       auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
222 
223       assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow");
224       uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow);
225       PathCapacity = std::min(PathCapacity, EdgeCapacity);
226 
227       Now = Pred;
228     }
229     return PathCapacity;
230   }
231 
232   /// Check for existence of an augmenting path with a positive capacity.
findAugmentingPath()233   bool findAugmentingPath() {
234     // Initialize data structures
235     for (auto &Node : Nodes) {
236       Node.Distance = INF;
237       Node.ParentNode = uint64_t(-1);
238       Node.ParentEdgeIndex = uint64_t(-1);
239       Node.Taken = false;
240     }
241 
242     std::queue<uint64_t> Queue;
243     Queue.push(Source);
244     Nodes[Source].Distance = 0;
245     Nodes[Source].Taken = true;
246     while (!Queue.empty()) {
247       uint64_t Src = Queue.front();
248       Queue.pop();
249       Nodes[Src].Taken = false;
250       // Although the residual network contains edges with negative costs
251       // (in particular, backward edges), it can be shown that there are no
252       // negative-weight cycles and the following two invariants are maintained:
253       // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
254       // where Dist is the length of the shortest path between two nodes. This
255       // allows to prune the search-space of the path-finding algorithm using
256       // the following early-stop criteria:
257       // -- If we find a path with zero-distance from Source to Target, stop the
258       //    search, as the path is the shortest since Dist[Source, Target] >= 0;
259       // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
260       //    process node V, as it is guaranteed _not_ to be on a shortest path
261       //    from Source to Target; it follows from inequalities
262       //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
263       //                         >= Dist[Source, V]
264       if (!Params.EvenFlowDistribution && Nodes[Target].Distance == 0)
265         break;
266       if (Nodes[Src].Distance > Nodes[Target].Distance)
267         continue;
268 
269       // Process adjacent edges
270       for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
271         auto &Edge = Edges[Src][EdgeIdx];
272         if (Edge.Flow < Edge.Capacity) {
273           uint64_t Dst = Edge.Dst;
274           int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
275           if (Nodes[Dst].Distance > NewDistance) {
276             // Update the distance and the parent node/edge
277             Nodes[Dst].Distance = NewDistance;
278             Nodes[Dst].ParentNode = Src;
279             Nodes[Dst].ParentEdgeIndex = EdgeIdx;
280             // Add the node to the queue, if it is not there yet
281             if (!Nodes[Dst].Taken) {
282               Queue.push(Dst);
283               Nodes[Dst].Taken = true;
284             }
285           }
286         }
287       }
288     }
289 
290     return Nodes[Target].Distance != INF;
291   }
292 
293   /// Update the current flow along the augmenting path.
augmentFlowAlongPath(uint64_t PathCapacity)294   void augmentFlowAlongPath(uint64_t PathCapacity) {
295     assert(PathCapacity > 0 && "found an incorrect augmenting path");
296     uint64_t Now = Target;
297     while (Now != Source) {
298       uint64_t Pred = Nodes[Now].ParentNode;
299       auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
300       auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
301 
302       Edge.Flow += PathCapacity;
303       RevEdge.Flow -= PathCapacity;
304 
305       Now = Pred;
306     }
307   }
308 
309   /// Find an Augmenting DAG order using a modified version of DFS in which we
310   /// can visit a node multiple times. In the DFS search, when scanning each
311   /// edge out of a node, continue search at Edge.Dst endpoint if it has not
312   /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm
313   /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time.
314   /// It returns an Augmenting Order (Taken nodes in decreasing Finish time)
315   /// that starts with Source and ends with Target.
findAugmentingDAG()316   std::vector<uint64_t> findAugmentingDAG() {
317     // We use a stack based implemenation of DFS to avoid recursion.
318     // Defining DFS data structures:
319     // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that
320     //  - we are currently visiting Nodes[NodeIdx] and
321     //  - the next edge to scan is Edges[NodeIdx][EdgeIdx]
322     typedef std::pair<uint64_t, uint64_t> StackItemType;
323     std::stack<StackItemType> Stack;
324     std::vector<uint64_t> AugmentingOrder;
325 
326     // Phase 0: Initialize Node attributes and Time for DFS run
327     for (auto &Node : Nodes) {
328       Node.Discovery = 0;
329       Node.Finish = 0;
330       Node.NumCalls = 0;
331       Node.Taken = false;
332     }
333     uint64_t Time = 0;
334     // Mark Target as Taken
335     // Taken attribute will be propagated backwards from Target towards Source
336     Nodes[Target].Taken = true;
337 
338     // Phase 1: Start DFS traversal from Source
339     Stack.emplace(Source, 0);
340     Nodes[Source].Discovery = ++Time;
341     while (!Stack.empty()) {
342       auto NodeIdx = Stack.top().first;
343       auto EdgeIdx = Stack.top().second;
344 
345       // If we haven't scanned all edges out of NodeIdx, continue scanning
346       if (EdgeIdx < Edges[NodeIdx].size()) {
347         auto &Edge = Edges[NodeIdx][EdgeIdx];
348         auto &Dst = Nodes[Edge.Dst];
349         Stack.top().second++;
350 
351         if (Edge.OnShortestPath) {
352           // If we haven't seen Edge.Dst so far, continue DFS search there
353           if (Dst.Discovery == 0 && Dst.NumCalls < MaxDfsCalls) {
354             Dst.Discovery = ++Time;
355             Stack.emplace(Edge.Dst, 0);
356             Dst.NumCalls++;
357           } else if (Dst.Taken && Dst.Finish != 0) {
358             // Else, if Edge.Dst already have a path to Target, so that NodeIdx
359             Nodes[NodeIdx].Taken = true;
360           }
361         }
362       } else {
363         // If we are done scanning all edge out of NodeIdx
364         Stack.pop();
365         // If we haven't found a path from NodeIdx to Target, forget about it
366         if (!Nodes[NodeIdx].Taken) {
367           Nodes[NodeIdx].Discovery = 0;
368         } else {
369           // If we have found a path from NodeIdx to Target, then finish NodeIdx
370           // and propagate Taken flag to DFS parent unless at the Source
371           Nodes[NodeIdx].Finish = ++Time;
372           // NodeIdx == Source if and only if the stack is empty
373           if (NodeIdx != Source) {
374             assert(!Stack.empty() && "empty stack while running dfs");
375             Nodes[Stack.top().first].Taken = true;
376           }
377           AugmentingOrder.push_back(NodeIdx);
378         }
379       }
380     }
381     // Nodes are collected decreasing Finish time, so the order is reversed
382     std::reverse(AugmentingOrder.begin(), AugmentingOrder.end());
383 
384     // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges
385     for (size_t Src : AugmentingOrder) {
386       AugmentingEdges[Src].clear();
387       for (auto &Edge : Edges[Src]) {
388         uint64_t Dst = Edge.Dst;
389         if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken &&
390             Nodes[Dst].Finish < Nodes[Src].Finish) {
391           AugmentingEdges[Src].push_back(&Edge);
392         }
393       }
394       assert((Src == Target || !AugmentingEdges[Src].empty()) &&
395              "incorrectly constructed augmenting edges");
396     }
397 
398     return AugmentingOrder;
399   }
400 
401   /// Update the current flow along the given (acyclic) subgraph specified by
402   /// the vertex order, AugmentingOrder. The objective is to send as much flow
403   /// as possible while evenly distributing flow among successors of each node.
404   /// After the update at least one edge is saturated.
augmentFlowAlongDAG(const std::vector<uint64_t> & AugmentingOrder)405   bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) {
406     // Phase 0: Initialization
407     for (uint64_t Src : AugmentingOrder) {
408       Nodes[Src].FracFlow = 0;
409       Nodes[Src].IntFlow = 0;
410       for (auto &Edge : AugmentingEdges[Src]) {
411         Edge->AugmentedFlow = 0;
412       }
413     }
414 
415     // Phase 1: Send a unit of fractional flow along the DAG
416     uint64_t MaxFlowAmount = INF;
417     Nodes[Source].FracFlow = 1.0;
418     for (uint64_t Src : AugmentingOrder) {
419       assert((Src == Target || Nodes[Src].FracFlow > 0.0) &&
420              "incorrectly computed fractional flow");
421       // Distribute flow evenly among successors of Src
422       uint64_t Degree = AugmentingEdges[Src].size();
423       for (auto &Edge : AugmentingEdges[Src]) {
424         double EdgeFlow = Nodes[Src].FracFlow / Degree;
425         Nodes[Edge->Dst].FracFlow += EdgeFlow;
426         if (Edge->Capacity == INF)
427           continue;
428         uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow;
429         MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow);
430       }
431     }
432     // Stop early if we cannot send any (integral) flow from Source to Target
433     if (MaxFlowAmount == 0)
434       return false;
435 
436     // Phase 2: Send an integral flow of MaxFlowAmount
437     Nodes[Source].IntFlow = MaxFlowAmount;
438     for (uint64_t Src : AugmentingOrder) {
439       if (Src == Target)
440         break;
441       // Distribute flow evenly among successors of Src, rounding up to make
442       // sure all flow is sent
443       uint64_t Degree = AugmentingEdges[Src].size();
444       // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree
445       uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree;
446       for (auto &Edge : AugmentingEdges[Src]) {
447         uint64_t Dst = Edge->Dst;
448         uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow);
449         EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow));
450         Nodes[Dst].IntFlow += EdgeFlow;
451         Nodes[Src].IntFlow -= EdgeFlow;
452         Edge->AugmentedFlow += EdgeFlow;
453       }
454     }
455     assert(Nodes[Target].IntFlow <= MaxFlowAmount);
456     Nodes[Target].IntFlow = 0;
457 
458     // Phase 3: Send excess flow back traversing the nodes backwards.
459     // Because of rounding, not all flow can be sent along the edges of Src.
460     // Hence, sending the remaining flow back to maintain flow conservation
461     for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) {
462       uint64_t Src = AugmentingOrder[Idx - 1];
463       // Try to send excess flow back along each edge.
464       // Make sure we only send back flow we just augmented (AugmentedFlow).
465       for (auto &Edge : AugmentingEdges[Src]) {
466         uint64_t Dst = Edge->Dst;
467         if (Nodes[Dst].IntFlow == 0)
468           continue;
469         uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow);
470         Nodes[Dst].IntFlow -= EdgeFlow;
471         Nodes[Src].IntFlow += EdgeFlow;
472         Edge->AugmentedFlow -= EdgeFlow;
473       }
474     }
475 
476     // Phase 4: Update flow values along all edges
477     bool HasSaturatedEdges = false;
478     for (uint64_t Src : AugmentingOrder) {
479       // Verify that we have sent all the excess flow from the node
480       assert(Src == Source || Nodes[Src].IntFlow == 0);
481       for (auto &Edge : AugmentingEdges[Src]) {
482         assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow);
483         // Update flow values along the edge and its reverse copy
484         auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex];
485         Edge->Flow += Edge->AugmentedFlow;
486         RevEdge.Flow -= Edge->AugmentedFlow;
487         if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0)
488           HasSaturatedEdges = true;
489       }
490     }
491 
492     // The augmentation is successful iff at least one edge becomes saturated
493     return HasSaturatedEdges;
494   }
495 
496   /// Identify candidate (shortest) edges for augmentation.
identifyShortestEdges(uint64_t PathCapacity)497   void identifyShortestEdges(uint64_t PathCapacity) {
498     assert(PathCapacity > 0 && "found an incorrect augmenting DAG");
499     // To make sure the augmentation DAG contains only edges with large residual
500     // capacity, we prune all edges whose capacity is below a fraction of
501     // the capacity of the augmented path.
502     // (All edges of the path itself are always in the DAG)
503     uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1));
504 
505     // Decide which edges are on a shortest path from Source to Target
506     for (size_t Src = 0; Src < Nodes.size(); Src++) {
507       // An edge cannot be augmenting if the endpoint has large distance
508       if (Nodes[Src].Distance > Nodes[Target].Distance)
509         continue;
510 
511       for (auto &Edge : Edges[Src]) {
512         uint64_t Dst = Edge.Dst;
513         Edge.OnShortestPath =
514             Src != Target && Dst != Source &&
515             Nodes[Dst].Distance <= Nodes[Target].Distance &&
516             Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost &&
517             Edge.Capacity > Edge.Flow &&
518             uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity;
519       }
520     }
521   }
522 
523   /// Maximum number of DFS iterations for DAG finding.
524   static constexpr uint64_t MaxDfsCalls = 10;
525 
526   /// A node in a flow network.
527   struct Node {
528     /// The cost of the cheapest path from the source to the current node.
529     int64_t Distance;
530     /// The node preceding the current one in the path.
531     uint64_t ParentNode;
532     /// The index of the edge between ParentNode and the current node.
533     uint64_t ParentEdgeIndex;
534     /// An indicator of whether the current node is in a queue.
535     bool Taken;
536 
537     /// Data fields utilized in DAG-augmentation:
538     /// Fractional flow.
539     double FracFlow;
540     /// Integral flow.
541     uint64_t IntFlow;
542     /// Discovery time.
543     uint64_t Discovery;
544     /// Finish time.
545     uint64_t Finish;
546     /// NumCalls.
547     uint64_t NumCalls;
548   };
549 
550   /// An edge in a flow network.
551   struct Edge {
552     /// The cost of the edge.
553     int64_t Cost;
554     /// The capacity of the edge.
555     int64_t Capacity;
556     /// The current flow on the edge.
557     int64_t Flow;
558     /// The destination node of the edge.
559     uint64_t Dst;
560     /// The index of the reverse edge between Dst and the current node.
561     uint64_t RevEdgeIndex;
562 
563     /// Data fields utilized in DAG-augmentation:
564     /// Whether the edge is currently on a shortest path from Source to Target.
565     bool OnShortestPath;
566     /// Extra flow along the edge.
567     uint64_t AugmentedFlow;
568   };
569 
570   /// The set of network nodes.
571   std::vector<Node> Nodes;
572   /// The set of network edges.
573   std::vector<std::vector<Edge>> Edges;
574   /// Source node of the flow.
575   uint64_t Source;
576   /// Target (sink) node of the flow.
577   uint64_t Target;
578   /// Augmenting edges.
579   std::vector<std::vector<Edge *>> AugmentingEdges;
580   /// Params for flow computation.
581   const ProfiParams &Params;
582 };
583 
584 /// A post-processing adjustment of the control flow. It applies two steps by
585 /// rerouting some flow and making it more realistic:
586 ///
587 /// - First, it removes all isolated components ("islands") with a positive flow
588 ///   that are unreachable from the entry block. For every such component, we
589 ///   find the shortest from the entry to an exit passing through the component,
590 ///   and increase the flow by one unit along the path.
591 ///
592 /// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
593 ///   with no sampled counts. Then it rebalnces the flow that goes through such
594 ///   a subgraph so that each branch is taken with probability 50%.
595 ///   An unknown subgraph is such that for every two nodes u and v:
596 ///     - u dominates v and u is not unknown;
597 ///     - v post-dominates u; and
598 ///     - all inner-nodes of all (u,v)-paths are unknown.
599 ///
600 class FlowAdjuster {
601 public:
FlowAdjuster(const ProfiParams & Params,FlowFunction & Func)602   FlowAdjuster(const ProfiParams &Params, FlowFunction &Func)
603       : Params(Params), Func(Func) {}
604 
605   /// Apply the post-processing.
run()606   void run() {
607     if (Params.JoinIslands) {
608       // Adjust the flow to get rid of isolated components
609       joinIsolatedComponents();
610     }
611 
612     if (Params.RebalanceUnknown) {
613       // Rebalance the flow inside unknown subgraphs
614       rebalanceUnknownSubgraphs();
615     }
616   }
617 
618 private:
joinIsolatedComponents()619   void joinIsolatedComponents() {
620     // Find blocks that are reachable from the source
621     auto Visited = BitVector(NumBlocks(), false);
622     findReachable(Func.Entry, Visited);
623 
624     // Iterate over all non-reachable blocks and adjust their weights
625     for (uint64_t I = 0; I < NumBlocks(); I++) {
626       auto &Block = Func.Blocks[I];
627       if (Block.Flow > 0 && !Visited[I]) {
628         // Find a path from the entry to an exit passing through the block I
629         auto Path = findShortestPath(I);
630         // Increase the flow along the path
631         assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
632                "incorrectly computed path adjusting control flow");
633         Func.Blocks[Func.Entry].Flow += 1;
634         for (auto &Jump : Path) {
635           Jump->Flow += 1;
636           Func.Blocks[Jump->Target].Flow += 1;
637           // Update reachability
638           findReachable(Jump->Target, Visited);
639         }
640       }
641     }
642   }
643 
644   /// Run BFS from a given block along the jumps with a positive flow and mark
645   /// all reachable blocks.
findReachable(uint64_t Src,BitVector & Visited)646   void findReachable(uint64_t Src, BitVector &Visited) {
647     if (Visited[Src])
648       return;
649     std::queue<uint64_t> Queue;
650     Queue.push(Src);
651     Visited[Src] = true;
652     while (!Queue.empty()) {
653       Src = Queue.front();
654       Queue.pop();
655       for (auto *Jump : Func.Blocks[Src].SuccJumps) {
656         uint64_t Dst = Jump->Target;
657         if (Jump->Flow > 0 && !Visited[Dst]) {
658           Queue.push(Dst);
659           Visited[Dst] = true;
660         }
661       }
662     }
663   }
664 
665   /// Find the shortest path from the entry block to an exit block passing
666   /// through a given block.
findShortestPath(uint64_t BlockIdx)667   std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
668     // A path from the entry block to BlockIdx
669     auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
670     // A path from BlockIdx to an exit block
671     auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
672 
673     // Concatenate the two paths
674     std::vector<FlowJump *> Result;
675     Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
676     Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
677     return Result;
678   }
679 
680   /// Apply the Dijkstra algorithm to find the shortest path from a given
681   /// Source to a given Target block.
682   /// If Target == -1, then the path ends at an exit block.
findShortestPath(uint64_t Source,uint64_t Target)683   std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
684     // Quit early, if possible
685     if (Source == Target)
686       return std::vector<FlowJump *>();
687     if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
688       return std::vector<FlowJump *>();
689 
690     // Initialize data structures
691     auto Distance = std::vector<int64_t>(NumBlocks(), INF);
692     auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
693     Distance[Source] = 0;
694     std::set<std::pair<uint64_t, uint64_t>> Queue;
695     Queue.insert(std::make_pair(Distance[Source], Source));
696 
697     // Run the Dijkstra algorithm
698     while (!Queue.empty()) {
699       uint64_t Src = Queue.begin()->second;
700       Queue.erase(Queue.begin());
701       // If we found a solution, quit early
702       if (Src == Target ||
703           (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
704         break;
705 
706       for (auto *Jump : Func.Blocks[Src].SuccJumps) {
707         uint64_t Dst = Jump->Target;
708         int64_t JumpDist = jumpDistance(Jump);
709         if (Distance[Dst] > Distance[Src] + JumpDist) {
710           Queue.erase(std::make_pair(Distance[Dst], Dst));
711 
712           Distance[Dst] = Distance[Src] + JumpDist;
713           Parent[Dst] = Jump;
714 
715           Queue.insert(std::make_pair(Distance[Dst], Dst));
716         }
717       }
718     }
719     // If Target is not provided, find the closest exit block
720     if (Target == AnyExitBlock) {
721       for (uint64_t I = 0; I < NumBlocks(); I++) {
722         if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
723           if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
724             Target = I;
725           }
726         }
727       }
728     }
729     assert(Parent[Target] != nullptr && "a path does not exist");
730 
731     // Extract the constructed path
732     std::vector<FlowJump *> Result;
733     uint64_t Now = Target;
734     while (Now != Source) {
735       assert(Now == Parent[Now]->Target && "incorrect parent jump");
736       Result.push_back(Parent[Now]);
737       Now = Parent[Now]->Source;
738     }
739     // Reverse the path, since it is extracted from Target to Source
740     std::reverse(Result.begin(), Result.end());
741     return Result;
742   }
743 
744   /// A distance of a path for a given jump.
745   /// In order to incite the path to use blocks/jumps with large positive flow,
746   /// and avoid changing branch probability of outgoing edges drastically,
747   /// set the jump distance so as:
748   ///   - to minimize the number of unlikely jumps used and subject to that,
749   ///   - to minimize the number of Flow == 0 jumps used and subject to that,
750   ///   - minimizes total multiplicative Flow increase for the remaining edges.
751   /// To capture this objective with integer distances, we round off fractional
752   /// parts to a multiple of 1 / BaseDistance.
jumpDistance(FlowJump * Jump) const753   int64_t jumpDistance(FlowJump *Jump) const {
754     if (Jump->IsUnlikely)
755       return Params.CostUnlikely;
756     uint64_t BaseDistance =
757         std::max(FlowAdjuster::MinBaseDistance,
758                  std::min(Func.Blocks[Func.Entry].Flow,
759                           Params.CostUnlikely / (2 * (NumBlocks() + 1))));
760     if (Jump->Flow > 0)
761       return BaseDistance + BaseDistance / Jump->Flow;
762     return 2 * BaseDistance * (NumBlocks() + 1);
763   };
764 
NumBlocks() const765   uint64_t NumBlocks() const { return Func.Blocks.size(); }
766 
767   /// Rebalance unknown subgraphs so that the flow is split evenly across the
768   /// outgoing branches of every block of the subgraph. The method iterates over
769   /// blocks with known weight and identifies unknown subgraphs rooted at the
770   /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
rebalanceUnknownSubgraphs()771   void rebalanceUnknownSubgraphs() {
772     // Try to find unknown subgraphs from each block
773     for (const FlowBlock &SrcBlock : Func.Blocks) {
774       // Verify if rebalancing rooted at SrcBlock is feasible
775       if (!canRebalanceAtRoot(&SrcBlock))
776         continue;
777 
778       // Find an unknown subgraphs starting at SrcBlock. Along the way,
779       // fill in known destinations and intermediate unknown blocks.
780       std::vector<FlowBlock *> UnknownBlocks;
781       std::vector<FlowBlock *> KnownDstBlocks;
782       findUnknownSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks);
783 
784       // Verify if rebalancing of the subgraph is feasible. If the search is
785       // successful, find the unique destination block (which can be null)
786       FlowBlock *DstBlock = nullptr;
787       if (!canRebalanceSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks,
788                                 DstBlock))
789         continue;
790 
791       // We cannot rebalance subgraphs containing cycles among unknown blocks
792       if (!isAcyclicSubgraph(&SrcBlock, DstBlock, UnknownBlocks))
793         continue;
794 
795       // Rebalance the flow
796       rebalanceUnknownSubgraph(&SrcBlock, DstBlock, UnknownBlocks);
797     }
798   }
799 
800   /// Verify if rebalancing rooted at a given block is possible.
canRebalanceAtRoot(const FlowBlock * SrcBlock)801   bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
802     // Do not attempt to find unknown subgraphs from an unknown or a
803     // zero-flow block
804     if (SrcBlock->HasUnknownWeight || SrcBlock->Flow == 0)
805       return false;
806 
807     // Do not attempt to process subgraphs from a block w/o unknown sucessors
808     bool HasUnknownSuccs = false;
809     for (auto *Jump : SrcBlock->SuccJumps) {
810       if (Func.Blocks[Jump->Target].HasUnknownWeight) {
811         HasUnknownSuccs = true;
812         break;
813       }
814     }
815     if (!HasUnknownSuccs)
816       return false;
817 
818     return true;
819   }
820 
821   /// Find an unknown subgraph starting at block SrcBlock. The method sets
822   /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
findUnknownSubgraph(const FlowBlock * SrcBlock,std::vector<FlowBlock * > & KnownDstBlocks,std::vector<FlowBlock * > & UnknownBlocks)823   void findUnknownSubgraph(const FlowBlock *SrcBlock,
824                            std::vector<FlowBlock *> &KnownDstBlocks,
825                            std::vector<FlowBlock *> &UnknownBlocks) {
826     // Run BFS from SrcBlock and make sure all paths are going through unknown
827     // blocks and end at a known DstBlock
828     auto Visited = BitVector(NumBlocks(), false);
829     std::queue<uint64_t> Queue;
830 
831     Queue.push(SrcBlock->Index);
832     Visited[SrcBlock->Index] = true;
833     while (!Queue.empty()) {
834       auto &Block = Func.Blocks[Queue.front()];
835       Queue.pop();
836       // Process blocks reachable from Block
837       for (auto *Jump : Block.SuccJumps) {
838         // If Jump can be ignored, skip it
839         if (ignoreJump(SrcBlock, nullptr, Jump))
840           continue;
841 
842         uint64_t Dst = Jump->Target;
843         // If Dst has been visited, skip Jump
844         if (Visited[Dst])
845           continue;
846         // Process block Dst
847         Visited[Dst] = true;
848         if (!Func.Blocks[Dst].HasUnknownWeight) {
849           KnownDstBlocks.push_back(&Func.Blocks[Dst]);
850         } else {
851           Queue.push(Dst);
852           UnknownBlocks.push_back(&Func.Blocks[Dst]);
853         }
854       }
855     }
856   }
857 
858   /// Verify if rebalancing of the subgraph is feasible. If the checks are
859   /// successful, set the unique destination block, DstBlock (can be null).
canRebalanceSubgraph(const FlowBlock * SrcBlock,const std::vector<FlowBlock * > & KnownDstBlocks,const std::vector<FlowBlock * > & UnknownBlocks,FlowBlock * & DstBlock)860   bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
861                             const std::vector<FlowBlock *> &KnownDstBlocks,
862                             const std::vector<FlowBlock *> &UnknownBlocks,
863                             FlowBlock *&DstBlock) {
864     // If the list of unknown blocks is empty, we don't need rebalancing
865     if (UnknownBlocks.empty())
866       return false;
867 
868     // If there are multiple known sinks, we can't rebalance
869     if (KnownDstBlocks.size() > 1)
870       return false;
871     DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
872 
873     // Verify sinks of the subgraph
874     for (auto *Block : UnknownBlocks) {
875       if (Block->SuccJumps.empty()) {
876         // If there are multiple (known and unknown) sinks, we can't rebalance
877         if (DstBlock != nullptr)
878           return false;
879         continue;
880       }
881       size_t NumIgnoredJumps = 0;
882       for (auto *Jump : Block->SuccJumps) {
883         if (ignoreJump(SrcBlock, DstBlock, Jump))
884           NumIgnoredJumps++;
885       }
886       // If there is a non-sink block in UnknownBlocks with all jumps ignored,
887       // then we can't rebalance
888       if (NumIgnoredJumps == Block->SuccJumps.size())
889         return false;
890     }
891 
892     return true;
893   }
894 
895   /// Decide whether the Jump is ignored while processing an unknown subgraphs
896   /// rooted at basic block SrcBlock with the destination block, DstBlock.
ignoreJump(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const FlowJump * Jump)897   bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
898                   const FlowJump *Jump) {
899     // Ignore unlikely jumps with zero flow
900     if (Jump->IsUnlikely && Jump->Flow == 0)
901       return true;
902 
903     auto JumpSource = &Func.Blocks[Jump->Source];
904     auto JumpTarget = &Func.Blocks[Jump->Target];
905 
906     // Do not ignore jumps coming into DstBlock
907     if (DstBlock != nullptr && JumpTarget == DstBlock)
908       return false;
909 
910     // Ignore jumps out of SrcBlock to known blocks
911     if (!JumpTarget->HasUnknownWeight && JumpSource == SrcBlock)
912       return true;
913 
914     // Ignore jumps to known blocks with zero flow
915     if (!JumpTarget->HasUnknownWeight && JumpTarget->Flow == 0)
916       return true;
917 
918     return false;
919   }
920 
921   /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
922   /// UnknownBlocks in the topological order (so that all jumps are "forward").
isAcyclicSubgraph(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,std::vector<FlowBlock * > & UnknownBlocks)923   bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
924                          std::vector<FlowBlock *> &UnknownBlocks) {
925     // Extract local in-degrees in the considered subgraph
926     auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
927     auto fillInDegree = [&](const FlowBlock *Block) {
928       for (auto *Jump : Block->SuccJumps) {
929         if (ignoreJump(SrcBlock, DstBlock, Jump))
930           continue;
931         LocalInDegree[Jump->Target]++;
932       }
933     };
934     fillInDegree(SrcBlock);
935     for (auto *Block : UnknownBlocks) {
936       fillInDegree(Block);
937     }
938     // A loop containing SrcBlock
939     if (LocalInDegree[SrcBlock->Index] > 0)
940       return false;
941 
942     std::vector<FlowBlock *> AcyclicOrder;
943     std::queue<uint64_t> Queue;
944     Queue.push(SrcBlock->Index);
945     while (!Queue.empty()) {
946       FlowBlock *Block = &Func.Blocks[Queue.front()];
947       Queue.pop();
948       // Stop propagation once we reach DstBlock, if any
949       if (DstBlock != nullptr && Block == DstBlock)
950         break;
951 
952       // Keep an acyclic order of unknown blocks
953       if (Block->HasUnknownWeight && Block != SrcBlock)
954         AcyclicOrder.push_back(Block);
955 
956       // Add to the queue all successors with zero local in-degree
957       for (auto *Jump : Block->SuccJumps) {
958         if (ignoreJump(SrcBlock, DstBlock, Jump))
959           continue;
960         uint64_t Dst = Jump->Target;
961         LocalInDegree[Dst]--;
962         if (LocalInDegree[Dst] == 0) {
963           Queue.push(Dst);
964         }
965       }
966     }
967 
968     // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
969     // of all blocks
970     if (UnknownBlocks.size() != AcyclicOrder.size())
971       return false;
972     UnknownBlocks = AcyclicOrder;
973     return true;
974   }
975 
976   /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
977   /// having UnknownBlocks intermediate blocks.
rebalanceUnknownSubgraph(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const std::vector<FlowBlock * > & UnknownBlocks)978   void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
979                                 const FlowBlock *DstBlock,
980                                 const std::vector<FlowBlock *> &UnknownBlocks) {
981     assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
982 
983     // Ditribute flow from the source block
984     uint64_t BlockFlow = 0;
985     // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
986     for (auto *Jump : SrcBlock->SuccJumps) {
987       if (ignoreJump(SrcBlock, DstBlock, Jump))
988         continue;
989       BlockFlow += Jump->Flow;
990     }
991     rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
992 
993     // Ditribute flow from the remaining blocks
994     for (auto *Block : UnknownBlocks) {
995       assert(Block->HasUnknownWeight && "incorrect unknown subgraph");
996       uint64_t BlockFlow = 0;
997       // Block's flow is the sum of incoming flows
998       for (auto *Jump : Block->PredJumps) {
999         BlockFlow += Jump->Flow;
1000       }
1001       Block->Flow = BlockFlow;
1002       rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
1003     }
1004   }
1005 
1006   /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
1007   /// and ending at DstBlock.
rebalanceBlock(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const FlowBlock * Block,uint64_t BlockFlow)1008   void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
1009                       const FlowBlock *Block, uint64_t BlockFlow) {
1010     // Process all successor jumps and update corresponding flow values
1011     size_t BlockDegree = 0;
1012     for (auto *Jump : Block->SuccJumps) {
1013       if (ignoreJump(SrcBlock, DstBlock, Jump))
1014         continue;
1015       BlockDegree++;
1016     }
1017     // If all successor jumps of the block are ignored, skip it
1018     if (DstBlock == nullptr && BlockDegree == 0)
1019       return;
1020     assert(BlockDegree > 0 && "all outgoing jumps are ignored");
1021 
1022     // Each of the Block's successors gets the following amount of flow.
1023     // Rounding the value up so that all flow is propagated
1024     uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
1025     for (auto *Jump : Block->SuccJumps) {
1026       if (ignoreJump(SrcBlock, DstBlock, Jump))
1027         continue;
1028       uint64_t Flow = std::min(SuccFlow, BlockFlow);
1029       Jump->Flow = Flow;
1030       BlockFlow -= Flow;
1031     }
1032     assert(BlockFlow == 0 && "not all flow is propagated");
1033   }
1034 
1035   /// A constant indicating an arbitrary exit block of a function.
1036   static constexpr uint64_t AnyExitBlock = uint64_t(-1);
1037   /// Minimum BaseDistance for the jump distance values in island joining.
1038   static constexpr uint64_t MinBaseDistance = 10000;
1039 
1040   /// Params for flow computation.
1041   const ProfiParams &Params;
1042   /// The function.
1043   FlowFunction &Func;
1044 };
1045 
1046 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params,
1047                                              const FlowBlock &Block);
1048 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params,
1049                                             const FlowJump &Jump);
1050 
1051 /// Initializing flow network for a given function.
1052 ///
1053 /// Every block is split into two nodes that are responsible for (i) an
1054 /// incoming flow, (ii) an outgoing flow; they penalize an increase or a
1055 /// reduction of the block weight.
initializeNetwork(const ProfiParams & Params,MinCostMaxFlow & Network,FlowFunction & Func)1056 void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network,
1057                        FlowFunction &Func) {
1058   uint64_t NumBlocks = Func.Blocks.size();
1059   assert(NumBlocks > 1 && "Too few blocks in a function");
1060   uint64_t NumJumps = Func.Jumps.size();
1061   assert(NumJumps > 0 && "Too few jumps in a function");
1062 
1063   // Introducing dummy source/sink pairs to allow flow circulation.
1064   // The nodes corresponding to blocks of the function have indices in
1065   // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the
1066   // next four values.
1067   uint64_t S = 2 * NumBlocks;
1068   uint64_t T = S + 1;
1069   uint64_t S1 = S + 2;
1070   uint64_t T1 = S + 3;
1071 
1072   Network.initialize(2 * NumBlocks + 4, S1, T1);
1073 
1074   // Initialize nodes of the flow network
1075   for (uint64_t B = 0; B < NumBlocks; B++) {
1076     auto &Block = Func.Blocks[B];
1077 
1078     // Split every block into two auxiliary nodes to allow
1079     // increase/reduction of the block count.
1080     uint64_t Bin = 2 * B;
1081     uint64_t Bout = 2 * B + 1;
1082 
1083     // Edges from S and to T
1084     if (Block.isEntry()) {
1085       Network.addEdge(S, Bin, 0);
1086     } else if (Block.isExit()) {
1087       Network.addEdge(Bout, T, 0);
1088     }
1089 
1090     // Assign costs for increasing/decreasing the block counts
1091     auto [AuxCostInc, AuxCostDec] = assignBlockCosts(Params, Block);
1092 
1093     // Add the corresponding edges to the network
1094     Network.addEdge(Bin, Bout, AuxCostInc);
1095     if (Block.Weight > 0) {
1096       Network.addEdge(Bout, Bin, Block.Weight, AuxCostDec);
1097       Network.addEdge(S1, Bout, Block.Weight, 0);
1098       Network.addEdge(Bin, T1, Block.Weight, 0);
1099     }
1100   }
1101 
1102   // Initialize edges of the flow network
1103   for (uint64_t J = 0; J < NumJumps; J++) {
1104     auto &Jump = Func.Jumps[J];
1105 
1106     // Get the endpoints corresponding to the jump
1107     uint64_t Jin = 2 * Jump.Source + 1;
1108     uint64_t Jout = 2 * Jump.Target;
1109 
1110     // Assign costs for increasing/decreasing the jump counts
1111     auto [AuxCostInc, AuxCostDec] = assignJumpCosts(Params, Jump);
1112 
1113     // Add the corresponding edges to the network
1114     Network.addEdge(Jin, Jout, AuxCostInc);
1115     if (Jump.Weight > 0) {
1116       Network.addEdge(Jout, Jin, Jump.Weight, AuxCostDec);
1117       Network.addEdge(S1, Jout, Jump.Weight, 0);
1118       Network.addEdge(Jin, T1, Jump.Weight, 0);
1119     }
1120   }
1121 
1122   // Make sure we have a valid flow circulation
1123   Network.addEdge(T, S, 0);
1124 }
1125 
1126 /// Assign costs for increasing/decreasing the block counts.
assignBlockCosts(const ProfiParams & Params,const FlowBlock & Block)1127 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params,
1128                                              const FlowBlock &Block) {
1129   // Modifying the weight of an unlikely block is expensive
1130   if (Block.IsUnlikely)
1131     return std::make_pair(Params.CostUnlikely, Params.CostUnlikely);
1132 
1133   // Assign default values for the costs
1134   int64_t CostInc = Params.CostBlockInc;
1135   int64_t CostDec = Params.CostBlockDec;
1136   // Update the costs depending on the block metadata
1137   if (Block.HasUnknownWeight) {
1138     CostInc = Params.CostBlockUnknownInc;
1139     CostDec = 0;
1140   } else {
1141     // Increasing the count for "cold" blocks with zero initial count is more
1142     // expensive than for "hot" ones
1143     if (Block.Weight == 0)
1144       CostInc = Params.CostBlockZeroInc;
1145     // Modifying the count of the entry block is expensive
1146     if (Block.isEntry()) {
1147       CostInc = Params.CostBlockEntryInc;
1148       CostDec = Params.CostBlockEntryDec;
1149     }
1150   }
1151   return std::make_pair(CostInc, CostDec);
1152 }
1153 
1154 /// Assign costs for increasing/decreasing the jump counts.
assignJumpCosts(const ProfiParams & Params,const FlowJump & Jump)1155 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params,
1156                                             const FlowJump &Jump) {
1157   // Modifying the weight of an unlikely jump is expensive
1158   if (Jump.IsUnlikely)
1159     return std::make_pair(Params.CostUnlikely, Params.CostUnlikely);
1160 
1161   // Assign default values for the costs
1162   int64_t CostInc = Params.CostJumpInc;
1163   int64_t CostDec = Params.CostJumpDec;
1164   // Update the costs depending on the block metadata
1165   if (Jump.Source + 1 == Jump.Target) {
1166     // Adjusting the fall-through branch
1167     CostInc = Params.CostJumpFTInc;
1168     CostDec = Params.CostJumpFTDec;
1169   }
1170   if (Jump.HasUnknownWeight) {
1171     // The cost is different for fall-through and non-fall-through branches
1172     if (Jump.Source + 1 == Jump.Target)
1173       CostInc = Params.CostJumpUnknownFTInc;
1174     else
1175       CostInc = Params.CostJumpUnknownInc;
1176     CostDec = 0;
1177   } else {
1178     assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight");
1179   }
1180   return std::make_pair(CostInc, CostDec);
1181 }
1182 
1183 /// Extract resulting block and edge counts from the flow network.
extractWeights(const ProfiParams & Params,MinCostMaxFlow & Network,FlowFunction & Func)1184 void extractWeights(const ProfiParams &Params, MinCostMaxFlow &Network,
1185                     FlowFunction &Func) {
1186   uint64_t NumBlocks = Func.Blocks.size();
1187   uint64_t NumJumps = Func.Jumps.size();
1188 
1189   // Extract resulting jump counts
1190   for (uint64_t J = 0; J < NumJumps; J++) {
1191     auto &Jump = Func.Jumps[J];
1192     uint64_t SrcOut = 2 * Jump.Source + 1;
1193     uint64_t DstIn = 2 * Jump.Target;
1194 
1195     int64_t Flow = 0;
1196     int64_t AuxFlow = Network.getFlow(SrcOut, DstIn);
1197     if (Jump.Source != Jump.Target)
1198       Flow = int64_t(Jump.Weight) + AuxFlow;
1199     else
1200       Flow = int64_t(Jump.Weight) + (AuxFlow > 0 ? AuxFlow : 0);
1201 
1202     Jump.Flow = Flow;
1203     assert(Flow >= 0 && "negative jump flow");
1204   }
1205 
1206   // Extract resulting block counts
1207   auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1208   auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1209   for (auto &Jump : Func.Jumps) {
1210     InFlow[Jump.Target] += Jump.Flow;
1211     OutFlow[Jump.Source] += Jump.Flow;
1212   }
1213   for (uint64_t B = 0; B < NumBlocks; B++) {
1214     auto &Block = Func.Blocks[B];
1215     Block.Flow = std::max(OutFlow[B], InFlow[B]);
1216   }
1217 }
1218 
1219 #ifndef NDEBUG
1220 /// Verify that the provided block/jump weights are as expected.
verifyInput(const FlowFunction & Func)1221 void verifyInput(const FlowFunction &Func) {
1222   // Verify entry and exit blocks
1223   assert(Func.Entry == 0 && Func.Blocks[0].isEntry());
1224   size_t NumExitBlocks = 0;
1225   for (size_t I = 1; I < Func.Blocks.size(); I++) {
1226     assert(!Func.Blocks[I].isEntry() && "multiple entry blocks");
1227     if (Func.Blocks[I].isExit())
1228       NumExitBlocks++;
1229   }
1230   assert(NumExitBlocks > 0 && "cannot find exit blocks");
1231 
1232   // Verify that there are no parallel edges
1233   for (auto &Block : Func.Blocks) {
1234     std::unordered_set<uint64_t> UniqueSuccs;
1235     for (auto &Jump : Block.SuccJumps) {
1236       auto It = UniqueSuccs.insert(Jump->Target);
1237       assert(It.second && "input CFG contains parallel edges");
1238     }
1239   }
1240   // Verify CFG jumps
1241   for (auto &Block : Func.Blocks) {
1242     assert((!Block.isEntry() || !Block.isExit()) &&
1243            "a block cannot be an entry and an exit");
1244   }
1245   // Verify input block weights
1246   for (auto &Block : Func.Blocks) {
1247     assert((!Block.HasUnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
1248            "non-zero weight of a block w/o weight except for an entry");
1249   }
1250   // Verify input jump weights
1251   for (auto &Jump : Func.Jumps) {
1252     assert((!Jump.HasUnknownWeight || Jump.Weight == 0) &&
1253            "non-zero weight of a jump w/o weight");
1254   }
1255 }
1256 
1257 /// Verify that the computed flow values satisfy flow conservation rules.
verifyOutput(const FlowFunction & Func)1258 void verifyOutput(const FlowFunction &Func) {
1259   const uint64_t NumBlocks = Func.Blocks.size();
1260   auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1261   auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1262   for (const auto &Jump : Func.Jumps) {
1263     InFlow[Jump.Target] += Jump.Flow;
1264     OutFlow[Jump.Source] += Jump.Flow;
1265   }
1266 
1267   uint64_t TotalInFlow = 0;
1268   uint64_t TotalOutFlow = 0;
1269   for (uint64_t I = 0; I < NumBlocks; I++) {
1270     auto &Block = Func.Blocks[I];
1271     if (Block.isEntry()) {
1272       TotalInFlow += Block.Flow;
1273       assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
1274     } else if (Block.isExit()) {
1275       TotalOutFlow += Block.Flow;
1276       assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
1277     } else {
1278       assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
1279       assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
1280     }
1281   }
1282   assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
1283 
1284   // Verify that there are no isolated flow components
1285   // One could modify FlowFunction to hold edges indexed by the sources, which
1286   // will avoid a creation of the object
1287   auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
1288   for (const auto &Jump : Func.Jumps) {
1289     if (Jump.Flow > 0) {
1290       PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
1291     }
1292   }
1293 
1294   // Run BFS from the source along edges with positive flow
1295   std::queue<uint64_t> Queue;
1296   auto Visited = BitVector(NumBlocks, false);
1297   Queue.push(Func.Entry);
1298   Visited[Func.Entry] = true;
1299   while (!Queue.empty()) {
1300     uint64_t Src = Queue.front();
1301     Queue.pop();
1302     for (uint64_t Dst : PositiveFlowEdges[Src]) {
1303       if (!Visited[Dst]) {
1304         Queue.push(Dst);
1305         Visited[Dst] = true;
1306       }
1307     }
1308   }
1309 
1310   // Verify that every block that has a positive flow is reached from the source
1311   // along edges with a positive flow
1312   for (uint64_t I = 0; I < NumBlocks; I++) {
1313     auto &Block = Func.Blocks[I];
1314     assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
1315   }
1316 }
1317 #endif
1318 
1319 } // end of anonymous namespace
1320 
1321 /// Apply the profile inference algorithm for a given function and provided
1322 /// profi options
applyFlowInference(const ProfiParams & Params,FlowFunction & Func)1323 void llvm::applyFlowInference(const ProfiParams &Params, FlowFunction &Func) {
1324   // Check if the function has samples and assign initial flow values
1325   bool HasSamples = false;
1326   for (FlowBlock &Block : Func.Blocks) {
1327     if (Block.Weight > 0)
1328       HasSamples = true;
1329     Block.Flow = Block.Weight;
1330   }
1331   for (FlowJump &Jump : Func.Jumps) {
1332     if (Jump.Weight > 0)
1333       HasSamples = true;
1334     Jump.Flow = Jump.Weight;
1335   }
1336 
1337   // Quit early for functions with a single block or ones w/o samples
1338   if (Func.Blocks.size() <= 1 || !HasSamples)
1339     return;
1340 
1341 #ifndef NDEBUG
1342   // Verify the input data
1343   verifyInput(Func);
1344 #endif
1345 
1346   // Create and apply an inference network model
1347   auto InferenceNetwork = MinCostMaxFlow(Params);
1348   initializeNetwork(Params, InferenceNetwork, Func);
1349   InferenceNetwork.run();
1350 
1351   // Extract flow values for every block and every edge
1352   extractWeights(Params, InferenceNetwork, Func);
1353 
1354   // Post-processing adjustments to the flow
1355   auto Adjuster = FlowAdjuster(Params, Func);
1356   Adjuster.run();
1357 
1358 #ifndef NDEBUG
1359   // Verify the result
1360   verifyOutput(Func);
1361 #endif
1362 }
1363 
1364 /// Apply the profile inference algorithm for a given flow function
applyFlowInference(FlowFunction & Func)1365 void llvm::applyFlowInference(FlowFunction &Func) {
1366   ProfiParams Params;
1367   // Set the params from the command-line flags.
1368   Params.EvenFlowDistribution = SampleProfileEvenFlowDistribution;
1369   Params.RebalanceUnknown = SampleProfileRebalanceUnknown;
1370   Params.JoinIslands = SampleProfileJoinIslands;
1371   Params.CostBlockInc = SampleProfileProfiCostBlockInc;
1372   Params.CostBlockDec = SampleProfileProfiCostBlockDec;
1373   Params.CostBlockEntryInc = SampleProfileProfiCostBlockEntryInc;
1374   Params.CostBlockEntryDec = SampleProfileProfiCostBlockEntryDec;
1375   Params.CostBlockZeroInc = SampleProfileProfiCostBlockZeroInc;
1376   Params.CostBlockUnknownInc = SampleProfileProfiCostBlockUnknownInc;
1377 
1378   applyFlowInference(Params, Func);
1379 }
1380