1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a profile inference algorithm. Given an incomplete and
10 // possibly imprecise block counts, the algorithm reconstructs realistic block
11 // and edge counts that satisfy flow conservation rules, while minimally modify
12 // input block counts.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "llvm/Transforms/Utils/SampleProfileInference.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
20 #include <queue>
21 #include <set>
22 #include <stack>
23 #include <unordered_set>
24
25 using namespace llvm;
26 #define DEBUG_TYPE "sample-profile-inference"
27
28 namespace {
29
30 static cl::opt<bool> SampleProfileEvenFlowDistribution(
31 "sample-profile-even-flow-distribution", cl::init(true), cl::Hidden,
32 cl::desc("Try to evenly distribute flow when there are multiple equally "
33 "likely options."));
34
35 static cl::opt<bool> SampleProfileRebalanceUnknown(
36 "sample-profile-rebalance-unknown", cl::init(true), cl::Hidden,
37 cl::desc("Evenly re-distribute flow among unknown subgraphs."));
38
39 static cl::opt<bool> SampleProfileJoinIslands(
40 "sample-profile-join-islands", cl::init(true), cl::Hidden,
41 cl::desc("Join isolated components having positive flow."));
42
43 static cl::opt<unsigned> SampleProfileProfiCostBlockInc(
44 "sample-profile-profi-cost-block-inc", cl::init(10), cl::Hidden,
45 cl::desc("The cost of increasing a block's count by one."));
46
47 static cl::opt<unsigned> SampleProfileProfiCostBlockDec(
48 "sample-profile-profi-cost-block-dec", cl::init(20), cl::Hidden,
49 cl::desc("The cost of decreasing a block's count by one."));
50
51 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryInc(
52 "sample-profile-profi-cost-block-entry-inc", cl::init(40), cl::Hidden,
53 cl::desc("The cost of increasing the entry block's count by one."));
54
55 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryDec(
56 "sample-profile-profi-cost-block-entry-dec", cl::init(10), cl::Hidden,
57 cl::desc("The cost of decreasing the entry block's count by one."));
58
59 static cl::opt<unsigned> SampleProfileProfiCostBlockZeroInc(
60 "sample-profile-profi-cost-block-zero-inc", cl::init(11), cl::Hidden,
61 cl::desc("The cost of increasing a count of zero-weight block by one."));
62
63 static cl::opt<unsigned> SampleProfileProfiCostBlockUnknownInc(
64 "sample-profile-profi-cost-block-unknown-inc", cl::init(0), cl::Hidden,
65 cl::desc("The cost of increasing an unknown block's count by one."));
66
67 /// A value indicating an infinite flow/capacity/weight of a block/edge.
68 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
69 /// during the execution.
70 static constexpr int64_t INF = ((int64_t)1) << 50;
71
72 /// The minimum-cost maximum flow algorithm.
73 ///
74 /// The algorithm finds the maximum flow of minimum cost on a given (directed)
75 /// network using a modified version of the classical Moore-Bellman-Ford
76 /// approach. The algorithm applies a number of augmentation iterations in which
77 /// flow is sent along paths of positive capacity from the source to the sink.
78 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where
79 /// where m is the number of edges, n is the number of vertices, and v(f) is the
80 /// value of the maximum flow. However, the observed running time on typical
81 /// instances is sub-quadratic, that is, o(n^2).
82 ///
83 /// The input is a set of edges with specified costs and capacities, and a pair
84 /// of nodes (source and sink). The output is the flow along each edge of the
85 /// minimum total cost respecting the given edge capacities.
86 class MinCostMaxFlow {
87 public:
MinCostMaxFlow(const ProfiParams & Params)88 MinCostMaxFlow(const ProfiParams &Params) : Params(Params) {}
89
90 // Initialize algorithm's data structures for a network of a given size.
initialize(uint64_t NodeCount,uint64_t SourceNode,uint64_t SinkNode)91 void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
92 Source = SourceNode;
93 Target = SinkNode;
94
95 Nodes = std::vector<Node>(NodeCount);
96 Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
97 if (Params.EvenFlowDistribution)
98 AugmentingEdges =
99 std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>());
100 }
101
102 // Run the algorithm.
run()103 int64_t run() {
104 LLVM_DEBUG(dbgs() << "Starting profi for " << Nodes.size() << " nodes\n");
105
106 // Iteratively find an augmentation path/dag in the network and send the
107 // flow along its edges
108 size_t AugmentationIters = applyFlowAugmentation();
109
110 // Compute the total flow and its cost
111 int64_t TotalCost = 0;
112 int64_t TotalFlow = 0;
113 for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
114 for (auto &Edge : Edges[Src]) {
115 if (Edge.Flow > 0) {
116 TotalCost += Edge.Cost * Edge.Flow;
117 if (Src == Source)
118 TotalFlow += Edge.Flow;
119 }
120 }
121 }
122 LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
123 << " iterations with " << TotalFlow << " total flow"
124 << " of " << TotalCost << " cost\n");
125 (void)TotalFlow;
126 (void)AugmentationIters;
127 return TotalCost;
128 }
129
130 /// Adding an edge to the network with a specified capacity and a cost.
131 /// Multiple edges between a pair of nodes are allowed but self-edges
132 /// are not supported.
addEdge(uint64_t Src,uint64_t Dst,int64_t Capacity,int64_t Cost)133 void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
134 assert(Capacity > 0 && "adding an edge of zero capacity");
135 assert(Src != Dst && "loop edge are not supported");
136
137 Edge SrcEdge;
138 SrcEdge.Dst = Dst;
139 SrcEdge.Cost = Cost;
140 SrcEdge.Capacity = Capacity;
141 SrcEdge.Flow = 0;
142 SrcEdge.RevEdgeIndex = Edges[Dst].size();
143
144 Edge DstEdge;
145 DstEdge.Dst = Src;
146 DstEdge.Cost = -Cost;
147 DstEdge.Capacity = 0;
148 DstEdge.Flow = 0;
149 DstEdge.RevEdgeIndex = Edges[Src].size();
150
151 Edges[Src].push_back(SrcEdge);
152 Edges[Dst].push_back(DstEdge);
153 }
154
155 /// Adding an edge to the network of infinite capacity and a given cost.
addEdge(uint64_t Src,uint64_t Dst,int64_t Cost)156 void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
157 addEdge(Src, Dst, INF, Cost);
158 }
159
160 /// Get the total flow from a given source node.
161 /// Returns a list of pairs (target node, amount of flow to the target).
getFlow(uint64_t Src) const162 std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
163 std::vector<std::pair<uint64_t, int64_t>> Flow;
164 for (const auto &Edge : Edges[Src]) {
165 if (Edge.Flow > 0)
166 Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
167 }
168 return Flow;
169 }
170
171 /// Get the total flow between a pair of nodes.
getFlow(uint64_t Src,uint64_t Dst) const172 int64_t getFlow(uint64_t Src, uint64_t Dst) const {
173 int64_t Flow = 0;
174 for (const auto &Edge : Edges[Src]) {
175 if (Edge.Dst == Dst) {
176 Flow += Edge.Flow;
177 }
178 }
179 return Flow;
180 }
181
182 private:
183 /// Iteratively find an augmentation path/dag in the network and send the
184 /// flow along its edges. The method returns the number of applied iterations.
applyFlowAugmentation()185 size_t applyFlowAugmentation() {
186 size_t AugmentationIters = 0;
187 while (findAugmentingPath()) {
188 uint64_t PathCapacity = computeAugmentingPathCapacity();
189 while (PathCapacity > 0) {
190 bool Progress = false;
191 if (Params.EvenFlowDistribution) {
192 // Identify node/edge candidates for augmentation
193 identifyShortestEdges(PathCapacity);
194
195 // Find an augmenting DAG
196 auto AugmentingOrder = findAugmentingDAG();
197
198 // Apply the DAG augmentation
199 Progress = augmentFlowAlongDAG(AugmentingOrder);
200 PathCapacity = computeAugmentingPathCapacity();
201 }
202
203 if (!Progress) {
204 augmentFlowAlongPath(PathCapacity);
205 PathCapacity = 0;
206 }
207
208 AugmentationIters++;
209 }
210 }
211 return AugmentationIters;
212 }
213
214 /// Compute the capacity of the cannonical augmenting path. If the path is
215 /// saturated (that is, no flow can be sent along the path), then return 0.
computeAugmentingPathCapacity()216 uint64_t computeAugmentingPathCapacity() {
217 uint64_t PathCapacity = INF;
218 uint64_t Now = Target;
219 while (Now != Source) {
220 uint64_t Pred = Nodes[Now].ParentNode;
221 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
222
223 assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow");
224 uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow);
225 PathCapacity = std::min(PathCapacity, EdgeCapacity);
226
227 Now = Pred;
228 }
229 return PathCapacity;
230 }
231
232 /// Check for existence of an augmenting path with a positive capacity.
findAugmentingPath()233 bool findAugmentingPath() {
234 // Initialize data structures
235 for (auto &Node : Nodes) {
236 Node.Distance = INF;
237 Node.ParentNode = uint64_t(-1);
238 Node.ParentEdgeIndex = uint64_t(-1);
239 Node.Taken = false;
240 }
241
242 std::queue<uint64_t> Queue;
243 Queue.push(Source);
244 Nodes[Source].Distance = 0;
245 Nodes[Source].Taken = true;
246 while (!Queue.empty()) {
247 uint64_t Src = Queue.front();
248 Queue.pop();
249 Nodes[Src].Taken = false;
250 // Although the residual network contains edges with negative costs
251 // (in particular, backward edges), it can be shown that there are no
252 // negative-weight cycles and the following two invariants are maintained:
253 // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
254 // where Dist is the length of the shortest path between two nodes. This
255 // allows to prune the search-space of the path-finding algorithm using
256 // the following early-stop criteria:
257 // -- If we find a path with zero-distance from Source to Target, stop the
258 // search, as the path is the shortest since Dist[Source, Target] >= 0;
259 // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
260 // process node V, as it is guaranteed _not_ to be on a shortest path
261 // from Source to Target; it follows from inequalities
262 // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
263 // >= Dist[Source, V]
264 if (!Params.EvenFlowDistribution && Nodes[Target].Distance == 0)
265 break;
266 if (Nodes[Src].Distance > Nodes[Target].Distance)
267 continue;
268
269 // Process adjacent edges
270 for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
271 auto &Edge = Edges[Src][EdgeIdx];
272 if (Edge.Flow < Edge.Capacity) {
273 uint64_t Dst = Edge.Dst;
274 int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
275 if (Nodes[Dst].Distance > NewDistance) {
276 // Update the distance and the parent node/edge
277 Nodes[Dst].Distance = NewDistance;
278 Nodes[Dst].ParentNode = Src;
279 Nodes[Dst].ParentEdgeIndex = EdgeIdx;
280 // Add the node to the queue, if it is not there yet
281 if (!Nodes[Dst].Taken) {
282 Queue.push(Dst);
283 Nodes[Dst].Taken = true;
284 }
285 }
286 }
287 }
288 }
289
290 return Nodes[Target].Distance != INF;
291 }
292
293 /// Update the current flow along the augmenting path.
augmentFlowAlongPath(uint64_t PathCapacity)294 void augmentFlowAlongPath(uint64_t PathCapacity) {
295 assert(PathCapacity > 0 && "found an incorrect augmenting path");
296 uint64_t Now = Target;
297 while (Now != Source) {
298 uint64_t Pred = Nodes[Now].ParentNode;
299 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
300 auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
301
302 Edge.Flow += PathCapacity;
303 RevEdge.Flow -= PathCapacity;
304
305 Now = Pred;
306 }
307 }
308
309 /// Find an Augmenting DAG order using a modified version of DFS in which we
310 /// can visit a node multiple times. In the DFS search, when scanning each
311 /// edge out of a node, continue search at Edge.Dst endpoint if it has not
312 /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm
313 /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time.
314 /// It returns an Augmenting Order (Taken nodes in decreasing Finish time)
315 /// that starts with Source and ends with Target.
findAugmentingDAG()316 std::vector<uint64_t> findAugmentingDAG() {
317 // We use a stack based implemenation of DFS to avoid recursion.
318 // Defining DFS data structures:
319 // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that
320 // - we are currently visiting Nodes[NodeIdx] and
321 // - the next edge to scan is Edges[NodeIdx][EdgeIdx]
322 typedef std::pair<uint64_t, uint64_t> StackItemType;
323 std::stack<StackItemType> Stack;
324 std::vector<uint64_t> AugmentingOrder;
325
326 // Phase 0: Initialize Node attributes and Time for DFS run
327 for (auto &Node : Nodes) {
328 Node.Discovery = 0;
329 Node.Finish = 0;
330 Node.NumCalls = 0;
331 Node.Taken = false;
332 }
333 uint64_t Time = 0;
334 // Mark Target as Taken
335 // Taken attribute will be propagated backwards from Target towards Source
336 Nodes[Target].Taken = true;
337
338 // Phase 1: Start DFS traversal from Source
339 Stack.emplace(Source, 0);
340 Nodes[Source].Discovery = ++Time;
341 while (!Stack.empty()) {
342 auto NodeIdx = Stack.top().first;
343 auto EdgeIdx = Stack.top().second;
344
345 // If we haven't scanned all edges out of NodeIdx, continue scanning
346 if (EdgeIdx < Edges[NodeIdx].size()) {
347 auto &Edge = Edges[NodeIdx][EdgeIdx];
348 auto &Dst = Nodes[Edge.Dst];
349 Stack.top().second++;
350
351 if (Edge.OnShortestPath) {
352 // If we haven't seen Edge.Dst so far, continue DFS search there
353 if (Dst.Discovery == 0 && Dst.NumCalls < MaxDfsCalls) {
354 Dst.Discovery = ++Time;
355 Stack.emplace(Edge.Dst, 0);
356 Dst.NumCalls++;
357 } else if (Dst.Taken && Dst.Finish != 0) {
358 // Else, if Edge.Dst already have a path to Target, so that NodeIdx
359 Nodes[NodeIdx].Taken = true;
360 }
361 }
362 } else {
363 // If we are done scanning all edge out of NodeIdx
364 Stack.pop();
365 // If we haven't found a path from NodeIdx to Target, forget about it
366 if (!Nodes[NodeIdx].Taken) {
367 Nodes[NodeIdx].Discovery = 0;
368 } else {
369 // If we have found a path from NodeIdx to Target, then finish NodeIdx
370 // and propagate Taken flag to DFS parent unless at the Source
371 Nodes[NodeIdx].Finish = ++Time;
372 // NodeIdx == Source if and only if the stack is empty
373 if (NodeIdx != Source) {
374 assert(!Stack.empty() && "empty stack while running dfs");
375 Nodes[Stack.top().first].Taken = true;
376 }
377 AugmentingOrder.push_back(NodeIdx);
378 }
379 }
380 }
381 // Nodes are collected decreasing Finish time, so the order is reversed
382 std::reverse(AugmentingOrder.begin(), AugmentingOrder.end());
383
384 // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges
385 for (size_t Src : AugmentingOrder) {
386 AugmentingEdges[Src].clear();
387 for (auto &Edge : Edges[Src]) {
388 uint64_t Dst = Edge.Dst;
389 if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken &&
390 Nodes[Dst].Finish < Nodes[Src].Finish) {
391 AugmentingEdges[Src].push_back(&Edge);
392 }
393 }
394 assert((Src == Target || !AugmentingEdges[Src].empty()) &&
395 "incorrectly constructed augmenting edges");
396 }
397
398 return AugmentingOrder;
399 }
400
401 /// Update the current flow along the given (acyclic) subgraph specified by
402 /// the vertex order, AugmentingOrder. The objective is to send as much flow
403 /// as possible while evenly distributing flow among successors of each node.
404 /// After the update at least one edge is saturated.
augmentFlowAlongDAG(const std::vector<uint64_t> & AugmentingOrder)405 bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) {
406 // Phase 0: Initialization
407 for (uint64_t Src : AugmentingOrder) {
408 Nodes[Src].FracFlow = 0;
409 Nodes[Src].IntFlow = 0;
410 for (auto &Edge : AugmentingEdges[Src]) {
411 Edge->AugmentedFlow = 0;
412 }
413 }
414
415 // Phase 1: Send a unit of fractional flow along the DAG
416 uint64_t MaxFlowAmount = INF;
417 Nodes[Source].FracFlow = 1.0;
418 for (uint64_t Src : AugmentingOrder) {
419 assert((Src == Target || Nodes[Src].FracFlow > 0.0) &&
420 "incorrectly computed fractional flow");
421 // Distribute flow evenly among successors of Src
422 uint64_t Degree = AugmentingEdges[Src].size();
423 for (auto &Edge : AugmentingEdges[Src]) {
424 double EdgeFlow = Nodes[Src].FracFlow / Degree;
425 Nodes[Edge->Dst].FracFlow += EdgeFlow;
426 if (Edge->Capacity == INF)
427 continue;
428 uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow;
429 MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow);
430 }
431 }
432 // Stop early if we cannot send any (integral) flow from Source to Target
433 if (MaxFlowAmount == 0)
434 return false;
435
436 // Phase 2: Send an integral flow of MaxFlowAmount
437 Nodes[Source].IntFlow = MaxFlowAmount;
438 for (uint64_t Src : AugmentingOrder) {
439 if (Src == Target)
440 break;
441 // Distribute flow evenly among successors of Src, rounding up to make
442 // sure all flow is sent
443 uint64_t Degree = AugmentingEdges[Src].size();
444 // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree
445 uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree;
446 for (auto &Edge : AugmentingEdges[Src]) {
447 uint64_t Dst = Edge->Dst;
448 uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow);
449 EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow));
450 Nodes[Dst].IntFlow += EdgeFlow;
451 Nodes[Src].IntFlow -= EdgeFlow;
452 Edge->AugmentedFlow += EdgeFlow;
453 }
454 }
455 assert(Nodes[Target].IntFlow <= MaxFlowAmount);
456 Nodes[Target].IntFlow = 0;
457
458 // Phase 3: Send excess flow back traversing the nodes backwards.
459 // Because of rounding, not all flow can be sent along the edges of Src.
460 // Hence, sending the remaining flow back to maintain flow conservation
461 for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) {
462 uint64_t Src = AugmentingOrder[Idx - 1];
463 // Try to send excess flow back along each edge.
464 // Make sure we only send back flow we just augmented (AugmentedFlow).
465 for (auto &Edge : AugmentingEdges[Src]) {
466 uint64_t Dst = Edge->Dst;
467 if (Nodes[Dst].IntFlow == 0)
468 continue;
469 uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow);
470 Nodes[Dst].IntFlow -= EdgeFlow;
471 Nodes[Src].IntFlow += EdgeFlow;
472 Edge->AugmentedFlow -= EdgeFlow;
473 }
474 }
475
476 // Phase 4: Update flow values along all edges
477 bool HasSaturatedEdges = false;
478 for (uint64_t Src : AugmentingOrder) {
479 // Verify that we have sent all the excess flow from the node
480 assert(Src == Source || Nodes[Src].IntFlow == 0);
481 for (auto &Edge : AugmentingEdges[Src]) {
482 assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow);
483 // Update flow values along the edge and its reverse copy
484 auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex];
485 Edge->Flow += Edge->AugmentedFlow;
486 RevEdge.Flow -= Edge->AugmentedFlow;
487 if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0)
488 HasSaturatedEdges = true;
489 }
490 }
491
492 // The augmentation is successful iff at least one edge becomes saturated
493 return HasSaturatedEdges;
494 }
495
496 /// Identify candidate (shortest) edges for augmentation.
identifyShortestEdges(uint64_t PathCapacity)497 void identifyShortestEdges(uint64_t PathCapacity) {
498 assert(PathCapacity > 0 && "found an incorrect augmenting DAG");
499 // To make sure the augmentation DAG contains only edges with large residual
500 // capacity, we prune all edges whose capacity is below a fraction of
501 // the capacity of the augmented path.
502 // (All edges of the path itself are always in the DAG)
503 uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1));
504
505 // Decide which edges are on a shortest path from Source to Target
506 for (size_t Src = 0; Src < Nodes.size(); Src++) {
507 // An edge cannot be augmenting if the endpoint has large distance
508 if (Nodes[Src].Distance > Nodes[Target].Distance)
509 continue;
510
511 for (auto &Edge : Edges[Src]) {
512 uint64_t Dst = Edge.Dst;
513 Edge.OnShortestPath =
514 Src != Target && Dst != Source &&
515 Nodes[Dst].Distance <= Nodes[Target].Distance &&
516 Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost &&
517 Edge.Capacity > Edge.Flow &&
518 uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity;
519 }
520 }
521 }
522
523 /// Maximum number of DFS iterations for DAG finding.
524 static constexpr uint64_t MaxDfsCalls = 10;
525
526 /// A node in a flow network.
527 struct Node {
528 /// The cost of the cheapest path from the source to the current node.
529 int64_t Distance;
530 /// The node preceding the current one in the path.
531 uint64_t ParentNode;
532 /// The index of the edge between ParentNode and the current node.
533 uint64_t ParentEdgeIndex;
534 /// An indicator of whether the current node is in a queue.
535 bool Taken;
536
537 /// Data fields utilized in DAG-augmentation:
538 /// Fractional flow.
539 double FracFlow;
540 /// Integral flow.
541 uint64_t IntFlow;
542 /// Discovery time.
543 uint64_t Discovery;
544 /// Finish time.
545 uint64_t Finish;
546 /// NumCalls.
547 uint64_t NumCalls;
548 };
549
550 /// An edge in a flow network.
551 struct Edge {
552 /// The cost of the edge.
553 int64_t Cost;
554 /// The capacity of the edge.
555 int64_t Capacity;
556 /// The current flow on the edge.
557 int64_t Flow;
558 /// The destination node of the edge.
559 uint64_t Dst;
560 /// The index of the reverse edge between Dst and the current node.
561 uint64_t RevEdgeIndex;
562
563 /// Data fields utilized in DAG-augmentation:
564 /// Whether the edge is currently on a shortest path from Source to Target.
565 bool OnShortestPath;
566 /// Extra flow along the edge.
567 uint64_t AugmentedFlow;
568 };
569
570 /// The set of network nodes.
571 std::vector<Node> Nodes;
572 /// The set of network edges.
573 std::vector<std::vector<Edge>> Edges;
574 /// Source node of the flow.
575 uint64_t Source;
576 /// Target (sink) node of the flow.
577 uint64_t Target;
578 /// Augmenting edges.
579 std::vector<std::vector<Edge *>> AugmentingEdges;
580 /// Params for flow computation.
581 const ProfiParams &Params;
582 };
583
584 /// A post-processing adjustment of the control flow. It applies two steps by
585 /// rerouting some flow and making it more realistic:
586 ///
587 /// - First, it removes all isolated components ("islands") with a positive flow
588 /// that are unreachable from the entry block. For every such component, we
589 /// find the shortest from the entry to an exit passing through the component,
590 /// and increase the flow by one unit along the path.
591 ///
592 /// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
593 /// with no sampled counts. Then it rebalnces the flow that goes through such
594 /// a subgraph so that each branch is taken with probability 50%.
595 /// An unknown subgraph is such that for every two nodes u and v:
596 /// - u dominates v and u is not unknown;
597 /// - v post-dominates u; and
598 /// - all inner-nodes of all (u,v)-paths are unknown.
599 ///
600 class FlowAdjuster {
601 public:
FlowAdjuster(const ProfiParams & Params,FlowFunction & Func)602 FlowAdjuster(const ProfiParams &Params, FlowFunction &Func)
603 : Params(Params), Func(Func) {}
604
605 /// Apply the post-processing.
run()606 void run() {
607 if (Params.JoinIslands) {
608 // Adjust the flow to get rid of isolated components
609 joinIsolatedComponents();
610 }
611
612 if (Params.RebalanceUnknown) {
613 // Rebalance the flow inside unknown subgraphs
614 rebalanceUnknownSubgraphs();
615 }
616 }
617
618 private:
joinIsolatedComponents()619 void joinIsolatedComponents() {
620 // Find blocks that are reachable from the source
621 auto Visited = BitVector(NumBlocks(), false);
622 findReachable(Func.Entry, Visited);
623
624 // Iterate over all non-reachable blocks and adjust their weights
625 for (uint64_t I = 0; I < NumBlocks(); I++) {
626 auto &Block = Func.Blocks[I];
627 if (Block.Flow > 0 && !Visited[I]) {
628 // Find a path from the entry to an exit passing through the block I
629 auto Path = findShortestPath(I);
630 // Increase the flow along the path
631 assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
632 "incorrectly computed path adjusting control flow");
633 Func.Blocks[Func.Entry].Flow += 1;
634 for (auto &Jump : Path) {
635 Jump->Flow += 1;
636 Func.Blocks[Jump->Target].Flow += 1;
637 // Update reachability
638 findReachable(Jump->Target, Visited);
639 }
640 }
641 }
642 }
643
644 /// Run BFS from a given block along the jumps with a positive flow and mark
645 /// all reachable blocks.
findReachable(uint64_t Src,BitVector & Visited)646 void findReachable(uint64_t Src, BitVector &Visited) {
647 if (Visited[Src])
648 return;
649 std::queue<uint64_t> Queue;
650 Queue.push(Src);
651 Visited[Src] = true;
652 while (!Queue.empty()) {
653 Src = Queue.front();
654 Queue.pop();
655 for (auto *Jump : Func.Blocks[Src].SuccJumps) {
656 uint64_t Dst = Jump->Target;
657 if (Jump->Flow > 0 && !Visited[Dst]) {
658 Queue.push(Dst);
659 Visited[Dst] = true;
660 }
661 }
662 }
663 }
664
665 /// Find the shortest path from the entry block to an exit block passing
666 /// through a given block.
findShortestPath(uint64_t BlockIdx)667 std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
668 // A path from the entry block to BlockIdx
669 auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
670 // A path from BlockIdx to an exit block
671 auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
672
673 // Concatenate the two paths
674 std::vector<FlowJump *> Result;
675 Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
676 Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
677 return Result;
678 }
679
680 /// Apply the Dijkstra algorithm to find the shortest path from a given
681 /// Source to a given Target block.
682 /// If Target == -1, then the path ends at an exit block.
findShortestPath(uint64_t Source,uint64_t Target)683 std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
684 // Quit early, if possible
685 if (Source == Target)
686 return std::vector<FlowJump *>();
687 if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
688 return std::vector<FlowJump *>();
689
690 // Initialize data structures
691 auto Distance = std::vector<int64_t>(NumBlocks(), INF);
692 auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
693 Distance[Source] = 0;
694 std::set<std::pair<uint64_t, uint64_t>> Queue;
695 Queue.insert(std::make_pair(Distance[Source], Source));
696
697 // Run the Dijkstra algorithm
698 while (!Queue.empty()) {
699 uint64_t Src = Queue.begin()->second;
700 Queue.erase(Queue.begin());
701 // If we found a solution, quit early
702 if (Src == Target ||
703 (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
704 break;
705
706 for (auto *Jump : Func.Blocks[Src].SuccJumps) {
707 uint64_t Dst = Jump->Target;
708 int64_t JumpDist = jumpDistance(Jump);
709 if (Distance[Dst] > Distance[Src] + JumpDist) {
710 Queue.erase(std::make_pair(Distance[Dst], Dst));
711
712 Distance[Dst] = Distance[Src] + JumpDist;
713 Parent[Dst] = Jump;
714
715 Queue.insert(std::make_pair(Distance[Dst], Dst));
716 }
717 }
718 }
719 // If Target is not provided, find the closest exit block
720 if (Target == AnyExitBlock) {
721 for (uint64_t I = 0; I < NumBlocks(); I++) {
722 if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
723 if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
724 Target = I;
725 }
726 }
727 }
728 }
729 assert(Parent[Target] != nullptr && "a path does not exist");
730
731 // Extract the constructed path
732 std::vector<FlowJump *> Result;
733 uint64_t Now = Target;
734 while (Now != Source) {
735 assert(Now == Parent[Now]->Target && "incorrect parent jump");
736 Result.push_back(Parent[Now]);
737 Now = Parent[Now]->Source;
738 }
739 // Reverse the path, since it is extracted from Target to Source
740 std::reverse(Result.begin(), Result.end());
741 return Result;
742 }
743
744 /// A distance of a path for a given jump.
745 /// In order to incite the path to use blocks/jumps with large positive flow,
746 /// and avoid changing branch probability of outgoing edges drastically,
747 /// set the jump distance so as:
748 /// - to minimize the number of unlikely jumps used and subject to that,
749 /// - to minimize the number of Flow == 0 jumps used and subject to that,
750 /// - minimizes total multiplicative Flow increase for the remaining edges.
751 /// To capture this objective with integer distances, we round off fractional
752 /// parts to a multiple of 1 / BaseDistance.
jumpDistance(FlowJump * Jump) const753 int64_t jumpDistance(FlowJump *Jump) const {
754 if (Jump->IsUnlikely)
755 return Params.CostUnlikely;
756 uint64_t BaseDistance =
757 std::max(FlowAdjuster::MinBaseDistance,
758 std::min(Func.Blocks[Func.Entry].Flow,
759 Params.CostUnlikely / (2 * (NumBlocks() + 1))));
760 if (Jump->Flow > 0)
761 return BaseDistance + BaseDistance / Jump->Flow;
762 return 2 * BaseDistance * (NumBlocks() + 1);
763 };
764
NumBlocks() const765 uint64_t NumBlocks() const { return Func.Blocks.size(); }
766
767 /// Rebalance unknown subgraphs so that the flow is split evenly across the
768 /// outgoing branches of every block of the subgraph. The method iterates over
769 /// blocks with known weight and identifies unknown subgraphs rooted at the
770 /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
rebalanceUnknownSubgraphs()771 void rebalanceUnknownSubgraphs() {
772 // Try to find unknown subgraphs from each block
773 for (const FlowBlock &SrcBlock : Func.Blocks) {
774 // Verify if rebalancing rooted at SrcBlock is feasible
775 if (!canRebalanceAtRoot(&SrcBlock))
776 continue;
777
778 // Find an unknown subgraphs starting at SrcBlock. Along the way,
779 // fill in known destinations and intermediate unknown blocks.
780 std::vector<FlowBlock *> UnknownBlocks;
781 std::vector<FlowBlock *> KnownDstBlocks;
782 findUnknownSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks);
783
784 // Verify if rebalancing of the subgraph is feasible. If the search is
785 // successful, find the unique destination block (which can be null)
786 FlowBlock *DstBlock = nullptr;
787 if (!canRebalanceSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks,
788 DstBlock))
789 continue;
790
791 // We cannot rebalance subgraphs containing cycles among unknown blocks
792 if (!isAcyclicSubgraph(&SrcBlock, DstBlock, UnknownBlocks))
793 continue;
794
795 // Rebalance the flow
796 rebalanceUnknownSubgraph(&SrcBlock, DstBlock, UnknownBlocks);
797 }
798 }
799
800 /// Verify if rebalancing rooted at a given block is possible.
canRebalanceAtRoot(const FlowBlock * SrcBlock)801 bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
802 // Do not attempt to find unknown subgraphs from an unknown or a
803 // zero-flow block
804 if (SrcBlock->HasUnknownWeight || SrcBlock->Flow == 0)
805 return false;
806
807 // Do not attempt to process subgraphs from a block w/o unknown sucessors
808 bool HasUnknownSuccs = false;
809 for (auto *Jump : SrcBlock->SuccJumps) {
810 if (Func.Blocks[Jump->Target].HasUnknownWeight) {
811 HasUnknownSuccs = true;
812 break;
813 }
814 }
815 if (!HasUnknownSuccs)
816 return false;
817
818 return true;
819 }
820
821 /// Find an unknown subgraph starting at block SrcBlock. The method sets
822 /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
findUnknownSubgraph(const FlowBlock * SrcBlock,std::vector<FlowBlock * > & KnownDstBlocks,std::vector<FlowBlock * > & UnknownBlocks)823 void findUnknownSubgraph(const FlowBlock *SrcBlock,
824 std::vector<FlowBlock *> &KnownDstBlocks,
825 std::vector<FlowBlock *> &UnknownBlocks) {
826 // Run BFS from SrcBlock and make sure all paths are going through unknown
827 // blocks and end at a known DstBlock
828 auto Visited = BitVector(NumBlocks(), false);
829 std::queue<uint64_t> Queue;
830
831 Queue.push(SrcBlock->Index);
832 Visited[SrcBlock->Index] = true;
833 while (!Queue.empty()) {
834 auto &Block = Func.Blocks[Queue.front()];
835 Queue.pop();
836 // Process blocks reachable from Block
837 for (auto *Jump : Block.SuccJumps) {
838 // If Jump can be ignored, skip it
839 if (ignoreJump(SrcBlock, nullptr, Jump))
840 continue;
841
842 uint64_t Dst = Jump->Target;
843 // If Dst has been visited, skip Jump
844 if (Visited[Dst])
845 continue;
846 // Process block Dst
847 Visited[Dst] = true;
848 if (!Func.Blocks[Dst].HasUnknownWeight) {
849 KnownDstBlocks.push_back(&Func.Blocks[Dst]);
850 } else {
851 Queue.push(Dst);
852 UnknownBlocks.push_back(&Func.Blocks[Dst]);
853 }
854 }
855 }
856 }
857
858 /// Verify if rebalancing of the subgraph is feasible. If the checks are
859 /// successful, set the unique destination block, DstBlock (can be null).
canRebalanceSubgraph(const FlowBlock * SrcBlock,const std::vector<FlowBlock * > & KnownDstBlocks,const std::vector<FlowBlock * > & UnknownBlocks,FlowBlock * & DstBlock)860 bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
861 const std::vector<FlowBlock *> &KnownDstBlocks,
862 const std::vector<FlowBlock *> &UnknownBlocks,
863 FlowBlock *&DstBlock) {
864 // If the list of unknown blocks is empty, we don't need rebalancing
865 if (UnknownBlocks.empty())
866 return false;
867
868 // If there are multiple known sinks, we can't rebalance
869 if (KnownDstBlocks.size() > 1)
870 return false;
871 DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
872
873 // Verify sinks of the subgraph
874 for (auto *Block : UnknownBlocks) {
875 if (Block->SuccJumps.empty()) {
876 // If there are multiple (known and unknown) sinks, we can't rebalance
877 if (DstBlock != nullptr)
878 return false;
879 continue;
880 }
881 size_t NumIgnoredJumps = 0;
882 for (auto *Jump : Block->SuccJumps) {
883 if (ignoreJump(SrcBlock, DstBlock, Jump))
884 NumIgnoredJumps++;
885 }
886 // If there is a non-sink block in UnknownBlocks with all jumps ignored,
887 // then we can't rebalance
888 if (NumIgnoredJumps == Block->SuccJumps.size())
889 return false;
890 }
891
892 return true;
893 }
894
895 /// Decide whether the Jump is ignored while processing an unknown subgraphs
896 /// rooted at basic block SrcBlock with the destination block, DstBlock.
ignoreJump(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const FlowJump * Jump)897 bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
898 const FlowJump *Jump) {
899 // Ignore unlikely jumps with zero flow
900 if (Jump->IsUnlikely && Jump->Flow == 0)
901 return true;
902
903 auto JumpSource = &Func.Blocks[Jump->Source];
904 auto JumpTarget = &Func.Blocks[Jump->Target];
905
906 // Do not ignore jumps coming into DstBlock
907 if (DstBlock != nullptr && JumpTarget == DstBlock)
908 return false;
909
910 // Ignore jumps out of SrcBlock to known blocks
911 if (!JumpTarget->HasUnknownWeight && JumpSource == SrcBlock)
912 return true;
913
914 // Ignore jumps to known blocks with zero flow
915 if (!JumpTarget->HasUnknownWeight && JumpTarget->Flow == 0)
916 return true;
917
918 return false;
919 }
920
921 /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
922 /// UnknownBlocks in the topological order (so that all jumps are "forward").
isAcyclicSubgraph(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,std::vector<FlowBlock * > & UnknownBlocks)923 bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
924 std::vector<FlowBlock *> &UnknownBlocks) {
925 // Extract local in-degrees in the considered subgraph
926 auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
927 auto fillInDegree = [&](const FlowBlock *Block) {
928 for (auto *Jump : Block->SuccJumps) {
929 if (ignoreJump(SrcBlock, DstBlock, Jump))
930 continue;
931 LocalInDegree[Jump->Target]++;
932 }
933 };
934 fillInDegree(SrcBlock);
935 for (auto *Block : UnknownBlocks) {
936 fillInDegree(Block);
937 }
938 // A loop containing SrcBlock
939 if (LocalInDegree[SrcBlock->Index] > 0)
940 return false;
941
942 std::vector<FlowBlock *> AcyclicOrder;
943 std::queue<uint64_t> Queue;
944 Queue.push(SrcBlock->Index);
945 while (!Queue.empty()) {
946 FlowBlock *Block = &Func.Blocks[Queue.front()];
947 Queue.pop();
948 // Stop propagation once we reach DstBlock, if any
949 if (DstBlock != nullptr && Block == DstBlock)
950 break;
951
952 // Keep an acyclic order of unknown blocks
953 if (Block->HasUnknownWeight && Block != SrcBlock)
954 AcyclicOrder.push_back(Block);
955
956 // Add to the queue all successors with zero local in-degree
957 for (auto *Jump : Block->SuccJumps) {
958 if (ignoreJump(SrcBlock, DstBlock, Jump))
959 continue;
960 uint64_t Dst = Jump->Target;
961 LocalInDegree[Dst]--;
962 if (LocalInDegree[Dst] == 0) {
963 Queue.push(Dst);
964 }
965 }
966 }
967
968 // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
969 // of all blocks
970 if (UnknownBlocks.size() != AcyclicOrder.size())
971 return false;
972 UnknownBlocks = AcyclicOrder;
973 return true;
974 }
975
976 /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
977 /// having UnknownBlocks intermediate blocks.
rebalanceUnknownSubgraph(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const std::vector<FlowBlock * > & UnknownBlocks)978 void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
979 const FlowBlock *DstBlock,
980 const std::vector<FlowBlock *> &UnknownBlocks) {
981 assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
982
983 // Ditribute flow from the source block
984 uint64_t BlockFlow = 0;
985 // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
986 for (auto *Jump : SrcBlock->SuccJumps) {
987 if (ignoreJump(SrcBlock, DstBlock, Jump))
988 continue;
989 BlockFlow += Jump->Flow;
990 }
991 rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
992
993 // Ditribute flow from the remaining blocks
994 for (auto *Block : UnknownBlocks) {
995 assert(Block->HasUnknownWeight && "incorrect unknown subgraph");
996 uint64_t BlockFlow = 0;
997 // Block's flow is the sum of incoming flows
998 for (auto *Jump : Block->PredJumps) {
999 BlockFlow += Jump->Flow;
1000 }
1001 Block->Flow = BlockFlow;
1002 rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
1003 }
1004 }
1005
1006 /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
1007 /// and ending at DstBlock.
rebalanceBlock(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const FlowBlock * Block,uint64_t BlockFlow)1008 void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
1009 const FlowBlock *Block, uint64_t BlockFlow) {
1010 // Process all successor jumps and update corresponding flow values
1011 size_t BlockDegree = 0;
1012 for (auto *Jump : Block->SuccJumps) {
1013 if (ignoreJump(SrcBlock, DstBlock, Jump))
1014 continue;
1015 BlockDegree++;
1016 }
1017 // If all successor jumps of the block are ignored, skip it
1018 if (DstBlock == nullptr && BlockDegree == 0)
1019 return;
1020 assert(BlockDegree > 0 && "all outgoing jumps are ignored");
1021
1022 // Each of the Block's successors gets the following amount of flow.
1023 // Rounding the value up so that all flow is propagated
1024 uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
1025 for (auto *Jump : Block->SuccJumps) {
1026 if (ignoreJump(SrcBlock, DstBlock, Jump))
1027 continue;
1028 uint64_t Flow = std::min(SuccFlow, BlockFlow);
1029 Jump->Flow = Flow;
1030 BlockFlow -= Flow;
1031 }
1032 assert(BlockFlow == 0 && "not all flow is propagated");
1033 }
1034
1035 /// A constant indicating an arbitrary exit block of a function.
1036 static constexpr uint64_t AnyExitBlock = uint64_t(-1);
1037 /// Minimum BaseDistance for the jump distance values in island joining.
1038 static constexpr uint64_t MinBaseDistance = 10000;
1039
1040 /// Params for flow computation.
1041 const ProfiParams &Params;
1042 /// The function.
1043 FlowFunction &Func;
1044 };
1045
1046 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params,
1047 const FlowBlock &Block);
1048 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params,
1049 const FlowJump &Jump);
1050
1051 /// Initializing flow network for a given function.
1052 ///
1053 /// Every block is split into two nodes that are responsible for (i) an
1054 /// incoming flow, (ii) an outgoing flow; they penalize an increase or a
1055 /// reduction of the block weight.
initializeNetwork(const ProfiParams & Params,MinCostMaxFlow & Network,FlowFunction & Func)1056 void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network,
1057 FlowFunction &Func) {
1058 uint64_t NumBlocks = Func.Blocks.size();
1059 assert(NumBlocks > 1 && "Too few blocks in a function");
1060 uint64_t NumJumps = Func.Jumps.size();
1061 assert(NumJumps > 0 && "Too few jumps in a function");
1062
1063 // Introducing dummy source/sink pairs to allow flow circulation.
1064 // The nodes corresponding to blocks of the function have indices in
1065 // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the
1066 // next four values.
1067 uint64_t S = 2 * NumBlocks;
1068 uint64_t T = S + 1;
1069 uint64_t S1 = S + 2;
1070 uint64_t T1 = S + 3;
1071
1072 Network.initialize(2 * NumBlocks + 4, S1, T1);
1073
1074 // Initialize nodes of the flow network
1075 for (uint64_t B = 0; B < NumBlocks; B++) {
1076 auto &Block = Func.Blocks[B];
1077
1078 // Split every block into two auxiliary nodes to allow
1079 // increase/reduction of the block count.
1080 uint64_t Bin = 2 * B;
1081 uint64_t Bout = 2 * B + 1;
1082
1083 // Edges from S and to T
1084 if (Block.isEntry()) {
1085 Network.addEdge(S, Bin, 0);
1086 } else if (Block.isExit()) {
1087 Network.addEdge(Bout, T, 0);
1088 }
1089
1090 // Assign costs for increasing/decreasing the block counts
1091 auto [AuxCostInc, AuxCostDec] = assignBlockCosts(Params, Block);
1092
1093 // Add the corresponding edges to the network
1094 Network.addEdge(Bin, Bout, AuxCostInc);
1095 if (Block.Weight > 0) {
1096 Network.addEdge(Bout, Bin, Block.Weight, AuxCostDec);
1097 Network.addEdge(S1, Bout, Block.Weight, 0);
1098 Network.addEdge(Bin, T1, Block.Weight, 0);
1099 }
1100 }
1101
1102 // Initialize edges of the flow network
1103 for (uint64_t J = 0; J < NumJumps; J++) {
1104 auto &Jump = Func.Jumps[J];
1105
1106 // Get the endpoints corresponding to the jump
1107 uint64_t Jin = 2 * Jump.Source + 1;
1108 uint64_t Jout = 2 * Jump.Target;
1109
1110 // Assign costs for increasing/decreasing the jump counts
1111 auto [AuxCostInc, AuxCostDec] = assignJumpCosts(Params, Jump);
1112
1113 // Add the corresponding edges to the network
1114 Network.addEdge(Jin, Jout, AuxCostInc);
1115 if (Jump.Weight > 0) {
1116 Network.addEdge(Jout, Jin, Jump.Weight, AuxCostDec);
1117 Network.addEdge(S1, Jout, Jump.Weight, 0);
1118 Network.addEdge(Jin, T1, Jump.Weight, 0);
1119 }
1120 }
1121
1122 // Make sure we have a valid flow circulation
1123 Network.addEdge(T, S, 0);
1124 }
1125
1126 /// Assign costs for increasing/decreasing the block counts.
assignBlockCosts(const ProfiParams & Params,const FlowBlock & Block)1127 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params,
1128 const FlowBlock &Block) {
1129 // Modifying the weight of an unlikely block is expensive
1130 if (Block.IsUnlikely)
1131 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely);
1132
1133 // Assign default values for the costs
1134 int64_t CostInc = Params.CostBlockInc;
1135 int64_t CostDec = Params.CostBlockDec;
1136 // Update the costs depending on the block metadata
1137 if (Block.HasUnknownWeight) {
1138 CostInc = Params.CostBlockUnknownInc;
1139 CostDec = 0;
1140 } else {
1141 // Increasing the count for "cold" blocks with zero initial count is more
1142 // expensive than for "hot" ones
1143 if (Block.Weight == 0)
1144 CostInc = Params.CostBlockZeroInc;
1145 // Modifying the count of the entry block is expensive
1146 if (Block.isEntry()) {
1147 CostInc = Params.CostBlockEntryInc;
1148 CostDec = Params.CostBlockEntryDec;
1149 }
1150 }
1151 return std::make_pair(CostInc, CostDec);
1152 }
1153
1154 /// Assign costs for increasing/decreasing the jump counts.
assignJumpCosts(const ProfiParams & Params,const FlowJump & Jump)1155 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params,
1156 const FlowJump &Jump) {
1157 // Modifying the weight of an unlikely jump is expensive
1158 if (Jump.IsUnlikely)
1159 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely);
1160
1161 // Assign default values for the costs
1162 int64_t CostInc = Params.CostJumpInc;
1163 int64_t CostDec = Params.CostJumpDec;
1164 // Update the costs depending on the block metadata
1165 if (Jump.Source + 1 == Jump.Target) {
1166 // Adjusting the fall-through branch
1167 CostInc = Params.CostJumpFTInc;
1168 CostDec = Params.CostJumpFTDec;
1169 }
1170 if (Jump.HasUnknownWeight) {
1171 // The cost is different for fall-through and non-fall-through branches
1172 if (Jump.Source + 1 == Jump.Target)
1173 CostInc = Params.CostJumpUnknownFTInc;
1174 else
1175 CostInc = Params.CostJumpUnknownInc;
1176 CostDec = 0;
1177 } else {
1178 assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight");
1179 }
1180 return std::make_pair(CostInc, CostDec);
1181 }
1182
1183 /// Extract resulting block and edge counts from the flow network.
extractWeights(const ProfiParams & Params,MinCostMaxFlow & Network,FlowFunction & Func)1184 void extractWeights(const ProfiParams &Params, MinCostMaxFlow &Network,
1185 FlowFunction &Func) {
1186 uint64_t NumBlocks = Func.Blocks.size();
1187 uint64_t NumJumps = Func.Jumps.size();
1188
1189 // Extract resulting jump counts
1190 for (uint64_t J = 0; J < NumJumps; J++) {
1191 auto &Jump = Func.Jumps[J];
1192 uint64_t SrcOut = 2 * Jump.Source + 1;
1193 uint64_t DstIn = 2 * Jump.Target;
1194
1195 int64_t Flow = 0;
1196 int64_t AuxFlow = Network.getFlow(SrcOut, DstIn);
1197 if (Jump.Source != Jump.Target)
1198 Flow = int64_t(Jump.Weight) + AuxFlow;
1199 else
1200 Flow = int64_t(Jump.Weight) + (AuxFlow > 0 ? AuxFlow : 0);
1201
1202 Jump.Flow = Flow;
1203 assert(Flow >= 0 && "negative jump flow");
1204 }
1205
1206 // Extract resulting block counts
1207 auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1208 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1209 for (auto &Jump : Func.Jumps) {
1210 InFlow[Jump.Target] += Jump.Flow;
1211 OutFlow[Jump.Source] += Jump.Flow;
1212 }
1213 for (uint64_t B = 0; B < NumBlocks; B++) {
1214 auto &Block = Func.Blocks[B];
1215 Block.Flow = std::max(OutFlow[B], InFlow[B]);
1216 }
1217 }
1218
1219 #ifndef NDEBUG
1220 /// Verify that the provided block/jump weights are as expected.
verifyInput(const FlowFunction & Func)1221 void verifyInput(const FlowFunction &Func) {
1222 // Verify entry and exit blocks
1223 assert(Func.Entry == 0 && Func.Blocks[0].isEntry());
1224 size_t NumExitBlocks = 0;
1225 for (size_t I = 1; I < Func.Blocks.size(); I++) {
1226 assert(!Func.Blocks[I].isEntry() && "multiple entry blocks");
1227 if (Func.Blocks[I].isExit())
1228 NumExitBlocks++;
1229 }
1230 assert(NumExitBlocks > 0 && "cannot find exit blocks");
1231
1232 // Verify that there are no parallel edges
1233 for (auto &Block : Func.Blocks) {
1234 std::unordered_set<uint64_t> UniqueSuccs;
1235 for (auto &Jump : Block.SuccJumps) {
1236 auto It = UniqueSuccs.insert(Jump->Target);
1237 assert(It.second && "input CFG contains parallel edges");
1238 }
1239 }
1240 // Verify CFG jumps
1241 for (auto &Block : Func.Blocks) {
1242 assert((!Block.isEntry() || !Block.isExit()) &&
1243 "a block cannot be an entry and an exit");
1244 }
1245 // Verify input block weights
1246 for (auto &Block : Func.Blocks) {
1247 assert((!Block.HasUnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
1248 "non-zero weight of a block w/o weight except for an entry");
1249 }
1250 // Verify input jump weights
1251 for (auto &Jump : Func.Jumps) {
1252 assert((!Jump.HasUnknownWeight || Jump.Weight == 0) &&
1253 "non-zero weight of a jump w/o weight");
1254 }
1255 }
1256
1257 /// Verify that the computed flow values satisfy flow conservation rules.
verifyOutput(const FlowFunction & Func)1258 void verifyOutput(const FlowFunction &Func) {
1259 const uint64_t NumBlocks = Func.Blocks.size();
1260 auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1261 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1262 for (const auto &Jump : Func.Jumps) {
1263 InFlow[Jump.Target] += Jump.Flow;
1264 OutFlow[Jump.Source] += Jump.Flow;
1265 }
1266
1267 uint64_t TotalInFlow = 0;
1268 uint64_t TotalOutFlow = 0;
1269 for (uint64_t I = 0; I < NumBlocks; I++) {
1270 auto &Block = Func.Blocks[I];
1271 if (Block.isEntry()) {
1272 TotalInFlow += Block.Flow;
1273 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
1274 } else if (Block.isExit()) {
1275 TotalOutFlow += Block.Flow;
1276 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
1277 } else {
1278 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
1279 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
1280 }
1281 }
1282 assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
1283
1284 // Verify that there are no isolated flow components
1285 // One could modify FlowFunction to hold edges indexed by the sources, which
1286 // will avoid a creation of the object
1287 auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
1288 for (const auto &Jump : Func.Jumps) {
1289 if (Jump.Flow > 0) {
1290 PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
1291 }
1292 }
1293
1294 // Run BFS from the source along edges with positive flow
1295 std::queue<uint64_t> Queue;
1296 auto Visited = BitVector(NumBlocks, false);
1297 Queue.push(Func.Entry);
1298 Visited[Func.Entry] = true;
1299 while (!Queue.empty()) {
1300 uint64_t Src = Queue.front();
1301 Queue.pop();
1302 for (uint64_t Dst : PositiveFlowEdges[Src]) {
1303 if (!Visited[Dst]) {
1304 Queue.push(Dst);
1305 Visited[Dst] = true;
1306 }
1307 }
1308 }
1309
1310 // Verify that every block that has a positive flow is reached from the source
1311 // along edges with a positive flow
1312 for (uint64_t I = 0; I < NumBlocks; I++) {
1313 auto &Block = Func.Blocks[I];
1314 assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
1315 }
1316 }
1317 #endif
1318
1319 } // end of anonymous namespace
1320
1321 /// Apply the profile inference algorithm for a given function and provided
1322 /// profi options
applyFlowInference(const ProfiParams & Params,FlowFunction & Func)1323 void llvm::applyFlowInference(const ProfiParams &Params, FlowFunction &Func) {
1324 // Check if the function has samples and assign initial flow values
1325 bool HasSamples = false;
1326 for (FlowBlock &Block : Func.Blocks) {
1327 if (Block.Weight > 0)
1328 HasSamples = true;
1329 Block.Flow = Block.Weight;
1330 }
1331 for (FlowJump &Jump : Func.Jumps) {
1332 if (Jump.Weight > 0)
1333 HasSamples = true;
1334 Jump.Flow = Jump.Weight;
1335 }
1336
1337 // Quit early for functions with a single block or ones w/o samples
1338 if (Func.Blocks.size() <= 1 || !HasSamples)
1339 return;
1340
1341 #ifndef NDEBUG
1342 // Verify the input data
1343 verifyInput(Func);
1344 #endif
1345
1346 // Create and apply an inference network model
1347 auto InferenceNetwork = MinCostMaxFlow(Params);
1348 initializeNetwork(Params, InferenceNetwork, Func);
1349 InferenceNetwork.run();
1350
1351 // Extract flow values for every block and every edge
1352 extractWeights(Params, InferenceNetwork, Func);
1353
1354 // Post-processing adjustments to the flow
1355 auto Adjuster = FlowAdjuster(Params, Func);
1356 Adjuster.run();
1357
1358 #ifndef NDEBUG
1359 // Verify the result
1360 verifyOutput(Func);
1361 #endif
1362 }
1363
1364 /// Apply the profile inference algorithm for a given flow function
applyFlowInference(FlowFunction & Func)1365 void llvm::applyFlowInference(FlowFunction &Func) {
1366 ProfiParams Params;
1367 // Set the params from the command-line flags.
1368 Params.EvenFlowDistribution = SampleProfileEvenFlowDistribution;
1369 Params.RebalanceUnknown = SampleProfileRebalanceUnknown;
1370 Params.JoinIslands = SampleProfileJoinIslands;
1371 Params.CostBlockInc = SampleProfileProfiCostBlockInc;
1372 Params.CostBlockDec = SampleProfileProfiCostBlockDec;
1373 Params.CostBlockEntryInc = SampleProfileProfiCostBlockEntryInc;
1374 Params.CostBlockEntryDec = SampleProfileProfiCostBlockEntryDec;
1375 Params.CostBlockZeroInc = SampleProfileProfiCostBlockZeroInc;
1376 Params.CostBlockUnknownInc = SampleProfileProfiCostBlockUnknownInc;
1377
1378 applyFlowInference(Params, Func);
1379 }
1380