1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a profile inference algorithm. Given an incomplete and 10 // possibly imprecise block counts, the algorithm reconstructs realistic block 11 // and edge counts that satisfy flow conservation rules, while minimally modify 12 // input block counts. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/Transforms/Utils/SampleProfileInference.h" 17 #include "llvm/ADT/BitVector.h" 18 #include "llvm/Support/CommandLine.h" 19 #include "llvm/Support/Debug.h" 20 #include <queue> 21 #include <set> 22 #include <stack> 23 #include <unordered_set> 24 25 using namespace llvm; 26 #define DEBUG_TYPE "sample-profile-inference" 27 28 namespace { 29 30 static cl::opt<bool> SampleProfileEvenFlowDistribution( 31 "sample-profile-even-flow-distribution", cl::init(true), cl::Hidden, 32 cl::desc("Try to evenly distribute flow when there are multiple equally " 33 "likely options.")); 34 35 static cl::opt<bool> SampleProfileRebalanceUnknown( 36 "sample-profile-rebalance-unknown", cl::init(true), cl::Hidden, 37 cl::desc("Evenly re-distribute flow among unknown subgraphs.")); 38 39 static cl::opt<bool> SampleProfileJoinIslands( 40 "sample-profile-join-islands", cl::init(true), cl::Hidden, 41 cl::desc("Join isolated components having positive flow.")); 42 43 static cl::opt<unsigned> SampleProfileProfiCostBlockInc( 44 "sample-profile-profi-cost-block-inc", cl::init(10), cl::Hidden, 45 cl::desc("The cost of increasing a block's count by one.")); 46 47 static cl::opt<unsigned> SampleProfileProfiCostBlockDec( 48 "sample-profile-profi-cost-block-dec", cl::init(20), cl::Hidden, 49 cl::desc("The cost of decreasing a block's count by one.")); 50 51 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryInc( 52 "sample-profile-profi-cost-block-entry-inc", cl::init(40), cl::Hidden, 53 cl::desc("The cost of increasing the entry block's count by one.")); 54 55 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryDec( 56 "sample-profile-profi-cost-block-entry-dec", cl::init(10), cl::Hidden, 57 cl::desc("The cost of decreasing the entry block's count by one.")); 58 59 static cl::opt<unsigned> SampleProfileProfiCostBlockZeroInc( 60 "sample-profile-profi-cost-block-zero-inc", cl::init(11), cl::Hidden, 61 cl::desc("The cost of increasing a count of zero-weight block by one.")); 62 63 static cl::opt<unsigned> SampleProfileProfiCostBlockUnknownInc( 64 "sample-profile-profi-cost-block-unknown-inc", cl::init(0), cl::Hidden, 65 cl::desc("The cost of increasing an unknown block's count by one.")); 66 67 /// A value indicating an infinite flow/capacity/weight of a block/edge. 68 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up 69 /// during the execution. 70 static constexpr int64_t INF = ((int64_t)1) << 50; 71 72 /// The minimum-cost maximum flow algorithm. 73 /// 74 /// The algorithm finds the maximum flow of minimum cost on a given (directed) 75 /// network using a modified version of the classical Moore-Bellman-Ford 76 /// approach. The algorithm applies a number of augmentation iterations in which 77 /// flow is sent along paths of positive capacity from the source to the sink. 78 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where 79 /// where m is the number of edges, n is the number of vertices, and v(f) is the 80 /// value of the maximum flow. However, the observed running time on typical 81 /// instances is sub-quadratic, that is, o(n^2). 82 /// 83 /// The input is a set of edges with specified costs and capacities, and a pair 84 /// of nodes (source and sink). The output is the flow along each edge of the 85 /// minimum total cost respecting the given edge capacities. 86 class MinCostMaxFlow { 87 public: 88 MinCostMaxFlow(const ProfiParams &Params) : Params(Params) {} 89 90 // Initialize algorithm's data structures for a network of a given size. 91 void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) { 92 Source = SourceNode; 93 Target = SinkNode; 94 95 Nodes = std::vector<Node>(NodeCount); 96 Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>()); 97 if (Params.EvenFlowDistribution) 98 AugmentingEdges = 99 std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>()); 100 } 101 102 // Run the algorithm. 103 int64_t run() { 104 LLVM_DEBUG(dbgs() << "Starting profi for " << Nodes.size() << " nodes\n"); 105 106 // Iteratively find an augmentation path/dag in the network and send the 107 // flow along its edges 108 size_t AugmentationIters = applyFlowAugmentation(); 109 110 // Compute the total flow and its cost 111 int64_t TotalCost = 0; 112 int64_t TotalFlow = 0; 113 for (uint64_t Src = 0; Src < Nodes.size(); Src++) { 114 for (auto &Edge : Edges[Src]) { 115 if (Edge.Flow > 0) { 116 TotalCost += Edge.Cost * Edge.Flow; 117 if (Src == Source) 118 TotalFlow += Edge.Flow; 119 } 120 } 121 } 122 LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters 123 << " iterations with " << TotalFlow << " total flow" 124 << " of " << TotalCost << " cost\n"); 125 (void)TotalFlow; 126 (void)AugmentationIters; 127 return TotalCost; 128 } 129 130 /// Adding an edge to the network with a specified capacity and a cost. 131 /// Multiple edges between a pair of nodes are allowed but self-edges 132 /// are not supported. 133 void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) { 134 assert(Capacity > 0 && "adding an edge of zero capacity"); 135 assert(Src != Dst && "loop edge are not supported"); 136 137 Edge SrcEdge; 138 SrcEdge.Dst = Dst; 139 SrcEdge.Cost = Cost; 140 SrcEdge.Capacity = Capacity; 141 SrcEdge.Flow = 0; 142 SrcEdge.RevEdgeIndex = Edges[Dst].size(); 143 144 Edge DstEdge; 145 DstEdge.Dst = Src; 146 DstEdge.Cost = -Cost; 147 DstEdge.Capacity = 0; 148 DstEdge.Flow = 0; 149 DstEdge.RevEdgeIndex = Edges[Src].size(); 150 151 Edges[Src].push_back(SrcEdge); 152 Edges[Dst].push_back(DstEdge); 153 } 154 155 /// Adding an edge to the network of infinite capacity and a given cost. 156 void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) { 157 addEdge(Src, Dst, INF, Cost); 158 } 159 160 /// Get the total flow from a given source node. 161 /// Returns a list of pairs (target node, amount of flow to the target). 162 std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const { 163 std::vector<std::pair<uint64_t, int64_t>> Flow; 164 for (const auto &Edge : Edges[Src]) { 165 if (Edge.Flow > 0) 166 Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow)); 167 } 168 return Flow; 169 } 170 171 /// Get the total flow between a pair of nodes. 172 int64_t getFlow(uint64_t Src, uint64_t Dst) const { 173 int64_t Flow = 0; 174 for (const auto &Edge : Edges[Src]) { 175 if (Edge.Dst == Dst) { 176 Flow += Edge.Flow; 177 } 178 } 179 return Flow; 180 } 181 182 private: 183 /// Iteratively find an augmentation path/dag in the network and send the 184 /// flow along its edges. The method returns the number of applied iterations. 185 size_t applyFlowAugmentation() { 186 size_t AugmentationIters = 0; 187 while (findAugmentingPath()) { 188 uint64_t PathCapacity = computeAugmentingPathCapacity(); 189 while (PathCapacity > 0) { 190 bool Progress = false; 191 if (Params.EvenFlowDistribution) { 192 // Identify node/edge candidates for augmentation 193 identifyShortestEdges(PathCapacity); 194 195 // Find an augmenting DAG 196 auto AugmentingOrder = findAugmentingDAG(); 197 198 // Apply the DAG augmentation 199 Progress = augmentFlowAlongDAG(AugmentingOrder); 200 PathCapacity = computeAugmentingPathCapacity(); 201 } 202 203 if (!Progress) { 204 augmentFlowAlongPath(PathCapacity); 205 PathCapacity = 0; 206 } 207 208 AugmentationIters++; 209 } 210 } 211 return AugmentationIters; 212 } 213 214 /// Compute the capacity of the cannonical augmenting path. If the path is 215 /// saturated (that is, no flow can be sent along the path), then return 0. 216 uint64_t computeAugmentingPathCapacity() { 217 uint64_t PathCapacity = INF; 218 uint64_t Now = Target; 219 while (Now != Source) { 220 uint64_t Pred = Nodes[Now].ParentNode; 221 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; 222 223 assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow"); 224 uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow); 225 PathCapacity = std::min(PathCapacity, EdgeCapacity); 226 227 Now = Pred; 228 } 229 return PathCapacity; 230 } 231 232 /// Check for existence of an augmenting path with a positive capacity. 233 bool findAugmentingPath() { 234 // Initialize data structures 235 for (auto &Node : Nodes) { 236 Node.Distance = INF; 237 Node.ParentNode = uint64_t(-1); 238 Node.ParentEdgeIndex = uint64_t(-1); 239 Node.Taken = false; 240 } 241 242 std::queue<uint64_t> Queue; 243 Queue.push(Source); 244 Nodes[Source].Distance = 0; 245 Nodes[Source].Taken = true; 246 while (!Queue.empty()) { 247 uint64_t Src = Queue.front(); 248 Queue.pop(); 249 Nodes[Src].Taken = false; 250 // Although the residual network contains edges with negative costs 251 // (in particular, backward edges), it can be shown that there are no 252 // negative-weight cycles and the following two invariants are maintained: 253 // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, 254 // where Dist is the length of the shortest path between two nodes. This 255 // allows to prune the search-space of the path-finding algorithm using 256 // the following early-stop criteria: 257 // -- If we find a path with zero-distance from Source to Target, stop the 258 // search, as the path is the shortest since Dist[Source, Target] >= 0; 259 // -- If we have Dist[Source, V] > Dist[Source, Target], then do not 260 // process node V, as it is guaranteed _not_ to be on a shortest path 261 // from Source to Target; it follows from inequalities 262 // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] 263 // >= Dist[Source, V] 264 if (!Params.EvenFlowDistribution && Nodes[Target].Distance == 0) 265 break; 266 if (Nodes[Src].Distance > Nodes[Target].Distance) 267 continue; 268 269 // Process adjacent edges 270 for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) { 271 auto &Edge = Edges[Src][EdgeIdx]; 272 if (Edge.Flow < Edge.Capacity) { 273 uint64_t Dst = Edge.Dst; 274 int64_t NewDistance = Nodes[Src].Distance + Edge.Cost; 275 if (Nodes[Dst].Distance > NewDistance) { 276 // Update the distance and the parent node/edge 277 Nodes[Dst].Distance = NewDistance; 278 Nodes[Dst].ParentNode = Src; 279 Nodes[Dst].ParentEdgeIndex = EdgeIdx; 280 // Add the node to the queue, if it is not there yet 281 if (!Nodes[Dst].Taken) { 282 Queue.push(Dst); 283 Nodes[Dst].Taken = true; 284 } 285 } 286 } 287 } 288 } 289 290 return Nodes[Target].Distance != INF; 291 } 292 293 /// Update the current flow along the augmenting path. 294 void augmentFlowAlongPath(uint64_t PathCapacity) { 295 assert(PathCapacity > 0 && "found an incorrect augmenting path"); 296 uint64_t Now = Target; 297 while (Now != Source) { 298 uint64_t Pred = Nodes[Now].ParentNode; 299 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; 300 auto &RevEdge = Edges[Now][Edge.RevEdgeIndex]; 301 302 Edge.Flow += PathCapacity; 303 RevEdge.Flow -= PathCapacity; 304 305 Now = Pred; 306 } 307 } 308 309 /// Find an Augmenting DAG order using a modified version of DFS in which we 310 /// can visit a node multiple times. In the DFS search, when scanning each 311 /// edge out of a node, continue search at Edge.Dst endpoint if it has not 312 /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm 313 /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time. 314 /// It returns an Augmenting Order (Taken nodes in decreasing Finish time) 315 /// that starts with Source and ends with Target. 316 std::vector<uint64_t> findAugmentingDAG() { 317 // We use a stack based implemenation of DFS to avoid recursion. 318 // Defining DFS data structures: 319 // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that 320 // - we are currently visiting Nodes[NodeIdx] and 321 // - the next edge to scan is Edges[NodeIdx][EdgeIdx] 322 typedef std::pair<uint64_t, uint64_t> StackItemType; 323 std::stack<StackItemType> Stack; 324 std::vector<uint64_t> AugmentingOrder; 325 326 // Phase 0: Initialize Node attributes and Time for DFS run 327 for (auto &Node : Nodes) { 328 Node.Discovery = 0; 329 Node.Finish = 0; 330 Node.NumCalls = 0; 331 Node.Taken = false; 332 } 333 uint64_t Time = 0; 334 // Mark Target as Taken 335 // Taken attribute will be propagated backwards from Target towards Source 336 Nodes[Target].Taken = true; 337 338 // Phase 1: Start DFS traversal from Source 339 Stack.emplace(Source, 0); 340 Nodes[Source].Discovery = ++Time; 341 while (!Stack.empty()) { 342 auto NodeIdx = Stack.top().first; 343 auto EdgeIdx = Stack.top().second; 344 345 // If we haven't scanned all edges out of NodeIdx, continue scanning 346 if (EdgeIdx < Edges[NodeIdx].size()) { 347 auto &Edge = Edges[NodeIdx][EdgeIdx]; 348 auto &Dst = Nodes[Edge.Dst]; 349 Stack.top().second++; 350 351 if (Edge.OnShortestPath) { 352 // If we haven't seen Edge.Dst so far, continue DFS search there 353 if (Dst.Discovery == 0 && Dst.NumCalls < MaxDfsCalls) { 354 Dst.Discovery = ++Time; 355 Stack.emplace(Edge.Dst, 0); 356 Dst.NumCalls++; 357 } else if (Dst.Taken && Dst.Finish != 0) { 358 // Else, if Edge.Dst already have a path to Target, so that NodeIdx 359 Nodes[NodeIdx].Taken = true; 360 } 361 } 362 } else { 363 // If we are done scanning all edge out of NodeIdx 364 Stack.pop(); 365 // If we haven't found a path from NodeIdx to Target, forget about it 366 if (!Nodes[NodeIdx].Taken) { 367 Nodes[NodeIdx].Discovery = 0; 368 } else { 369 // If we have found a path from NodeIdx to Target, then finish NodeIdx 370 // and propagate Taken flag to DFS parent unless at the Source 371 Nodes[NodeIdx].Finish = ++Time; 372 // NodeIdx == Source if and only if the stack is empty 373 if (NodeIdx != Source) { 374 assert(!Stack.empty() && "empty stack while running dfs"); 375 Nodes[Stack.top().first].Taken = true; 376 } 377 AugmentingOrder.push_back(NodeIdx); 378 } 379 } 380 } 381 // Nodes are collected decreasing Finish time, so the order is reversed 382 std::reverse(AugmentingOrder.begin(), AugmentingOrder.end()); 383 384 // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges 385 for (size_t Src : AugmentingOrder) { 386 AugmentingEdges[Src].clear(); 387 for (auto &Edge : Edges[Src]) { 388 uint64_t Dst = Edge.Dst; 389 if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken && 390 Nodes[Dst].Finish < Nodes[Src].Finish) { 391 AugmentingEdges[Src].push_back(&Edge); 392 } 393 } 394 assert((Src == Target || !AugmentingEdges[Src].empty()) && 395 "incorrectly constructed augmenting edges"); 396 } 397 398 return AugmentingOrder; 399 } 400 401 /// Update the current flow along the given (acyclic) subgraph specified by 402 /// the vertex order, AugmentingOrder. The objective is to send as much flow 403 /// as possible while evenly distributing flow among successors of each node. 404 /// After the update at least one edge is saturated. 405 bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) { 406 // Phase 0: Initialization 407 for (uint64_t Src : AugmentingOrder) { 408 Nodes[Src].FracFlow = 0; 409 Nodes[Src].IntFlow = 0; 410 for (auto &Edge : AugmentingEdges[Src]) { 411 Edge->AugmentedFlow = 0; 412 } 413 } 414 415 // Phase 1: Send a unit of fractional flow along the DAG 416 uint64_t MaxFlowAmount = INF; 417 Nodes[Source].FracFlow = 1.0; 418 for (uint64_t Src : AugmentingOrder) { 419 assert((Src == Target || Nodes[Src].FracFlow > 0.0) && 420 "incorrectly computed fractional flow"); 421 // Distribute flow evenly among successors of Src 422 uint64_t Degree = AugmentingEdges[Src].size(); 423 for (auto &Edge : AugmentingEdges[Src]) { 424 double EdgeFlow = Nodes[Src].FracFlow / Degree; 425 Nodes[Edge->Dst].FracFlow += EdgeFlow; 426 if (Edge->Capacity == INF) 427 continue; 428 uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow; 429 MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow); 430 } 431 } 432 // Stop early if we cannot send any (integral) flow from Source to Target 433 if (MaxFlowAmount == 0) 434 return false; 435 436 // Phase 2: Send an integral flow of MaxFlowAmount 437 Nodes[Source].IntFlow = MaxFlowAmount; 438 for (uint64_t Src : AugmentingOrder) { 439 if (Src == Target) 440 break; 441 // Distribute flow evenly among successors of Src, rounding up to make 442 // sure all flow is sent 443 uint64_t Degree = AugmentingEdges[Src].size(); 444 // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree 445 uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree; 446 for (auto &Edge : AugmentingEdges[Src]) { 447 uint64_t Dst = Edge->Dst; 448 uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow); 449 EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow)); 450 Nodes[Dst].IntFlow += EdgeFlow; 451 Nodes[Src].IntFlow -= EdgeFlow; 452 Edge->AugmentedFlow += EdgeFlow; 453 } 454 } 455 assert(Nodes[Target].IntFlow <= MaxFlowAmount); 456 Nodes[Target].IntFlow = 0; 457 458 // Phase 3: Send excess flow back traversing the nodes backwards. 459 // Because of rounding, not all flow can be sent along the edges of Src. 460 // Hence, sending the remaining flow back to maintain flow conservation 461 for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) { 462 uint64_t Src = AugmentingOrder[Idx - 1]; 463 // Try to send excess flow back along each edge. 464 // Make sure we only send back flow we just augmented (AugmentedFlow). 465 for (auto &Edge : AugmentingEdges[Src]) { 466 uint64_t Dst = Edge->Dst; 467 if (Nodes[Dst].IntFlow == 0) 468 continue; 469 uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow); 470 Nodes[Dst].IntFlow -= EdgeFlow; 471 Nodes[Src].IntFlow += EdgeFlow; 472 Edge->AugmentedFlow -= EdgeFlow; 473 } 474 } 475 476 // Phase 4: Update flow values along all edges 477 bool HasSaturatedEdges = false; 478 for (uint64_t Src : AugmentingOrder) { 479 // Verify that we have sent all the excess flow from the node 480 assert(Src == Source || Nodes[Src].IntFlow == 0); 481 for (auto &Edge : AugmentingEdges[Src]) { 482 assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow); 483 // Update flow values along the edge and its reverse copy 484 auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex]; 485 Edge->Flow += Edge->AugmentedFlow; 486 RevEdge.Flow -= Edge->AugmentedFlow; 487 if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0) 488 HasSaturatedEdges = true; 489 } 490 } 491 492 // The augmentation is successful iff at least one edge becomes saturated 493 return HasSaturatedEdges; 494 } 495 496 /// Identify candidate (shortest) edges for augmentation. 497 void identifyShortestEdges(uint64_t PathCapacity) { 498 assert(PathCapacity > 0 && "found an incorrect augmenting DAG"); 499 // To make sure the augmentation DAG contains only edges with large residual 500 // capacity, we prune all edges whose capacity is below a fraction of 501 // the capacity of the augmented path. 502 // (All edges of the path itself are always in the DAG) 503 uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1)); 504 505 // Decide which edges are on a shortest path from Source to Target 506 for (size_t Src = 0; Src < Nodes.size(); Src++) { 507 // An edge cannot be augmenting if the endpoint has large distance 508 if (Nodes[Src].Distance > Nodes[Target].Distance) 509 continue; 510 511 for (auto &Edge : Edges[Src]) { 512 uint64_t Dst = Edge.Dst; 513 Edge.OnShortestPath = 514 Src != Target && Dst != Source && 515 Nodes[Dst].Distance <= Nodes[Target].Distance && 516 Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost && 517 Edge.Capacity > Edge.Flow && 518 uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity; 519 } 520 } 521 } 522 523 /// Maximum number of DFS iterations for DAG finding. 524 static constexpr uint64_t MaxDfsCalls = 10; 525 526 /// A node in a flow network. 527 struct Node { 528 /// The cost of the cheapest path from the source to the current node. 529 int64_t Distance; 530 /// The node preceding the current one in the path. 531 uint64_t ParentNode; 532 /// The index of the edge between ParentNode and the current node. 533 uint64_t ParentEdgeIndex; 534 /// An indicator of whether the current node is in a queue. 535 bool Taken; 536 537 /// Data fields utilized in DAG-augmentation: 538 /// Fractional flow. 539 double FracFlow; 540 /// Integral flow. 541 uint64_t IntFlow; 542 /// Discovery time. 543 uint64_t Discovery; 544 /// Finish time. 545 uint64_t Finish; 546 /// NumCalls. 547 uint64_t NumCalls; 548 }; 549 550 /// An edge in a flow network. 551 struct Edge { 552 /// The cost of the edge. 553 int64_t Cost; 554 /// The capacity of the edge. 555 int64_t Capacity; 556 /// The current flow on the edge. 557 int64_t Flow; 558 /// The destination node of the edge. 559 uint64_t Dst; 560 /// The index of the reverse edge between Dst and the current node. 561 uint64_t RevEdgeIndex; 562 563 /// Data fields utilized in DAG-augmentation: 564 /// Whether the edge is currently on a shortest path from Source to Target. 565 bool OnShortestPath; 566 /// Extra flow along the edge. 567 uint64_t AugmentedFlow; 568 }; 569 570 /// The set of network nodes. 571 std::vector<Node> Nodes; 572 /// The set of network edges. 573 std::vector<std::vector<Edge>> Edges; 574 /// Source node of the flow. 575 uint64_t Source; 576 /// Target (sink) node of the flow. 577 uint64_t Target; 578 /// Augmenting edges. 579 std::vector<std::vector<Edge *>> AugmentingEdges; 580 /// Params for flow computation. 581 const ProfiParams &Params; 582 }; 583 584 /// A post-processing adjustment of the control flow. It applies two steps by 585 /// rerouting some flow and making it more realistic: 586 /// 587 /// - First, it removes all isolated components ("islands") with a positive flow 588 /// that are unreachable from the entry block. For every such component, we 589 /// find the shortest from the entry to an exit passing through the component, 590 /// and increase the flow by one unit along the path. 591 /// 592 /// - Second, it identifies all "unknown subgraphs" consisting of basic blocks 593 /// with no sampled counts. Then it rebalnces the flow that goes through such 594 /// a subgraph so that each branch is taken with probability 50%. 595 /// An unknown subgraph is such that for every two nodes u and v: 596 /// - u dominates v and u is not unknown; 597 /// - v post-dominates u; and 598 /// - all inner-nodes of all (u,v)-paths are unknown. 599 /// 600 class FlowAdjuster { 601 public: 602 FlowAdjuster(const ProfiParams &Params, FlowFunction &Func) 603 : Params(Params), Func(Func) {} 604 605 /// Apply the post-processing. 606 void run() { 607 if (Params.JoinIslands) { 608 // Adjust the flow to get rid of isolated components 609 joinIsolatedComponents(); 610 } 611 612 if (Params.RebalanceUnknown) { 613 // Rebalance the flow inside unknown subgraphs 614 rebalanceUnknownSubgraphs(); 615 } 616 } 617 618 private: 619 void joinIsolatedComponents() { 620 // Find blocks that are reachable from the source 621 auto Visited = BitVector(NumBlocks(), false); 622 findReachable(Func.Entry, Visited); 623 624 // Iterate over all non-reachable blocks and adjust their weights 625 for (uint64_t I = 0; I < NumBlocks(); I++) { 626 auto &Block = Func.Blocks[I]; 627 if (Block.Flow > 0 && !Visited[I]) { 628 // Find a path from the entry to an exit passing through the block I 629 auto Path = findShortestPath(I); 630 // Increase the flow along the path 631 assert(Path.size() > 0 && Path[0]->Source == Func.Entry && 632 "incorrectly computed path adjusting control flow"); 633 Func.Blocks[Func.Entry].Flow += 1; 634 for (auto &Jump : Path) { 635 Jump->Flow += 1; 636 Func.Blocks[Jump->Target].Flow += 1; 637 // Update reachability 638 findReachable(Jump->Target, Visited); 639 } 640 } 641 } 642 } 643 644 /// Run BFS from a given block along the jumps with a positive flow and mark 645 /// all reachable blocks. 646 void findReachable(uint64_t Src, BitVector &Visited) { 647 if (Visited[Src]) 648 return; 649 std::queue<uint64_t> Queue; 650 Queue.push(Src); 651 Visited[Src] = true; 652 while (!Queue.empty()) { 653 Src = Queue.front(); 654 Queue.pop(); 655 for (auto *Jump : Func.Blocks[Src].SuccJumps) { 656 uint64_t Dst = Jump->Target; 657 if (Jump->Flow > 0 && !Visited[Dst]) { 658 Queue.push(Dst); 659 Visited[Dst] = true; 660 } 661 } 662 } 663 } 664 665 /// Find the shortest path from the entry block to an exit block passing 666 /// through a given block. 667 std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) { 668 // A path from the entry block to BlockIdx 669 auto ForwardPath = findShortestPath(Func.Entry, BlockIdx); 670 // A path from BlockIdx to an exit block 671 auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock); 672 673 // Concatenate the two paths 674 std::vector<FlowJump *> Result; 675 Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end()); 676 Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end()); 677 return Result; 678 } 679 680 /// Apply the Dijkstra algorithm to find the shortest path from a given 681 /// Source to a given Target block. 682 /// If Target == -1, then the path ends at an exit block. 683 std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) { 684 // Quit early, if possible 685 if (Source == Target) 686 return std::vector<FlowJump *>(); 687 if (Func.Blocks[Source].isExit() && Target == AnyExitBlock) 688 return std::vector<FlowJump *>(); 689 690 // Initialize data structures 691 auto Distance = std::vector<int64_t>(NumBlocks(), INF); 692 auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr); 693 Distance[Source] = 0; 694 std::set<std::pair<uint64_t, uint64_t>> Queue; 695 Queue.insert(std::make_pair(Distance[Source], Source)); 696 697 // Run the Dijkstra algorithm 698 while (!Queue.empty()) { 699 uint64_t Src = Queue.begin()->second; 700 Queue.erase(Queue.begin()); 701 // If we found a solution, quit early 702 if (Src == Target || 703 (Func.Blocks[Src].isExit() && Target == AnyExitBlock)) 704 break; 705 706 for (auto *Jump : Func.Blocks[Src].SuccJumps) { 707 uint64_t Dst = Jump->Target; 708 int64_t JumpDist = jumpDistance(Jump); 709 if (Distance[Dst] > Distance[Src] + JumpDist) { 710 Queue.erase(std::make_pair(Distance[Dst], Dst)); 711 712 Distance[Dst] = Distance[Src] + JumpDist; 713 Parent[Dst] = Jump; 714 715 Queue.insert(std::make_pair(Distance[Dst], Dst)); 716 } 717 } 718 } 719 // If Target is not provided, find the closest exit block 720 if (Target == AnyExitBlock) { 721 for (uint64_t I = 0; I < NumBlocks(); I++) { 722 if (Func.Blocks[I].isExit() && Parent[I] != nullptr) { 723 if (Target == AnyExitBlock || Distance[Target] > Distance[I]) { 724 Target = I; 725 } 726 } 727 } 728 } 729 assert(Parent[Target] != nullptr && "a path does not exist"); 730 731 // Extract the constructed path 732 std::vector<FlowJump *> Result; 733 uint64_t Now = Target; 734 while (Now != Source) { 735 assert(Now == Parent[Now]->Target && "incorrect parent jump"); 736 Result.push_back(Parent[Now]); 737 Now = Parent[Now]->Source; 738 } 739 // Reverse the path, since it is extracted from Target to Source 740 std::reverse(Result.begin(), Result.end()); 741 return Result; 742 } 743 744 /// A distance of a path for a given jump. 745 /// In order to incite the path to use blocks/jumps with large positive flow, 746 /// and avoid changing branch probability of outgoing edges drastically, 747 /// set the jump distance so as: 748 /// - to minimize the number of unlikely jumps used and subject to that, 749 /// - to minimize the number of Flow == 0 jumps used and subject to that, 750 /// - minimizes total multiplicative Flow increase for the remaining edges. 751 /// To capture this objective with integer distances, we round off fractional 752 /// parts to a multiple of 1 / BaseDistance. 753 int64_t jumpDistance(FlowJump *Jump) const { 754 if (Jump->IsUnlikely) 755 return Params.CostUnlikely; 756 uint64_t BaseDistance = 757 std::max(FlowAdjuster::MinBaseDistance, 758 std::min(Func.Blocks[Func.Entry].Flow, 759 Params.CostUnlikely / (2 * (NumBlocks() + 1)))); 760 if (Jump->Flow > 0) 761 return BaseDistance + BaseDistance / Jump->Flow; 762 return 2 * BaseDistance * (NumBlocks() + 1); 763 }; 764 765 uint64_t NumBlocks() const { return Func.Blocks.size(); } 766 767 /// Rebalance unknown subgraphs so that the flow is split evenly across the 768 /// outgoing branches of every block of the subgraph. The method iterates over 769 /// blocks with known weight and identifies unknown subgraphs rooted at the 770 /// blocks. Then it verifies if flow rebalancing is feasible and applies it. 771 void rebalanceUnknownSubgraphs() { 772 // Try to find unknown subgraphs from each block 773 for (const FlowBlock &SrcBlock : Func.Blocks) { 774 // Verify if rebalancing rooted at SrcBlock is feasible 775 if (!canRebalanceAtRoot(&SrcBlock)) 776 continue; 777 778 // Find an unknown subgraphs starting at SrcBlock. Along the way, 779 // fill in known destinations and intermediate unknown blocks. 780 std::vector<FlowBlock *> UnknownBlocks; 781 std::vector<FlowBlock *> KnownDstBlocks; 782 findUnknownSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks); 783 784 // Verify if rebalancing of the subgraph is feasible. If the search is 785 // successful, find the unique destination block (which can be null) 786 FlowBlock *DstBlock = nullptr; 787 if (!canRebalanceSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks, 788 DstBlock)) 789 continue; 790 791 // We cannot rebalance subgraphs containing cycles among unknown blocks 792 if (!isAcyclicSubgraph(&SrcBlock, DstBlock, UnknownBlocks)) 793 continue; 794 795 // Rebalance the flow 796 rebalanceUnknownSubgraph(&SrcBlock, DstBlock, UnknownBlocks); 797 } 798 } 799 800 /// Verify if rebalancing rooted at a given block is possible. 801 bool canRebalanceAtRoot(const FlowBlock *SrcBlock) { 802 // Do not attempt to find unknown subgraphs from an unknown or a 803 // zero-flow block 804 if (SrcBlock->HasUnknownWeight || SrcBlock->Flow == 0) 805 return false; 806 807 // Do not attempt to process subgraphs from a block w/o unknown sucessors 808 bool HasUnknownSuccs = false; 809 for (auto *Jump : SrcBlock->SuccJumps) { 810 if (Func.Blocks[Jump->Target].HasUnknownWeight) { 811 HasUnknownSuccs = true; 812 break; 813 } 814 } 815 if (!HasUnknownSuccs) 816 return false; 817 818 return true; 819 } 820 821 /// Find an unknown subgraph starting at block SrcBlock. The method sets 822 /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks. 823 void findUnknownSubgraph(const FlowBlock *SrcBlock, 824 std::vector<FlowBlock *> &KnownDstBlocks, 825 std::vector<FlowBlock *> &UnknownBlocks) { 826 // Run BFS from SrcBlock and make sure all paths are going through unknown 827 // blocks and end at a known DstBlock 828 auto Visited = BitVector(NumBlocks(), false); 829 std::queue<uint64_t> Queue; 830 831 Queue.push(SrcBlock->Index); 832 Visited[SrcBlock->Index] = true; 833 while (!Queue.empty()) { 834 auto &Block = Func.Blocks[Queue.front()]; 835 Queue.pop(); 836 // Process blocks reachable from Block 837 for (auto *Jump : Block.SuccJumps) { 838 // If Jump can be ignored, skip it 839 if (ignoreJump(SrcBlock, nullptr, Jump)) 840 continue; 841 842 uint64_t Dst = Jump->Target; 843 // If Dst has been visited, skip Jump 844 if (Visited[Dst]) 845 continue; 846 // Process block Dst 847 Visited[Dst] = true; 848 if (!Func.Blocks[Dst].HasUnknownWeight) { 849 KnownDstBlocks.push_back(&Func.Blocks[Dst]); 850 } else { 851 Queue.push(Dst); 852 UnknownBlocks.push_back(&Func.Blocks[Dst]); 853 } 854 } 855 } 856 } 857 858 /// Verify if rebalancing of the subgraph is feasible. If the checks are 859 /// successful, set the unique destination block, DstBlock (can be null). 860 bool canRebalanceSubgraph(const FlowBlock *SrcBlock, 861 const std::vector<FlowBlock *> &KnownDstBlocks, 862 const std::vector<FlowBlock *> &UnknownBlocks, 863 FlowBlock *&DstBlock) { 864 // If the list of unknown blocks is empty, we don't need rebalancing 865 if (UnknownBlocks.empty()) 866 return false; 867 868 // If there are multiple known sinks, we can't rebalance 869 if (KnownDstBlocks.size() > 1) 870 return false; 871 DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front(); 872 873 // Verify sinks of the subgraph 874 for (auto *Block : UnknownBlocks) { 875 if (Block->SuccJumps.empty()) { 876 // If there are multiple (known and unknown) sinks, we can't rebalance 877 if (DstBlock != nullptr) 878 return false; 879 continue; 880 } 881 size_t NumIgnoredJumps = 0; 882 for (auto *Jump : Block->SuccJumps) { 883 if (ignoreJump(SrcBlock, DstBlock, Jump)) 884 NumIgnoredJumps++; 885 } 886 // If there is a non-sink block in UnknownBlocks with all jumps ignored, 887 // then we can't rebalance 888 if (NumIgnoredJumps == Block->SuccJumps.size()) 889 return false; 890 } 891 892 return true; 893 } 894 895 /// Decide whether the Jump is ignored while processing an unknown subgraphs 896 /// rooted at basic block SrcBlock with the destination block, DstBlock. 897 bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, 898 const FlowJump *Jump) { 899 // Ignore unlikely jumps with zero flow 900 if (Jump->IsUnlikely && Jump->Flow == 0) 901 return true; 902 903 auto JumpSource = &Func.Blocks[Jump->Source]; 904 auto JumpTarget = &Func.Blocks[Jump->Target]; 905 906 // Do not ignore jumps coming into DstBlock 907 if (DstBlock != nullptr && JumpTarget == DstBlock) 908 return false; 909 910 // Ignore jumps out of SrcBlock to known blocks 911 if (!JumpTarget->HasUnknownWeight && JumpSource == SrcBlock) 912 return true; 913 914 // Ignore jumps to known blocks with zero flow 915 if (!JumpTarget->HasUnknownWeight && JumpTarget->Flow == 0) 916 return true; 917 918 return false; 919 } 920 921 /// Verify if the given unknown subgraph is acyclic, and if yes, reorder 922 /// UnknownBlocks in the topological order (so that all jumps are "forward"). 923 bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, 924 std::vector<FlowBlock *> &UnknownBlocks) { 925 // Extract local in-degrees in the considered subgraph 926 auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0); 927 auto fillInDegree = [&](const FlowBlock *Block) { 928 for (auto *Jump : Block->SuccJumps) { 929 if (ignoreJump(SrcBlock, DstBlock, Jump)) 930 continue; 931 LocalInDegree[Jump->Target]++; 932 } 933 }; 934 fillInDegree(SrcBlock); 935 for (auto *Block : UnknownBlocks) { 936 fillInDegree(Block); 937 } 938 // A loop containing SrcBlock 939 if (LocalInDegree[SrcBlock->Index] > 0) 940 return false; 941 942 std::vector<FlowBlock *> AcyclicOrder; 943 std::queue<uint64_t> Queue; 944 Queue.push(SrcBlock->Index); 945 while (!Queue.empty()) { 946 FlowBlock *Block = &Func.Blocks[Queue.front()]; 947 Queue.pop(); 948 // Stop propagation once we reach DstBlock, if any 949 if (DstBlock != nullptr && Block == DstBlock) 950 break; 951 952 // Keep an acyclic order of unknown blocks 953 if (Block->HasUnknownWeight && Block != SrcBlock) 954 AcyclicOrder.push_back(Block); 955 956 // Add to the queue all successors with zero local in-degree 957 for (auto *Jump : Block->SuccJumps) { 958 if (ignoreJump(SrcBlock, DstBlock, Jump)) 959 continue; 960 uint64_t Dst = Jump->Target; 961 LocalInDegree[Dst]--; 962 if (LocalInDegree[Dst] == 0) { 963 Queue.push(Dst); 964 } 965 } 966 } 967 968 // If there is a cycle in the subgraph, AcyclicOrder contains only a subset 969 // of all blocks 970 if (UnknownBlocks.size() != AcyclicOrder.size()) 971 return false; 972 UnknownBlocks = AcyclicOrder; 973 return true; 974 } 975 976 /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and 977 /// having UnknownBlocks intermediate blocks. 978 void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock, 979 const FlowBlock *DstBlock, 980 const std::vector<FlowBlock *> &UnknownBlocks) { 981 assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph"); 982 983 // Ditribute flow from the source block 984 uint64_t BlockFlow = 0; 985 // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps 986 for (auto *Jump : SrcBlock->SuccJumps) { 987 if (ignoreJump(SrcBlock, DstBlock, Jump)) 988 continue; 989 BlockFlow += Jump->Flow; 990 } 991 rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow); 992 993 // Ditribute flow from the remaining blocks 994 for (auto *Block : UnknownBlocks) { 995 assert(Block->HasUnknownWeight && "incorrect unknown subgraph"); 996 uint64_t BlockFlow = 0; 997 // Block's flow is the sum of incoming flows 998 for (auto *Jump : Block->PredJumps) { 999 BlockFlow += Jump->Flow; 1000 } 1001 Block->Flow = BlockFlow; 1002 rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow); 1003 } 1004 } 1005 1006 /// Redistribute flow for a block in a subgraph rooted at SrcBlock, 1007 /// and ending at DstBlock. 1008 void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, 1009 const FlowBlock *Block, uint64_t BlockFlow) { 1010 // Process all successor jumps and update corresponding flow values 1011 size_t BlockDegree = 0; 1012 for (auto *Jump : Block->SuccJumps) { 1013 if (ignoreJump(SrcBlock, DstBlock, Jump)) 1014 continue; 1015 BlockDegree++; 1016 } 1017 // If all successor jumps of the block are ignored, skip it 1018 if (DstBlock == nullptr && BlockDegree == 0) 1019 return; 1020 assert(BlockDegree > 0 && "all outgoing jumps are ignored"); 1021 1022 // Each of the Block's successors gets the following amount of flow. 1023 // Rounding the value up so that all flow is propagated 1024 uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree; 1025 for (auto *Jump : Block->SuccJumps) { 1026 if (ignoreJump(SrcBlock, DstBlock, Jump)) 1027 continue; 1028 uint64_t Flow = std::min(SuccFlow, BlockFlow); 1029 Jump->Flow = Flow; 1030 BlockFlow -= Flow; 1031 } 1032 assert(BlockFlow == 0 && "not all flow is propagated"); 1033 } 1034 1035 /// A constant indicating an arbitrary exit block of a function. 1036 static constexpr uint64_t AnyExitBlock = uint64_t(-1); 1037 /// Minimum BaseDistance for the jump distance values in island joining. 1038 static constexpr uint64_t MinBaseDistance = 10000; 1039 1040 /// Params for flow computation. 1041 const ProfiParams &Params; 1042 /// The function. 1043 FlowFunction &Func; 1044 }; 1045 1046 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params, 1047 const FlowBlock &Block); 1048 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params, 1049 const FlowJump &Jump); 1050 1051 /// Initializing flow network for a given function. 1052 /// 1053 /// Every block is split into two nodes that are responsible for (i) an 1054 /// incoming flow, (ii) an outgoing flow; they penalize an increase or a 1055 /// reduction of the block weight. 1056 void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network, 1057 FlowFunction &Func) { 1058 uint64_t NumBlocks = Func.Blocks.size(); 1059 assert(NumBlocks > 1 && "Too few blocks in a function"); 1060 uint64_t NumJumps = Func.Jumps.size(); 1061 assert(NumJumps > 0 && "Too few jumps in a function"); 1062 1063 // Introducing dummy source/sink pairs to allow flow circulation. 1064 // The nodes corresponding to blocks of the function have indicies in 1065 // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the 1066 // next four values. 1067 uint64_t S = 2 * NumBlocks; 1068 uint64_t T = S + 1; 1069 uint64_t S1 = S + 2; 1070 uint64_t T1 = S + 3; 1071 1072 Network.initialize(2 * NumBlocks + 4, S1, T1); 1073 1074 // Initialize nodes of the flow network 1075 for (uint64_t B = 0; B < NumBlocks; B++) { 1076 auto &Block = Func.Blocks[B]; 1077 1078 // Split every block into two auxiliary nodes to allow 1079 // increase/reduction of the block count. 1080 uint64_t Bin = 2 * B; 1081 uint64_t Bout = 2 * B + 1; 1082 1083 // Edges from S and to T 1084 if (Block.isEntry()) { 1085 Network.addEdge(S, Bin, 0); 1086 } else if (Block.isExit()) { 1087 Network.addEdge(Bout, T, 0); 1088 } 1089 1090 // Assign costs for increasing/decreasing the block counts 1091 auto [AuxCostInc, AuxCostDec] = assignBlockCosts(Params, Block); 1092 1093 // Add the corresponding edges to the network 1094 Network.addEdge(Bin, Bout, AuxCostInc); 1095 if (Block.Weight > 0) { 1096 Network.addEdge(Bout, Bin, Block.Weight, AuxCostDec); 1097 Network.addEdge(S1, Bout, Block.Weight, 0); 1098 Network.addEdge(Bin, T1, Block.Weight, 0); 1099 } 1100 } 1101 1102 // Initialize edges of the flow network 1103 for (uint64_t J = 0; J < NumJumps; J++) { 1104 auto &Jump = Func.Jumps[J]; 1105 1106 // Get the endpoints corresponding to the jump 1107 uint64_t Jin = 2 * Jump.Source + 1; 1108 uint64_t Jout = 2 * Jump.Target; 1109 1110 // Assign costs for increasing/decreasing the jump counts 1111 auto [AuxCostInc, AuxCostDec] = assignJumpCosts(Params, Jump); 1112 1113 // Add the corresponding edges to the network 1114 Network.addEdge(Jin, Jout, AuxCostInc); 1115 if (Jump.Weight > 0) { 1116 Network.addEdge(Jout, Jin, Jump.Weight, AuxCostDec); 1117 Network.addEdge(S1, Jout, Jump.Weight, 0); 1118 Network.addEdge(Jin, T1, Jump.Weight, 0); 1119 } 1120 } 1121 1122 // Make sure we have a valid flow circulation 1123 Network.addEdge(T, S, 0); 1124 } 1125 1126 /// Assign costs for increasing/decreasing the block counts. 1127 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params, 1128 const FlowBlock &Block) { 1129 // Modifying the weight of an unlikely block is expensive 1130 if (Block.IsUnlikely) 1131 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely); 1132 1133 // Assign default values for the costs 1134 int64_t CostInc = Params.CostBlockInc; 1135 int64_t CostDec = Params.CostBlockDec; 1136 // Update the costs depending on the block metadata 1137 if (Block.HasUnknownWeight) { 1138 CostInc = Params.CostBlockUnknownInc; 1139 CostDec = 0; 1140 } else { 1141 // Increasing the count for "cold" blocks with zero initial count is more 1142 // expensive than for "hot" ones 1143 if (Block.Weight == 0) 1144 CostInc = Params.CostBlockZeroInc; 1145 // Modifying the count of the entry block is expensive 1146 if (Block.isEntry()) { 1147 CostInc = Params.CostBlockEntryInc; 1148 CostDec = Params.CostBlockEntryDec; 1149 } 1150 } 1151 return std::make_pair(CostInc, CostDec); 1152 } 1153 1154 /// Assign costs for increasing/decreasing the jump counts. 1155 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params, 1156 const FlowJump &Jump) { 1157 // Modifying the weight of an unlikely jump is expensive 1158 if (Jump.IsUnlikely) 1159 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely); 1160 1161 // Assign default values for the costs 1162 int64_t CostInc = Params.CostJumpInc; 1163 int64_t CostDec = Params.CostJumpDec; 1164 // Update the costs depending on the block metadata 1165 if (Jump.Source + 1 == Jump.Target) { 1166 // Adjusting the fall-through branch 1167 CostInc = Params.CostJumpFTInc; 1168 CostDec = Params.CostJumpFTDec; 1169 } 1170 if (Jump.HasUnknownWeight) { 1171 // The cost is different for fall-through and non-fall-through branches 1172 if (Jump.Source + 1 == Jump.Target) 1173 CostInc = Params.CostJumpUnknownFTInc; 1174 else 1175 CostInc = Params.CostJumpUnknownInc; 1176 CostDec = 0; 1177 } else { 1178 assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight"); 1179 } 1180 return std::make_pair(CostInc, CostDec); 1181 } 1182 1183 /// Extract resulting block and edge counts from the flow network. 1184 void extractWeights(const ProfiParams &Params, MinCostMaxFlow &Network, 1185 FlowFunction &Func) { 1186 uint64_t NumBlocks = Func.Blocks.size(); 1187 uint64_t NumJumps = Func.Jumps.size(); 1188 1189 // Extract resulting jump counts 1190 for (uint64_t J = 0; J < NumJumps; J++) { 1191 auto &Jump = Func.Jumps[J]; 1192 uint64_t SrcOut = 2 * Jump.Source + 1; 1193 uint64_t DstIn = 2 * Jump.Target; 1194 1195 int64_t Flow = 0; 1196 int64_t AuxFlow = Network.getFlow(SrcOut, DstIn); 1197 if (Jump.Source != Jump.Target) 1198 Flow = int64_t(Jump.Weight) + AuxFlow; 1199 else 1200 Flow = int64_t(Jump.Weight) + (AuxFlow > 0 ? AuxFlow : 0); 1201 1202 Jump.Flow = Flow; 1203 assert(Flow >= 0 && "negative jump flow"); 1204 } 1205 1206 // Extract resulting block counts 1207 auto InFlow = std::vector<uint64_t>(NumBlocks, 0); 1208 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); 1209 for (auto &Jump : Func.Jumps) { 1210 InFlow[Jump.Target] += Jump.Flow; 1211 OutFlow[Jump.Source] += Jump.Flow; 1212 } 1213 for (uint64_t B = 0; B < NumBlocks; B++) { 1214 auto &Block = Func.Blocks[B]; 1215 Block.Flow = std::max(OutFlow[B], InFlow[B]); 1216 } 1217 } 1218 1219 #ifndef NDEBUG 1220 /// Verify that the provided block/jump weights are as expected. 1221 void verifyInput(const FlowFunction &Func) { 1222 // Verify entry and exit blocks 1223 assert(Func.Entry == 0 && Func.Blocks[0].isEntry()); 1224 size_t NumExitBlocks = 0; 1225 for (size_t I = 1; I < Func.Blocks.size(); I++) { 1226 assert(!Func.Blocks[I].isEntry() && "multiple entry blocks"); 1227 if (Func.Blocks[I].isExit()) 1228 NumExitBlocks++; 1229 } 1230 assert(NumExitBlocks > 0 && "cannot find exit blocks"); 1231 1232 // Verify that there are no parallel edges 1233 for (auto &Block : Func.Blocks) { 1234 std::unordered_set<uint64_t> UniqueSuccs; 1235 for (auto &Jump : Block.SuccJumps) { 1236 auto It = UniqueSuccs.insert(Jump->Target); 1237 assert(It.second && "input CFG contains parallel edges"); 1238 } 1239 } 1240 // Verify CFG jumps 1241 for (auto &Block : Func.Blocks) { 1242 assert((!Block.isEntry() || !Block.isExit()) && 1243 "a block cannot be an entry and an exit"); 1244 } 1245 // Verify input block weights 1246 for (auto &Block : Func.Blocks) { 1247 assert((!Block.HasUnknownWeight || Block.Weight == 0 || Block.isEntry()) && 1248 "non-zero weight of a block w/o weight except for an entry"); 1249 } 1250 // Verify input jump weights 1251 for (auto &Jump : Func.Jumps) { 1252 assert((!Jump.HasUnknownWeight || Jump.Weight == 0) && 1253 "non-zero weight of a jump w/o weight"); 1254 } 1255 } 1256 1257 /// Verify that the computed flow values satisfy flow conservation rules. 1258 void verifyOutput(const FlowFunction &Func) { 1259 const uint64_t NumBlocks = Func.Blocks.size(); 1260 auto InFlow = std::vector<uint64_t>(NumBlocks, 0); 1261 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); 1262 for (const auto &Jump : Func.Jumps) { 1263 InFlow[Jump.Target] += Jump.Flow; 1264 OutFlow[Jump.Source] += Jump.Flow; 1265 } 1266 1267 uint64_t TotalInFlow = 0; 1268 uint64_t TotalOutFlow = 0; 1269 for (uint64_t I = 0; I < NumBlocks; I++) { 1270 auto &Block = Func.Blocks[I]; 1271 if (Block.isEntry()) { 1272 TotalInFlow += Block.Flow; 1273 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); 1274 } else if (Block.isExit()) { 1275 TotalOutFlow += Block.Flow; 1276 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); 1277 } else { 1278 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); 1279 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); 1280 } 1281 } 1282 assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); 1283 1284 // Verify that there are no isolated flow components 1285 // One could modify FlowFunction to hold edges indexed by the sources, which 1286 // will avoid a creation of the object 1287 auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks); 1288 for (const auto &Jump : Func.Jumps) { 1289 if (Jump.Flow > 0) { 1290 PositiveFlowEdges[Jump.Source].push_back(Jump.Target); 1291 } 1292 } 1293 1294 // Run BFS from the source along edges with positive flow 1295 std::queue<uint64_t> Queue; 1296 auto Visited = BitVector(NumBlocks, false); 1297 Queue.push(Func.Entry); 1298 Visited[Func.Entry] = true; 1299 while (!Queue.empty()) { 1300 uint64_t Src = Queue.front(); 1301 Queue.pop(); 1302 for (uint64_t Dst : PositiveFlowEdges[Src]) { 1303 if (!Visited[Dst]) { 1304 Queue.push(Dst); 1305 Visited[Dst] = true; 1306 } 1307 } 1308 } 1309 1310 // Verify that every block that has a positive flow is reached from the source 1311 // along edges with a positive flow 1312 for (uint64_t I = 0; I < NumBlocks; I++) { 1313 auto &Block = Func.Blocks[I]; 1314 assert((Visited[I] || Block.Flow == 0) && "an isolated flow component"); 1315 } 1316 } 1317 #endif 1318 1319 } // end of anonymous namespace 1320 1321 /// Apply the profile inference algorithm for a given function and provided 1322 /// profi options 1323 void llvm::applyFlowInference(const ProfiParams &Params, FlowFunction &Func) { 1324 // Check if the function has samples and assign initial flow values 1325 bool HasSamples = false; 1326 for (FlowBlock &Block : Func.Blocks) { 1327 if (Block.Weight > 0) 1328 HasSamples = true; 1329 Block.Flow = Block.Weight; 1330 } 1331 for (FlowJump &Jump : Func.Jumps) { 1332 if (Jump.Weight > 0) 1333 HasSamples = true; 1334 Jump.Flow = Jump.Weight; 1335 } 1336 1337 // Quit early for functions with a single block or ones w/o samples 1338 if (Func.Blocks.size() <= 1 || !HasSamples) 1339 return; 1340 1341 #ifndef NDEBUG 1342 // Verify the input data 1343 verifyInput(Func); 1344 #endif 1345 1346 // Create and apply an inference network model 1347 auto InferenceNetwork = MinCostMaxFlow(Params); 1348 initializeNetwork(Params, InferenceNetwork, Func); 1349 InferenceNetwork.run(); 1350 1351 // Extract flow values for every block and every edge 1352 extractWeights(Params, InferenceNetwork, Func); 1353 1354 // Post-processing adjustments to the flow 1355 auto Adjuster = FlowAdjuster(Params, Func); 1356 Adjuster.run(); 1357 1358 #ifndef NDEBUG 1359 // Verify the result 1360 verifyOutput(Func); 1361 #endif 1362 } 1363 1364 /// Apply the profile inference algorithm for a given flow function 1365 void llvm::applyFlowInference(FlowFunction &Func) { 1366 ProfiParams Params; 1367 // Set the params from the command-line flags. 1368 Params.EvenFlowDistribution = SampleProfileEvenFlowDistribution; 1369 Params.RebalanceUnknown = SampleProfileRebalanceUnknown; 1370 Params.JoinIslands = SampleProfileJoinIslands; 1371 Params.CostBlockInc = SampleProfileProfiCostBlockInc; 1372 Params.CostBlockDec = SampleProfileProfiCostBlockDec; 1373 Params.CostBlockEntryInc = SampleProfileProfiCostBlockEntryInc; 1374 Params.CostBlockEntryDec = SampleProfileProfiCostBlockEntryDec; 1375 Params.CostBlockZeroInc = SampleProfileProfiCostBlockZeroInc; 1376 Params.CostBlockUnknownInc = SampleProfileProfiCostBlockUnknownInc; 1377 1378 applyFlowInference(Params, Func); 1379 } 1380