1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a profile inference algorithm. Given an incomplete and 10 // possibly imprecise block counts, the algorithm reconstructs realistic block 11 // and edge counts that satisfy flow conservation rules, while minimally modify 12 // input block counts. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/Transforms/Utils/SampleProfileInference.h" 17 #include "llvm/ADT/BitVector.h" 18 #include "llvm/Support/CommandLine.h" 19 #include "llvm/Support/Debug.h" 20 #include <queue> 21 #include <set> 22 #include <stack> 23 24 using namespace llvm; 25 #define DEBUG_TYPE "sample-profile-inference" 26 27 namespace { 28 29 static cl::opt<bool> SampleProfileEvenFlowDistribution( 30 "sample-profile-even-flow-distribution", cl::init(true), cl::Hidden, 31 cl::desc("Try to evenly distribute flow when there are multiple equally " 32 "likely options.")); 33 34 static cl::opt<bool> SampleProfileRebalanceUnknown( 35 "sample-profile-rebalance-unknown", cl::init(true), cl::Hidden, 36 cl::desc("Evenly re-distribute flow among unknown subgraphs.")); 37 38 static cl::opt<bool> SampleProfileJoinIslands( 39 "sample-profile-join-islands", cl::init(true), cl::Hidden, 40 cl::desc("Join isolated components having positive flow.")); 41 42 static cl::opt<unsigned> SampleProfileProfiCostBlockInc( 43 "sample-profile-profi-cost-block-inc", cl::init(10), cl::Hidden, 44 cl::desc("The cost of increasing a block's count by one.")); 45 46 static cl::opt<unsigned> SampleProfileProfiCostBlockDec( 47 "sample-profile-profi-cost-block-dec", cl::init(20), cl::Hidden, 48 cl::desc("The cost of decreasing a block's count by one.")); 49 50 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryInc( 51 "sample-profile-profi-cost-block-entry-inc", cl::init(40), cl::Hidden, 52 cl::desc("The cost of increasing the entry block's count by one.")); 53 54 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryDec( 55 "sample-profile-profi-cost-block-entry-dec", cl::init(10), cl::Hidden, 56 cl::desc("The cost of decreasing the entry block's count by one.")); 57 58 static cl::opt<unsigned> SampleProfileProfiCostBlockZeroInc( 59 "sample-profile-profi-cost-block-zero-inc", cl::init(11), cl::Hidden, 60 cl::desc("The cost of increasing a count of zero-weight block by one.")); 61 62 static cl::opt<unsigned> SampleProfileProfiCostBlockUnknownInc( 63 "sample-profile-profi-cost-block-unknown-inc", cl::init(0), cl::Hidden, 64 cl::desc("The cost of increasing an unknown block's count by one.")); 65 66 /// A value indicating an infinite flow/capacity/weight of a block/edge. 67 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up 68 /// during the execution. 69 static constexpr int64_t INF = ((int64_t)1) << 50; 70 71 /// The minimum-cost maximum flow algorithm. 72 /// 73 /// The algorithm finds the maximum flow of minimum cost on a given (directed) 74 /// network using a modified version of the classical Moore-Bellman-Ford 75 /// approach. The algorithm applies a number of augmentation iterations in which 76 /// flow is sent along paths of positive capacity from the source to the sink. 77 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where 78 /// where m is the number of edges, n is the number of vertices, and v(f) is the 79 /// value of the maximum flow. However, the observed running time on typical 80 /// instances is sub-quadratic, that is, o(n^2). 81 /// 82 /// The input is a set of edges with specified costs and capacities, and a pair 83 /// of nodes (source and sink). The output is the flow along each edge of the 84 /// minimum total cost respecting the given edge capacities. 85 class MinCostMaxFlow { 86 public: 87 MinCostMaxFlow(const ProfiParams &Params) : Params(Params) {} 88 89 // Initialize algorithm's data structures for a network of a given size. 90 void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) { 91 Source = SourceNode; 92 Target = SinkNode; 93 94 Nodes = std::vector<Node>(NodeCount); 95 Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>()); 96 if (Params.EvenFlowDistribution) 97 AugmentingEdges = 98 std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>()); 99 } 100 101 // Run the algorithm. 102 int64_t run() { 103 LLVM_DEBUG(dbgs() << "Starting profi for " << Nodes.size() << " nodes\n"); 104 105 // Iteratively find an augmentation path/dag in the network and send the 106 // flow along its edges 107 size_t AugmentationIters = applyFlowAugmentation(); 108 109 // Compute the total flow and its cost 110 int64_t TotalCost = 0; 111 int64_t TotalFlow = 0; 112 for (uint64_t Src = 0; Src < Nodes.size(); Src++) { 113 for (auto &Edge : Edges[Src]) { 114 if (Edge.Flow > 0) { 115 TotalCost += Edge.Cost * Edge.Flow; 116 if (Src == Source) 117 TotalFlow += Edge.Flow; 118 } 119 } 120 } 121 LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters 122 << " iterations with " << TotalFlow << " total flow" 123 << " of " << TotalCost << " cost\n"); 124 (void)TotalFlow; 125 (void)AugmentationIters; 126 return TotalCost; 127 } 128 129 /// Adding an edge to the network with a specified capacity and a cost. 130 /// Multiple edges between a pair of nodes are allowed but self-edges 131 /// are not supported. 132 void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) { 133 assert(Capacity > 0 && "adding an edge of zero capacity"); 134 assert(Src != Dst && "loop edge are not supported"); 135 136 Edge SrcEdge; 137 SrcEdge.Dst = Dst; 138 SrcEdge.Cost = Cost; 139 SrcEdge.Capacity = Capacity; 140 SrcEdge.Flow = 0; 141 SrcEdge.RevEdgeIndex = Edges[Dst].size(); 142 143 Edge DstEdge; 144 DstEdge.Dst = Src; 145 DstEdge.Cost = -Cost; 146 DstEdge.Capacity = 0; 147 DstEdge.Flow = 0; 148 DstEdge.RevEdgeIndex = Edges[Src].size(); 149 150 Edges[Src].push_back(SrcEdge); 151 Edges[Dst].push_back(DstEdge); 152 } 153 154 /// Adding an edge to the network of infinite capacity and a given cost. 155 void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) { 156 addEdge(Src, Dst, INF, Cost); 157 } 158 159 /// Get the total flow from a given source node. 160 /// Returns a list of pairs (target node, amount of flow to the target). 161 const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const { 162 std::vector<std::pair<uint64_t, int64_t>> Flow; 163 for (const auto &Edge : Edges[Src]) { 164 if (Edge.Flow > 0) 165 Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow)); 166 } 167 return Flow; 168 } 169 170 /// Get the total flow between a pair of nodes. 171 int64_t getFlow(uint64_t Src, uint64_t Dst) const { 172 int64_t Flow = 0; 173 for (const auto &Edge : Edges[Src]) { 174 if (Edge.Dst == Dst) { 175 Flow += Edge.Flow; 176 } 177 } 178 return Flow; 179 } 180 181 private: 182 /// Iteratively find an augmentation path/dag in the network and send the 183 /// flow along its edges. The method returns the number of applied iterations. 184 size_t applyFlowAugmentation() { 185 size_t AugmentationIters = 0; 186 while (findAugmentingPath()) { 187 uint64_t PathCapacity = computeAugmentingPathCapacity(); 188 while (PathCapacity > 0) { 189 bool Progress = false; 190 if (Params.EvenFlowDistribution) { 191 // Identify node/edge candidates for augmentation 192 identifyShortestEdges(PathCapacity); 193 194 // Find an augmenting DAG 195 auto AugmentingOrder = findAugmentingDAG(); 196 197 // Apply the DAG augmentation 198 Progress = augmentFlowAlongDAG(AugmentingOrder); 199 PathCapacity = computeAugmentingPathCapacity(); 200 } 201 202 if (!Progress) { 203 augmentFlowAlongPath(PathCapacity); 204 PathCapacity = 0; 205 } 206 207 AugmentationIters++; 208 } 209 } 210 return AugmentationIters; 211 } 212 213 /// Compute the capacity of the cannonical augmenting path. If the path is 214 /// saturated (that is, no flow can be sent along the path), then return 0. 215 uint64_t computeAugmentingPathCapacity() { 216 uint64_t PathCapacity = INF; 217 uint64_t Now = Target; 218 while (Now != Source) { 219 uint64_t Pred = Nodes[Now].ParentNode; 220 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; 221 222 assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow"); 223 uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow); 224 PathCapacity = std::min(PathCapacity, EdgeCapacity); 225 226 Now = Pred; 227 } 228 return PathCapacity; 229 } 230 231 /// Check for existence of an augmenting path with a positive capacity. 232 bool findAugmentingPath() { 233 // Initialize data structures 234 for (auto &Node : Nodes) { 235 Node.Distance = INF; 236 Node.ParentNode = uint64_t(-1); 237 Node.ParentEdgeIndex = uint64_t(-1); 238 Node.Taken = false; 239 } 240 241 std::queue<uint64_t> Queue; 242 Queue.push(Source); 243 Nodes[Source].Distance = 0; 244 Nodes[Source].Taken = true; 245 while (!Queue.empty()) { 246 uint64_t Src = Queue.front(); 247 Queue.pop(); 248 Nodes[Src].Taken = false; 249 // Although the residual network contains edges with negative costs 250 // (in particular, backward edges), it can be shown that there are no 251 // negative-weight cycles and the following two invariants are maintained: 252 // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, 253 // where Dist is the length of the shortest path between two nodes. This 254 // allows to prune the search-space of the path-finding algorithm using 255 // the following early-stop criteria: 256 // -- If we find a path with zero-distance from Source to Target, stop the 257 // search, as the path is the shortest since Dist[Source, Target] >= 0; 258 // -- If we have Dist[Source, V] > Dist[Source, Target], then do not 259 // process node V, as it is guaranteed _not_ to be on a shortest path 260 // from Source to Target; it follows from inequalities 261 // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] 262 // >= Dist[Source, V] 263 if (!Params.EvenFlowDistribution && Nodes[Target].Distance == 0) 264 break; 265 if (Nodes[Src].Distance > Nodes[Target].Distance) 266 continue; 267 268 // Process adjacent edges 269 for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) { 270 auto &Edge = Edges[Src][EdgeIdx]; 271 if (Edge.Flow < Edge.Capacity) { 272 uint64_t Dst = Edge.Dst; 273 int64_t NewDistance = Nodes[Src].Distance + Edge.Cost; 274 if (Nodes[Dst].Distance > NewDistance) { 275 // Update the distance and the parent node/edge 276 Nodes[Dst].Distance = NewDistance; 277 Nodes[Dst].ParentNode = Src; 278 Nodes[Dst].ParentEdgeIndex = EdgeIdx; 279 // Add the node to the queue, if it is not there yet 280 if (!Nodes[Dst].Taken) { 281 Queue.push(Dst); 282 Nodes[Dst].Taken = true; 283 } 284 } 285 } 286 } 287 } 288 289 return Nodes[Target].Distance != INF; 290 } 291 292 /// Update the current flow along the augmenting path. 293 void augmentFlowAlongPath(uint64_t PathCapacity) { 294 assert(PathCapacity > 0 && "found an incorrect augmenting path"); 295 uint64_t Now = Target; 296 while (Now != Source) { 297 uint64_t Pred = Nodes[Now].ParentNode; 298 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; 299 auto &RevEdge = Edges[Now][Edge.RevEdgeIndex]; 300 301 Edge.Flow += PathCapacity; 302 RevEdge.Flow -= PathCapacity; 303 304 Now = Pred; 305 } 306 } 307 308 /// Find an Augmenting DAG order using a modified version of DFS in which we 309 /// can visit a node multiple times. In the DFS search, when scanning each 310 /// edge out of a node, continue search at Edge.Dst endpoint if it has not 311 /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm 312 /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time. 313 /// It returns an Augmenting Order (Taken nodes in decreasing Finish time) 314 /// that starts with Source and ends with Target. 315 std::vector<uint64_t> findAugmentingDAG() { 316 // We use a stack based implemenation of DFS to avoid recursion. 317 // Defining DFS data structures: 318 // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that 319 // - we are currently visiting Nodes[NodeIdx] and 320 // - the next edge to scan is Edges[NodeIdx][EdgeIdx] 321 typedef std::pair<uint64_t, uint64_t> StackItemType; 322 std::stack<StackItemType> Stack; 323 std::vector<uint64_t> AugmentingOrder; 324 325 // Phase 0: Initialize Node attributes and Time for DFS run 326 for (auto &Node : Nodes) { 327 Node.Discovery = 0; 328 Node.Finish = 0; 329 Node.NumCalls = 0; 330 Node.Taken = false; 331 } 332 uint64_t Time = 0; 333 // Mark Target as Taken 334 // Taken attribute will be propagated backwards from Target towards Source 335 Nodes[Target].Taken = true; 336 337 // Phase 1: Start DFS traversal from Source 338 Stack.emplace(Source, 0); 339 Nodes[Source].Discovery = ++Time; 340 while (!Stack.empty()) { 341 auto NodeIdx = Stack.top().first; 342 auto EdgeIdx = Stack.top().second; 343 344 // If we haven't scanned all edges out of NodeIdx, continue scanning 345 if (EdgeIdx < Edges[NodeIdx].size()) { 346 auto &Edge = Edges[NodeIdx][EdgeIdx]; 347 auto &Dst = Nodes[Edge.Dst]; 348 Stack.top().second++; 349 350 if (Edge.OnShortestPath) { 351 // If we haven't seen Edge.Dst so far, continue DFS search there 352 if (Dst.Discovery == 0 && Dst.NumCalls < MaxDfsCalls) { 353 Dst.Discovery = ++Time; 354 Stack.emplace(Edge.Dst, 0); 355 Dst.NumCalls++; 356 } else if (Dst.Taken && Dst.Finish != 0) { 357 // Else, if Edge.Dst already have a path to Target, so that NodeIdx 358 Nodes[NodeIdx].Taken = true; 359 } 360 } 361 } else { 362 // If we are done scanning all edge out of NodeIdx 363 Stack.pop(); 364 // If we haven't found a path from NodeIdx to Target, forget about it 365 if (!Nodes[NodeIdx].Taken) { 366 Nodes[NodeIdx].Discovery = 0; 367 } else { 368 // If we have found a path from NodeIdx to Target, then finish NodeIdx 369 // and propagate Taken flag to DFS parent unless at the Source 370 Nodes[NodeIdx].Finish = ++Time; 371 // NodeIdx == Source if and only if the stack is empty 372 if (NodeIdx != Source) { 373 assert(!Stack.empty() && "empty stack while running dfs"); 374 Nodes[Stack.top().first].Taken = true; 375 } 376 AugmentingOrder.push_back(NodeIdx); 377 } 378 } 379 } 380 // Nodes are collected decreasing Finish time, so the order is reversed 381 std::reverse(AugmentingOrder.begin(), AugmentingOrder.end()); 382 383 // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges 384 for (size_t Src : AugmentingOrder) { 385 AugmentingEdges[Src].clear(); 386 for (auto &Edge : Edges[Src]) { 387 uint64_t Dst = Edge.Dst; 388 if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken && 389 Nodes[Dst].Finish < Nodes[Src].Finish) { 390 AugmentingEdges[Src].push_back(&Edge); 391 } 392 } 393 assert((Src == Target || !AugmentingEdges[Src].empty()) && 394 "incorrectly constructed augmenting edges"); 395 } 396 397 return AugmentingOrder; 398 } 399 400 /// Update the current flow along the given (acyclic) subgraph specified by 401 /// the vertex order, AugmentingOrder. The objective is to send as much flow 402 /// as possible while evenly distributing flow among successors of each node. 403 /// After the update at least one edge is saturated. 404 bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) { 405 // Phase 0: Initialization 406 for (uint64_t Src : AugmentingOrder) { 407 Nodes[Src].FracFlow = 0; 408 Nodes[Src].IntFlow = 0; 409 for (auto &Edge : AugmentingEdges[Src]) { 410 Edge->AugmentedFlow = 0; 411 } 412 } 413 414 // Phase 1: Send a unit of fractional flow along the DAG 415 uint64_t MaxFlowAmount = INF; 416 Nodes[Source].FracFlow = 1.0; 417 for (uint64_t Src : AugmentingOrder) { 418 assert((Src == Target || Nodes[Src].FracFlow > 0.0) && 419 "incorrectly computed fractional flow"); 420 // Distribute flow evenly among successors of Src 421 uint64_t Degree = AugmentingEdges[Src].size(); 422 for (auto &Edge : AugmentingEdges[Src]) { 423 double EdgeFlow = Nodes[Src].FracFlow / Degree; 424 Nodes[Edge->Dst].FracFlow += EdgeFlow; 425 if (Edge->Capacity == INF) 426 continue; 427 uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow; 428 MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow); 429 } 430 } 431 // Stop early if we cannot send any (integral) flow from Source to Target 432 if (MaxFlowAmount == 0) 433 return false; 434 435 // Phase 2: Send an integral flow of MaxFlowAmount 436 Nodes[Source].IntFlow = MaxFlowAmount; 437 for (uint64_t Src : AugmentingOrder) { 438 if (Src == Target) 439 break; 440 // Distribute flow evenly among successors of Src, rounding up to make 441 // sure all flow is sent 442 uint64_t Degree = AugmentingEdges[Src].size(); 443 // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree 444 uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree; 445 for (auto &Edge : AugmentingEdges[Src]) { 446 uint64_t Dst = Edge->Dst; 447 uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow); 448 EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow)); 449 Nodes[Dst].IntFlow += EdgeFlow; 450 Nodes[Src].IntFlow -= EdgeFlow; 451 Edge->AugmentedFlow += EdgeFlow; 452 } 453 } 454 assert(Nodes[Target].IntFlow <= MaxFlowAmount); 455 Nodes[Target].IntFlow = 0; 456 457 // Phase 3: Send excess flow back traversing the nodes backwards. 458 // Because of rounding, not all flow can be sent along the edges of Src. 459 // Hence, sending the remaining flow back to maintain flow conservation 460 for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) { 461 uint64_t Src = AugmentingOrder[Idx - 1]; 462 // Try to send excess flow back along each edge. 463 // Make sure we only send back flow we just augmented (AugmentedFlow). 464 for (auto &Edge : AugmentingEdges[Src]) { 465 uint64_t Dst = Edge->Dst; 466 if (Nodes[Dst].IntFlow == 0) 467 continue; 468 uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow); 469 Nodes[Dst].IntFlow -= EdgeFlow; 470 Nodes[Src].IntFlow += EdgeFlow; 471 Edge->AugmentedFlow -= EdgeFlow; 472 } 473 } 474 475 // Phase 4: Update flow values along all edges 476 bool HasSaturatedEdges = false; 477 for (uint64_t Src : AugmentingOrder) { 478 // Verify that we have sent all the excess flow from the node 479 assert(Src == Source || Nodes[Src].IntFlow == 0); 480 for (auto &Edge : AugmentingEdges[Src]) { 481 assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow); 482 // Update flow values along the edge and its reverse copy 483 auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex]; 484 Edge->Flow += Edge->AugmentedFlow; 485 RevEdge.Flow -= Edge->AugmentedFlow; 486 if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0) 487 HasSaturatedEdges = true; 488 } 489 } 490 491 // The augmentation is successful iff at least one edge becomes saturated 492 return HasSaturatedEdges; 493 } 494 495 /// Identify candidate (shortest) edges for augmentation. 496 void identifyShortestEdges(uint64_t PathCapacity) { 497 assert(PathCapacity > 0 && "found an incorrect augmenting DAG"); 498 // To make sure the augmentation DAG contains only edges with large residual 499 // capacity, we prune all edges whose capacity is below a fraction of 500 // the capacity of the augmented path. 501 // (All edges of the path itself are always in the DAG) 502 uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1)); 503 504 // Decide which edges are on a shortest path from Source to Target 505 for (size_t Src = 0; Src < Nodes.size(); Src++) { 506 // An edge cannot be augmenting if the endpoint has large distance 507 if (Nodes[Src].Distance > Nodes[Target].Distance) 508 continue; 509 510 for (auto &Edge : Edges[Src]) { 511 uint64_t Dst = Edge.Dst; 512 Edge.OnShortestPath = 513 Src != Target && Dst != Source && 514 Nodes[Dst].Distance <= Nodes[Target].Distance && 515 Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost && 516 Edge.Capacity > Edge.Flow && 517 uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity; 518 } 519 } 520 } 521 522 /// Maximum number of DFS iterations for DAG finding. 523 static constexpr uint64_t MaxDfsCalls = 10; 524 525 /// A node in a flow network. 526 struct Node { 527 /// The cost of the cheapest path from the source to the current node. 528 int64_t Distance; 529 /// The node preceding the current one in the path. 530 uint64_t ParentNode; 531 /// The index of the edge between ParentNode and the current node. 532 uint64_t ParentEdgeIndex; 533 /// An indicator of whether the current node is in a queue. 534 bool Taken; 535 536 /// Data fields utilized in DAG-augmentation: 537 /// Fractional flow. 538 double FracFlow; 539 /// Integral flow. 540 uint64_t IntFlow; 541 /// Discovery time. 542 uint64_t Discovery; 543 /// Finish time. 544 uint64_t Finish; 545 /// NumCalls. 546 uint64_t NumCalls; 547 }; 548 549 /// An edge in a flow network. 550 struct Edge { 551 /// The cost of the edge. 552 int64_t Cost; 553 /// The capacity of the edge. 554 int64_t Capacity; 555 /// The current flow on the edge. 556 int64_t Flow; 557 /// The destination node of the edge. 558 uint64_t Dst; 559 /// The index of the reverse edge between Dst and the current node. 560 uint64_t RevEdgeIndex; 561 562 /// Data fields utilized in DAG-augmentation: 563 /// Whether the edge is currently on a shortest path from Source to Target. 564 bool OnShortestPath; 565 /// Extra flow along the edge. 566 uint64_t AugmentedFlow; 567 }; 568 569 /// The set of network nodes. 570 std::vector<Node> Nodes; 571 /// The set of network edges. 572 std::vector<std::vector<Edge>> Edges; 573 /// Source node of the flow. 574 uint64_t Source; 575 /// Target (sink) node of the flow. 576 uint64_t Target; 577 /// Augmenting edges. 578 std::vector<std::vector<Edge *>> AugmentingEdges; 579 /// Params for flow computation. 580 const ProfiParams &Params; 581 }; 582 583 /// A post-processing adjustment of the control flow. It applies two steps by 584 /// rerouting some flow and making it more realistic: 585 /// 586 /// - First, it removes all isolated components ("islands") with a positive flow 587 /// that are unreachable from the entry block. For every such component, we 588 /// find the shortest from the entry to an exit passing through the component, 589 /// and increase the flow by one unit along the path. 590 /// 591 /// - Second, it identifies all "unknown subgraphs" consisting of basic blocks 592 /// with no sampled counts. Then it rebalnces the flow that goes through such 593 /// a subgraph so that each branch is taken with probability 50%. 594 /// An unknown subgraph is such that for every two nodes u and v: 595 /// - u dominates v and u is not unknown; 596 /// - v post-dominates u; and 597 /// - all inner-nodes of all (u,v)-paths are unknown. 598 /// 599 class FlowAdjuster { 600 public: 601 FlowAdjuster(const ProfiParams &Params, FlowFunction &Func) 602 : Params(Params), Func(Func) {} 603 604 /// Apply the post-processing. 605 void run() { 606 if (Params.JoinIslands) { 607 // Adjust the flow to get rid of isolated components 608 joinIsolatedComponents(); 609 } 610 611 if (Params.RebalanceUnknown) { 612 // Rebalance the flow inside unknown subgraphs 613 rebalanceUnknownSubgraphs(); 614 } 615 } 616 617 private: 618 void joinIsolatedComponents() { 619 // Find blocks that are reachable from the source 620 auto Visited = BitVector(NumBlocks(), false); 621 findReachable(Func.Entry, Visited); 622 623 // Iterate over all non-reachable blocks and adjust their weights 624 for (uint64_t I = 0; I < NumBlocks(); I++) { 625 auto &Block = Func.Blocks[I]; 626 if (Block.Flow > 0 && !Visited[I]) { 627 // Find a path from the entry to an exit passing through the block I 628 auto Path = findShortestPath(I); 629 // Increase the flow along the path 630 assert(Path.size() > 0 && Path[0]->Source == Func.Entry && 631 "incorrectly computed path adjusting control flow"); 632 Func.Blocks[Func.Entry].Flow += 1; 633 for (auto &Jump : Path) { 634 Jump->Flow += 1; 635 Func.Blocks[Jump->Target].Flow += 1; 636 // Update reachability 637 findReachable(Jump->Target, Visited); 638 } 639 } 640 } 641 } 642 643 /// Run BFS from a given block along the jumps with a positive flow and mark 644 /// all reachable blocks. 645 void findReachable(uint64_t Src, BitVector &Visited) { 646 if (Visited[Src]) 647 return; 648 std::queue<uint64_t> Queue; 649 Queue.push(Src); 650 Visited[Src] = true; 651 while (!Queue.empty()) { 652 Src = Queue.front(); 653 Queue.pop(); 654 for (auto *Jump : Func.Blocks[Src].SuccJumps) { 655 uint64_t Dst = Jump->Target; 656 if (Jump->Flow > 0 && !Visited[Dst]) { 657 Queue.push(Dst); 658 Visited[Dst] = true; 659 } 660 } 661 } 662 } 663 664 /// Find the shortest path from the entry block to an exit block passing 665 /// through a given block. 666 std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) { 667 // A path from the entry block to BlockIdx 668 auto ForwardPath = findShortestPath(Func.Entry, BlockIdx); 669 // A path from BlockIdx to an exit block 670 auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock); 671 672 // Concatenate the two paths 673 std::vector<FlowJump *> Result; 674 Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end()); 675 Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end()); 676 return Result; 677 } 678 679 /// Apply the Dijkstra algorithm to find the shortest path from a given 680 /// Source to a given Target block. 681 /// If Target == -1, then the path ends at an exit block. 682 std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) { 683 // Quit early, if possible 684 if (Source == Target) 685 return std::vector<FlowJump *>(); 686 if (Func.Blocks[Source].isExit() && Target == AnyExitBlock) 687 return std::vector<FlowJump *>(); 688 689 // Initialize data structures 690 auto Distance = std::vector<int64_t>(NumBlocks(), INF); 691 auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr); 692 Distance[Source] = 0; 693 std::set<std::pair<uint64_t, uint64_t>> Queue; 694 Queue.insert(std::make_pair(Distance[Source], Source)); 695 696 // Run the Dijkstra algorithm 697 while (!Queue.empty()) { 698 uint64_t Src = Queue.begin()->second; 699 Queue.erase(Queue.begin()); 700 // If we found a solution, quit early 701 if (Src == Target || 702 (Func.Blocks[Src].isExit() && Target == AnyExitBlock)) 703 break; 704 705 for (auto *Jump : Func.Blocks[Src].SuccJumps) { 706 uint64_t Dst = Jump->Target; 707 int64_t JumpDist = jumpDistance(Jump); 708 if (Distance[Dst] > Distance[Src] + JumpDist) { 709 Queue.erase(std::make_pair(Distance[Dst], Dst)); 710 711 Distance[Dst] = Distance[Src] + JumpDist; 712 Parent[Dst] = Jump; 713 714 Queue.insert(std::make_pair(Distance[Dst], Dst)); 715 } 716 } 717 } 718 // If Target is not provided, find the closest exit block 719 if (Target == AnyExitBlock) { 720 for (uint64_t I = 0; I < NumBlocks(); I++) { 721 if (Func.Blocks[I].isExit() && Parent[I] != nullptr) { 722 if (Target == AnyExitBlock || Distance[Target] > Distance[I]) { 723 Target = I; 724 } 725 } 726 } 727 } 728 assert(Parent[Target] != nullptr && "a path does not exist"); 729 730 // Extract the constructed path 731 std::vector<FlowJump *> Result; 732 uint64_t Now = Target; 733 while (Now != Source) { 734 assert(Now == Parent[Now]->Target && "incorrect parent jump"); 735 Result.push_back(Parent[Now]); 736 Now = Parent[Now]->Source; 737 } 738 // Reverse the path, since it is extracted from Target to Source 739 std::reverse(Result.begin(), Result.end()); 740 return Result; 741 } 742 743 /// A distance of a path for a given jump. 744 /// In order to incite the path to use blocks/jumps with large positive flow, 745 /// and avoid changing branch probability of outgoing edges drastically, 746 /// set the jump distance so as: 747 /// - to minimize the number of unlikely jumps used and subject to that, 748 /// - to minimize the number of Flow == 0 jumps used and subject to that, 749 /// - minimizes total multiplicative Flow increase for the remaining edges. 750 /// To capture this objective with integer distances, we round off fractional 751 /// parts to a multiple of 1 / BaseDistance. 752 int64_t jumpDistance(FlowJump *Jump) const { 753 if (Jump->IsUnlikely) 754 return Params.CostUnlikely; 755 uint64_t BaseDistance = 756 std::max(FlowAdjuster::MinBaseDistance, 757 std::min(Func.Blocks[Func.Entry].Flow, 758 Params.CostUnlikely / (2 * (NumBlocks() + 1)))); 759 if (Jump->Flow > 0) 760 return BaseDistance + BaseDistance / Jump->Flow; 761 return 2 * BaseDistance * (NumBlocks() + 1); 762 }; 763 764 uint64_t NumBlocks() const { return Func.Blocks.size(); } 765 766 /// Rebalance unknown subgraphs so that the flow is split evenly across the 767 /// outgoing branches of every block of the subgraph. The method iterates over 768 /// blocks with known weight and identifies unknown subgraphs rooted at the 769 /// blocks. Then it verifies if flow rebalancing is feasible and applies it. 770 void rebalanceUnknownSubgraphs() { 771 // Try to find unknown subgraphs from each block 772 for (const FlowBlock &SrcBlock : Func.Blocks) { 773 // Verify if rebalancing rooted at SrcBlock is feasible 774 if (!canRebalanceAtRoot(&SrcBlock)) 775 continue; 776 777 // Find an unknown subgraphs starting at SrcBlock. Along the way, 778 // fill in known destinations and intermediate unknown blocks. 779 std::vector<FlowBlock *> UnknownBlocks; 780 std::vector<FlowBlock *> KnownDstBlocks; 781 findUnknownSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks); 782 783 // Verify if rebalancing of the subgraph is feasible. If the search is 784 // successful, find the unique destination block (which can be null) 785 FlowBlock *DstBlock = nullptr; 786 if (!canRebalanceSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks, 787 DstBlock)) 788 continue; 789 790 // We cannot rebalance subgraphs containing cycles among unknown blocks 791 if (!isAcyclicSubgraph(&SrcBlock, DstBlock, UnknownBlocks)) 792 continue; 793 794 // Rebalance the flow 795 rebalanceUnknownSubgraph(&SrcBlock, DstBlock, UnknownBlocks); 796 } 797 } 798 799 /// Verify if rebalancing rooted at a given block is possible. 800 bool canRebalanceAtRoot(const FlowBlock *SrcBlock) { 801 // Do not attempt to find unknown subgraphs from an unknown or a 802 // zero-flow block 803 if (SrcBlock->HasUnknownWeight || SrcBlock->Flow == 0) 804 return false; 805 806 // Do not attempt to process subgraphs from a block w/o unknown sucessors 807 bool HasUnknownSuccs = false; 808 for (auto *Jump : SrcBlock->SuccJumps) { 809 if (Func.Blocks[Jump->Target].HasUnknownWeight) { 810 HasUnknownSuccs = true; 811 break; 812 } 813 } 814 if (!HasUnknownSuccs) 815 return false; 816 817 return true; 818 } 819 820 /// Find an unknown subgraph starting at block SrcBlock. The method sets 821 /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks. 822 void findUnknownSubgraph(const FlowBlock *SrcBlock, 823 std::vector<FlowBlock *> &KnownDstBlocks, 824 std::vector<FlowBlock *> &UnknownBlocks) { 825 // Run BFS from SrcBlock and make sure all paths are going through unknown 826 // blocks and end at a known DstBlock 827 auto Visited = BitVector(NumBlocks(), false); 828 std::queue<uint64_t> Queue; 829 830 Queue.push(SrcBlock->Index); 831 Visited[SrcBlock->Index] = true; 832 while (!Queue.empty()) { 833 auto &Block = Func.Blocks[Queue.front()]; 834 Queue.pop(); 835 // Process blocks reachable from Block 836 for (auto *Jump : Block.SuccJumps) { 837 // If Jump can be ignored, skip it 838 if (ignoreJump(SrcBlock, nullptr, Jump)) 839 continue; 840 841 uint64_t Dst = Jump->Target; 842 // If Dst has been visited, skip Jump 843 if (Visited[Dst]) 844 continue; 845 // Process block Dst 846 Visited[Dst] = true; 847 if (!Func.Blocks[Dst].HasUnknownWeight) { 848 KnownDstBlocks.push_back(&Func.Blocks[Dst]); 849 } else { 850 Queue.push(Dst); 851 UnknownBlocks.push_back(&Func.Blocks[Dst]); 852 } 853 } 854 } 855 } 856 857 /// Verify if rebalancing of the subgraph is feasible. If the checks are 858 /// successful, set the unique destination block, DstBlock (can be null). 859 bool canRebalanceSubgraph(const FlowBlock *SrcBlock, 860 const std::vector<FlowBlock *> &KnownDstBlocks, 861 const std::vector<FlowBlock *> &UnknownBlocks, 862 FlowBlock *&DstBlock) { 863 // If the list of unknown blocks is empty, we don't need rebalancing 864 if (UnknownBlocks.empty()) 865 return false; 866 867 // If there are multiple known sinks, we can't rebalance 868 if (KnownDstBlocks.size() > 1) 869 return false; 870 DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front(); 871 872 // Verify sinks of the subgraph 873 for (auto *Block : UnknownBlocks) { 874 if (Block->SuccJumps.empty()) { 875 // If there are multiple (known and unknown) sinks, we can't rebalance 876 if (DstBlock != nullptr) 877 return false; 878 continue; 879 } 880 size_t NumIgnoredJumps = 0; 881 for (auto *Jump : Block->SuccJumps) { 882 if (ignoreJump(SrcBlock, DstBlock, Jump)) 883 NumIgnoredJumps++; 884 } 885 // If there is a non-sink block in UnknownBlocks with all jumps ignored, 886 // then we can't rebalance 887 if (NumIgnoredJumps == Block->SuccJumps.size()) 888 return false; 889 } 890 891 return true; 892 } 893 894 /// Decide whether the Jump is ignored while processing an unknown subgraphs 895 /// rooted at basic block SrcBlock with the destination block, DstBlock. 896 bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, 897 const FlowJump *Jump) { 898 // Ignore unlikely jumps with zero flow 899 if (Jump->IsUnlikely && Jump->Flow == 0) 900 return true; 901 902 auto JumpSource = &Func.Blocks[Jump->Source]; 903 auto JumpTarget = &Func.Blocks[Jump->Target]; 904 905 // Do not ignore jumps coming into DstBlock 906 if (DstBlock != nullptr && JumpTarget == DstBlock) 907 return false; 908 909 // Ignore jumps out of SrcBlock to known blocks 910 if (!JumpTarget->HasUnknownWeight && JumpSource == SrcBlock) 911 return true; 912 913 // Ignore jumps to known blocks with zero flow 914 if (!JumpTarget->HasUnknownWeight && JumpTarget->Flow == 0) 915 return true; 916 917 return false; 918 } 919 920 /// Verify if the given unknown subgraph is acyclic, and if yes, reorder 921 /// UnknownBlocks in the topological order (so that all jumps are "forward"). 922 bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, 923 std::vector<FlowBlock *> &UnknownBlocks) { 924 // Extract local in-degrees in the considered subgraph 925 auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0); 926 auto fillInDegree = [&](const FlowBlock *Block) { 927 for (auto *Jump : Block->SuccJumps) { 928 if (ignoreJump(SrcBlock, DstBlock, Jump)) 929 continue; 930 LocalInDegree[Jump->Target]++; 931 } 932 }; 933 fillInDegree(SrcBlock); 934 for (auto *Block : UnknownBlocks) { 935 fillInDegree(Block); 936 } 937 // A loop containing SrcBlock 938 if (LocalInDegree[SrcBlock->Index] > 0) 939 return false; 940 941 std::vector<FlowBlock *> AcyclicOrder; 942 std::queue<uint64_t> Queue; 943 Queue.push(SrcBlock->Index); 944 while (!Queue.empty()) { 945 FlowBlock *Block = &Func.Blocks[Queue.front()]; 946 Queue.pop(); 947 // Stop propagation once we reach DstBlock, if any 948 if (DstBlock != nullptr && Block == DstBlock) 949 break; 950 951 // Keep an acyclic order of unknown blocks 952 if (Block->HasUnknownWeight && Block != SrcBlock) 953 AcyclicOrder.push_back(Block); 954 955 // Add to the queue all successors with zero local in-degree 956 for (auto *Jump : Block->SuccJumps) { 957 if (ignoreJump(SrcBlock, DstBlock, Jump)) 958 continue; 959 uint64_t Dst = Jump->Target; 960 LocalInDegree[Dst]--; 961 if (LocalInDegree[Dst] == 0) { 962 Queue.push(Dst); 963 } 964 } 965 } 966 967 // If there is a cycle in the subgraph, AcyclicOrder contains only a subset 968 // of all blocks 969 if (UnknownBlocks.size() != AcyclicOrder.size()) 970 return false; 971 UnknownBlocks = AcyclicOrder; 972 return true; 973 } 974 975 /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and 976 /// having UnknownBlocks intermediate blocks. 977 void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock, 978 const FlowBlock *DstBlock, 979 const std::vector<FlowBlock *> &UnknownBlocks) { 980 assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph"); 981 982 // Ditribute flow from the source block 983 uint64_t BlockFlow = 0; 984 // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps 985 for (auto *Jump : SrcBlock->SuccJumps) { 986 if (ignoreJump(SrcBlock, DstBlock, Jump)) 987 continue; 988 BlockFlow += Jump->Flow; 989 } 990 rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow); 991 992 // Ditribute flow from the remaining blocks 993 for (auto *Block : UnknownBlocks) { 994 assert(Block->HasUnknownWeight && "incorrect unknown subgraph"); 995 uint64_t BlockFlow = 0; 996 // Block's flow is the sum of incoming flows 997 for (auto *Jump : Block->PredJumps) { 998 BlockFlow += Jump->Flow; 999 } 1000 Block->Flow = BlockFlow; 1001 rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow); 1002 } 1003 } 1004 1005 /// Redistribute flow for a block in a subgraph rooted at SrcBlock, 1006 /// and ending at DstBlock. 1007 void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, 1008 const FlowBlock *Block, uint64_t BlockFlow) { 1009 // Process all successor jumps and update corresponding flow values 1010 size_t BlockDegree = 0; 1011 for (auto *Jump : Block->SuccJumps) { 1012 if (ignoreJump(SrcBlock, DstBlock, Jump)) 1013 continue; 1014 BlockDegree++; 1015 } 1016 // If all successor jumps of the block are ignored, skip it 1017 if (DstBlock == nullptr && BlockDegree == 0) 1018 return; 1019 assert(BlockDegree > 0 && "all outgoing jumps are ignored"); 1020 1021 // Each of the Block's successors gets the following amount of flow. 1022 // Rounding the value up so that all flow is propagated 1023 uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree; 1024 for (auto *Jump : Block->SuccJumps) { 1025 if (ignoreJump(SrcBlock, DstBlock, Jump)) 1026 continue; 1027 uint64_t Flow = std::min(SuccFlow, BlockFlow); 1028 Jump->Flow = Flow; 1029 BlockFlow -= Flow; 1030 } 1031 assert(BlockFlow == 0 && "not all flow is propagated"); 1032 } 1033 1034 /// A constant indicating an arbitrary exit block of a function. 1035 static constexpr uint64_t AnyExitBlock = uint64_t(-1); 1036 /// Minimum BaseDistance for the jump distance values in island joining. 1037 static constexpr uint64_t MinBaseDistance = 10000; 1038 1039 /// Params for flow computation. 1040 const ProfiParams &Params; 1041 /// The function. 1042 FlowFunction &Func; 1043 }; 1044 1045 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params, 1046 const FlowBlock &Block); 1047 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params, 1048 const FlowJump &Jump); 1049 1050 /// Initializing flow network for a given function. 1051 /// 1052 /// Every block is split into two nodes that are responsible for (i) an 1053 /// incoming flow, (ii) an outgoing flow; they penalize an increase or a 1054 /// reduction of the block weight. 1055 void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network, 1056 FlowFunction &Func) { 1057 uint64_t NumBlocks = Func.Blocks.size(); 1058 assert(NumBlocks > 1 && "Too few blocks in a function"); 1059 uint64_t NumJumps = Func.Jumps.size(); 1060 assert(NumJumps > 0 && "Too few jumps in a function"); 1061 1062 // Introducing dummy source/sink pairs to allow flow circulation. 1063 // The nodes corresponding to blocks of the function have indicies in 1064 // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the 1065 // next four values. 1066 uint64_t S = 2 * NumBlocks; 1067 uint64_t T = S + 1; 1068 uint64_t S1 = S + 2; 1069 uint64_t T1 = S + 3; 1070 1071 Network.initialize(2 * NumBlocks + 4, S1, T1); 1072 1073 // Initialize nodes of the flow network 1074 for (uint64_t B = 0; B < NumBlocks; B++) { 1075 auto &Block = Func.Blocks[B]; 1076 1077 // Split every block into two auxiliary nodes to allow 1078 // increase/reduction of the block count. 1079 uint64_t Bin = 2 * B; 1080 uint64_t Bout = 2 * B + 1; 1081 1082 // Edges from S and to T 1083 if (Block.isEntry()) { 1084 Network.addEdge(S, Bin, 0); 1085 } else if (Block.isExit()) { 1086 Network.addEdge(Bout, T, 0); 1087 } 1088 1089 // Assign costs for increasing/decreasing the block counts 1090 auto [AuxCostInc, AuxCostDec] = assignBlockCosts(Params, Block); 1091 1092 // Add the corresponding edges to the network 1093 Network.addEdge(Bin, Bout, AuxCostInc); 1094 if (Block.Weight > 0) { 1095 Network.addEdge(Bout, Bin, Block.Weight, AuxCostDec); 1096 Network.addEdge(S1, Bout, Block.Weight, 0); 1097 Network.addEdge(Bin, T1, Block.Weight, 0); 1098 } 1099 } 1100 1101 // Initialize edges of the flow network 1102 for (uint64_t J = 0; J < NumJumps; J++) { 1103 auto &Jump = Func.Jumps[J]; 1104 1105 // Get the endpoints corresponding to the jump 1106 uint64_t Jin = 2 * Jump.Source + 1; 1107 uint64_t Jout = 2 * Jump.Target; 1108 1109 // Assign costs for increasing/decreasing the jump counts 1110 auto [AuxCostInc, AuxCostDec] = assignJumpCosts(Params, Jump); 1111 1112 // Add the corresponding edges to the network 1113 Network.addEdge(Jin, Jout, AuxCostInc); 1114 if (Jump.Weight > 0) { 1115 Network.addEdge(Jout, Jin, Jump.Weight, AuxCostDec); 1116 Network.addEdge(S1, Jout, Jump.Weight, 0); 1117 Network.addEdge(Jin, T1, Jump.Weight, 0); 1118 } 1119 } 1120 1121 // Make sure we have a valid flow circulation 1122 Network.addEdge(T, S, 0); 1123 } 1124 1125 /// Assign costs for increasing/decreasing the block counts. 1126 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params, 1127 const FlowBlock &Block) { 1128 // Modifying the weight of an unlikely block is expensive 1129 if (Block.IsUnlikely) 1130 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely); 1131 1132 // Assign default values for the costs 1133 int64_t CostInc = Params.CostBlockInc; 1134 int64_t CostDec = Params.CostBlockDec; 1135 // Update the costs depending on the block metadata 1136 if (Block.HasUnknownWeight) { 1137 CostInc = Params.CostBlockUnknownInc; 1138 CostDec = 0; 1139 } else { 1140 // Increasing the count for "cold" blocks with zero initial count is more 1141 // expensive than for "hot" ones 1142 if (Block.Weight == 0) 1143 CostInc = Params.CostBlockZeroInc; 1144 // Modifying the count of the entry block is expensive 1145 if (Block.isEntry()) { 1146 CostInc = Params.CostBlockEntryInc; 1147 CostDec = Params.CostBlockEntryDec; 1148 } 1149 } 1150 return std::make_pair(CostInc, CostDec); 1151 } 1152 1153 /// Assign costs for increasing/decreasing the jump counts. 1154 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params, 1155 const FlowJump &Jump) { 1156 // Modifying the weight of an unlikely jump is expensive 1157 if (Jump.IsUnlikely) 1158 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely); 1159 1160 // Assign default values for the costs 1161 int64_t CostInc = Params.CostJumpInc; 1162 int64_t CostDec = Params.CostJumpDec; 1163 // Update the costs depending on the block metadata 1164 if (Jump.Source + 1 == Jump.Target) { 1165 // Adjusting the fall-through branch 1166 CostInc = Params.CostJumpFTInc; 1167 CostDec = Params.CostJumpFTDec; 1168 } 1169 if (Jump.HasUnknownWeight) { 1170 // The cost is different for fall-through and non-fall-through branches 1171 if (Jump.Source + 1 == Jump.Target) 1172 CostInc = Params.CostJumpUnknownFTInc; 1173 else 1174 CostInc = Params.CostJumpUnknownInc; 1175 CostDec = 0; 1176 } else { 1177 assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight"); 1178 } 1179 return std::make_pair(CostInc, CostDec); 1180 } 1181 1182 /// Extract resulting block and edge counts from the flow network. 1183 void extractWeights(const ProfiParams &Params, MinCostMaxFlow &Network, 1184 FlowFunction &Func) { 1185 uint64_t NumBlocks = Func.Blocks.size(); 1186 uint64_t NumJumps = Func.Jumps.size(); 1187 1188 // Extract resulting jump counts 1189 for (uint64_t J = 0; J < NumJumps; J++) { 1190 auto &Jump = Func.Jumps[J]; 1191 uint64_t SrcOut = 2 * Jump.Source + 1; 1192 uint64_t DstIn = 2 * Jump.Target; 1193 1194 int64_t Flow = 0; 1195 int64_t AuxFlow = Network.getFlow(SrcOut, DstIn); 1196 if (Jump.Source != Jump.Target) 1197 Flow = int64_t(Jump.Weight) + AuxFlow; 1198 else 1199 Flow = int64_t(Jump.Weight) + (AuxFlow > 0 ? AuxFlow : 0); 1200 1201 Jump.Flow = Flow; 1202 assert(Flow >= 0 && "negative jump flow"); 1203 } 1204 1205 // Extract resulting block counts 1206 auto InFlow = std::vector<uint64_t>(NumBlocks, 0); 1207 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); 1208 for (auto &Jump : Func.Jumps) { 1209 InFlow[Jump.Target] += Jump.Flow; 1210 OutFlow[Jump.Source] += Jump.Flow; 1211 } 1212 for (uint64_t B = 0; B < NumBlocks; B++) { 1213 auto &Block = Func.Blocks[B]; 1214 Block.Flow = std::max(OutFlow[B], InFlow[B]); 1215 } 1216 } 1217 1218 #ifndef NDEBUG 1219 /// Verify that the provided block/jump weights are as expected. 1220 void verifyInput(const FlowFunction &Func) { 1221 // Verify the entry block 1222 assert(Func.Entry == 0 && Func.Blocks[0].isEntry()); 1223 for (size_t I = 1; I < Func.Blocks.size(); I++) { 1224 assert(!Func.Blocks[I].isEntry() && "multiple entry blocks"); 1225 } 1226 // Verify CFG jumps 1227 for (auto &Block : Func.Blocks) { 1228 assert((!Block.isEntry() || !Block.isExit()) && 1229 "a block cannot be an entry and an exit"); 1230 } 1231 // Verify input block weights 1232 for (auto &Block : Func.Blocks) { 1233 assert((!Block.HasUnknownWeight || Block.Weight == 0 || Block.isEntry()) && 1234 "non-zero weight of a block w/o weight except for an entry"); 1235 } 1236 // Verify input jump weights 1237 for (auto &Jump : Func.Jumps) { 1238 assert((!Jump.HasUnknownWeight || Jump.Weight == 0) && 1239 "non-zero weight of a jump w/o weight"); 1240 } 1241 } 1242 1243 /// Verify that the computed flow values satisfy flow conservation rules. 1244 void verifyOutput(const FlowFunction &Func) { 1245 const uint64_t NumBlocks = Func.Blocks.size(); 1246 auto InFlow = std::vector<uint64_t>(NumBlocks, 0); 1247 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); 1248 for (const auto &Jump : Func.Jumps) { 1249 InFlow[Jump.Target] += Jump.Flow; 1250 OutFlow[Jump.Source] += Jump.Flow; 1251 } 1252 1253 uint64_t TotalInFlow = 0; 1254 uint64_t TotalOutFlow = 0; 1255 for (uint64_t I = 0; I < NumBlocks; I++) { 1256 auto &Block = Func.Blocks[I]; 1257 if (Block.isEntry()) { 1258 TotalInFlow += Block.Flow; 1259 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); 1260 } else if (Block.isExit()) { 1261 TotalOutFlow += Block.Flow; 1262 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); 1263 } else { 1264 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); 1265 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); 1266 } 1267 } 1268 assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); 1269 1270 // Verify that there are no isolated flow components 1271 // One could modify FlowFunction to hold edges indexed by the sources, which 1272 // will avoid a creation of the object 1273 auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks); 1274 for (const auto &Jump : Func.Jumps) { 1275 if (Jump.Flow > 0) { 1276 PositiveFlowEdges[Jump.Source].push_back(Jump.Target); 1277 } 1278 } 1279 1280 // Run BFS from the source along edges with positive flow 1281 std::queue<uint64_t> Queue; 1282 auto Visited = BitVector(NumBlocks, false); 1283 Queue.push(Func.Entry); 1284 Visited[Func.Entry] = true; 1285 while (!Queue.empty()) { 1286 uint64_t Src = Queue.front(); 1287 Queue.pop(); 1288 for (uint64_t Dst : PositiveFlowEdges[Src]) { 1289 if (!Visited[Dst]) { 1290 Queue.push(Dst); 1291 Visited[Dst] = true; 1292 } 1293 } 1294 } 1295 1296 // Verify that every block that has a positive flow is reached from the source 1297 // along edges with a positive flow 1298 for (uint64_t I = 0; I < NumBlocks; I++) { 1299 auto &Block = Func.Blocks[I]; 1300 assert((Visited[I] || Block.Flow == 0) && "an isolated flow component"); 1301 } 1302 } 1303 #endif 1304 1305 } // end of anonymous namespace 1306 1307 /// Apply the profile inference algorithm for a given function 1308 void llvm::applyFlowInference(const ProfiParams &Params, FlowFunction &Func) { 1309 #ifndef NDEBUG 1310 // Verify the input data 1311 verifyInput(Func); 1312 #endif 1313 1314 // Create and apply an inference network model 1315 auto InferenceNetwork = MinCostMaxFlow(Params); 1316 initializeNetwork(Params, InferenceNetwork, Func); 1317 InferenceNetwork.run(); 1318 1319 // Extract flow values for every block and every edge 1320 extractWeights(Params, InferenceNetwork, Func); 1321 1322 // Post-processing adjustments to the flow 1323 auto Adjuster = FlowAdjuster(Params, Func); 1324 Adjuster.run(); 1325 1326 #ifndef NDEBUG 1327 // Verify the result 1328 verifyOutput(Func); 1329 #endif 1330 } 1331 1332 /// Apply the profile inference algorithm for a given flow function 1333 void llvm::applyFlowInference(FlowFunction &Func) { 1334 ProfiParams Params; 1335 // Set the params from the command-line flags. 1336 Params.EvenFlowDistribution = SampleProfileEvenFlowDistribution; 1337 Params.RebalanceUnknown = SampleProfileRebalanceUnknown; 1338 Params.JoinIslands = SampleProfileJoinIslands; 1339 Params.CostBlockInc = SampleProfileProfiCostBlockInc; 1340 Params.CostBlockDec = SampleProfileProfiCostBlockDec; 1341 Params.CostBlockEntryInc = SampleProfileProfiCostBlockEntryInc; 1342 Params.CostBlockEntryDec = SampleProfileProfiCostBlockEntryDec; 1343 Params.CostBlockZeroInc = SampleProfileProfiCostBlockZeroInc; 1344 Params.CostBlockUnknownInc = SampleProfileProfiCostBlockUnknownInc; 1345 1346 applyFlowInference(Params, Func); 1347 } 1348