1 //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loops should be simplified before this analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/BlockFrequencyInfoImpl.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/SCCIterator.h" 18 #include "llvm/Config/llvm-config.h" 19 #include "llvm/IR/Function.h" 20 #include "llvm/Support/BlockFrequency.h" 21 #include "llvm/Support/BranchProbability.h" 22 #include "llvm/Support/Compiler.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/MathExtras.h" 25 #include "llvm/Support/ScaledNumber.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include <algorithm> 28 #include <cassert> 29 #include <cstddef> 30 #include <cstdint> 31 #include <iterator> 32 #include <list> 33 #include <numeric> 34 #include <utility> 35 #include <vector> 36 37 using namespace llvm; 38 using namespace llvm::bfi_detail; 39 40 #define DEBUG_TYPE "block-freq" 41 42 namespace llvm { 43 cl::opt<bool> CheckBFIUnknownBlockQueries( 44 "check-bfi-unknown-block-queries", 45 cl::init(false), cl::Hidden, 46 cl::desc("Check if block frequency is queried for an unknown block " 47 "for debugging missed BFI updates")); 48 49 cl::opt<bool> UseIterativeBFIInference( 50 "use-iterative-bfi-inference", cl::Hidden, 51 cl::desc("Apply an iterative post-processing to infer correct BFI counts")); 52 53 cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock( 54 "iterative-bfi-max-iterations-per-block", cl::init(1000), cl::Hidden, 55 cl::desc("Iterative inference: maximum number of update iterations " 56 "per block")); 57 58 cl::opt<double> IterativeBFIPrecision( 59 "iterative-bfi-precision", cl::init(1e-12), cl::Hidden, 60 cl::desc("Iterative inference: delta convergence precision; smaller values " 61 "typically lead to better results at the cost of worsen runtime")); 62 } 63 64 ScaledNumber<uint64_t> BlockMass::toScaled() const { 65 if (isFull()) 66 return ScaledNumber<uint64_t>(1, 0); 67 return ScaledNumber<uint64_t>(getMass() + 1, -64); 68 } 69 70 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 71 LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } 72 #endif 73 74 static char getHexDigit(int N) { 75 assert(N < 16); 76 if (N < 10) 77 return '0' + N; 78 return 'a' + N - 10; 79 } 80 81 raw_ostream &BlockMass::print(raw_ostream &OS) const { 82 for (int Digits = 0; Digits < 16; ++Digits) 83 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); 84 return OS; 85 } 86 87 namespace { 88 89 using BlockNode = BlockFrequencyInfoImplBase::BlockNode; 90 using Distribution = BlockFrequencyInfoImplBase::Distribution; 91 using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; 92 using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; 93 using LoopData = BlockFrequencyInfoImplBase::LoopData; 94 using Weight = BlockFrequencyInfoImplBase::Weight; 95 using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; 96 97 /// Dithering mass distributer. 98 /// 99 /// This class splits up a single mass into portions by weight, dithering to 100 /// spread out error. No mass is lost. The dithering precision depends on the 101 /// precision of the product of \a BlockMass and \a BranchProbability. 102 /// 103 /// The distribution algorithm follows. 104 /// 105 /// 1. Initialize by saving the sum of the weights in \a RemWeight and the 106 /// mass to distribute in \a RemMass. 107 /// 108 /// 2. For each portion: 109 /// 110 /// 1. Construct a branch probability, P, as the portion's weight divided 111 /// by the current value of \a RemWeight. 112 /// 2. Calculate the portion's mass as \a RemMass times P. 113 /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting 114 /// the current portion's weight and mass. 115 struct DitheringDistributer { 116 uint32_t RemWeight; 117 BlockMass RemMass; 118 119 DitheringDistributer(Distribution &Dist, const BlockMass &Mass); 120 121 BlockMass takeMass(uint32_t Weight); 122 }; 123 124 } // end anonymous namespace 125 126 DitheringDistributer::DitheringDistributer(Distribution &Dist, 127 const BlockMass &Mass) { 128 Dist.normalize(); 129 RemWeight = Dist.Total; 130 RemMass = Mass; 131 } 132 133 BlockMass DitheringDistributer::takeMass(uint32_t Weight) { 134 assert(Weight && "invalid weight"); 135 assert(Weight <= RemWeight); 136 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); 137 138 // Decrement totals (dither). 139 RemWeight -= Weight; 140 RemMass -= Mass; 141 return Mass; 142 } 143 144 void Distribution::add(const BlockNode &Node, uint64_t Amount, 145 Weight::DistType Type) { 146 assert(Amount && "invalid weight of 0"); 147 uint64_t NewTotal = Total + Amount; 148 149 // Check for overflow. It should be impossible to overflow twice. 150 bool IsOverflow = NewTotal < Total; 151 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); 152 DidOverflow |= IsOverflow; 153 154 // Update the total. 155 Total = NewTotal; 156 157 // Save the weight. 158 Weights.push_back(Weight(Type, Node, Amount)); 159 } 160 161 static void combineWeight(Weight &W, const Weight &OtherW) { 162 assert(OtherW.TargetNode.isValid()); 163 if (!W.Amount) { 164 W = OtherW; 165 return; 166 } 167 assert(W.Type == OtherW.Type); 168 assert(W.TargetNode == OtherW.TargetNode); 169 assert(OtherW.Amount && "Expected non-zero weight"); 170 if (W.Amount > W.Amount + OtherW.Amount) 171 // Saturate on overflow. 172 W.Amount = UINT64_MAX; 173 else 174 W.Amount += OtherW.Amount; 175 } 176 177 static void combineWeightsBySorting(WeightList &Weights) { 178 // Sort so edges to the same node are adjacent. 179 llvm::sort(Weights, [](const Weight &L, const Weight &R) { 180 return L.TargetNode < R.TargetNode; 181 }); 182 183 // Combine adjacent edges. 184 WeightList::iterator O = Weights.begin(); 185 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; 186 ++O, (I = L)) { 187 *O = *I; 188 189 // Find the adjacent weights to the same node. 190 for (++L; L != E && I->TargetNode == L->TargetNode; ++L) 191 combineWeight(*O, *L); 192 } 193 194 // Erase extra entries. 195 Weights.erase(O, Weights.end()); 196 } 197 198 static void combineWeightsByHashing(WeightList &Weights) { 199 // Collect weights into a DenseMap. 200 using HashTable = DenseMap<BlockNode::IndexType, Weight>; 201 202 HashTable Combined(NextPowerOf2(2 * Weights.size())); 203 for (const Weight &W : Weights) 204 combineWeight(Combined[W.TargetNode.Index], W); 205 206 // Check whether anything changed. 207 if (Weights.size() == Combined.size()) 208 return; 209 210 // Fill in the new weights. 211 Weights.clear(); 212 Weights.reserve(Combined.size()); 213 for (const auto &I : Combined) 214 Weights.push_back(I.second); 215 } 216 217 static void combineWeights(WeightList &Weights) { 218 // Use a hash table for many successors to keep this linear. 219 if (Weights.size() > 128) { 220 combineWeightsByHashing(Weights); 221 return; 222 } 223 224 combineWeightsBySorting(Weights); 225 } 226 227 static uint64_t shiftRightAndRound(uint64_t N, int Shift) { 228 assert(Shift >= 0); 229 assert(Shift < 64); 230 if (!Shift) 231 return N; 232 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); 233 } 234 235 void Distribution::normalize() { 236 // Early exit for termination nodes. 237 if (Weights.empty()) 238 return; 239 240 // Only bother if there are multiple successors. 241 if (Weights.size() > 1) 242 combineWeights(Weights); 243 244 // Early exit when combined into a single successor. 245 if (Weights.size() == 1) { 246 Total = 1; 247 Weights.front().Amount = 1; 248 return; 249 } 250 251 // Determine how much to shift right so that the total fits into 32-bits. 252 // 253 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 254 // for each weight can cause a 32-bit overflow. 255 int Shift = 0; 256 if (DidOverflow) 257 Shift = 33; 258 else if (Total > UINT32_MAX) 259 Shift = 33 - countLeadingZeros(Total); 260 261 // Early exit if nothing needs to be scaled. 262 if (!Shift) { 263 // If we didn't overflow then combineWeights() shouldn't have changed the 264 // sum of the weights, but let's double-check. 265 assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), 266 [](uint64_t Sum, const Weight &W) { 267 return Sum + W.Amount; 268 }) && 269 "Expected total to be correct"); 270 return; 271 } 272 273 // Recompute the total through accumulation (rather than shifting it) so that 274 // it's accurate after shifting and any changes combineWeights() made above. 275 Total = 0; 276 277 // Sum the weights to each node and shift right if necessary. 278 for (Weight &W : Weights) { 279 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we 280 // can round here without concern about overflow. 281 assert(W.TargetNode.isValid()); 282 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); 283 assert(W.Amount <= UINT32_MAX); 284 285 // Update the total. 286 Total += W.Amount; 287 } 288 assert(Total <= UINT32_MAX); 289 } 290 291 void BlockFrequencyInfoImplBase::clear() { 292 // Swap with a default-constructed std::vector, since std::vector<>::clear() 293 // does not actually clear heap storage. 294 std::vector<FrequencyData>().swap(Freqs); 295 IsIrrLoopHeader.clear(); 296 std::vector<WorkingData>().swap(Working); 297 Loops.clear(); 298 } 299 300 /// Clear all memory not needed downstream. 301 /// 302 /// Releases all memory not used downstream. In particular, saves Freqs. 303 static void cleanup(BlockFrequencyInfoImplBase &BFI) { 304 std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); 305 SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); 306 BFI.clear(); 307 BFI.Freqs = std::move(SavedFreqs); 308 BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); 309 } 310 311 bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, 312 const LoopData *OuterLoop, 313 const BlockNode &Pred, 314 const BlockNode &Succ, 315 uint64_t Weight) { 316 if (!Weight) 317 Weight = 1; 318 319 auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { 320 return OuterLoop && OuterLoop->isHeader(Node); 321 }; 322 323 BlockNode Resolved = Working[Succ.Index].getResolvedNode(); 324 325 #ifndef NDEBUG 326 auto debugSuccessor = [&](const char *Type) { 327 dbgs() << " =>" 328 << " [" << Type << "] weight = " << Weight; 329 if (!isLoopHeader(Resolved)) 330 dbgs() << ", succ = " << getBlockName(Succ); 331 if (Resolved != Succ) 332 dbgs() << ", resolved = " << getBlockName(Resolved); 333 dbgs() << "\n"; 334 }; 335 (void)debugSuccessor; 336 #endif 337 338 if (isLoopHeader(Resolved)) { 339 LLVM_DEBUG(debugSuccessor("backedge")); 340 Dist.addBackedge(Resolved, Weight); 341 return true; 342 } 343 344 if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { 345 LLVM_DEBUG(debugSuccessor(" exit ")); 346 Dist.addExit(Resolved, Weight); 347 return true; 348 } 349 350 if (Resolved < Pred) { 351 if (!isLoopHeader(Pred)) { 352 // If OuterLoop is an irreducible loop, we can't actually handle this. 353 assert((!OuterLoop || !OuterLoop->isIrreducible()) && 354 "unhandled irreducible control flow"); 355 356 // Irreducible backedge. Abort. 357 LLVM_DEBUG(debugSuccessor("abort!!!")); 358 return false; 359 } 360 361 // If "Pred" is a loop header, then this isn't really a backedge; rather, 362 // OuterLoop must be irreducible. These false backedges can come only from 363 // secondary loop headers. 364 assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && 365 "unhandled irreducible control flow"); 366 } 367 368 LLVM_DEBUG(debugSuccessor(" local ")); 369 Dist.addLocal(Resolved, Weight); 370 return true; 371 } 372 373 bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( 374 const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { 375 // Copy the exit map into Dist. 376 for (const auto &I : Loop.Exits) 377 if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, 378 I.second.getMass())) 379 // Irreducible backedge. 380 return false; 381 382 return true; 383 } 384 385 /// Compute the loop scale for a loop. 386 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { 387 // Compute loop scale. 388 LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); 389 390 // Infinite loops need special handling. If we give the back edge an infinite 391 // mass, they may saturate all the other scales in the function down to 1, 392 // making all the other region temperatures look exactly the same. Choose an 393 // arbitrary scale to avoid these issues. 394 // 395 // FIXME: An alternate way would be to select a symbolic scale which is later 396 // replaced to be the maximum of all computed scales plus 1. This would 397 // appropriately describe the loop as having a large scale, without skewing 398 // the final frequency computation. 399 const Scaled64 InfiniteLoopScale(1, 12); 400 401 // LoopScale == 1 / ExitMass 402 // ExitMass == HeadMass - BackedgeMass 403 BlockMass TotalBackedgeMass; 404 for (auto &Mass : Loop.BackedgeMass) 405 TotalBackedgeMass += Mass; 406 BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; 407 408 // Block scale stores the inverse of the scale. If this is an infinite loop, 409 // its exit mass will be zero. In this case, use an arbitrary scale for the 410 // loop scale. 411 Loop.Scale = 412 ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse(); 413 414 LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" 415 << BlockMass::getFull() << " - " << TotalBackedgeMass 416 << ")\n" 417 << " - scale = " << Loop.Scale << "\n"); 418 } 419 420 /// Package up a loop. 421 void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { 422 LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); 423 424 // Clear the subloop exits to prevent quadratic memory usage. 425 for (const BlockNode &M : Loop.Nodes) { 426 if (auto *Loop = Working[M.Index].getPackagedLoop()) 427 Loop->Exits.clear(); 428 LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); 429 } 430 Loop.IsPackaged = true; 431 } 432 433 #ifndef NDEBUG 434 static void debugAssign(const BlockFrequencyInfoImplBase &BFI, 435 const DitheringDistributer &D, const BlockNode &T, 436 const BlockMass &M, const char *Desc) { 437 dbgs() << " => assign " << M << " (" << D.RemMass << ")"; 438 if (Desc) 439 dbgs() << " [" << Desc << "]"; 440 if (T.isValid()) 441 dbgs() << " to " << BFI.getBlockName(T); 442 dbgs() << "\n"; 443 } 444 #endif 445 446 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, 447 LoopData *OuterLoop, 448 Distribution &Dist) { 449 BlockMass Mass = Working[Source.Index].getMass(); 450 LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n"); 451 452 // Distribute mass to successors as laid out in Dist. 453 DitheringDistributer D(Dist, Mass); 454 455 for (const Weight &W : Dist.Weights) { 456 // Check for a local edge (non-backedge and non-exit). 457 BlockMass Taken = D.takeMass(W.Amount); 458 if (W.Type == Weight::Local) { 459 Working[W.TargetNode.Index].getMass() += Taken; 460 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 461 continue; 462 } 463 464 // Backedges and exits only make sense if we're processing a loop. 465 assert(OuterLoop && "backedge or exit outside of loop"); 466 467 // Check for a backedge. 468 if (W.Type == Weight::Backedge) { 469 OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; 470 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); 471 continue; 472 } 473 474 // This must be an exit. 475 assert(W.Type == Weight::Exit); 476 OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); 477 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); 478 } 479 } 480 481 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, 482 const Scaled64 &Min, const Scaled64 &Max) { 483 // Scale the Factor to a size that creates integers. Ideally, integers would 484 // be scaled so that Max == UINT64_MAX so that they can be best 485 // differentiated. However, in the presence of large frequency values, small 486 // frequencies are scaled down to 1, making it impossible to differentiate 487 // small, unequal numbers. When the spread between Min and Max frequencies 488 // fits well within MaxBits, we make the scale be at least 8. 489 const unsigned MaxBits = 64; 490 const unsigned SpreadBits = (Max / Min).lg(); 491 Scaled64 ScalingFactor; 492 if (SpreadBits <= MaxBits - 3) { 493 // If the values are small enough, make the scaling factor at least 8 to 494 // allow distinguishing small values. 495 ScalingFactor = Min.inverse(); 496 ScalingFactor <<= 3; 497 } else { 498 // If the values need more than MaxBits to be represented, saturate small 499 // frequency values down to 1 by using a scaling factor that benefits large 500 // frequency values. 501 ScalingFactor = Scaled64(1, MaxBits) / Max; 502 } 503 504 // Translate the floats to integers. 505 LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max 506 << ", factor = " << ScalingFactor << "\n"); 507 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { 508 Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; 509 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); 510 LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " 511 << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled 512 << ", int = " << BFI.Freqs[Index].Integer << "\n"); 513 } 514 } 515 516 /// Unwrap a loop package. 517 /// 518 /// Visits all the members of a loop, adjusting their BlockData according to 519 /// the loop's pseudo-node. 520 static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { 521 LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) 522 << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale 523 << "\n"); 524 Loop.Scale *= Loop.Mass.toScaled(); 525 Loop.IsPackaged = false; 526 LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); 527 528 // Propagate the head scale through the loop. Since members are visited in 529 // RPO, the head scale will be updated by the loop scale first, and then the 530 // final head scale will be used for updated the rest of the members. 531 for (const BlockNode &N : Loop.Nodes) { 532 const auto &Working = BFI.Working[N.Index]; 533 Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale 534 : BFI.Freqs[N.Index].Scaled; 535 Scaled64 New = Loop.Scale * F; 536 LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " 537 << New << "\n"); 538 F = New; 539 } 540 } 541 542 void BlockFrequencyInfoImplBase::unwrapLoops() { 543 // Set initial frequencies from loop-local masses. 544 for (size_t Index = 0; Index < Working.size(); ++Index) 545 Freqs[Index].Scaled = Working[Index].Mass.toScaled(); 546 547 for (LoopData &Loop : Loops) 548 unwrapLoop(*this, Loop); 549 } 550 551 void BlockFrequencyInfoImplBase::finalizeMetrics() { 552 // Unwrap loop packages in reverse post-order, tracking min and max 553 // frequencies. 554 auto Min = Scaled64::getLargest(); 555 auto Max = Scaled64::getZero(); 556 for (size_t Index = 0; Index < Working.size(); ++Index) { 557 // Update min/max scale. 558 Min = std::min(Min, Freqs[Index].Scaled); 559 Max = std::max(Max, Freqs[Index].Scaled); 560 } 561 562 // Convert to integers. 563 convertFloatingToInteger(*this, Min, Max); 564 565 // Clean up data structures. 566 cleanup(*this); 567 568 // Print out the final stats. 569 LLVM_DEBUG(dump()); 570 } 571 572 BlockFrequency 573 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { 574 if (!Node.isValid()) { 575 #ifndef NDEBUG 576 if (CheckBFIUnknownBlockQueries) { 577 SmallString<256> Msg; 578 raw_svector_ostream OS(Msg); 579 OS << "*** Detected BFI query for unknown block " << getBlockName(Node); 580 report_fatal_error(OS.str()); 581 } 582 #endif 583 return 0; 584 } 585 return Freqs[Node.Index].Integer; 586 } 587 588 Optional<uint64_t> 589 BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, 590 const BlockNode &Node, 591 bool AllowSynthetic) const { 592 return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency(), 593 AllowSynthetic); 594 } 595 596 Optional<uint64_t> 597 BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, 598 uint64_t Freq, 599 bool AllowSynthetic) const { 600 auto EntryCount = F.getEntryCount(AllowSynthetic); 601 if (!EntryCount) 602 return None; 603 // Use 128 bit APInt to do the arithmetic to avoid overflow. 604 APInt BlockCount(128, EntryCount->getCount()); 605 APInt BlockFreq(128, Freq); 606 APInt EntryFreq(128, getEntryFreq()); 607 BlockCount *= BlockFreq; 608 // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned 609 // lshr by 1 gives EntryFreq/2. 610 BlockCount = (BlockCount + EntryFreq.lshr(1)).udiv(EntryFreq); 611 return BlockCount.getLimitedValue(); 612 } 613 614 bool 615 BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { 616 if (!Node.isValid()) 617 return false; 618 return IsIrrLoopHeader.test(Node.Index); 619 } 620 621 Scaled64 622 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { 623 if (!Node.isValid()) 624 return Scaled64::getZero(); 625 return Freqs[Node.Index].Scaled; 626 } 627 628 void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, 629 uint64_t Freq) { 630 assert(Node.isValid() && "Expected valid node"); 631 assert(Node.Index < Freqs.size() && "Expected legal index"); 632 Freqs[Node.Index].Integer = Freq; 633 } 634 635 std::string 636 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { 637 return {}; 638 } 639 640 std::string 641 BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { 642 return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*"); 643 } 644 645 raw_ostream & 646 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 647 const BlockNode &Node) const { 648 return OS << getFloatingBlockFreq(Node); 649 } 650 651 raw_ostream & 652 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 653 const BlockFrequency &Freq) const { 654 Scaled64 Block(Freq.getFrequency(), 0); 655 Scaled64 Entry(getEntryFreq(), 0); 656 657 return OS << Block / Entry; 658 } 659 660 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { 661 Start = OuterLoop.getHeader(); 662 Nodes.reserve(OuterLoop.Nodes.size()); 663 for (auto N : OuterLoop.Nodes) 664 addNode(N); 665 indexNodes(); 666 } 667 668 void IrreducibleGraph::addNodesInFunction() { 669 Start = 0; 670 for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) 671 if (!BFI.Working[Index].isPackaged()) 672 addNode(Index); 673 indexNodes(); 674 } 675 676 void IrreducibleGraph::indexNodes() { 677 for (auto &I : Nodes) 678 Lookup[I.Node.Index] = &I; 679 } 680 681 void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, 682 const BFIBase::LoopData *OuterLoop) { 683 if (OuterLoop && OuterLoop->isHeader(Succ)) 684 return; 685 auto L = Lookup.find(Succ.Index); 686 if (L == Lookup.end()) 687 return; 688 IrrNode &SuccIrr = *L->second; 689 Irr.Edges.push_back(&SuccIrr); 690 SuccIrr.Edges.push_front(&Irr); 691 ++SuccIrr.NumIn; 692 } 693 694 namespace llvm { 695 696 template <> struct GraphTraits<IrreducibleGraph> { 697 using GraphT = bfi_detail::IrreducibleGraph; 698 using NodeRef = const GraphT::IrrNode *; 699 using ChildIteratorType = GraphT::IrrNode::iterator; 700 701 static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } 702 static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } 703 static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } 704 }; 705 706 } // end namespace llvm 707 708 /// Find extra irreducible headers. 709 /// 710 /// Find entry blocks and other blocks with backedges, which exist when \c G 711 /// contains irreducible sub-SCCs. 712 static void findIrreducibleHeaders( 713 const BlockFrequencyInfoImplBase &BFI, 714 const IrreducibleGraph &G, 715 const std::vector<const IrreducibleGraph::IrrNode *> &SCC, 716 LoopData::NodeList &Headers, LoopData::NodeList &Others) { 717 // Map from nodes in the SCC to whether it's an entry block. 718 SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; 719 720 // InSCC also acts the set of nodes in the graph. Seed it. 721 for (const auto *I : SCC) 722 InSCC[I] = false; 723 724 for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { 725 auto &Irr = *I->first; 726 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 727 if (InSCC.count(P)) 728 continue; 729 730 // This is an entry block. 731 I->second = true; 732 Headers.push_back(Irr.Node); 733 LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) 734 << "\n"); 735 break; 736 } 737 } 738 assert(Headers.size() >= 2 && 739 "Expected irreducible CFG; -loop-info is likely invalid"); 740 if (Headers.size() == InSCC.size()) { 741 // Every block is a header. 742 llvm::sort(Headers); 743 return; 744 } 745 746 // Look for extra headers from irreducible sub-SCCs. 747 for (const auto &I : InSCC) { 748 // Entry blocks are already headers. 749 if (I.second) 750 continue; 751 752 auto &Irr = *I.first; 753 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 754 // Skip forward edges. 755 if (P->Node < Irr.Node) 756 continue; 757 758 // Skip predecessors from entry blocks. These can have inverted 759 // ordering. 760 if (InSCC.lookup(P)) 761 continue; 762 763 // Store the extra header. 764 Headers.push_back(Irr.Node); 765 LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) 766 << "\n"); 767 break; 768 } 769 if (Headers.back() == Irr.Node) 770 // Added this as a header. 771 continue; 772 773 // This is not a header. 774 Others.push_back(Irr.Node); 775 LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); 776 } 777 llvm::sort(Headers); 778 llvm::sort(Others); 779 } 780 781 static void createIrreducibleLoop( 782 BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, 783 LoopData *OuterLoop, std::list<LoopData>::iterator Insert, 784 const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { 785 // Translate the SCC into RPO. 786 LLVM_DEBUG(dbgs() << " - found-scc\n"); 787 788 LoopData::NodeList Headers; 789 LoopData::NodeList Others; 790 findIrreducibleHeaders(BFI, G, SCC, Headers, Others); 791 792 auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), 793 Headers.end(), Others.begin(), Others.end()); 794 795 // Update loop hierarchy. 796 for (const auto &N : Loop->Nodes) 797 if (BFI.Working[N.Index].isLoopHeader()) 798 BFI.Working[N.Index].Loop->Parent = &*Loop; 799 else 800 BFI.Working[N.Index].Loop = &*Loop; 801 } 802 803 iterator_range<std::list<LoopData>::iterator> 804 BlockFrequencyInfoImplBase::analyzeIrreducible( 805 const IrreducibleGraph &G, LoopData *OuterLoop, 806 std::list<LoopData>::iterator Insert) { 807 assert((OuterLoop == nullptr) == (Insert == Loops.begin())); 808 auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); 809 810 for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { 811 if (I->size() < 2) 812 continue; 813 814 // Translate the SCC into RPO. 815 createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); 816 } 817 818 if (OuterLoop) 819 return make_range(std::next(Prev), Insert); 820 return make_range(Loops.begin(), Insert); 821 } 822 823 void 824 BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { 825 OuterLoop.Exits.clear(); 826 for (auto &Mass : OuterLoop.BackedgeMass) 827 Mass = BlockMass::getEmpty(); 828 auto O = OuterLoop.Nodes.begin() + 1; 829 for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) 830 if (!Working[I->Index].isPackaged()) 831 *O++ = *I; 832 OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); 833 } 834 835 void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { 836 assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); 837 838 // Since the loop has more than one header block, the mass flowing back into 839 // each header will be different. Adjust the mass in each header loop to 840 // reflect the masses flowing through back edges. 841 // 842 // To do this, we distribute the initial mass using the backedge masses 843 // as weights for the distribution. 844 BlockMass LoopMass = BlockMass::getFull(); 845 Distribution Dist; 846 847 LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n"); 848 for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { 849 auto &HeaderNode = Loop.Nodes[H]; 850 auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; 851 LLVM_DEBUG(dbgs() << " - Add back edge mass for node " 852 << getBlockName(HeaderNode) << ": " << BackedgeMass 853 << "\n"); 854 if (BackedgeMass.getMass() > 0) 855 Dist.addLocal(HeaderNode, BackedgeMass.getMass()); 856 else 857 LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); 858 } 859 860 DitheringDistributer D(Dist, LoopMass); 861 862 LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass 863 << " to headers using above weights\n"); 864 for (const Weight &W : Dist.Weights) { 865 BlockMass Taken = D.takeMass(W.Amount); 866 assert(W.Type == Weight::Local && "all weights should be local"); 867 Working[W.TargetNode.Index].getMass() = Taken; 868 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 869 } 870 } 871 872 void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { 873 BlockMass LoopMass = BlockMass::getFull(); 874 DitheringDistributer D(Dist, LoopMass); 875 for (const Weight &W : Dist.Weights) { 876 BlockMass Taken = D.takeMass(W.Amount); 877 assert(W.Type == Weight::Local && "all weights should be local"); 878 Working[W.TargetNode.Index].getMass() = Taken; 879 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 880 } 881 } 882