1 //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loops should be simplified before this analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/BlockFrequencyInfoImpl.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/GraphTraits.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/SCCIterator.h" 19 #include "llvm/Config/llvm-config.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/Support/BlockFrequency.h" 22 #include "llvm/Support/BranchProbability.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/ScaledNumber.h" 26 #include "llvm/Support/MathExtras.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <cstddef> 31 #include <cstdint> 32 #include <iterator> 33 #include <list> 34 #include <numeric> 35 #include <utility> 36 #include <vector> 37 38 using namespace llvm; 39 using namespace llvm::bfi_detail; 40 41 #define DEBUG_TYPE "block-freq" 42 43 cl::opt<bool> CheckBFIUnknownBlockQueries( 44 "check-bfi-unknown-block-queries", 45 cl::init(false), cl::Hidden, 46 cl::desc("Check if block frequency is queried for an unknown block " 47 "for debugging missed BFI updates")); 48 49 ScaledNumber<uint64_t> BlockMass::toScaled() const { 50 if (isFull()) 51 return ScaledNumber<uint64_t>(1, 0); 52 return ScaledNumber<uint64_t>(getMass() + 1, -64); 53 } 54 55 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 56 LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } 57 #endif 58 59 static char getHexDigit(int N) { 60 assert(N < 16); 61 if (N < 10) 62 return '0' + N; 63 return 'a' + N - 10; 64 } 65 66 raw_ostream &BlockMass::print(raw_ostream &OS) const { 67 for (int Digits = 0; Digits < 16; ++Digits) 68 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); 69 return OS; 70 } 71 72 namespace { 73 74 using BlockNode = BlockFrequencyInfoImplBase::BlockNode; 75 using Distribution = BlockFrequencyInfoImplBase::Distribution; 76 using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; 77 using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; 78 using LoopData = BlockFrequencyInfoImplBase::LoopData; 79 using Weight = BlockFrequencyInfoImplBase::Weight; 80 using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; 81 82 /// Dithering mass distributer. 83 /// 84 /// This class splits up a single mass into portions by weight, dithering to 85 /// spread out error. No mass is lost. The dithering precision depends on the 86 /// precision of the product of \a BlockMass and \a BranchProbability. 87 /// 88 /// The distribution algorithm follows. 89 /// 90 /// 1. Initialize by saving the sum of the weights in \a RemWeight and the 91 /// mass to distribute in \a RemMass. 92 /// 93 /// 2. For each portion: 94 /// 95 /// 1. Construct a branch probability, P, as the portion's weight divided 96 /// by the current value of \a RemWeight. 97 /// 2. Calculate the portion's mass as \a RemMass times P. 98 /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting 99 /// the current portion's weight and mass. 100 struct DitheringDistributer { 101 uint32_t RemWeight; 102 BlockMass RemMass; 103 104 DitheringDistributer(Distribution &Dist, const BlockMass &Mass); 105 106 BlockMass takeMass(uint32_t Weight); 107 }; 108 109 } // end anonymous namespace 110 111 DitheringDistributer::DitheringDistributer(Distribution &Dist, 112 const BlockMass &Mass) { 113 Dist.normalize(); 114 RemWeight = Dist.Total; 115 RemMass = Mass; 116 } 117 118 BlockMass DitheringDistributer::takeMass(uint32_t Weight) { 119 assert(Weight && "invalid weight"); 120 assert(Weight <= RemWeight); 121 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); 122 123 // Decrement totals (dither). 124 RemWeight -= Weight; 125 RemMass -= Mass; 126 return Mass; 127 } 128 129 void Distribution::add(const BlockNode &Node, uint64_t Amount, 130 Weight::DistType Type) { 131 assert(Amount && "invalid weight of 0"); 132 uint64_t NewTotal = Total + Amount; 133 134 // Check for overflow. It should be impossible to overflow twice. 135 bool IsOverflow = NewTotal < Total; 136 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); 137 DidOverflow |= IsOverflow; 138 139 // Update the total. 140 Total = NewTotal; 141 142 // Save the weight. 143 Weights.push_back(Weight(Type, Node, Amount)); 144 } 145 146 static void combineWeight(Weight &W, const Weight &OtherW) { 147 assert(OtherW.TargetNode.isValid()); 148 if (!W.Amount) { 149 W = OtherW; 150 return; 151 } 152 assert(W.Type == OtherW.Type); 153 assert(W.TargetNode == OtherW.TargetNode); 154 assert(OtherW.Amount && "Expected non-zero weight"); 155 if (W.Amount > W.Amount + OtherW.Amount) 156 // Saturate on overflow. 157 W.Amount = UINT64_MAX; 158 else 159 W.Amount += OtherW.Amount; 160 } 161 162 static void combineWeightsBySorting(WeightList &Weights) { 163 // Sort so edges to the same node are adjacent. 164 llvm::sort(Weights, [](const Weight &L, const Weight &R) { 165 return L.TargetNode < R.TargetNode; 166 }); 167 168 // Combine adjacent edges. 169 WeightList::iterator O = Weights.begin(); 170 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; 171 ++O, (I = L)) { 172 *O = *I; 173 174 // Find the adjacent weights to the same node. 175 for (++L; L != E && I->TargetNode == L->TargetNode; ++L) 176 combineWeight(*O, *L); 177 } 178 179 // Erase extra entries. 180 Weights.erase(O, Weights.end()); 181 } 182 183 static void combineWeightsByHashing(WeightList &Weights) { 184 // Collect weights into a DenseMap. 185 using HashTable = DenseMap<BlockNode::IndexType, Weight>; 186 187 HashTable Combined(NextPowerOf2(2 * Weights.size())); 188 for (const Weight &W : Weights) 189 combineWeight(Combined[W.TargetNode.Index], W); 190 191 // Check whether anything changed. 192 if (Weights.size() == Combined.size()) 193 return; 194 195 // Fill in the new weights. 196 Weights.clear(); 197 Weights.reserve(Combined.size()); 198 for (const auto &I : Combined) 199 Weights.push_back(I.second); 200 } 201 202 static void combineWeights(WeightList &Weights) { 203 // Use a hash table for many successors to keep this linear. 204 if (Weights.size() > 128) { 205 combineWeightsByHashing(Weights); 206 return; 207 } 208 209 combineWeightsBySorting(Weights); 210 } 211 212 static uint64_t shiftRightAndRound(uint64_t N, int Shift) { 213 assert(Shift >= 0); 214 assert(Shift < 64); 215 if (!Shift) 216 return N; 217 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); 218 } 219 220 void Distribution::normalize() { 221 // Early exit for termination nodes. 222 if (Weights.empty()) 223 return; 224 225 // Only bother if there are multiple successors. 226 if (Weights.size() > 1) 227 combineWeights(Weights); 228 229 // Early exit when combined into a single successor. 230 if (Weights.size() == 1) { 231 Total = 1; 232 Weights.front().Amount = 1; 233 return; 234 } 235 236 // Determine how much to shift right so that the total fits into 32-bits. 237 // 238 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 239 // for each weight can cause a 32-bit overflow. 240 int Shift = 0; 241 if (DidOverflow) 242 Shift = 33; 243 else if (Total > UINT32_MAX) 244 Shift = 33 - countLeadingZeros(Total); 245 246 // Early exit if nothing needs to be scaled. 247 if (!Shift) { 248 // If we didn't overflow then combineWeights() shouldn't have changed the 249 // sum of the weights, but let's double-check. 250 assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), 251 [](uint64_t Sum, const Weight &W) { 252 return Sum + W.Amount; 253 }) && 254 "Expected total to be correct"); 255 return; 256 } 257 258 // Recompute the total through accumulation (rather than shifting it) so that 259 // it's accurate after shifting and any changes combineWeights() made above. 260 Total = 0; 261 262 // Sum the weights to each node and shift right if necessary. 263 for (Weight &W : Weights) { 264 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we 265 // can round here without concern about overflow. 266 assert(W.TargetNode.isValid()); 267 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); 268 assert(W.Amount <= UINT32_MAX); 269 270 // Update the total. 271 Total += W.Amount; 272 } 273 assert(Total <= UINT32_MAX); 274 } 275 276 void BlockFrequencyInfoImplBase::clear() { 277 // Swap with a default-constructed std::vector, since std::vector<>::clear() 278 // does not actually clear heap storage. 279 std::vector<FrequencyData>().swap(Freqs); 280 IsIrrLoopHeader.clear(); 281 std::vector<WorkingData>().swap(Working); 282 Loops.clear(); 283 } 284 285 /// Clear all memory not needed downstream. 286 /// 287 /// Releases all memory not used downstream. In particular, saves Freqs. 288 static void cleanup(BlockFrequencyInfoImplBase &BFI) { 289 std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); 290 SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); 291 BFI.clear(); 292 BFI.Freqs = std::move(SavedFreqs); 293 BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); 294 } 295 296 bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, 297 const LoopData *OuterLoop, 298 const BlockNode &Pred, 299 const BlockNode &Succ, 300 uint64_t Weight) { 301 if (!Weight) 302 Weight = 1; 303 304 auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { 305 return OuterLoop && OuterLoop->isHeader(Node); 306 }; 307 308 BlockNode Resolved = Working[Succ.Index].getResolvedNode(); 309 310 #ifndef NDEBUG 311 auto debugSuccessor = [&](const char *Type) { 312 dbgs() << " =>" 313 << " [" << Type << "] weight = " << Weight; 314 if (!isLoopHeader(Resolved)) 315 dbgs() << ", succ = " << getBlockName(Succ); 316 if (Resolved != Succ) 317 dbgs() << ", resolved = " << getBlockName(Resolved); 318 dbgs() << "\n"; 319 }; 320 (void)debugSuccessor; 321 #endif 322 323 if (isLoopHeader(Resolved)) { 324 LLVM_DEBUG(debugSuccessor("backedge")); 325 Dist.addBackedge(Resolved, Weight); 326 return true; 327 } 328 329 if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { 330 LLVM_DEBUG(debugSuccessor(" exit ")); 331 Dist.addExit(Resolved, Weight); 332 return true; 333 } 334 335 if (Resolved < Pred) { 336 if (!isLoopHeader(Pred)) { 337 // If OuterLoop is an irreducible loop, we can't actually handle this. 338 assert((!OuterLoop || !OuterLoop->isIrreducible()) && 339 "unhandled irreducible control flow"); 340 341 // Irreducible backedge. Abort. 342 LLVM_DEBUG(debugSuccessor("abort!!!")); 343 return false; 344 } 345 346 // If "Pred" is a loop header, then this isn't really a backedge; rather, 347 // OuterLoop must be irreducible. These false backedges can come only from 348 // secondary loop headers. 349 assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && 350 "unhandled irreducible control flow"); 351 } 352 353 LLVM_DEBUG(debugSuccessor(" local ")); 354 Dist.addLocal(Resolved, Weight); 355 return true; 356 } 357 358 bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( 359 const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { 360 // Copy the exit map into Dist. 361 for (const auto &I : Loop.Exits) 362 if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, 363 I.second.getMass())) 364 // Irreducible backedge. 365 return false; 366 367 return true; 368 } 369 370 /// Compute the loop scale for a loop. 371 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { 372 // Compute loop scale. 373 LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); 374 375 // Infinite loops need special handling. If we give the back edge an infinite 376 // mass, they may saturate all the other scales in the function down to 1, 377 // making all the other region temperatures look exactly the same. Choose an 378 // arbitrary scale to avoid these issues. 379 // 380 // FIXME: An alternate way would be to select a symbolic scale which is later 381 // replaced to be the maximum of all computed scales plus 1. This would 382 // appropriately describe the loop as having a large scale, without skewing 383 // the final frequency computation. 384 const Scaled64 InfiniteLoopScale(1, 12); 385 386 // LoopScale == 1 / ExitMass 387 // ExitMass == HeadMass - BackedgeMass 388 BlockMass TotalBackedgeMass; 389 for (auto &Mass : Loop.BackedgeMass) 390 TotalBackedgeMass += Mass; 391 BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; 392 393 // Block scale stores the inverse of the scale. If this is an infinite loop, 394 // its exit mass will be zero. In this case, use an arbitrary scale for the 395 // loop scale. 396 Loop.Scale = 397 ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse(); 398 399 LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" 400 << BlockMass::getFull() << " - " << TotalBackedgeMass 401 << ")\n" 402 << " - scale = " << Loop.Scale << "\n"); 403 } 404 405 /// Package up a loop. 406 void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { 407 LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); 408 409 // Clear the subloop exits to prevent quadratic memory usage. 410 for (const BlockNode &M : Loop.Nodes) { 411 if (auto *Loop = Working[M.Index].getPackagedLoop()) 412 Loop->Exits.clear(); 413 LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); 414 } 415 Loop.IsPackaged = true; 416 } 417 418 #ifndef NDEBUG 419 static void debugAssign(const BlockFrequencyInfoImplBase &BFI, 420 const DitheringDistributer &D, const BlockNode &T, 421 const BlockMass &M, const char *Desc) { 422 dbgs() << " => assign " << M << " (" << D.RemMass << ")"; 423 if (Desc) 424 dbgs() << " [" << Desc << "]"; 425 if (T.isValid()) 426 dbgs() << " to " << BFI.getBlockName(T); 427 dbgs() << "\n"; 428 } 429 #endif 430 431 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, 432 LoopData *OuterLoop, 433 Distribution &Dist) { 434 BlockMass Mass = Working[Source.Index].getMass(); 435 LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n"); 436 437 // Distribute mass to successors as laid out in Dist. 438 DitheringDistributer D(Dist, Mass); 439 440 for (const Weight &W : Dist.Weights) { 441 // Check for a local edge (non-backedge and non-exit). 442 BlockMass Taken = D.takeMass(W.Amount); 443 if (W.Type == Weight::Local) { 444 Working[W.TargetNode.Index].getMass() += Taken; 445 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 446 continue; 447 } 448 449 // Backedges and exits only make sense if we're processing a loop. 450 assert(OuterLoop && "backedge or exit outside of loop"); 451 452 // Check for a backedge. 453 if (W.Type == Weight::Backedge) { 454 OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; 455 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); 456 continue; 457 } 458 459 // This must be an exit. 460 assert(W.Type == Weight::Exit); 461 OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); 462 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); 463 } 464 } 465 466 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, 467 const Scaled64 &Min, const Scaled64 &Max) { 468 // Scale the Factor to a size that creates integers. Ideally, integers would 469 // be scaled so that Max == UINT64_MAX so that they can be best 470 // differentiated. However, in the presence of large frequency values, small 471 // frequencies are scaled down to 1, making it impossible to differentiate 472 // small, unequal numbers. When the spread between Min and Max frequencies 473 // fits well within MaxBits, we make the scale be at least 8. 474 const unsigned MaxBits = 64; 475 const unsigned SpreadBits = (Max / Min).lg(); 476 Scaled64 ScalingFactor; 477 if (SpreadBits <= MaxBits - 3) { 478 // If the values are small enough, make the scaling factor at least 8 to 479 // allow distinguishing small values. 480 ScalingFactor = Min.inverse(); 481 ScalingFactor <<= 3; 482 } else { 483 // If the values need more than MaxBits to be represented, saturate small 484 // frequency values down to 1 by using a scaling factor that benefits large 485 // frequency values. 486 ScalingFactor = Scaled64(1, MaxBits) / Max; 487 } 488 489 // Translate the floats to integers. 490 LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max 491 << ", factor = " << ScalingFactor << "\n"); 492 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { 493 Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; 494 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); 495 LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " 496 << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled 497 << ", int = " << BFI.Freqs[Index].Integer << "\n"); 498 } 499 } 500 501 /// Unwrap a loop package. 502 /// 503 /// Visits all the members of a loop, adjusting their BlockData according to 504 /// the loop's pseudo-node. 505 static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { 506 LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) 507 << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale 508 << "\n"); 509 Loop.Scale *= Loop.Mass.toScaled(); 510 Loop.IsPackaged = false; 511 LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); 512 513 // Propagate the head scale through the loop. Since members are visited in 514 // RPO, the head scale will be updated by the loop scale first, and then the 515 // final head scale will be used for updated the rest of the members. 516 for (const BlockNode &N : Loop.Nodes) { 517 const auto &Working = BFI.Working[N.Index]; 518 Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale 519 : BFI.Freqs[N.Index].Scaled; 520 Scaled64 New = Loop.Scale * F; 521 LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " 522 << New << "\n"); 523 F = New; 524 } 525 } 526 527 void BlockFrequencyInfoImplBase::unwrapLoops() { 528 // Set initial frequencies from loop-local masses. 529 for (size_t Index = 0; Index < Working.size(); ++Index) 530 Freqs[Index].Scaled = Working[Index].Mass.toScaled(); 531 532 for (LoopData &Loop : Loops) 533 unwrapLoop(*this, Loop); 534 } 535 536 void BlockFrequencyInfoImplBase::finalizeMetrics() { 537 // Unwrap loop packages in reverse post-order, tracking min and max 538 // frequencies. 539 auto Min = Scaled64::getLargest(); 540 auto Max = Scaled64::getZero(); 541 for (size_t Index = 0; Index < Working.size(); ++Index) { 542 // Update min/max scale. 543 Min = std::min(Min, Freqs[Index].Scaled); 544 Max = std::max(Max, Freqs[Index].Scaled); 545 } 546 547 // Convert to integers. 548 convertFloatingToInteger(*this, Min, Max); 549 550 // Clean up data structures. 551 cleanup(*this); 552 553 // Print out the final stats. 554 LLVM_DEBUG(dump()); 555 } 556 557 BlockFrequency 558 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { 559 if (!Node.isValid()) { 560 #ifndef NDEBUG 561 if (CheckBFIUnknownBlockQueries) { 562 SmallString<256> Msg; 563 raw_svector_ostream OS(Msg); 564 OS << "*** Detected BFI query for unknown block " << getBlockName(Node); 565 report_fatal_error(OS.str()); 566 } 567 #endif 568 return 0; 569 } 570 return Freqs[Node.Index].Integer; 571 } 572 573 Optional<uint64_t> 574 BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, 575 const BlockNode &Node, 576 bool AllowSynthetic) const { 577 return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency(), 578 AllowSynthetic); 579 } 580 581 Optional<uint64_t> 582 BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, 583 uint64_t Freq, 584 bool AllowSynthetic) const { 585 auto EntryCount = F.getEntryCount(AllowSynthetic); 586 if (!EntryCount) 587 return None; 588 // Use 128 bit APInt to do the arithmetic to avoid overflow. 589 APInt BlockCount(128, EntryCount.getCount()); 590 APInt BlockFreq(128, Freq); 591 APInt EntryFreq(128, getEntryFreq()); 592 BlockCount *= BlockFreq; 593 // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned 594 // lshr by 1 gives EntryFreq/2. 595 BlockCount = (BlockCount + EntryFreq.lshr(1)).udiv(EntryFreq); 596 return BlockCount.getLimitedValue(); 597 } 598 599 bool 600 BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { 601 if (!Node.isValid()) 602 return false; 603 return IsIrrLoopHeader.test(Node.Index); 604 } 605 606 Scaled64 607 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { 608 if (!Node.isValid()) 609 return Scaled64::getZero(); 610 return Freqs[Node.Index].Scaled; 611 } 612 613 void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, 614 uint64_t Freq) { 615 assert(Node.isValid() && "Expected valid node"); 616 assert(Node.Index < Freqs.size() && "Expected legal index"); 617 Freqs[Node.Index].Integer = Freq; 618 } 619 620 std::string 621 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { 622 return {}; 623 } 624 625 std::string 626 BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { 627 return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*"); 628 } 629 630 raw_ostream & 631 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 632 const BlockNode &Node) const { 633 return OS << getFloatingBlockFreq(Node); 634 } 635 636 raw_ostream & 637 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 638 const BlockFrequency &Freq) const { 639 Scaled64 Block(Freq.getFrequency(), 0); 640 Scaled64 Entry(getEntryFreq(), 0); 641 642 return OS << Block / Entry; 643 } 644 645 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { 646 Start = OuterLoop.getHeader(); 647 Nodes.reserve(OuterLoop.Nodes.size()); 648 for (auto N : OuterLoop.Nodes) 649 addNode(N); 650 indexNodes(); 651 } 652 653 void IrreducibleGraph::addNodesInFunction() { 654 Start = 0; 655 for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) 656 if (!BFI.Working[Index].isPackaged()) 657 addNode(Index); 658 indexNodes(); 659 } 660 661 void IrreducibleGraph::indexNodes() { 662 for (auto &I : Nodes) 663 Lookup[I.Node.Index] = &I; 664 } 665 666 void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, 667 const BFIBase::LoopData *OuterLoop) { 668 if (OuterLoop && OuterLoop->isHeader(Succ)) 669 return; 670 auto L = Lookup.find(Succ.Index); 671 if (L == Lookup.end()) 672 return; 673 IrrNode &SuccIrr = *L->second; 674 Irr.Edges.push_back(&SuccIrr); 675 SuccIrr.Edges.push_front(&Irr); 676 ++SuccIrr.NumIn; 677 } 678 679 namespace llvm { 680 681 template <> struct GraphTraits<IrreducibleGraph> { 682 using GraphT = bfi_detail::IrreducibleGraph; 683 using NodeRef = const GraphT::IrrNode *; 684 using ChildIteratorType = GraphT::IrrNode::iterator; 685 686 static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } 687 static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } 688 static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } 689 }; 690 691 } // end namespace llvm 692 693 /// Find extra irreducible headers. 694 /// 695 /// Find entry blocks and other blocks with backedges, which exist when \c G 696 /// contains irreducible sub-SCCs. 697 static void findIrreducibleHeaders( 698 const BlockFrequencyInfoImplBase &BFI, 699 const IrreducibleGraph &G, 700 const std::vector<const IrreducibleGraph::IrrNode *> &SCC, 701 LoopData::NodeList &Headers, LoopData::NodeList &Others) { 702 // Map from nodes in the SCC to whether it's an entry block. 703 SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; 704 705 // InSCC also acts the set of nodes in the graph. Seed it. 706 for (const auto *I : SCC) 707 InSCC[I] = false; 708 709 for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { 710 auto &Irr = *I->first; 711 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 712 if (InSCC.count(P)) 713 continue; 714 715 // This is an entry block. 716 I->second = true; 717 Headers.push_back(Irr.Node); 718 LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) 719 << "\n"); 720 break; 721 } 722 } 723 assert(Headers.size() >= 2 && 724 "Expected irreducible CFG; -loop-info is likely invalid"); 725 if (Headers.size() == InSCC.size()) { 726 // Every block is a header. 727 llvm::sort(Headers); 728 return; 729 } 730 731 // Look for extra headers from irreducible sub-SCCs. 732 for (const auto &I : InSCC) { 733 // Entry blocks are already headers. 734 if (I.second) 735 continue; 736 737 auto &Irr = *I.first; 738 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 739 // Skip forward edges. 740 if (P->Node < Irr.Node) 741 continue; 742 743 // Skip predecessors from entry blocks. These can have inverted 744 // ordering. 745 if (InSCC.lookup(P)) 746 continue; 747 748 // Store the extra header. 749 Headers.push_back(Irr.Node); 750 LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) 751 << "\n"); 752 break; 753 } 754 if (Headers.back() == Irr.Node) 755 // Added this as a header. 756 continue; 757 758 // This is not a header. 759 Others.push_back(Irr.Node); 760 LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); 761 } 762 llvm::sort(Headers); 763 llvm::sort(Others); 764 } 765 766 static void createIrreducibleLoop( 767 BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, 768 LoopData *OuterLoop, std::list<LoopData>::iterator Insert, 769 const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { 770 // Translate the SCC into RPO. 771 LLVM_DEBUG(dbgs() << " - found-scc\n"); 772 773 LoopData::NodeList Headers; 774 LoopData::NodeList Others; 775 findIrreducibleHeaders(BFI, G, SCC, Headers, Others); 776 777 auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), 778 Headers.end(), Others.begin(), Others.end()); 779 780 // Update loop hierarchy. 781 for (const auto &N : Loop->Nodes) 782 if (BFI.Working[N.Index].isLoopHeader()) 783 BFI.Working[N.Index].Loop->Parent = &*Loop; 784 else 785 BFI.Working[N.Index].Loop = &*Loop; 786 } 787 788 iterator_range<std::list<LoopData>::iterator> 789 BlockFrequencyInfoImplBase::analyzeIrreducible( 790 const IrreducibleGraph &G, LoopData *OuterLoop, 791 std::list<LoopData>::iterator Insert) { 792 assert((OuterLoop == nullptr) == (Insert == Loops.begin())); 793 auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); 794 795 for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { 796 if (I->size() < 2) 797 continue; 798 799 // Translate the SCC into RPO. 800 createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); 801 } 802 803 if (OuterLoop) 804 return make_range(std::next(Prev), Insert); 805 return make_range(Loops.begin(), Insert); 806 } 807 808 void 809 BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { 810 OuterLoop.Exits.clear(); 811 for (auto &Mass : OuterLoop.BackedgeMass) 812 Mass = BlockMass::getEmpty(); 813 auto O = OuterLoop.Nodes.begin() + 1; 814 for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) 815 if (!Working[I->Index].isPackaged()) 816 *O++ = *I; 817 OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); 818 } 819 820 void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { 821 assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); 822 823 // Since the loop has more than one header block, the mass flowing back into 824 // each header will be different. Adjust the mass in each header loop to 825 // reflect the masses flowing through back edges. 826 // 827 // To do this, we distribute the initial mass using the backedge masses 828 // as weights for the distribution. 829 BlockMass LoopMass = BlockMass::getFull(); 830 Distribution Dist; 831 832 LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n"); 833 for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { 834 auto &HeaderNode = Loop.Nodes[H]; 835 auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; 836 LLVM_DEBUG(dbgs() << " - Add back edge mass for node " 837 << getBlockName(HeaderNode) << ": " << BackedgeMass 838 << "\n"); 839 if (BackedgeMass.getMass() > 0) 840 Dist.addLocal(HeaderNode, BackedgeMass.getMass()); 841 else 842 LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); 843 } 844 845 DitheringDistributer D(Dist, LoopMass); 846 847 LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass 848 << " to headers using above weights\n"); 849 for (const Weight &W : Dist.Weights) { 850 BlockMass Taken = D.takeMass(W.Amount); 851 assert(W.Type == Weight::Local && "all weights should be local"); 852 Working[W.TargetNode.Index].getMass() = Taken; 853 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 854 } 855 } 856 857 void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { 858 BlockMass LoopMass = BlockMass::getFull(); 859 DitheringDistributer D(Dist, LoopMass); 860 for (const Weight &W : Dist.Weights) { 861 BlockMass Taken = D.takeMass(W.Amount); 862 assert(W.Type == Weight::Local && "all weights should be local"); 863 Working[W.TargetNode.Index].getMass() = Taken; 864 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 865 } 866 } 867