1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements basic block placement transformations using the CFG 10 // structure and branch probability estimates. 11 // 12 // The pass strives to preserve the structure of the CFG (that is, retain 13 // a topological ordering of basic blocks) in the absence of a *strong* signal 14 // to the contrary from probabilities. However, within the CFG structure, it 15 // attempts to choose an ordering which favors placing more likely sequences of 16 // blocks adjacent to each other. 17 // 18 // The algorithm works from the inner-most loop within a function outward, and 19 // at each stage walks through the basic blocks, trying to coalesce them into 20 // sequential chains where allowed by the CFG (or demanded by heavy 21 // probabilities). Finally, it walks the blocks in topological order, and the 22 // first time it reaches a chain of basic blocks, it schedules them in the 23 // function in-order. 24 // 25 //===----------------------------------------------------------------------===// 26 27 #include "llvm/CodeGen/MachineBlockPlacement.h" 28 #include "BranchFolding.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/DenseMap.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SetVector.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/Analysis/BlockFrequencyInfoImpl.h" 37 #include "llvm/Analysis/ProfileSummaryInfo.h" 38 #include "llvm/CodeGen/MBFIWrapper.h" 39 #include "llvm/CodeGen/MachineBasicBlock.h" 40 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" 41 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" 42 #include "llvm/CodeGen/MachineFunction.h" 43 #include "llvm/CodeGen/MachineFunctionPass.h" 44 #include "llvm/CodeGen/MachineLoopInfo.h" 45 #include "llvm/CodeGen/MachinePostDominators.h" 46 #include "llvm/CodeGen/MachineSizeOpts.h" 47 #include "llvm/CodeGen/TailDuplicator.h" 48 #include "llvm/CodeGen/TargetInstrInfo.h" 49 #include "llvm/CodeGen/TargetLowering.h" 50 #include "llvm/CodeGen/TargetPassConfig.h" 51 #include "llvm/CodeGen/TargetSubtargetInfo.h" 52 #include "llvm/IR/DebugLoc.h" 53 #include "llvm/IR/Function.h" 54 #include "llvm/IR/PrintPasses.h" 55 #include "llvm/InitializePasses.h" 56 #include "llvm/Pass.h" 57 #include "llvm/Support/Allocator.h" 58 #include "llvm/Support/BlockFrequency.h" 59 #include "llvm/Support/BranchProbability.h" 60 #include "llvm/Support/CodeGen.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "llvm/Support/Compiler.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/raw_ostream.h" 65 #include "llvm/Target/TargetMachine.h" 66 #include "llvm/Transforms/Utils/CodeLayout.h" 67 #include <algorithm> 68 #include <cassert> 69 #include <cstdint> 70 #include <iterator> 71 #include <memory> 72 #include <string> 73 #include <tuple> 74 #include <utility> 75 #include <vector> 76 77 using namespace llvm; 78 79 #define DEBUG_TYPE "block-placement" 80 81 STATISTIC(NumCondBranches, "Number of conditional branches"); 82 STATISTIC(NumUncondBranches, "Number of unconditional branches"); 83 STATISTIC(CondBranchTakenFreq, 84 "Potential frequency of taking conditional branches"); 85 STATISTIC(UncondBranchTakenFreq, 86 "Potential frequency of taking unconditional branches"); 87 88 static cl::opt<unsigned> AlignAllBlock( 89 "align-all-blocks", 90 cl::desc("Force the alignment of all blocks in the function in log2 format " 91 "(e.g 4 means align on 16B boundaries)."), 92 cl::init(0), cl::Hidden); 93 94 static cl::opt<unsigned> AlignAllNonFallThruBlocks( 95 "align-all-nofallthru-blocks", 96 cl::desc("Force the alignment of all blocks that have no fall-through " 97 "predecessors (i.e. don't add nops that are executed). In log2 " 98 "format (e.g 4 means align on 16B boundaries)."), 99 cl::init(0), cl::Hidden); 100 101 static cl::opt<unsigned> MaxBytesForAlignmentOverride( 102 "max-bytes-for-alignment", 103 cl::desc("Forces the maximum bytes allowed to be emitted when padding for " 104 "alignment"), 105 cl::init(0), cl::Hidden); 106 107 static cl::opt<unsigned> PredecessorLimit( 108 "block-placement-predecessor-limit", 109 cl::desc("For blocks with more predecessors, certain layout optimizations" 110 "will be disabled to prevent quadratic compile time."), 111 cl::init(1000), cl::Hidden); 112 113 // FIXME: Find a good default for this flag and remove the flag. 114 static cl::opt<unsigned> ExitBlockBias( 115 "block-placement-exit-block-bias", 116 cl::desc("Block frequency percentage a loop exit block needs " 117 "over the original exit to be considered the new exit."), 118 cl::init(0), cl::Hidden); 119 120 // Definition: 121 // - Outlining: placement of a basic block outside the chain or hot path. 122 123 static cl::opt<unsigned> LoopToColdBlockRatio( 124 "loop-to-cold-block-ratio", 125 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " 126 "(frequency of block) is greater than this ratio"), 127 cl::init(5), cl::Hidden); 128 129 static cl::opt<bool> 130 ForceLoopColdBlock("force-loop-cold-block", 131 cl::desc("Force outlining cold blocks from loops."), 132 cl::init(false), cl::Hidden); 133 134 static cl::opt<bool> 135 PreciseRotationCost("precise-rotation-cost", 136 cl::desc("Model the cost of loop rotation more " 137 "precisely by using profile data."), 138 cl::init(false), cl::Hidden); 139 140 static cl::opt<bool> 141 ForcePreciseRotationCost("force-precise-rotation-cost", 142 cl::desc("Force the use of precise cost " 143 "loop rotation strategy."), 144 cl::init(false), cl::Hidden); 145 146 static cl::opt<unsigned> MisfetchCost( 147 "misfetch-cost", 148 cl::desc("Cost that models the probabilistic risk of an instruction " 149 "misfetch due to a jump comparing to falling through, whose cost " 150 "is zero."), 151 cl::init(1), cl::Hidden); 152 153 static cl::opt<unsigned> JumpInstCost("jump-inst-cost", 154 cl::desc("Cost of jump instructions."), 155 cl::init(1), cl::Hidden); 156 static cl::opt<bool> 157 TailDupPlacement("tail-dup-placement", 158 cl::desc("Perform tail duplication during placement. " 159 "Creates more fallthrough opportunities in " 160 "outline branches."), 161 cl::init(true), cl::Hidden); 162 163 static cl::opt<bool> 164 BranchFoldPlacement("branch-fold-placement", 165 cl::desc("Perform branch folding during placement. " 166 "Reduces code size."), 167 cl::init(true), cl::Hidden); 168 169 // Heuristic for tail duplication. 170 static cl::opt<unsigned> TailDupPlacementThreshold( 171 "tail-dup-placement-threshold", 172 cl::desc("Instruction cutoff for tail duplication during layout. " 173 "Tail merging during layout is forced to have a threshold " 174 "that won't conflict."), 175 cl::init(2), cl::Hidden); 176 177 // Heuristic for aggressive tail duplication. 178 static cl::opt<unsigned> TailDupPlacementAggressiveThreshold( 179 "tail-dup-placement-aggressive-threshold", 180 cl::desc("Instruction cutoff for aggressive tail duplication during " 181 "layout. Used at -O3. Tail merging during layout is forced to " 182 "have a threshold that won't conflict."), 183 cl::init(4), cl::Hidden); 184 185 // Heuristic for tail duplication. 186 static cl::opt<unsigned> TailDupPlacementPenalty( 187 "tail-dup-placement-penalty", 188 cl::desc( 189 "Cost penalty for blocks that can avoid breaking CFG by copying. " 190 "Copying can increase fallthrough, but it also increases icache " 191 "pressure. This parameter controls the penalty to account for that. " 192 "Percent as integer."), 193 cl::init(2), cl::Hidden); 194 195 // Heuristic for tail duplication if profile count is used in cost model. 196 static cl::opt<unsigned> TailDupProfilePercentThreshold( 197 "tail-dup-profile-percent-threshold", 198 cl::desc("If profile count information is used in tail duplication cost " 199 "model, the gained fall through number from tail duplication " 200 "should be at least this percent of hot count."), 201 cl::init(50), cl::Hidden); 202 203 // Heuristic for triangle chains. 204 static cl::opt<unsigned> TriangleChainCount( 205 "triangle-chain-count", 206 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " 207 "triangle tail duplication heuristic to kick in. 0 to disable."), 208 cl::init(2), cl::Hidden); 209 210 // Use case: When block layout is visualized after MBP pass, the basic blocks 211 // are labeled in layout order; meanwhile blocks could be numbered in a 212 // different order. It's hard to map between the graph and pass output. 213 // With this option on, the basic blocks are renumbered in function layout 214 // order. For debugging only. 215 static cl::opt<bool> RenumberBlocksBeforeView( 216 "renumber-blocks-before-view", 217 cl::desc( 218 "If true, basic blocks are re-numbered before MBP layout is printed " 219 "into a dot graph. Only used when a function is being printed."), 220 cl::init(false), cl::Hidden); 221 222 static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks( 223 "ext-tsp-block-placement-max-blocks", 224 cl::desc("Maximum number of basic blocks in a function to run ext-TSP " 225 "block placement."), 226 cl::init(UINT_MAX), cl::Hidden); 227 228 // Apply the ext-tsp algorithm minimizing the size of a binary. 229 static cl::opt<bool> 230 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, 231 cl::desc("Use ext-tsp for size-aware block placement.")); 232 233 namespace llvm { 234 extern cl::opt<bool> EnableExtTspBlockPlacement; 235 extern cl::opt<bool> ApplyExtTspWithoutProfile; 236 extern cl::opt<unsigned> StaticLikelyProb; 237 extern cl::opt<unsigned> ProfileLikelyProb; 238 239 // Internal option used to control BFI display only after MBP pass. 240 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp: 241 // -view-block-layout-with-bfi= 242 extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; 243 244 // Command line option to specify the name of the function for CFG dump 245 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= 246 extern cl::opt<std::string> ViewBlockFreqFuncName; 247 } // namespace llvm 248 249 namespace { 250 251 class BlockChain; 252 253 /// Type for our function-wide basic block -> block chain mapping. 254 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>; 255 256 /// A chain of blocks which will be laid out contiguously. 257 /// 258 /// This is the datastructure representing a chain of consecutive blocks that 259 /// are profitable to layout together in order to maximize fallthrough 260 /// probabilities and code locality. We also can use a block chain to represent 261 /// a sequence of basic blocks which have some external (correctness) 262 /// requirement for sequential layout. 263 /// 264 /// Chains can be built around a single basic block and can be merged to grow 265 /// them. They participate in a block-to-chain mapping, which is updated 266 /// automatically as chains are merged together. 267 class BlockChain { 268 /// The sequence of blocks belonging to this chain. 269 /// 270 /// This is the sequence of blocks for a particular chain. These will be laid 271 /// out in-order within the function. 272 SmallVector<MachineBasicBlock *, 4> Blocks; 273 274 /// A handle to the function-wide basic block to block chain mapping. 275 /// 276 /// This is retained in each block chain to simplify the computation of child 277 /// block chains for SCC-formation and iteration. We store the edges to child 278 /// basic blocks, and map them back to their associated chains using this 279 /// structure. 280 BlockToChainMapType &BlockToChain; 281 282 public: 283 /// Construct a new BlockChain. 284 /// 285 /// This builds a new block chain representing a single basic block in the 286 /// function. It also registers itself as the chain that block participates 287 /// in with the BlockToChain mapping. 288 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB) 289 : Blocks(1, BB), BlockToChain(BlockToChain) { 290 assert(BB && "Cannot create a chain with a null basic block"); 291 BlockToChain[BB] = this; 292 } 293 294 /// Iterator over blocks within the chain. 295 using iterator = SmallVectorImpl<MachineBasicBlock *>::iterator; 296 using const_iterator = SmallVectorImpl<MachineBasicBlock *>::const_iterator; 297 298 /// Beginning of blocks within the chain. 299 iterator begin() { return Blocks.begin(); } 300 const_iterator begin() const { return Blocks.begin(); } 301 302 /// End of blocks within the chain. 303 iterator end() { return Blocks.end(); } 304 const_iterator end() const { return Blocks.end(); } 305 306 bool remove(MachineBasicBlock *BB) { 307 for (iterator i = begin(); i != end(); ++i) { 308 if (*i == BB) { 309 Blocks.erase(i); 310 return true; 311 } 312 } 313 return false; 314 } 315 316 /// Merge a block chain into this one. 317 /// 318 /// This routine merges a block chain into this one. It takes care of forming 319 /// a contiguous sequence of basic blocks, updating the edge list, and 320 /// updating the block -> chain mapping. It does not free or tear down the 321 /// old chain, but the old chain's block list is no longer valid. 322 void merge(MachineBasicBlock *BB, BlockChain *Chain) { 323 assert(BB && "Can't merge a null block."); 324 assert(!Blocks.empty() && "Can't merge into an empty chain."); 325 326 // Fast path in case we don't have a chain already. 327 if (!Chain) { 328 assert(!BlockToChain[BB] && 329 "Passed chain is null, but BB has entry in BlockToChain."); 330 Blocks.push_back(BB); 331 BlockToChain[BB] = this; 332 return; 333 } 334 335 assert(BB == *Chain->begin() && "Passed BB is not head of Chain."); 336 assert(Chain->begin() != Chain->end()); 337 338 // Update the incoming blocks to point to this chain, and add them to the 339 // chain structure. 340 for (MachineBasicBlock *ChainBB : *Chain) { 341 Blocks.push_back(ChainBB); 342 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain."); 343 BlockToChain[ChainBB] = this; 344 } 345 } 346 347 #ifndef NDEBUG 348 /// Dump the blocks in this chain. 349 LLVM_DUMP_METHOD void dump() { 350 for (MachineBasicBlock *MBB : *this) 351 MBB->dump(); 352 } 353 #endif // NDEBUG 354 355 /// Count of predecessors of any block within the chain which have not 356 /// yet been scheduled. In general, we will delay scheduling this chain 357 /// until those predecessors are scheduled (or we find a sufficiently good 358 /// reason to override this heuristic.) Note that when forming loop chains, 359 /// blocks outside the loop are ignored and treated as if they were already 360 /// scheduled. 361 /// 362 /// Note: This field is reinitialized multiple times - once for each loop, 363 /// and then once for the function as a whole. 364 unsigned UnscheduledPredecessors = 0; 365 }; 366 367 class MachineBlockPlacement { 368 /// A type for a block filter set. 369 using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>; 370 371 /// Pair struct containing basic block and taildup profitability 372 struct BlockAndTailDupResult { 373 MachineBasicBlock *BB = nullptr; 374 bool ShouldTailDup; 375 }; 376 377 /// Triple struct containing edge weight and the edge. 378 struct WeightedEdge { 379 BlockFrequency Weight; 380 MachineBasicBlock *Src = nullptr; 381 MachineBasicBlock *Dest = nullptr; 382 }; 383 384 /// work lists of blocks that are ready to be laid out 385 SmallVector<MachineBasicBlock *, 16> BlockWorkList; 386 SmallVector<MachineBasicBlock *, 16> EHPadWorkList; 387 388 /// Edges that have already been computed as optimal. 389 DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges; 390 391 /// Machine Function 392 MachineFunction *F = nullptr; 393 394 /// A handle to the branch probability pass. 395 const MachineBranchProbabilityInfo *MBPI = nullptr; 396 397 /// A handle to the function-wide block frequency pass. 398 std::unique_ptr<MBFIWrapper> MBFI; 399 400 /// A handle to the loop info. 401 MachineLoopInfo *MLI = nullptr; 402 403 /// Preferred loop exit. 404 /// Member variable for convenience. It may be removed by duplication deep 405 /// in the call stack. 406 MachineBasicBlock *PreferredLoopExit = nullptr; 407 408 /// A handle to the target's instruction info. 409 const TargetInstrInfo *TII = nullptr; 410 411 /// A handle to the target's lowering info. 412 const TargetLoweringBase *TLI = nullptr; 413 414 /// A handle to the post dominator tree. 415 MachinePostDominatorTree *MPDT = nullptr; 416 417 ProfileSummaryInfo *PSI = nullptr; 418 419 // Tail merging is also determined based on 420 // whether structured CFG is required. 421 bool AllowTailMerge; 422 423 CodeGenOptLevel OptLevel; 424 425 /// Duplicator used to duplicate tails during placement. 426 /// 427 /// Placement decisions can open up new tail duplication opportunities, but 428 /// since tail duplication affects placement decisions of later blocks, it 429 /// must be done inline. 430 TailDuplicator TailDup; 431 432 /// Partial tail duplication threshold. 433 BlockFrequency DupThreshold; 434 435 unsigned TailDupSize; 436 437 /// True: use block profile count to compute tail duplication cost. 438 /// False: use block frequency to compute tail duplication cost. 439 bool UseProfileCount = false; 440 441 /// Allocator and owner of BlockChain structures. 442 /// 443 /// We build BlockChains lazily while processing the loop structure of 444 /// a function. To reduce malloc traffic, we allocate them using this 445 /// slab-like allocator, and destroy them after the pass completes. An 446 /// important guarantee is that this allocator produces stable pointers to 447 /// the chains. 448 SpecificBumpPtrAllocator<BlockChain> ChainAllocator; 449 450 /// Function wide BasicBlock to BlockChain mapping. 451 /// 452 /// This mapping allows efficiently moving from any given basic block to the 453 /// BlockChain it participates in, if any. We use it to, among other things, 454 /// allow implicitly defining edges between chains as the existing edges 455 /// between basic blocks. 456 DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain; 457 458 #ifndef NDEBUG 459 /// The set of basic blocks that have terminators that cannot be fully 460 /// analyzed. These basic blocks cannot be re-ordered safely by 461 /// MachineBlockPlacement, and we must preserve physical layout of these 462 /// blocks and their successors through the pass. 463 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits; 464 #endif 465 466 /// Get block profile count or frequency according to UseProfileCount. 467 /// The return value is used to model tail duplication cost. 468 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) { 469 if (UseProfileCount) { 470 auto Count = MBFI->getBlockProfileCount(BB); 471 if (Count) 472 return BlockFrequency(*Count); 473 else 474 return BlockFrequency(0); 475 } else 476 return MBFI->getBlockFreq(BB); 477 } 478 479 /// Scale the DupThreshold according to basic block size. 480 BlockFrequency scaleThreshold(MachineBasicBlock *BB); 481 void initTailDupThreshold(); 482 483 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and 484 /// if the count goes to 0, add them to the appropriate work list. 485 void markChainSuccessors(const BlockChain &Chain, 486 const MachineBasicBlock *LoopHeaderBB, 487 const BlockFilterSet *BlockFilter = nullptr); 488 489 /// Decrease the UnscheduledPredecessors count for a single block, and 490 /// if the count goes to 0, add them to the appropriate work list. 491 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB, 492 const MachineBasicBlock *LoopHeaderBB, 493 const BlockFilterSet *BlockFilter = nullptr); 494 495 BranchProbability 496 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain, 497 const BlockFilterSet *BlockFilter, 498 SmallVector<MachineBasicBlock *, 4> &Successors); 499 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, 500 BlockFilterSet *BlockFilter); 501 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates, 502 MachineBasicBlock *BB, 503 BlockFilterSet *BlockFilter); 504 bool repeatedlyTailDuplicateBlock( 505 MachineBasicBlock *BB, MachineBasicBlock *&LPred, 506 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, 507 BlockFilterSet *BlockFilter, 508 MachineFunction::iterator &PrevUnplacedBlockIt, 509 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt); 510 bool 511 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, 512 BlockChain &Chain, BlockFilterSet *BlockFilter, 513 MachineFunction::iterator &PrevUnplacedBlockIt, 514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, 515 bool &DuplicatedToLPred); 516 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB, 517 const MachineBasicBlock *Succ, 518 const BlockChain &SuccChain, 519 BranchProbability SuccProb, 520 BranchProbability RealSuccProb, 521 const BlockChain &Chain, 522 const BlockFilterSet *BlockFilter); 523 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB, 524 const BlockChain &Chain, 525 const BlockFilterSet *BlockFilter); 526 MachineBasicBlock * 527 selectBestCandidateBlock(const BlockChain &Chain, 528 SmallVectorImpl<MachineBasicBlock *> &WorkList); 529 MachineBasicBlock * 530 getFirstUnplacedBlock(const BlockChain &PlacedChain, 531 MachineFunction::iterator &PrevUnplacedBlockIt); 532 MachineBasicBlock * 533 getFirstUnplacedBlock(const BlockChain &PlacedChain, 534 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, 535 const BlockFilterSet *BlockFilter); 536 537 /// Add a basic block to the work list if it is appropriate. 538 /// 539 /// If the optional parameter BlockFilter is provided, only MBB 540 /// present in the set will be added to the worklist. If nullptr 541 /// is provided, no filtering occurs. 542 void fillWorkLists(const MachineBasicBlock *MBB, 543 SmallPtrSetImpl<BlockChain *> &UpdatedPreds, 544 const BlockFilterSet *BlockFilter); 545 546 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain, 547 BlockFilterSet *BlockFilter = nullptr); 548 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock, 549 const MachineBasicBlock *OldTop); 550 bool hasViableTopFallthrough(const MachineBasicBlock *Top, 551 const BlockFilterSet &LoopBlockSet); 552 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top, 553 const BlockFilterSet &LoopBlockSet); 554 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop, 555 const MachineBasicBlock *OldTop, 556 const MachineBasicBlock *ExitBB, 557 const BlockFilterSet &LoopBlockSet); 558 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop, 559 const MachineLoop &L, 560 const BlockFilterSet &LoopBlockSet); 561 MachineBasicBlock *findBestLoopTop(const MachineLoop &L, 562 const BlockFilterSet &LoopBlockSet); 563 MachineBasicBlock *findBestLoopExit(const MachineLoop &L, 564 const BlockFilterSet &LoopBlockSet, 565 BlockFrequency &ExitFreq); 566 BlockFilterSet collectLoopBlockSet(const MachineLoop &L); 567 void buildLoopChains(const MachineLoop &L); 568 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, 569 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet); 570 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L, 571 const BlockFilterSet &LoopBlockSet); 572 void buildCFGChains(); 573 void optimizeBranches(); 574 void alignBlocks(); 575 /// Returns true if a block should be tail-duplicated to increase fallthrough 576 /// opportunities. 577 bool shouldTailDuplicate(MachineBasicBlock *BB); 578 /// Check the edge frequencies to see if tail duplication will increase 579 /// fallthroughs. 580 bool isProfitableToTailDup(const MachineBasicBlock *BB, 581 const MachineBasicBlock *Succ, 582 BranchProbability QProb, const BlockChain &Chain, 583 const BlockFilterSet *BlockFilter); 584 585 /// Check for a trellis layout. 586 bool isTrellis(const MachineBasicBlock *BB, 587 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, 588 const BlockChain &Chain, const BlockFilterSet *BlockFilter); 589 590 /// Get the best successor given a trellis layout. 591 BlockAndTailDupResult getBestTrellisSuccessor( 592 const MachineBasicBlock *BB, 593 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, 594 BranchProbability AdjustedSumProb, const BlockChain &Chain, 595 const BlockFilterSet *BlockFilter); 596 597 /// Get the best pair of non-conflicting edges. 598 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges( 599 const MachineBasicBlock *BB, 600 MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges); 601 602 /// Returns true if a block can tail duplicate into all unplaced 603 /// predecessors. Filters based on loop. 604 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB, 605 MachineBasicBlock *Succ, 606 const BlockChain &Chain, 607 const BlockFilterSet *BlockFilter); 608 609 /// Find chains of triangles to tail-duplicate where a global analysis works, 610 /// but a local analysis would not find them. 611 void precomputeTriangleChains(); 612 613 /// Apply a post-processing step optimizing block placement. 614 void applyExtTsp(bool OptForSize); 615 616 /// Modify the existing block placement in the function and adjust all jumps. 617 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder); 618 619 /// Create a single CFG chain from the current block order. 620 void createCFGChainExtTsp(); 621 622 public: 623 MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI, 624 MachineLoopInfo *MLI, ProfileSummaryInfo *PSI, 625 std::unique_ptr<MBFIWrapper> MBFI, 626 MachinePostDominatorTree *MPDT, bool AllowTailMerge) 627 : MBPI(MBPI), MBFI(std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI), 628 AllowTailMerge(AllowTailMerge) {}; 629 630 bool run(MachineFunction &F); 631 632 static bool allowTailDupPlacement(MachineFunction &MF) { 633 return TailDupPlacement && !MF.getTarget().requiresStructuredCFG(); 634 } 635 }; 636 637 class MachineBlockPlacementLegacy : public MachineFunctionPass { 638 public: 639 static char ID; // Pass identification, replacement for typeid 640 641 MachineBlockPlacementLegacy() : MachineFunctionPass(ID) { 642 initializeMachineBlockPlacementLegacyPass(*PassRegistry::getPassRegistry()); 643 } 644 645 bool runOnMachineFunction(MachineFunction &MF) override { 646 if (skipFunction(MF.getFunction())) 647 return false; 648 649 auto *MBPI = 650 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI(); 651 auto MBFI = std::make_unique<MBFIWrapper>( 652 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()); 653 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 654 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF) 655 ? &getAnalysis<MachinePostDominatorTreeWrapperPass>() 656 .getPostDomTree() 657 : nullptr; 658 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 659 auto *PassConfig = &getAnalysis<TargetPassConfig>(); 660 bool AllowTailMerge = PassConfig->getEnableTailMerge(); 661 return MachineBlockPlacement(MBPI, MLI, PSI, std::move(MBFI), MPDT, 662 AllowTailMerge) 663 .run(MF); 664 } 665 666 void getAnalysisUsage(AnalysisUsage &AU) const override { 667 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>(); 668 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>(); 669 if (TailDupPlacement) 670 AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 671 AU.addRequired<MachineLoopInfoWrapperPass>(); 672 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 673 AU.addRequired<TargetPassConfig>(); 674 MachineFunctionPass::getAnalysisUsage(AU); 675 } 676 }; 677 678 } // end anonymous namespace 679 680 char MachineBlockPlacementLegacy::ID = 0; 681 682 char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID; 683 684 INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE, 685 "Branch Probability Basic Block Placement", false, false) 686 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) 687 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass) 688 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 689 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) 690 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 691 INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE, 692 "Branch Probability Basic Block Placement", false, false) 693 694 #ifndef NDEBUG 695 /// Helper to print the name of a MBB. 696 /// 697 /// Only used by debug logging. 698 static std::string getBlockName(const MachineBasicBlock *BB) { 699 std::string Result; 700 raw_string_ostream OS(Result); 701 OS << printMBBReference(*BB); 702 OS << " ('" << BB->getName() << "')"; 703 OS.flush(); 704 return Result; 705 } 706 #endif 707 708 /// Mark a chain's successors as having one fewer preds. 709 /// 710 /// When a chain is being merged into the "placed" chain, this routine will 711 /// quickly walk the successors of each block in the chain and mark them as 712 /// having one fewer active predecessor. It also adds any successors of this 713 /// chain which reach the zero-predecessor state to the appropriate worklist. 714 void MachineBlockPlacement::markChainSuccessors( 715 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, 716 const BlockFilterSet *BlockFilter) { 717 // Walk all the blocks in this chain, marking their successors as having 718 // a predecessor placed. 719 for (MachineBasicBlock *MBB : Chain) { 720 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter); 721 } 722 } 723 724 /// Mark a single block's successors as having one fewer preds. 725 /// 726 /// Under normal circumstances, this is only called by markChainSuccessors, 727 /// but if a block that was to be placed is completely tail-duplicated away, 728 /// and was duplicated into the chain end, we need to redo markBlockSuccessors 729 /// for just that block. 730 void MachineBlockPlacement::markBlockSuccessors( 731 const BlockChain &Chain, const MachineBasicBlock *MBB, 732 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { 733 // Add any successors for which this is the only un-placed in-loop 734 // predecessor to the worklist as a viable candidate for CFG-neutral 735 // placement. No subsequent placement of this block will violate the CFG 736 // shape, so we get to use heuristics to choose a favorable placement. 737 for (MachineBasicBlock *Succ : MBB->successors()) { 738 if (BlockFilter && !BlockFilter->count(Succ)) 739 continue; 740 BlockChain &SuccChain = *BlockToChain[Succ]; 741 // Disregard edges within a fixed chain, or edges to the loop header. 742 if (&Chain == &SuccChain || Succ == LoopHeaderBB) 743 continue; 744 745 // This is a cross-chain edge that is within the loop, so decrement the 746 // loop predecessor count of the destination chain. 747 if (SuccChain.UnscheduledPredecessors == 0 || 748 --SuccChain.UnscheduledPredecessors > 0) 749 continue; 750 751 auto *NewBB = *SuccChain.begin(); 752 if (NewBB->isEHPad()) 753 EHPadWorkList.push_back(NewBB); 754 else 755 BlockWorkList.push_back(NewBB); 756 } 757 } 758 759 /// This helper function collects the set of successors of block 760 /// \p BB that are allowed to be its layout successors, and return 761 /// the total branch probability of edges from \p BB to those 762 /// blocks. 763 BranchProbability MachineBlockPlacement::collectViableSuccessors( 764 const MachineBasicBlock *BB, const BlockChain &Chain, 765 const BlockFilterSet *BlockFilter, 766 SmallVector<MachineBasicBlock *, 4> &Successors) { 767 // Adjust edge probabilities by excluding edges pointing to blocks that is 768 // either not in BlockFilter or is already in the current chain. Consider the 769 // following CFG: 770 // 771 // --->A 772 // | / \ 773 // | B C 774 // | \ / \ 775 // ----D E 776 // 777 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after 778 // A->C is chosen as a fall-through, D won't be selected as a successor of C 779 // due to CFG constraint (the probability of C->D is not greater than 780 // HotProb to break topo-order). If we exclude E that is not in BlockFilter 781 // when calculating the probability of C->D, D will be selected and we 782 // will get A C D B as the layout of this loop. 783 auto AdjustedSumProb = BranchProbability::getOne(); 784 for (MachineBasicBlock *Succ : BB->successors()) { 785 bool SkipSucc = false; 786 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) { 787 SkipSucc = true; 788 } else { 789 BlockChain *SuccChain = BlockToChain[Succ]; 790 if (SuccChain == &Chain) { 791 SkipSucc = true; 792 } else if (Succ != *SuccChain->begin()) { 793 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ) 794 << " -> Mid chain!\n"); 795 continue; 796 } 797 } 798 if (SkipSucc) 799 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); 800 else 801 Successors.push_back(Succ); 802 } 803 804 return AdjustedSumProb; 805 } 806 807 /// The helper function returns the branch probability that is adjusted 808 /// or normalized over the new total \p AdjustedSumProb. 809 static BranchProbability 810 getAdjustedProbability(BranchProbability OrigProb, 811 BranchProbability AdjustedSumProb) { 812 BranchProbability SuccProb; 813 uint32_t SuccProbN = OrigProb.getNumerator(); 814 uint32_t SuccProbD = AdjustedSumProb.getNumerator(); 815 if (SuccProbN >= SuccProbD) 816 SuccProb = BranchProbability::getOne(); 817 else 818 SuccProb = BranchProbability(SuccProbN, SuccProbD); 819 820 return SuccProb; 821 } 822 823 /// Check if \p BB has exactly the successors in \p Successors. 824 static bool 825 hasSameSuccessors(MachineBasicBlock &BB, 826 SmallPtrSetImpl<const MachineBasicBlock *> &Successors) { 827 if (BB.succ_size() != Successors.size()) 828 return false; 829 // We don't want to count self-loops 830 if (Successors.count(&BB)) 831 return false; 832 for (MachineBasicBlock *Succ : BB.successors()) 833 if (!Successors.count(Succ)) 834 return false; 835 return true; 836 } 837 838 /// Check if a block should be tail duplicated to increase fallthrough 839 /// opportunities. 840 /// \p BB Block to check. 841 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { 842 // Blocks with single successors don't create additional fallthrough 843 // opportunities. Don't duplicate them. TODO: When conditional exits are 844 // analyzable, allow them to be duplicated. 845 bool IsSimple = TailDup.isSimpleBB(BB); 846 847 if (BB->succ_size() == 1) 848 return false; 849 return TailDup.shouldTailDuplicate(IsSimple, *BB); 850 } 851 852 /// Compare 2 BlockFrequency's with a small penalty for \p A. 853 /// In order to be conservative, we apply a X% penalty to account for 854 /// increased icache pressure and static heuristics. For small frequencies 855 /// we use only the numerators to improve accuracy. For simplicity, we assume 856 /// the penalty is less than 100% 857 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. 858 static bool greaterWithBias(BlockFrequency A, BlockFrequency B, 859 BlockFrequency EntryFreq) { 860 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); 861 BlockFrequency Gain = A - B; 862 return (Gain / ThresholdProb) >= EntryFreq; 863 } 864 865 /// Check the edge frequencies to see if tail duplication will increase 866 /// fallthroughs. It only makes sense to call this function when 867 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is 868 /// always locally profitable if we would have picked \p Succ without 869 /// considering duplication. 870 bool MachineBlockPlacement::isProfitableToTailDup( 871 const MachineBasicBlock *BB, const MachineBasicBlock *Succ, 872 BranchProbability QProb, const BlockChain &Chain, 873 const BlockFilterSet *BlockFilter) { 874 // We need to do a probability calculation to make sure this is profitable. 875 // First: does succ have a successor that post-dominates? This affects the 876 // calculation. The 2 relevant cases are: 877 // BB BB 878 // | \Qout | \Qout 879 // P| C |P C 880 // = C' = C' 881 // | /Qin | /Qin 882 // | / | / 883 // Succ Succ 884 // / \ | \ V 885 // U/ =V |U \ 886 // / \ = D 887 // D E | / 888 // | / 889 // |/ 890 // PDom 891 // '=' : Branch taken for that CFG edge 892 // In the second case, Placing Succ while duplicating it into C prevents the 893 // fallthrough of Succ into either D or PDom, because they now have C as an 894 // unplaced predecessor 895 896 // Start by figuring out which case we fall into 897 MachineBasicBlock *PDom = nullptr; 898 SmallVector<MachineBasicBlock *, 4> SuccSuccs; 899 // Only scan the relevant successors 900 auto AdjustedSuccSumProb = 901 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs); 902 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ); 903 auto BBFreq = MBFI->getBlockFreq(BB); 904 auto SuccFreq = MBFI->getBlockFreq(Succ); 905 BlockFrequency P = BBFreq * PProb; 906 BlockFrequency Qout = BBFreq * QProb; 907 BlockFrequency EntryFreq = MBFI->getEntryFreq(); 908 // If there are no more successors, it is profitable to copy, as it strictly 909 // increases fallthrough. 910 if (SuccSuccs.size() == 0) 911 return greaterWithBias(P, Qout, EntryFreq); 912 913 auto BestSuccSucc = BranchProbability::getZero(); 914 // Find the PDom or the best Succ if no PDom exists. 915 for (MachineBasicBlock *SuccSucc : SuccSuccs) { 916 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); 917 if (Prob > BestSuccSucc) 918 BestSuccSucc = Prob; 919 if (PDom == nullptr) 920 if (MPDT->dominates(SuccSucc, Succ)) { 921 PDom = SuccSucc; 922 break; 923 } 924 } 925 // For the comparisons, we need to know Succ's best incoming edge that isn't 926 // from BB. 927 auto SuccBestPred = BlockFrequency(0); 928 for (MachineBasicBlock *SuccPred : Succ->predecessors()) { 929 if (SuccPred == Succ || SuccPred == BB || 930 BlockToChain[SuccPred] == &Chain || 931 (BlockFilter && !BlockFilter->count(SuccPred))) 932 continue; 933 auto Freq = 934 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ); 935 if (Freq > SuccBestPred) 936 SuccBestPred = Freq; 937 } 938 // Qin is Succ's best unplaced incoming edge that isn't BB 939 BlockFrequency Qin = SuccBestPred; 940 // If it doesn't have a post-dominating successor, here is the calculation: 941 // BB BB 942 // | \Qout | \ 943 // P| C | = 944 // = C' | C 945 // | /Qin | | 946 // | / | C' (+Succ) 947 // Succ Succ /| 948 // / \ | \/ | 949 // U/ =V | == | 950 // / \ | / \| 951 // D E D E 952 // '=' : Branch taken for that CFG edge 953 // Cost in the first case is: P + V 954 // For this calculation, we always assume P > Qout. If Qout > P 955 // The result of this function will be ignored at the caller. 956 // Let F = SuccFreq - Qin 957 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V 958 959 if (PDom == nullptr || !Succ->isSuccessor(PDom)) { 960 BranchProbability UProb = BestSuccSucc; 961 BranchProbability VProb = AdjustedSuccSumProb - UProb; 962 BlockFrequency F = SuccFreq - Qin; 963 BlockFrequency V = SuccFreq * VProb; 964 BlockFrequency QinU = std::min(Qin, F) * UProb; 965 BlockFrequency BaseCost = P + V; 966 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb; 967 return greaterWithBias(BaseCost, DupCost, EntryFreq); 968 } 969 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); 970 BranchProbability VProb = AdjustedSuccSumProb - UProb; 971 BlockFrequency U = SuccFreq * UProb; 972 BlockFrequency V = SuccFreq * VProb; 973 BlockFrequency F = SuccFreq - Qin; 974 // If there is a post-dominating successor, here is the calculation: 975 // BB BB BB BB 976 // | \Qout | \ | \Qout | \ 977 // |P C | = |P C | = 978 // = C' |P C = C' |P C 979 // | /Qin | | | /Qin | | 980 // | / | C' (+Succ) | / | C' (+Succ) 981 // Succ Succ /| Succ Succ /| 982 // | \ V | \/ | | \ V | \/ | 983 // |U \ |U /\ =? |U = |U /\ | 984 // = D = = =?| | D | = =| 985 // | / |/ D | / |/ D 986 // | / | / | = | / 987 // |/ | / |/ | = 988 // Dom Dom Dom Dom 989 // '=' : Branch taken for that CFG edge 990 // The cost for taken branches in the first case is P + U 991 // Let F = SuccFreq - Qin 992 // The cost in the second case (assuming independence), given the layout: 993 // BB, Succ, (C+Succ), D, Dom or the layout: 994 // BB, Succ, D, Dom, (C+Succ) 995 // is Qout + max(F, Qin) * U + min(F, Qin) 996 // compare P + U vs Qout + P * U + Qin. 997 // 998 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. 999 // 1000 // For the 3rd case, the cost is P + 2 * V 1001 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V 1002 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V 1003 if (UProb > AdjustedSuccSumProb / 2 && 1004 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb, 1005 Chain, BlockFilter)) 1006 // Cases 3 & 4 1007 return greaterWithBias( 1008 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb), 1009 EntryFreq); 1010 // Cases 1 & 2 1011 return greaterWithBias((P + U), 1012 (Qout + std::min(Qin, F) * AdjustedSuccSumProb + 1013 std::max(Qin, F) * UProb), 1014 EntryFreq); 1015 } 1016 1017 /// Check for a trellis layout. \p BB is the upper part of a trellis if its 1018 /// successors form the lower part of a trellis. A successor set S forms the 1019 /// lower part of a trellis if all of the predecessors of S are either in S or 1020 /// have all of S as successors. We ignore trellises where BB doesn't have 2 1021 /// successors because for fewer than 2, it's trivial, and for 3 or greater they 1022 /// are very uncommon and complex to compute optimally. Allowing edges within S 1023 /// is not strictly a trellis, but the same algorithm works, so we allow it. 1024 bool MachineBlockPlacement::isTrellis( 1025 const MachineBasicBlock *BB, 1026 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, 1027 const BlockChain &Chain, const BlockFilterSet *BlockFilter) { 1028 // Technically BB could form a trellis with branching factor higher than 2. 1029 // But that's extremely uncommon. 1030 if (BB->succ_size() != 2 || ViableSuccs.size() != 2) 1031 return false; 1032 1033 SmallPtrSet<const MachineBasicBlock *, 2> Successors(llvm::from_range, 1034 BB->successors()); 1035 // To avoid reviewing the same predecessors twice. 1036 SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds; 1037 1038 for (MachineBasicBlock *Succ : ViableSuccs) { 1039 // Compile-time optimization: runtime is quadratic in the number of 1040 // predecessors. For such uncommon cases, exit early. 1041 if (Succ->pred_size() > PredecessorLimit) 1042 return false; 1043 1044 int PredCount = 0; 1045 for (auto *SuccPred : Succ->predecessors()) { 1046 // Allow triangle successors, but don't count them. 1047 if (Successors.count(SuccPred)) { 1048 // Make sure that it is actually a triangle. 1049 for (MachineBasicBlock *CheckSucc : SuccPred->successors()) 1050 if (!Successors.count(CheckSucc)) 1051 return false; 1052 continue; 1053 } 1054 const BlockChain *PredChain = BlockToChain[SuccPred]; 1055 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) || 1056 PredChain == &Chain || PredChain == BlockToChain[Succ]) 1057 continue; 1058 ++PredCount; 1059 // Perform the successor check only once. 1060 if (!SeenPreds.insert(SuccPred).second) 1061 continue; 1062 if (!hasSameSuccessors(*SuccPred, Successors)) 1063 return false; 1064 } 1065 // If one of the successors has only BB as a predecessor, it is not a 1066 // trellis. 1067 if (PredCount < 1) 1068 return false; 1069 } 1070 return true; 1071 } 1072 1073 /// Pick the highest total weight pair of edges that can both be laid out. 1074 /// The edges in \p Edges[0] are assumed to have a different destination than 1075 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either 1076 /// the individual highest weight edges to the 2 different destinations, or in 1077 /// case of a conflict, one of them should be replaced with a 2nd best edge. 1078 std::pair<MachineBlockPlacement::WeightedEdge, 1079 MachineBlockPlacement::WeightedEdge> 1080 MachineBlockPlacement::getBestNonConflictingEdges( 1081 const MachineBasicBlock *BB, 1082 MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>> 1083 Edges) { 1084 // Sort the edges, and then for each successor, find the best incoming 1085 // predecessor. If the best incoming predecessors aren't the same, 1086 // then that is clearly the best layout. If there is a conflict, one of the 1087 // successors will have to fallthrough from the second best predecessor. We 1088 // compare which combination is better overall. 1089 1090 // Sort for highest frequency. 1091 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; }; 1092 1093 llvm::stable_sort(Edges[0], Cmp); 1094 llvm::stable_sort(Edges[1], Cmp); 1095 auto BestA = Edges[0].begin(); 1096 auto BestB = Edges[1].begin(); 1097 // Arrange for the correct answer to be in BestA and BestB 1098 // If the 2 best edges don't conflict, the answer is already there. 1099 if (BestA->Src == BestB->Src) { 1100 // Compare the total fallthrough of (Best + Second Best) for both pairs 1101 auto SecondBestA = std::next(BestA); 1102 auto SecondBestB = std::next(BestB); 1103 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight; 1104 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight; 1105 if (BestAScore < BestBScore) 1106 BestA = SecondBestA; 1107 else 1108 BestB = SecondBestB; 1109 } 1110 // Arrange for the BB edge to be in BestA if it exists. 1111 if (BestB->Src == BB) 1112 std::swap(BestA, BestB); 1113 return std::make_pair(*BestA, *BestB); 1114 } 1115 1116 /// Get the best successor from \p BB based on \p BB being part of a trellis. 1117 /// We only handle trellises with 2 successors, so the algorithm is 1118 /// straightforward: Find the best pair of edges that don't conflict. We find 1119 /// the best incoming edge for each successor in the trellis. If those conflict, 1120 /// we consider which of them should be replaced with the second best. 1121 /// Upon return the two best edges will be in \p BestEdges. If one of the edges 1122 /// comes from \p BB, it will be in \p BestEdges[0] 1123 MachineBlockPlacement::BlockAndTailDupResult 1124 MachineBlockPlacement::getBestTrellisSuccessor( 1125 const MachineBasicBlock *BB, 1126 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, 1127 BranchProbability AdjustedSumProb, const BlockChain &Chain, 1128 const BlockFilterSet *BlockFilter) { 1129 1130 BlockAndTailDupResult Result = {nullptr, false}; 1131 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range, 1132 BB->successors()); 1133 1134 // We assume size 2 because it's common. For general n, we would have to do 1135 // the Hungarian algorithm, but it's not worth the complexity because more 1136 // than 2 successors is fairly uncommon, and a trellis even more so. 1137 if (Successors.size() != 2 || ViableSuccs.size() != 2) 1138 return Result; 1139 1140 // Collect the edge frequencies of all edges that form the trellis. 1141 SmallVector<WeightedEdge, 8> Edges[2]; 1142 int SuccIndex = 0; 1143 for (auto *Succ : ViableSuccs) { 1144 for (MachineBasicBlock *SuccPred : Succ->predecessors()) { 1145 // Skip any placed predecessors that are not BB 1146 if (SuccPred != BB) { 1147 if (BlockFilter && !BlockFilter->count(SuccPred)) 1148 continue; 1149 const BlockChain *SuccPredChain = BlockToChain[SuccPred]; 1150 if (SuccPredChain == &Chain || SuccPredChain == BlockToChain[Succ]) 1151 continue; 1152 } 1153 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) * 1154 MBPI->getEdgeProbability(SuccPred, Succ); 1155 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ}); 1156 } 1157 ++SuccIndex; 1158 } 1159 1160 // Pick the best combination of 2 edges from all the edges in the trellis. 1161 WeightedEdge BestA, BestB; 1162 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); 1163 1164 if (BestA.Src != BB) { 1165 // If we have a trellis, and BB doesn't have the best fallthrough edges, 1166 // we shouldn't choose any successor. We've already looked and there's a 1167 // better fallthrough edge for all the successors. 1168 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n"); 1169 return Result; 1170 } 1171 1172 // Did we pick the triangle edge? If tail-duplication is profitable, do 1173 // that instead. Otherwise merge the triangle edge now while we know it is 1174 // optimal. 1175 if (BestA.Dest == BestB.Src) { 1176 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2 1177 // would be better. 1178 MachineBasicBlock *Succ1 = BestA.Dest; 1179 MachineBasicBlock *Succ2 = BestB.Dest; 1180 // Check to see if tail-duplication would be profitable. 1181 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ2) && 1182 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) && 1183 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1), 1184 Chain, BlockFilter)) { 1185 LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability( 1186 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb); 1187 dbgs() << " Selected: " << getBlockName(Succ2) 1188 << ", probability: " << Succ2Prob 1189 << " (Tail Duplicate)\n"); 1190 Result.BB = Succ2; 1191 Result.ShouldTailDup = true; 1192 return Result; 1193 } 1194 } 1195 // We have already computed the optimal edge for the other side of the 1196 // trellis. 1197 ComputedEdges[BestB.Src] = {BestB.Dest, false}; 1198 1199 auto TrellisSucc = BestA.Dest; 1200 LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability( 1201 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb); 1202 dbgs() << " Selected: " << getBlockName(TrellisSucc) 1203 << ", probability: " << SuccProb << " (Trellis)\n"); 1204 Result.BB = TrellisSucc; 1205 return Result; 1206 } 1207 1208 /// When the option allowTailDupPlacement() is on, this method checks if the 1209 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated 1210 /// into all of its unplaced, unfiltered predecessors, that are not BB. 1211 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( 1212 const MachineBasicBlock *BB, MachineBasicBlock *Succ, 1213 const BlockChain &Chain, const BlockFilterSet *BlockFilter) { 1214 if (!shouldTailDuplicate(Succ)) 1215 return false; 1216 1217 // The result of canTailDuplicate. 1218 bool Duplicate = true; 1219 // Number of possible duplication. 1220 unsigned int NumDup = 0; 1221 1222 // For CFG checking. 1223 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range, 1224 BB->successors()); 1225 for (MachineBasicBlock *Pred : Succ->predecessors()) { 1226 // Make sure all unplaced and unfiltered predecessors can be 1227 // tail-duplicated into. 1228 // Skip any blocks that are already placed or not in this loop. 1229 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) || 1230 (BlockToChain[Pred] == &Chain && !Succ->succ_empty())) 1231 continue; 1232 if (!TailDup.canTailDuplicate(Succ, Pred)) { 1233 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) 1234 // This will result in a trellis after tail duplication, so we don't 1235 // need to copy Succ into this predecessor. In the presence 1236 // of a trellis tail duplication can continue to be profitable. 1237 // For example: 1238 // A A 1239 // |\ |\ 1240 // | \ | \ 1241 // | C | C+BB 1242 // | / | | 1243 // |/ | | 1244 // BB => BB | 1245 // |\ |\/| 1246 // | \ |/\| 1247 // | D | D 1248 // | / | / 1249 // |/ |/ 1250 // Succ Succ 1251 // 1252 // After BB was duplicated into C, the layout looks like the one on the 1253 // right. BB and C now have the same successors. When considering 1254 // whether Succ can be duplicated into all its unplaced predecessors, we 1255 // ignore C. 1256 // We can do this because C already has a profitable fallthrough, namely 1257 // D. TODO(iteratee): ignore sufficiently cold predecessors for 1258 // duplication and for this test. 1259 // 1260 // This allows trellises to be laid out in 2 separate chains 1261 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic 1262 // because it allows the creation of 2 fallthrough paths with links 1263 // between them, and we correctly identify the best layout for these 1264 // CFGs. We want to extend trellises that the user created in addition 1265 // to trellises created by tail-duplication, so we just look for the 1266 // CFG. 1267 continue; 1268 Duplicate = false; 1269 continue; 1270 } 1271 NumDup++; 1272 } 1273 1274 // No possible duplication in current filter set. 1275 if (NumDup == 0) 1276 return false; 1277 1278 // If profile information is available, findDuplicateCandidates can do more 1279 // precise benefit analysis. 1280 if (F->getFunction().hasProfileData()) 1281 return true; 1282 1283 // This is mainly for function exit BB. 1284 // The integrated tail duplication is really designed for increasing 1285 // fallthrough from predecessors from Succ to its successors. We may need 1286 // other machanism to handle different cases. 1287 if (Succ->succ_empty()) 1288 return true; 1289 1290 // Plus the already placed predecessor. 1291 NumDup++; 1292 1293 // If the duplication candidate has more unplaced predecessors than 1294 // successors, the extra duplication can't bring more fallthrough. 1295 // 1296 // Pred1 Pred2 Pred3 1297 // \ | / 1298 // \ | / 1299 // \ | / 1300 // Dup 1301 // / \ 1302 // / \ 1303 // Succ1 Succ2 1304 // 1305 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup 1306 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2, 1307 // but the duplication into Pred3 can't increase fallthrough. 1308 // 1309 // A small number of extra duplication may not hurt too much. We need a better 1310 // heuristic to handle it. 1311 if ((NumDup > Succ->succ_size()) || !Duplicate) 1312 return false; 1313 1314 return true; 1315 } 1316 1317 /// Find chains of triangles where we believe it would be profitable to 1318 /// tail-duplicate them all, but a local analysis would not find them. 1319 /// There are 3 ways this can be profitable: 1320 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with 1321 /// longer chains) 1322 /// 2) The chains are statically correlated. Branch probabilities have a very 1323 /// U-shaped distribution. 1324 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] 1325 /// If the branches in a chain are likely to be from the same side of the 1326 /// distribution as their predecessor, but are independent at runtime, this 1327 /// transformation is profitable. (Because the cost of being wrong is a small 1328 /// fixed cost, unlike the standard triangle layout where the cost of being 1329 /// wrong scales with the # of triangles.) 1330 /// 3) The chains are dynamically correlated. If the probability that a previous 1331 /// branch was taken positively influences whether the next branch will be 1332 /// taken 1333 /// We believe that 2 and 3 are common enough to justify the small margin in 1. 1334 void MachineBlockPlacement::precomputeTriangleChains() { 1335 struct TriangleChain { 1336 std::vector<MachineBasicBlock *> Edges; 1337 1338 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst) 1339 : Edges({src, dst}) {} 1340 1341 void append(MachineBasicBlock *dst) { 1342 assert(getKey()->isSuccessor(dst) && 1343 "Attempting to append a block that is not a successor."); 1344 Edges.push_back(dst); 1345 } 1346 1347 unsigned count() const { return Edges.size() - 1; } 1348 1349 MachineBasicBlock *getKey() const { return Edges.back(); } 1350 }; 1351 1352 if (TriangleChainCount == 0) 1353 return; 1354 1355 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n"); 1356 // Map from last block to the chain that contains it. This allows us to extend 1357 // chains as we find new triangles. 1358 DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap; 1359 for (MachineBasicBlock &BB : *F) { 1360 // If BB doesn't have 2 successors, it doesn't start a triangle. 1361 if (BB.succ_size() != 2) 1362 continue; 1363 MachineBasicBlock *PDom = nullptr; 1364 for (MachineBasicBlock *Succ : BB.successors()) { 1365 if (!MPDT->dominates(Succ, &BB)) 1366 continue; 1367 PDom = Succ; 1368 break; 1369 } 1370 // If BB doesn't have a post-dominating successor, it doesn't form a 1371 // triangle. 1372 if (PDom == nullptr) 1373 continue; 1374 // If PDom has a hint that it is low probability, skip this triangle. 1375 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) 1376 continue; 1377 // If PDom isn't eligible for duplication, this isn't the kind of triangle 1378 // we're looking for. 1379 if (!shouldTailDuplicate(PDom)) 1380 continue; 1381 bool CanTailDuplicate = true; 1382 // If PDom can't tail-duplicate into it's non-BB predecessors, then this 1383 // isn't the kind of triangle we're looking for. 1384 for (MachineBasicBlock *Pred : PDom->predecessors()) { 1385 if (Pred == &BB) 1386 continue; 1387 if (!TailDup.canTailDuplicate(PDom, Pred)) { 1388 CanTailDuplicate = false; 1389 break; 1390 } 1391 } 1392 // If we can't tail-duplicate PDom to its predecessors, then skip this 1393 // triangle. 1394 if (!CanTailDuplicate) 1395 continue; 1396 1397 // Now we have an interesting triangle. Insert it if it's not part of an 1398 // existing chain. 1399 // Note: This cannot be replaced with a call insert() or emplace() because 1400 // the find key is BB, but the insert/emplace key is PDom. 1401 auto Found = TriangleChainMap.find(&BB); 1402 // If it is, remove the chain from the map, grow it, and put it back in the 1403 // map with the end as the new key. 1404 if (Found != TriangleChainMap.end()) { 1405 TriangleChain Chain = std::move(Found->second); 1406 TriangleChainMap.erase(Found); 1407 Chain.append(PDom); 1408 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); 1409 } else { 1410 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); 1411 assert(InsertResult.second && "Block seen twice."); 1412 (void)InsertResult; 1413 } 1414 } 1415 1416 // Iterating over a DenseMap is safe here, because the only thing in the body 1417 // of the loop is inserting into another DenseMap (ComputedEdges). 1418 // ComputedEdges is never iterated, so this doesn't lead to non-determinism. 1419 for (auto &ChainPair : TriangleChainMap) { 1420 TriangleChain &Chain = ChainPair.second; 1421 // Benchmarking has shown that due to branch correlation duplicating 2 or 1422 // more triangles is profitable, despite the calculations assuming 1423 // independence. 1424 if (Chain.count() < TriangleChainCount) 1425 continue; 1426 MachineBasicBlock *dst = Chain.Edges.back(); 1427 Chain.Edges.pop_back(); 1428 for (MachineBasicBlock *src : reverse(Chain.Edges)) { 1429 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" 1430 << getBlockName(dst) 1431 << " as pre-computed based on triangles.\n"); 1432 1433 auto InsertResult = ComputedEdges.insert({src, {dst, true}}); 1434 assert(InsertResult.second && "Block seen twice."); 1435 (void)InsertResult; 1436 1437 dst = src; 1438 } 1439 } 1440 } 1441 1442 // When profile is not present, return the StaticLikelyProb. 1443 // When profile is available, we need to handle the triangle-shape CFG. 1444 static BranchProbability 1445 getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) { 1446 if (!BB->getParent()->getFunction().hasProfileData()) 1447 return BranchProbability(StaticLikelyProb, 100); 1448 if (BB->succ_size() == 2) { 1449 const MachineBasicBlock *Succ1 = *BB->succ_begin(); 1450 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1); 1451 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { 1452 /* See case 1 below for the cost analysis. For BB->Succ to 1453 * be taken with smaller cost, the following needs to hold: 1454 * Prob(BB->Succ) > 2 * Prob(BB->Pred) 1455 * So the threshold T in the calculation below 1456 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) 1457 * So T / (1 - T) = 2, Yielding T = 2/3 1458 * Also adding user specified branch bias, we have 1459 * T = (2/3)*(ProfileLikelyProb/50) 1460 * = (2*ProfileLikelyProb)/150) 1461 */ 1462 return BranchProbability(2 * ProfileLikelyProb, 150); 1463 } 1464 } 1465 return BranchProbability(ProfileLikelyProb, 100); 1466 } 1467 1468 /// Checks to see if the layout candidate block \p Succ has a better layout 1469 /// predecessor than \c BB. If yes, returns true. 1470 /// \p SuccProb: The probability adjusted for only remaining blocks. 1471 /// Only used for logging 1472 /// \p RealSuccProb: The un-adjusted probability. 1473 /// \p Chain: The chain that BB belongs to and Succ is being considered for. 1474 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being 1475 /// considered 1476 bool MachineBlockPlacement::hasBetterLayoutPredecessor( 1477 const MachineBasicBlock *BB, const MachineBasicBlock *Succ, 1478 const BlockChain &SuccChain, BranchProbability SuccProb, 1479 BranchProbability RealSuccProb, const BlockChain &Chain, 1480 const BlockFilterSet *BlockFilter) { 1481 1482 // There isn't a better layout when there are no unscheduled predecessors. 1483 if (SuccChain.UnscheduledPredecessors == 0) 1484 return false; 1485 1486 // Compile-time optimization: runtime is quadratic in the number of 1487 // predecessors. For such uncommon cases, exit early. 1488 if (Succ->pred_size() > PredecessorLimit) 1489 return false; 1490 1491 // There are two basic scenarios here: 1492 // ------------------------------------- 1493 // Case 1: triangular shape CFG (if-then): 1494 // BB 1495 // | \ 1496 // | \ 1497 // | Pred 1498 // | / 1499 // Succ 1500 // In this case, we are evaluating whether to select edge -> Succ, e.g. 1501 // set Succ as the layout successor of BB. Picking Succ as BB's 1502 // successor breaks the CFG constraints (FIXME: define these constraints). 1503 // With this layout, Pred BB 1504 // is forced to be outlined, so the overall cost will be cost of the 1505 // branch taken from BB to Pred, plus the cost of back taken branch 1506 // from Pred to Succ, as well as the additional cost associated 1507 // with the needed unconditional jump instruction from Pred To Succ. 1508 1509 // The cost of the topological order layout is the taken branch cost 1510 // from BB to Succ, so to make BB->Succ a viable candidate, the following 1511 // must hold: 1512 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost 1513 // < freq(BB->Succ) * taken_branch_cost. 1514 // Ignoring unconditional jump cost, we get 1515 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e., 1516 // prob(BB->Succ) > 2 * prob(BB->Pred) 1517 // 1518 // When real profile data is available, we can precisely compute the 1519 // probability threshold that is needed for edge BB->Succ to be considered. 1520 // Without profile data, the heuristic requires the branch bias to be 1521 // a lot larger to make sure the signal is very strong (e.g. 80% default). 1522 // ----------------------------------------------------------------- 1523 // Case 2: diamond like CFG (if-then-else): 1524 // S 1525 // / \ 1526 // | \ 1527 // BB Pred 1528 // \ / 1529 // Succ 1530 // .. 1531 // 1532 // The current block is BB and edge BB->Succ is now being evaluated. 1533 // Note that edge S->BB was previously already selected because 1534 // prob(S->BB) > prob(S->Pred). 1535 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we 1536 // choose Pred, we will have a topological ordering as shown on the left 1537 // in the picture below. If we choose Succ, we have the solution as shown 1538 // on the right: 1539 // 1540 // topo-order: 1541 // 1542 // S----- ---S 1543 // | | | | 1544 // ---BB | | BB 1545 // | | | | 1546 // | Pred-- | Succ-- 1547 // | | | | 1548 // ---Succ ---Pred-- 1549 // 1550 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred) 1551 // = freq(S->Pred) + freq(S->BB) 1552 // 1553 // If we have profile data (i.e, branch probabilities can be trusted), the 1554 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 * 1555 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB). 1556 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which 1557 // means the cost of topological order is greater. 1558 // When profile data is not available, however, we need to be more 1559 // conservative. If the branch prediction is wrong, breaking the topo-order 1560 // will actually yield a layout with large cost. For this reason, we need 1561 // strong biased branch at block S with Prob(S->BB) in order to select 1562 // BB->Succ. This is equivalent to looking the CFG backward with backward 1563 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without 1564 // profile data). 1565 // -------------------------------------------------------------------------- 1566 // Case 3: forked diamond 1567 // S 1568 // / \ 1569 // / \ 1570 // BB Pred 1571 // | \ / | 1572 // | \ / | 1573 // | X | 1574 // | / \ | 1575 // | / \ | 1576 // S1 S2 1577 // 1578 // The current block is BB and edge BB->S1 is now being evaluated. 1579 // As above S->BB was already selected because 1580 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2). 1581 // 1582 // topo-order: 1583 // 1584 // S-------| ---S 1585 // | | | | 1586 // ---BB | | BB 1587 // | | | | 1588 // | Pred----| | S1---- 1589 // | | | | 1590 // --(S1 or S2) ---Pred-- 1591 // | 1592 // S2 1593 // 1594 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2) 1595 // + min(freq(Pred->S1), freq(Pred->S2)) 1596 // Non-topo-order cost: 1597 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2). 1598 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2)) 1599 // is 0. Then the non topo layout is better when 1600 // freq(S->Pred) < freq(BB->S1). 1601 // This is exactly what is checked below. 1602 // Note there are other shapes that apply (Pred may not be a single block, 1603 // but they all fit this general pattern.) 1604 BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB); 1605 1606 // Make sure that a hot successor doesn't have a globally more 1607 // important predecessor. 1608 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb; 1609 bool BadCFGConflict = false; 1610 1611 for (MachineBasicBlock *Pred : Succ->predecessors()) { 1612 BlockChain *PredChain = BlockToChain[Pred]; 1613 if (Pred == Succ || PredChain == &SuccChain || 1614 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain || 1615 Pred != *std::prev(PredChain->end()) || 1616 // This check is redundant except for look ahead. This function is 1617 // called for lookahead by isProfitableToTailDup when BB hasn't been 1618 // placed yet. 1619 (Pred == BB)) 1620 continue; 1621 // Do backward checking. 1622 // For all cases above, we need a backward checking to filter out edges that 1623 // are not 'strongly' biased. 1624 // BB Pred 1625 // \ / 1626 // Succ 1627 // We select edge BB->Succ if 1628 // freq(BB->Succ) > freq(Succ) * HotProb 1629 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) * 1630 // HotProb 1631 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb 1632 // Case 1 is covered too, because the first equation reduces to: 1633 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle) 1634 BlockFrequency PredEdgeFreq = 1635 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ); 1636 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) { 1637 BadCFGConflict = true; 1638 break; 1639 } 1640 } 1641 1642 if (BadCFGConflict) { 1643 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> " 1644 << SuccProb << " (prob) (non-cold CFG conflict)\n"); 1645 return true; 1646 } 1647 1648 return false; 1649 } 1650 1651 /// Select the best successor for a block. 1652 /// 1653 /// This looks across all successors of a particular block and attempts to 1654 /// select the "best" one to be the layout successor. It only considers direct 1655 /// successors which also pass the block filter. It will attempt to avoid 1656 /// breaking CFG structure, but cave and break such structures in the case of 1657 /// very hot successor edges. 1658 /// 1659 /// \returns The best successor block found, or null if none are viable, along 1660 /// with a boolean indicating if tail duplication is necessary. 1661 MachineBlockPlacement::BlockAndTailDupResult 1662 MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB, 1663 const BlockChain &Chain, 1664 const BlockFilterSet *BlockFilter) { 1665 const BranchProbability HotProb(StaticLikelyProb, 100); 1666 1667 BlockAndTailDupResult BestSucc = {nullptr, false}; 1668 auto BestProb = BranchProbability::getZero(); 1669 1670 SmallVector<MachineBasicBlock *, 4> Successors; 1671 auto AdjustedSumProb = 1672 collectViableSuccessors(BB, Chain, BlockFilter, Successors); 1673 1674 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) 1675 << "\n"); 1676 1677 // if we already precomputed the best successor for BB, return that if still 1678 // applicable. 1679 auto FoundEdge = ComputedEdges.find(BB); 1680 if (FoundEdge != ComputedEdges.end()) { 1681 MachineBasicBlock *Succ = FoundEdge->second.BB; 1682 ComputedEdges.erase(FoundEdge); 1683 BlockChain *SuccChain = BlockToChain[Succ]; 1684 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) && 1685 SuccChain != &Chain && Succ == *SuccChain->begin()) 1686 return FoundEdge->second; 1687 } 1688 1689 // if BB is part of a trellis, Use the trellis to determine the optimal 1690 // fallthrough edges 1691 if (isTrellis(BB, Successors, Chain, BlockFilter)) 1692 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain, 1693 BlockFilter); 1694 1695 // For blocks with CFG violations, we may be able to lay them out anyway with 1696 // tail-duplication. We keep this vector so we can perform the probability 1697 // calculations the minimum number of times. 1698 SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, 4> 1699 DupCandidates; 1700 for (MachineBasicBlock *Succ : Successors) { 1701 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); 1702 BranchProbability SuccProb = 1703 getAdjustedProbability(RealSuccProb, AdjustedSumProb); 1704 1705 BlockChain &SuccChain = *BlockToChain[Succ]; 1706 // Skip the edge \c BB->Succ if block \c Succ has a better layout 1707 // predecessor that yields lower global cost. 1708 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, 1709 Chain, BlockFilter)) { 1710 // If tail duplication would make Succ profitable, place it. 1711 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ)) 1712 DupCandidates.emplace_back(SuccProb, Succ); 1713 continue; 1714 } 1715 1716 LLVM_DEBUG( 1717 dbgs() << " Candidate: " << getBlockName(Succ) 1718 << ", probability: " << SuccProb 1719 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") 1720 << "\n"); 1721 1722 if (BestSucc.BB && BestProb >= SuccProb) { 1723 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n"); 1724 continue; 1725 } 1726 1727 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n"); 1728 BestSucc.BB = Succ; 1729 BestProb = SuccProb; 1730 } 1731 // Handle the tail duplication candidates in order of decreasing probability. 1732 // Stop at the first one that is profitable. Also stop if they are less 1733 // profitable than BestSucc. Position is important because we preserve it and 1734 // prefer first best match. Here we aren't comparing in order, so we capture 1735 // the position instead. 1736 llvm::stable_sort(DupCandidates, 1737 [](std::tuple<BranchProbability, MachineBasicBlock *> L, 1738 std::tuple<BranchProbability, MachineBasicBlock *> R) { 1739 return std::get<0>(L) > std::get<0>(R); 1740 }); 1741 for (auto &Tup : DupCandidates) { 1742 BranchProbability DupProb; 1743 MachineBasicBlock *Succ; 1744 std::tie(DupProb, Succ) = Tup; 1745 if (DupProb < BestProb) 1746 break; 1747 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) && 1748 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { 1749 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ) 1750 << ", probability: " << DupProb 1751 << " (Tail Duplicate)\n"); 1752 BestSucc.BB = Succ; 1753 BestSucc.ShouldTailDup = true; 1754 break; 1755 } 1756 } 1757 1758 if (BestSucc.BB) 1759 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n"); 1760 1761 return BestSucc; 1762 } 1763 1764 /// Select the best block from a worklist. 1765 /// 1766 /// This looks through the provided worklist as a list of candidate basic 1767 /// blocks and select the most profitable one to place. The definition of 1768 /// profitable only really makes sense in the context of a loop. This returns 1769 /// the most frequently visited block in the worklist, which in the case of 1770 /// a loop, is the one most desirable to be physically close to the rest of the 1771 /// loop body in order to improve i-cache behavior. 1772 /// 1773 /// \returns The best block found, or null if none are viable. 1774 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( 1775 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { 1776 // Once we need to walk the worklist looking for a candidate, cleanup the 1777 // worklist of already placed entries. 1778 // FIXME: If this shows up on profiles, it could be folded (at the cost of 1779 // some code complexity) into the loop below. 1780 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) { 1781 return BlockToChain.lookup(BB) == &Chain; 1782 }); 1783 1784 if (WorkList.empty()) 1785 return nullptr; 1786 1787 bool IsEHPad = WorkList[0]->isEHPad(); 1788 1789 MachineBasicBlock *BestBlock = nullptr; 1790 BlockFrequency BestFreq; 1791 for (MachineBasicBlock *MBB : WorkList) { 1792 assert(MBB->isEHPad() == IsEHPad && 1793 "EHPad mismatch between block and work list."); 1794 1795 BlockChain &SuccChain = *BlockToChain[MBB]; 1796 if (&SuccChain == &Chain) 1797 continue; 1798 1799 assert(SuccChain.UnscheduledPredecessors == 0 && 1800 "Found CFG-violating block"); 1801 1802 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB); 1803 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> " 1804 << printBlockFreq(MBFI->getMBFI(), CandidateFreq) 1805 << " (freq)\n"); 1806 1807 // For ehpad, we layout the least probable first as to avoid jumping back 1808 // from least probable landingpads to more probable ones. 1809 // 1810 // FIXME: Using probability is probably (!) not the best way to achieve 1811 // this. We should probably have a more principled approach to layout 1812 // cleanup code. 1813 // 1814 // The goal is to get: 1815 // 1816 // +--------------------------+ 1817 // | V 1818 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume 1819 // 1820 // Rather than: 1821 // 1822 // +-------------------------------------+ 1823 // V | 1824 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup 1825 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq))) 1826 continue; 1827 1828 BestBlock = MBB; 1829 BestFreq = CandidateFreq; 1830 } 1831 1832 return BestBlock; 1833 } 1834 1835 /// Retrieve the first unplaced basic block in the entire function. 1836 /// 1837 /// This routine is called when we are unable to use the CFG to walk through 1838 /// all of the basic blocks and form a chain due to unnatural loops in the CFG. 1839 /// We walk through the function's blocks in order, starting from the 1840 /// LastUnplacedBlockIt. We update this iterator on each call to avoid 1841 /// re-scanning the entire sequence on repeated calls to this routine. 1842 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( 1843 const BlockChain &PlacedChain, 1844 MachineFunction::iterator &PrevUnplacedBlockIt) { 1845 1846 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E; 1847 ++I) { 1848 if (BlockChain *Chain = BlockToChain[&*I]; Chain != &PlacedChain) { 1849 PrevUnplacedBlockIt = I; 1850 // Now select the head of the chain to which the unplaced block belongs 1851 // as the block to place. This will force the entire chain to be placed, 1852 // and satisfies the requirements of merging chains. 1853 return *Chain->begin(); 1854 } 1855 } 1856 return nullptr; 1857 } 1858 1859 /// Retrieve the first unplaced basic block among the blocks in BlockFilter. 1860 /// 1861 /// This is similar to getFirstUnplacedBlock for the entire function, but since 1862 /// the size of BlockFilter is typically far less than the number of blocks in 1863 /// the entire function, iterating through the BlockFilter is more efficient. 1864 /// When processing the entire funciton, using the version without BlockFilter 1865 /// has a complexity of #(loops in function) * #(blocks in function), while this 1866 /// version has a complexity of sum(#(loops in block) foreach block in function) 1867 /// which is always smaller. For long function mostly sequential in structure, 1868 /// the complexity is amortized to 1 * #(blocks in function). 1869 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( 1870 const BlockChain &PlacedChain, 1871 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, 1872 const BlockFilterSet *BlockFilter) { 1873 assert(BlockFilter); 1874 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end(); 1875 ++PrevUnplacedBlockInFilterIt) { 1876 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt]; 1877 if (C != &PlacedChain) { 1878 return *C->begin(); 1879 } 1880 } 1881 return nullptr; 1882 } 1883 1884 void MachineBlockPlacement::fillWorkLists( 1885 const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, 1886 const BlockFilterSet *BlockFilter = nullptr) { 1887 BlockChain &Chain = *BlockToChain[MBB]; 1888 if (!UpdatedPreds.insert(&Chain).second) 1889 return; 1890 1891 assert( 1892 Chain.UnscheduledPredecessors == 0 && 1893 "Attempting to place block with unscheduled predecessors in worklist."); 1894 for (MachineBasicBlock *ChainBB : Chain) { 1895 assert(BlockToChain[ChainBB] == &Chain && 1896 "Block in chain doesn't match BlockToChain map."); 1897 for (MachineBasicBlock *Pred : ChainBB->predecessors()) { 1898 if (BlockFilter && !BlockFilter->count(Pred)) 1899 continue; 1900 if (BlockToChain[Pred] == &Chain) 1901 continue; 1902 ++Chain.UnscheduledPredecessors; 1903 } 1904 } 1905 1906 if (Chain.UnscheduledPredecessors != 0) 1907 return; 1908 1909 MachineBasicBlock *BB = *Chain.begin(); 1910 if (BB->isEHPad()) 1911 EHPadWorkList.push_back(BB); 1912 else 1913 BlockWorkList.push_back(BB); 1914 } 1915 1916 void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB, 1917 BlockChain &Chain, 1918 BlockFilterSet *BlockFilter) { 1919 assert(HeadBB && "BB must not be null.\n"); 1920 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n"); 1921 MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); 1922 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt; 1923 if (BlockFilter) 1924 PrevUnplacedBlockInFilterIt = BlockFilter->begin(); 1925 1926 const MachineBasicBlock *LoopHeaderBB = HeadBB; 1927 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter); 1928 MachineBasicBlock *BB = *std::prev(Chain.end()); 1929 while (true) { 1930 assert(BB && "null block found at end of chain in loop."); 1931 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); 1932 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain."); 1933 1934 // Look for the best viable successor if there is one to place immediately 1935 // after this block. 1936 auto Result = selectBestSuccessor(BB, Chain, BlockFilter); 1937 MachineBasicBlock *BestSucc = Result.BB; 1938 bool ShouldTailDup = Result.ShouldTailDup; 1939 if (allowTailDupPlacement(*F)) 1940 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds( 1941 BB, BestSucc, Chain, BlockFilter)); 1942 1943 // If an immediate successor isn't available, look for the best viable 1944 // block among those we've identified as not violating the loop's CFG at 1945 // this point. This won't be a fallthrough, but it will increase locality. 1946 if (!BestSucc) 1947 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList); 1948 if (!BestSucc) 1949 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList); 1950 1951 if (!BestSucc) { 1952 if (BlockFilter) 1953 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt, 1954 BlockFilter); 1955 else 1956 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt); 1957 if (!BestSucc) 1958 break; 1959 1960 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the " 1961 "layout successor until the CFG reduces\n"); 1962 } 1963 1964 // Placement may have changed tail duplication opportunities. 1965 // Check for that now. 1966 if (allowTailDupPlacement(*F) && BestSucc && ShouldTailDup) { 1967 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, 1968 BlockFilter, PrevUnplacedBlockIt, 1969 PrevUnplacedBlockInFilterIt); 1970 // If the chosen successor was duplicated into BB, don't bother laying 1971 // it out, just go round the loop again with BB as the chain end. 1972 if (!BB->isSuccessor(BestSucc)) 1973 continue; 1974 } 1975 1976 // Place this block, updating the datastructures to reflect its placement. 1977 BlockChain &SuccChain = *BlockToChain[BestSucc]; 1978 // Zero out UnscheduledPredecessors for the successor we're about to merge 1979 // in case we selected a successor that didn't fit naturally into the CFG. 1980 SuccChain.UnscheduledPredecessors = 0; 1981 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to " 1982 << getBlockName(BestSucc) << "\n"); 1983 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter); 1984 Chain.merge(BestSucc, &SuccChain); 1985 BB = *std::prev(Chain.end()); 1986 } 1987 1988 LLVM_DEBUG(dbgs() << "Finished forming chain for header block " 1989 << getBlockName(*Chain.begin()) << "\n"); 1990 } 1991 1992 // If bottom of block BB has only one successor OldTop, in most cases it is 1993 // profitable to move it before OldTop, except the following case: 1994 // 1995 // -->OldTop<- 1996 // | . | 1997 // | . | 1998 // | . | 1999 // ---Pred | 2000 // | | 2001 // BB----- 2002 // 2003 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't 2004 // layout the other successor below it, so it can't reduce taken branch. 2005 // In this case we keep its original layout. 2006 bool MachineBlockPlacement::canMoveBottomBlockToTop( 2007 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) { 2008 if (BottomBlock->pred_size() != 1) 2009 return true; 2010 MachineBasicBlock *Pred = *BottomBlock->pred_begin(); 2011 if (Pred->succ_size() != 2) 2012 return true; 2013 2014 MachineBasicBlock *OtherBB = *Pred->succ_begin(); 2015 if (OtherBB == BottomBlock) 2016 OtherBB = *Pred->succ_rbegin(); 2017 if (OtherBB == OldTop) 2018 return false; 2019 2020 return true; 2021 } 2022 2023 // Find out the possible fall through frequence to the top of a loop. 2024 BlockFrequency 2025 MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top, 2026 const BlockFilterSet &LoopBlockSet) { 2027 BlockFrequency MaxFreq = BlockFrequency(0); 2028 for (MachineBasicBlock *Pred : Top->predecessors()) { 2029 BlockChain *PredChain = BlockToChain[Pred]; 2030 if (!LoopBlockSet.count(Pred) && 2031 (!PredChain || Pred == *std::prev(PredChain->end()))) { 2032 // Found a Pred block can be placed before Top. 2033 // Check if Top is the best successor of Pred. 2034 auto TopProb = MBPI->getEdgeProbability(Pred, Top); 2035 bool TopOK = true; 2036 for (MachineBasicBlock *Succ : Pred->successors()) { 2037 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ); 2038 BlockChain *SuccChain = BlockToChain[Succ]; 2039 // Check if Succ can be placed after Pred. 2040 // Succ should not be in any chain, or it is the head of some chain. 2041 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) && 2042 (!SuccChain || Succ == *SuccChain->begin())) { 2043 TopOK = false; 2044 break; 2045 } 2046 } 2047 if (TopOK) { 2048 BlockFrequency EdgeFreq = 2049 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top); 2050 if (EdgeFreq > MaxFreq) 2051 MaxFreq = EdgeFreq; 2052 } 2053 } 2054 } 2055 return MaxFreq; 2056 } 2057 2058 // Compute the fall through gains when move NewTop before OldTop. 2059 // 2060 // In following diagram, edges marked as "-" are reduced fallthrough, edges 2061 // marked as "+" are increased fallthrough, this function computes 2062 // 2063 // SUM(increased fallthrough) - SUM(decreased fallthrough) 2064 // 2065 // | 2066 // | - 2067 // V 2068 // --->OldTop 2069 // | . 2070 // | . 2071 // +| . + 2072 // | Pred ---> 2073 // | |- 2074 // | V 2075 // --- NewTop <--- 2076 // |- 2077 // V 2078 // 2079 BlockFrequency MachineBlockPlacement::FallThroughGains( 2080 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop, 2081 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) { 2082 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet); 2083 BlockFrequency FallThrough2Exit = BlockFrequency(0); 2084 if (ExitBB) 2085 FallThrough2Exit = 2086 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB); 2087 BlockFrequency BackEdgeFreq = 2088 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop); 2089 2090 // Find the best Pred of NewTop. 2091 MachineBasicBlock *BestPred = nullptr; 2092 BlockFrequency FallThroughFromPred = BlockFrequency(0); 2093 for (MachineBasicBlock *Pred : NewTop->predecessors()) { 2094 if (!LoopBlockSet.count(Pred)) 2095 continue; 2096 BlockChain *PredChain = BlockToChain[Pred]; 2097 if (!PredChain || Pred == *std::prev(PredChain->end())) { 2098 BlockFrequency EdgeFreq = 2099 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop); 2100 if (EdgeFreq > FallThroughFromPred) { 2101 FallThroughFromPred = EdgeFreq; 2102 BestPred = Pred; 2103 } 2104 } 2105 } 2106 2107 // If NewTop is not placed after Pred, another successor can be placed 2108 // after Pred. 2109 BlockFrequency NewFreq = BlockFrequency(0); 2110 if (BestPred) { 2111 for (MachineBasicBlock *Succ : BestPred->successors()) { 2112 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ)) 2113 continue; 2114 if (ComputedEdges.contains(Succ)) 2115 continue; 2116 BlockChain *SuccChain = BlockToChain[Succ]; 2117 if ((SuccChain && (Succ != *SuccChain->begin())) || 2118 (SuccChain == BlockToChain[BestPred])) 2119 continue; 2120 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) * 2121 MBPI->getEdgeProbability(BestPred, Succ); 2122 if (EdgeFreq > NewFreq) 2123 NewFreq = EdgeFreq; 2124 } 2125 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) * 2126 MBPI->getEdgeProbability(BestPred, NewTop); 2127 if (NewFreq > OrigEdgeFreq) { 2128 // If NewTop is not the best successor of Pred, then Pred doesn't 2129 // fallthrough to NewTop. So there is no FallThroughFromPred and 2130 // NewFreq. 2131 NewFreq = BlockFrequency(0); 2132 FallThroughFromPred = BlockFrequency(0); 2133 } 2134 } 2135 2136 BlockFrequency Result = BlockFrequency(0); 2137 BlockFrequency Gains = BackEdgeFreq + NewFreq; 2138 BlockFrequency Lost = 2139 FallThrough2Top + FallThrough2Exit + FallThroughFromPred; 2140 if (Gains > Lost) 2141 Result = Gains - Lost; 2142 return Result; 2143 } 2144 2145 /// Helper function of findBestLoopTop. Find the best loop top block 2146 /// from predecessors of old top. 2147 /// 2148 /// Look for a block which is strictly better than the old top for laying 2149 /// out before the old top of the loop. This looks for only two patterns: 2150 /// 2151 /// 1. a block has only one successor, the old loop top 2152 /// 2153 /// Because such a block will always result in an unconditional jump, 2154 /// rotating it in front of the old top is always profitable. 2155 /// 2156 /// 2. a block has two successors, one is old top, another is exit 2157 /// and it has more than one predecessors 2158 /// 2159 /// If it is below one of its predecessors P, only P can fall through to 2160 /// it, all other predecessors need a jump to it, and another conditional 2161 /// jump to loop header. If it is moved before loop header, all its 2162 /// predecessors jump to it, then fall through to loop header. So all its 2163 /// predecessors except P can reduce one taken branch. 2164 /// At the same time, move it before old top increases the taken branch 2165 /// to loop exit block, so the reduced taken branch will be compared with 2166 /// the increased taken branch to the loop exit block. 2167 MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper( 2168 MachineBasicBlock *OldTop, const MachineLoop &L, 2169 const BlockFilterSet &LoopBlockSet) { 2170 // Check that the header hasn't been fused with a preheader block due to 2171 // crazy branches. If it has, we need to start with the header at the top to 2172 // prevent pulling the preheader into the loop body. 2173 BlockChain &HeaderChain = *BlockToChain[OldTop]; 2174 if (!LoopBlockSet.count(*HeaderChain.begin())) 2175 return OldTop; 2176 if (OldTop != *HeaderChain.begin()) 2177 return OldTop; 2178 2179 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop) 2180 << "\n"); 2181 2182 BlockFrequency BestGains = BlockFrequency(0); 2183 MachineBasicBlock *BestPred = nullptr; 2184 for (MachineBasicBlock *Pred : OldTop->predecessors()) { 2185 if (!LoopBlockSet.count(Pred)) 2186 continue; 2187 if (Pred == L.getHeader()) 2188 continue; 2189 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has " 2190 << Pred->succ_size() << " successors, " 2191 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n"); 2192 if (Pred->succ_size() > 2) 2193 continue; 2194 2195 MachineBasicBlock *OtherBB = nullptr; 2196 if (Pred->succ_size() == 2) { 2197 OtherBB = *Pred->succ_begin(); 2198 if (OtherBB == OldTop) 2199 OtherBB = *Pred->succ_rbegin(); 2200 } 2201 2202 if (!canMoveBottomBlockToTop(Pred, OldTop)) 2203 continue; 2204 2205 BlockFrequency Gains = 2206 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet); 2207 if ((Gains > BlockFrequency(0)) && 2208 (Gains > BestGains || 2209 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) { 2210 BestPred = Pred; 2211 BestGains = Gains; 2212 } 2213 } 2214 2215 // If no direct predecessor is fine, just use the loop header. 2216 if (!BestPred) { 2217 LLVM_DEBUG(dbgs() << " final top unchanged\n"); 2218 return OldTop; 2219 } 2220 2221 // Walk backwards through any straight line of predecessors. 2222 while (BestPred->pred_size() == 1 && 2223 (*BestPred->pred_begin())->succ_size() == 1 && 2224 *BestPred->pred_begin() != L.getHeader()) 2225 BestPred = *BestPred->pred_begin(); 2226 2227 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n"); 2228 return BestPred; 2229 } 2230 2231 /// Find the best loop top block for layout. 2232 /// 2233 /// This function iteratively calls findBestLoopTopHelper, until no new better 2234 /// BB can be found. 2235 MachineBasicBlock * 2236 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, 2237 const BlockFilterSet &LoopBlockSet) { 2238 // Placing the latch block before the header may introduce an extra branch 2239 // that skips this block the first time the loop is executed, which we want 2240 // to avoid when optimising for size. 2241 // FIXME: in theory there is a case that does not introduce a new branch, 2242 // i.e. when the layout predecessor does not fallthrough to the loop header. 2243 // In practice this never happens though: there always seems to be a preheader 2244 // that can fallthrough and that is also placed before the header. 2245 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get())) 2246 return L.getHeader(); 2247 2248 MachineBasicBlock *OldTop = nullptr; 2249 MachineBasicBlock *NewTop = L.getHeader(); 2250 while (NewTop != OldTop) { 2251 OldTop = NewTop; 2252 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet); 2253 if (NewTop != OldTop) 2254 ComputedEdges[NewTop] = {OldTop, false}; 2255 } 2256 return NewTop; 2257 } 2258 2259 /// Find the best loop exiting block for layout. 2260 /// 2261 /// This routine implements the logic to analyze the loop looking for the best 2262 /// block to layout at the top of the loop. Typically this is done to maximize 2263 /// fallthrough opportunities. 2264 MachineBasicBlock * 2265 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, 2266 const BlockFilterSet &LoopBlockSet, 2267 BlockFrequency &ExitFreq) { 2268 // We don't want to layout the loop linearly in all cases. If the loop header 2269 // is just a normal basic block in the loop, we want to look for what block 2270 // within the loop is the best one to layout at the top. However, if the loop 2271 // header has be pre-merged into a chain due to predecessors not having 2272 // analyzable branches, *and* the predecessor it is merged with is *not* part 2273 // of the loop, rotating the header into the middle of the loop will create 2274 // a non-contiguous range of blocks which is Very Bad. So start with the 2275 // header and only rotate if safe. 2276 BlockChain &HeaderChain = *BlockToChain[L.getHeader()]; 2277 if (!LoopBlockSet.count(*HeaderChain.begin())) 2278 return nullptr; 2279 2280 BlockFrequency BestExitEdgeFreq; 2281 unsigned BestExitLoopDepth = 0; 2282 MachineBasicBlock *ExitingBB = nullptr; 2283 // If there are exits to outer loops, loop rotation can severely limit 2284 // fallthrough opportunities unless it selects such an exit. Keep a set of 2285 // blocks where rotating to exit with that block will reach an outer loop. 2286 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop; 2287 2288 LLVM_DEBUG(dbgs() << "Finding best loop exit for: " 2289 << getBlockName(L.getHeader()) << "\n"); 2290 for (MachineBasicBlock *MBB : L.getBlocks()) { 2291 BlockChain &Chain = *BlockToChain[MBB]; 2292 // Ensure that this block is at the end of a chain; otherwise it could be 2293 // mid-way through an inner loop or a successor of an unanalyzable branch. 2294 if (MBB != *std::prev(Chain.end())) 2295 continue; 2296 2297 // Now walk the successors. We need to establish whether this has a viable 2298 // exiting successor and whether it has a viable non-exiting successor. 2299 // We store the old exiting state and restore it if a viable looping 2300 // successor isn't found. 2301 MachineBasicBlock *OldExitingBB = ExitingBB; 2302 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; 2303 bool HasLoopingSucc = false; 2304 for (MachineBasicBlock *Succ : MBB->successors()) { 2305 if (Succ->isEHPad()) 2306 continue; 2307 if (Succ == MBB) 2308 continue; 2309 BlockChain &SuccChain = *BlockToChain[Succ]; 2310 // Don't split chains, either this chain or the successor's chain. 2311 if (&Chain == &SuccChain) { 2312 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " 2313 << getBlockName(Succ) << " (chain conflict)\n"); 2314 continue; 2315 } 2316 2317 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); 2318 if (LoopBlockSet.count(Succ)) { 2319 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " 2320 << getBlockName(Succ) << " (" << SuccProb << ")\n"); 2321 HasLoopingSucc = true; 2322 continue; 2323 } 2324 2325 unsigned SuccLoopDepth = 0; 2326 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) { 2327 SuccLoopDepth = ExitLoop->getLoopDepth(); 2328 if (ExitLoop->contains(&L)) 2329 BlocksExitingToOuterLoop.insert(MBB); 2330 } 2331 2332 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; 2333 LLVM_DEBUG( 2334 dbgs() << " exiting: " << getBlockName(MBB) << " -> " 2335 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (" 2336 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n"); 2337 // Note that we bias this toward an existing layout successor to retain 2338 // incoming order in the absence of better information. The exit must have 2339 // a frequency higher than the current exit before we consider breaking 2340 // the layout. 2341 BranchProbability Bias(100 - ExitBlockBias, 100); 2342 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth || 2343 ExitEdgeFreq > BestExitEdgeFreq || 2344 (MBB->isLayoutSuccessor(Succ) && 2345 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) { 2346 BestExitEdgeFreq = ExitEdgeFreq; 2347 ExitingBB = MBB; 2348 } 2349 } 2350 2351 if (!HasLoopingSucc) { 2352 // Restore the old exiting state, no viable looping successor was found. 2353 ExitingBB = OldExitingBB; 2354 BestExitEdgeFreq = OldBestExitEdgeFreq; 2355 } 2356 } 2357 // Without a candidate exiting block or with only a single block in the 2358 // loop, just use the loop header to layout the loop. 2359 if (!ExitingBB) { 2360 LLVM_DEBUG( 2361 dbgs() << " No other candidate exit blocks, using loop header\n"); 2362 return nullptr; 2363 } 2364 if (L.getNumBlocks() == 1) { 2365 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n"); 2366 return nullptr; 2367 } 2368 2369 // Also, if we have exit blocks which lead to outer loops but didn't select 2370 // one of them as the exiting block we are rotating toward, disable loop 2371 // rotation altogether. 2372 if (!BlocksExitingToOuterLoop.empty() && 2373 !BlocksExitingToOuterLoop.count(ExitingBB)) 2374 return nullptr; 2375 2376 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB) 2377 << "\n"); 2378 ExitFreq = BestExitEdgeFreq; 2379 return ExitingBB; 2380 } 2381 2382 /// Check if there is a fallthrough to loop header Top. 2383 /// 2384 /// 1. Look for a Pred that can be layout before Top. 2385 /// 2. Check if Top is the most possible successor of Pred. 2386 bool MachineBlockPlacement::hasViableTopFallthrough( 2387 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) { 2388 for (MachineBasicBlock *Pred : Top->predecessors()) { 2389 BlockChain *PredChain = BlockToChain[Pred]; 2390 if (!LoopBlockSet.count(Pred) && 2391 (!PredChain || Pred == *std::prev(PredChain->end()))) { 2392 // Found a Pred block can be placed before Top. 2393 // Check if Top is the best successor of Pred. 2394 auto TopProb = MBPI->getEdgeProbability(Pred, Top); 2395 bool TopOK = true; 2396 for (MachineBasicBlock *Succ : Pred->successors()) { 2397 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ); 2398 BlockChain *SuccChain = BlockToChain[Succ]; 2399 // Check if Succ can be placed after Pred. 2400 // Succ should not be in any chain, or it is the head of some chain. 2401 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) { 2402 TopOK = false; 2403 break; 2404 } 2405 } 2406 if (TopOK) 2407 return true; 2408 } 2409 } 2410 return false; 2411 } 2412 2413 /// Attempt to rotate an exiting block to the bottom of the loop. 2414 /// 2415 /// Once we have built a chain, try to rotate it to line up the hot exit block 2416 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary 2417 /// branches. For example, if the loop has fallthrough into its header and out 2418 /// of its bottom already, don't rotate it. 2419 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, 2420 const MachineBasicBlock *ExitingBB, 2421 BlockFrequency ExitFreq, 2422 const BlockFilterSet &LoopBlockSet) { 2423 if (!ExitingBB) 2424 return; 2425 2426 MachineBasicBlock *Top = *LoopChain.begin(); 2427 MachineBasicBlock *Bottom = *std::prev(LoopChain.end()); 2428 2429 // If ExitingBB is already the last one in a chain then nothing to do. 2430 if (Bottom == ExitingBB) 2431 return; 2432 2433 // The entry block should always be the first BB in a function. 2434 if (Top->isEntryBlock()) 2435 return; 2436 2437 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet); 2438 2439 // If the header has viable fallthrough, check whether the current loop 2440 // bottom is a viable exiting block. If so, bail out as rotating will 2441 // introduce an unnecessary branch. 2442 if (ViableTopFallthrough) { 2443 for (MachineBasicBlock *Succ : Bottom->successors()) { 2444 BlockChain *SuccChain = BlockToChain[Succ]; 2445 if (!LoopBlockSet.count(Succ) && 2446 (!SuccChain || Succ == *SuccChain->begin())) 2447 return; 2448 } 2449 2450 // Rotate will destroy the top fallthrough, we need to ensure the new exit 2451 // frequency is larger than top fallthrough. 2452 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet); 2453 if (FallThrough2Top >= ExitFreq) 2454 return; 2455 } 2456 2457 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB); 2458 if (ExitIt == LoopChain.end()) 2459 return; 2460 2461 // Rotating a loop exit to the bottom when there is a fallthrough to top 2462 // trades the entry fallthrough for an exit fallthrough. 2463 // If there is no bottom->top edge, but the chosen exit block does have 2464 // a fallthrough, we break that fallthrough for nothing in return. 2465 2466 // Let's consider an example. We have a built chain of basic blocks 2467 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block. 2468 // By doing a rotation we get 2469 // Bk+1, ..., Bn, B1, ..., Bk 2470 // Break of fallthrough to B1 is compensated by a fallthrough from Bk. 2471 // If we had a fallthrough Bk -> Bk+1 it is broken now. 2472 // It might be compensated by fallthrough Bn -> B1. 2473 // So we have a condition to avoid creation of extra branch by loop rotation. 2474 // All below must be true to avoid loop rotation: 2475 // If there is a fallthrough to top (B1) 2476 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1) 2477 // There is no fallthrough from bottom (Bn) to top (B1). 2478 // Please note that there is no exit fallthrough from Bn because we checked it 2479 // above. 2480 if (ViableTopFallthrough) { 2481 assert(std::next(ExitIt) != LoopChain.end() && 2482 "Exit should not be last BB"); 2483 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt); 2484 if (ExitingBB->isSuccessor(NextBlockInChain)) 2485 if (!Bottom->isSuccessor(Top)) 2486 return; 2487 } 2488 2489 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB) 2490 << " at bottom\n"); 2491 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); 2492 } 2493 2494 /// Attempt to rotate a loop based on profile data to reduce branch cost. 2495 /// 2496 /// With profile data, we can determine the cost in terms of missed fall through 2497 /// opportunities when rotating a loop chain and select the best rotation. 2498 /// Basically, there are three kinds of cost to consider for each rotation: 2499 /// 1. The possibly missed fall through edge (if it exists) from BB out of 2500 /// the loop to the loop header. 2501 /// 2. The possibly missed fall through edges (if they exist) from the loop 2502 /// exits to BB out of the loop. 2503 /// 3. The missed fall through edge (if it exists) from the last BB to the 2504 /// first BB in the loop chain. 2505 /// Therefore, the cost for a given rotation is the sum of costs listed above. 2506 /// We select the best rotation with the smallest cost. 2507 void MachineBlockPlacement::rotateLoopWithProfile( 2508 BlockChain &LoopChain, const MachineLoop &L, 2509 const BlockFilterSet &LoopBlockSet) { 2510 auto RotationPos = LoopChain.end(); 2511 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin(); 2512 2513 // The entry block should always be the first BB in a function. 2514 if (ChainHeaderBB->isEntryBlock()) 2515 return; 2516 2517 BlockFrequency SmallestRotationCost = BlockFrequency::max(); 2518 2519 // A utility lambda that scales up a block frequency by dividing it by a 2520 // branch probability which is the reciprocal of the scale. 2521 auto ScaleBlockFrequency = [](BlockFrequency Freq, 2522 unsigned Scale) -> BlockFrequency { 2523 if (Scale == 0) 2524 return BlockFrequency(0); 2525 // Use operator / between BlockFrequency and BranchProbability to implement 2526 // saturating multiplication. 2527 return Freq / BranchProbability(1, Scale); 2528 }; 2529 2530 // Compute the cost of the missed fall-through edge to the loop header if the 2531 // chain head is not the loop header. As we only consider natural loops with 2532 // single header, this computation can be done only once. 2533 BlockFrequency HeaderFallThroughCost(0); 2534 for (auto *Pred : ChainHeaderBB->predecessors()) { 2535 BlockChain *PredChain = BlockToChain[Pred]; 2536 if (!LoopBlockSet.count(Pred) && 2537 (!PredChain || Pred == *std::prev(PredChain->end()))) { 2538 auto EdgeFreq = MBFI->getBlockFreq(Pred) * 2539 MBPI->getEdgeProbability(Pred, ChainHeaderBB); 2540 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost); 2541 // If the predecessor has only an unconditional jump to the header, we 2542 // need to consider the cost of this jump. 2543 if (Pred->succ_size() == 1) 2544 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost); 2545 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost); 2546 } 2547 } 2548 2549 // Here we collect all exit blocks in the loop, and for each exit we find out 2550 // its hottest exit edge. For each loop rotation, we define the loop exit cost 2551 // as the sum of frequencies of exit edges we collect here, excluding the exit 2552 // edge from the tail of the loop chain. 2553 SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq; 2554 for (auto *BB : LoopChain) { 2555 auto LargestExitEdgeProb = BranchProbability::getZero(); 2556 for (auto *Succ : BB->successors()) { 2557 BlockChain *SuccChain = BlockToChain[Succ]; 2558 if (!LoopBlockSet.count(Succ) && 2559 (!SuccChain || Succ == *SuccChain->begin())) { 2560 auto SuccProb = MBPI->getEdgeProbability(BB, Succ); 2561 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); 2562 } 2563 } 2564 if (LargestExitEdgeProb > BranchProbability::getZero()) { 2565 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; 2566 ExitsWithFreq.emplace_back(BB, ExitFreq); 2567 } 2568 } 2569 2570 // In this loop we iterate every block in the loop chain and calculate the 2571 // cost assuming the block is the head of the loop chain. When the loop ends, 2572 // we should have found the best candidate as the loop chain's head. 2573 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), 2574 EndIter = LoopChain.end(); 2575 Iter != EndIter; Iter++, TailIter++) { 2576 // TailIter is used to track the tail of the loop chain if the block we are 2577 // checking (pointed by Iter) is the head of the chain. 2578 if (TailIter == LoopChain.end()) 2579 TailIter = LoopChain.begin(); 2580 2581 auto TailBB = *TailIter; 2582 2583 // Calculate the cost by putting this BB to the top. 2584 BlockFrequency Cost = BlockFrequency(0); 2585 2586 // If the current BB is the loop header, we need to take into account the 2587 // cost of the missed fall through edge from outside of the loop to the 2588 // header. 2589 if (Iter != LoopChain.begin()) 2590 Cost += HeaderFallThroughCost; 2591 2592 // Collect the loop exit cost by summing up frequencies of all exit edges 2593 // except the one from the chain tail. 2594 for (auto &ExitWithFreq : ExitsWithFreq) 2595 if (TailBB != ExitWithFreq.first) 2596 Cost += ExitWithFreq.second; 2597 2598 // The cost of breaking the once fall-through edge from the tail to the top 2599 // of the loop chain. Here we need to consider three cases: 2600 // 1. If the tail node has only one successor, then we will get an 2601 // additional jmp instruction. So the cost here is (MisfetchCost + 2602 // JumpInstCost) * tail node frequency. 2603 // 2. If the tail node has two successors, then we may still get an 2604 // additional jmp instruction if the layout successor after the loop 2605 // chain is not its CFG successor. Note that the more frequently executed 2606 // jmp instruction will be put ahead of the other one. Assume the 2607 // frequency of those two branches are x and y, where x is the frequency 2608 // of the edge to the chain head, then the cost will be 2609 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency. 2610 // 3. If the tail node has more than two successors (this rarely happens), 2611 // we won't consider any additional cost. 2612 if (TailBB->isSuccessor(*Iter)) { 2613 auto TailBBFreq = MBFI->getBlockFreq(TailBB); 2614 if (TailBB->succ_size() == 1) 2615 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost); 2616 else if (TailBB->succ_size() == 2) { 2617 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter); 2618 auto TailToHeadFreq = TailBBFreq * TailToHeadProb; 2619 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2) 2620 ? TailBBFreq * TailToHeadProb.getCompl() 2621 : TailToHeadFreq; 2622 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) + 2623 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost); 2624 } 2625 } 2626 2627 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making " 2628 << getBlockName(*Iter) << " to the top: " 2629 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n"); 2630 2631 if (Cost < SmallestRotationCost) { 2632 SmallestRotationCost = Cost; 2633 RotationPos = Iter; 2634 } 2635 } 2636 2637 if (RotationPos != LoopChain.end()) { 2638 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos) 2639 << " to the top\n"); 2640 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); 2641 } 2642 } 2643 2644 /// Collect blocks in the given loop that are to be placed. 2645 /// 2646 /// When profile data is available, exclude cold blocks from the returned set; 2647 /// otherwise, collect all blocks in the loop. 2648 MachineBlockPlacement::BlockFilterSet 2649 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { 2650 // Collect the blocks in a set ordered by block number, as this gives the same 2651 // order as they appear in the function. 2652 struct MBBCompare { 2653 bool operator()(const MachineBasicBlock *X, 2654 const MachineBasicBlock *Y) const { 2655 return X->getNumber() < Y->getNumber(); 2656 } 2657 }; 2658 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet; 2659 2660 // Filter cold blocks off from LoopBlockSet when profile data is available. 2661 // Collect the sum of frequencies of incoming edges to the loop header from 2662 // outside. If we treat the loop as a super block, this is the frequency of 2663 // the loop. Then for each block in the loop, we calculate the ratio between 2664 // its frequency and the frequency of the loop block. When it is too small, 2665 // don't add it to the loop chain. If there are outer loops, then this block 2666 // will be merged into the first outer loop chain for which this block is not 2667 // cold anymore. This needs precise profile data and we only do this when 2668 // profile data is available. 2669 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) { 2670 BlockFrequency LoopFreq(0); 2671 for (auto *LoopPred : L.getHeader()->predecessors()) 2672 if (!L.contains(LoopPred)) 2673 LoopFreq += MBFI->getBlockFreq(LoopPred) * 2674 MBPI->getEdgeProbability(LoopPred, L.getHeader()); 2675 2676 for (MachineBasicBlock *LoopBB : L.getBlocks()) { 2677 if (LoopBlockSet.count(LoopBB)) 2678 continue; 2679 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); 2680 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) 2681 continue; 2682 BlockChain *Chain = BlockToChain[LoopBB]; 2683 for (MachineBasicBlock *ChainBB : *Chain) 2684 LoopBlockSet.insert(ChainBB); 2685 } 2686 } else 2687 LoopBlockSet.insert(L.block_begin(), L.block_end()); 2688 2689 // Copy the blocks into a BlockFilterSet, as iterating it is faster than 2690 // std::set. We will only remove blocks and never insert them, which will 2691 // preserve the ordering. 2692 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end()); 2693 return Ret; 2694 } 2695 2696 /// Forms basic block chains from the natural loop structures. 2697 /// 2698 /// These chains are designed to preserve the existing *structure* of the code 2699 /// as much as possible. We can then stitch the chains together in a way which 2700 /// both preserves the topological structure and minimizes taken conditional 2701 /// branches. 2702 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { 2703 // First recurse through any nested loops, building chains for those inner 2704 // loops. 2705 for (const MachineLoop *InnerLoop : L) 2706 buildLoopChains(*InnerLoop); 2707 2708 assert(BlockWorkList.empty() && 2709 "BlockWorkList not empty when starting to build loop chains."); 2710 assert(EHPadWorkList.empty() && 2711 "EHPadWorkList not empty when starting to build loop chains."); 2712 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L); 2713 2714 // Check if we have profile data for this function. If yes, we will rotate 2715 // this loop by modeling costs more precisely which requires the profile data 2716 // for better layout. 2717 bool RotateLoopWithProfile = 2718 ForcePreciseRotationCost || 2719 (PreciseRotationCost && F->getFunction().hasProfileData()); 2720 2721 // First check to see if there is an obviously preferable top block for the 2722 // loop. This will default to the header, but may end up as one of the 2723 // predecessors to the header if there is one which will result in strictly 2724 // fewer branches in the loop body. 2725 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet); 2726 2727 // If we selected just the header for the loop top, look for a potentially 2728 // profitable exit block in the event that rotating the loop can eliminate 2729 // branches by placing an exit edge at the bottom. 2730 // 2731 // Loops are processed innermost to uttermost, make sure we clear 2732 // PreferredLoopExit before processing a new loop. 2733 PreferredLoopExit = nullptr; 2734 BlockFrequency ExitFreq; 2735 if (!RotateLoopWithProfile && LoopTop == L.getHeader()) 2736 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq); 2737 2738 BlockChain &LoopChain = *BlockToChain[LoopTop]; 2739 2740 // FIXME: This is a really lame way of walking the chains in the loop: we 2741 // walk the blocks, and use a set to prevent visiting a particular chain 2742 // twice. 2743 SmallPtrSet<BlockChain *, 4> UpdatedPreds; 2744 assert(LoopChain.UnscheduledPredecessors == 0 && 2745 "LoopChain should not have unscheduled predecessors."); 2746 UpdatedPreds.insert(&LoopChain); 2747 2748 for (const MachineBasicBlock *LoopBB : LoopBlockSet) 2749 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet); 2750 2751 buildChain(LoopTop, LoopChain, &LoopBlockSet); 2752 2753 if (RotateLoopWithProfile) 2754 rotateLoopWithProfile(LoopChain, L, LoopBlockSet); 2755 else 2756 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet); 2757 2758 LLVM_DEBUG({ 2759 // Crash at the end so we get all of the debugging output first. 2760 bool BadLoop = false; 2761 if (LoopChain.UnscheduledPredecessors) { 2762 BadLoop = true; 2763 dbgs() << "Loop chain contains a block without its preds placed!\n" 2764 << " Loop header: " << getBlockName(*L.block_begin()) << "\n" 2765 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"; 2766 } 2767 for (MachineBasicBlock *ChainBB : LoopChain) { 2768 dbgs() << " ... " << getBlockName(ChainBB) << "\n"; 2769 if (!LoopBlockSet.remove(ChainBB)) { 2770 // We don't mark the loop as bad here because there are real situations 2771 // where this can occur. For example, with an unanalyzable fallthrough 2772 // from a loop block to a non-loop block or vice versa. 2773 dbgs() << "Loop chain contains a block not contained by the loop!\n" 2774 << " Loop header: " << getBlockName(*L.block_begin()) << "\n" 2775 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" 2776 << " Bad block: " << getBlockName(ChainBB) << "\n"; 2777 } 2778 } 2779 2780 if (!LoopBlockSet.empty()) { 2781 BadLoop = true; 2782 for (const MachineBasicBlock *LoopBB : LoopBlockSet) 2783 dbgs() << "Loop contains blocks never placed into a chain!\n" 2784 << " Loop header: " << getBlockName(*L.block_begin()) << "\n" 2785 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" 2786 << " Bad block: " << getBlockName(LoopBB) << "\n"; 2787 } 2788 assert(!BadLoop && "Detected problems with the placement of this loop."); 2789 }); 2790 2791 BlockWorkList.clear(); 2792 EHPadWorkList.clear(); 2793 } 2794 2795 void MachineBlockPlacement::buildCFGChains() { 2796 // Ensure that every BB in the function has an associated chain to simplify 2797 // the assumptions of the remaining algorithm. 2798 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch. 2799 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE; 2800 ++FI) { 2801 MachineBasicBlock *BB = &*FI; 2802 BlockChain *Chain = 2803 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB); 2804 // Also, merge any blocks which we cannot reason about and must preserve 2805 // the exact fallthrough behavior for. 2806 while (true) { 2807 Cond.clear(); 2808 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. 2809 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough()) 2810 break; 2811 2812 MachineFunction::iterator NextFI = std::next(FI); 2813 MachineBasicBlock *NextBB = &*NextFI; 2814 // Ensure that the layout successor is a viable block, as we know that 2815 // fallthrough is a possibility. 2816 assert(NextFI != FE && "Can't fallthrough past the last block."); 2817 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: " 2818 << getBlockName(BB) << " -> " << getBlockName(NextBB) 2819 << "\n"); 2820 Chain->merge(NextBB, nullptr); 2821 #ifndef NDEBUG 2822 BlocksWithUnanalyzableExits.insert(&*BB); 2823 #endif 2824 FI = NextFI; 2825 BB = NextBB; 2826 } 2827 } 2828 2829 // Build any loop-based chains. 2830 PreferredLoopExit = nullptr; 2831 for (MachineLoop *L : *MLI) 2832 buildLoopChains(*L); 2833 2834 assert(BlockWorkList.empty() && 2835 "BlockWorkList should be empty before building final chain."); 2836 assert(EHPadWorkList.empty() && 2837 "EHPadWorkList should be empty before building final chain."); 2838 2839 SmallPtrSet<BlockChain *, 4> UpdatedPreds; 2840 for (MachineBasicBlock &MBB : *F) 2841 fillWorkLists(&MBB, UpdatedPreds); 2842 2843 BlockChain &FunctionChain = *BlockToChain[&F->front()]; 2844 buildChain(&F->front(), FunctionChain); 2845 2846 #ifndef NDEBUG 2847 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>; 2848 #endif 2849 LLVM_DEBUG({ 2850 // Crash at the end so we get all of the debugging output first. 2851 bool BadFunc = false; 2852 FunctionBlockSetType FunctionBlockSet; 2853 for (MachineBasicBlock &MBB : *F) 2854 FunctionBlockSet.insert(&MBB); 2855 2856 for (MachineBasicBlock *ChainBB : FunctionChain) 2857 if (!FunctionBlockSet.erase(ChainBB)) { 2858 BadFunc = true; 2859 dbgs() << "Function chain contains a block not in the function!\n" 2860 << " Bad block: " << getBlockName(ChainBB) << "\n"; 2861 } 2862 2863 if (!FunctionBlockSet.empty()) { 2864 BadFunc = true; 2865 for (MachineBasicBlock *RemainingBB : FunctionBlockSet) 2866 dbgs() << "Function contains blocks never placed into a chain!\n" 2867 << " Bad block: " << getBlockName(RemainingBB) << "\n"; 2868 } 2869 assert(!BadFunc && "Detected problems with the block placement."); 2870 }); 2871 2872 // Remember original layout ordering, so we can update terminators after 2873 // reordering to point to the original layout successor. 2874 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors( 2875 F->getNumBlockIDs()); 2876 { 2877 MachineBasicBlock *LastMBB = nullptr; 2878 for (auto &MBB : *F) { 2879 if (LastMBB != nullptr) 2880 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB; 2881 LastMBB = &MBB; 2882 } 2883 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr; 2884 } 2885 2886 // Splice the blocks into place. 2887 MachineFunction::iterator InsertPos = F->begin(); 2888 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n"); 2889 for (MachineBasicBlock *ChainBB : FunctionChain) { 2890 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain " 2891 : " ... ") 2892 << getBlockName(ChainBB) << "\n"); 2893 if (InsertPos != MachineFunction::iterator(ChainBB)) 2894 F->splice(InsertPos, ChainBB); 2895 else 2896 ++InsertPos; 2897 2898 // Update the terminator of the previous block. 2899 if (ChainBB == *FunctionChain.begin()) 2900 continue; 2901 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB)); 2902 2903 // FIXME: It would be awesome of updateTerminator would just return rather 2904 // than assert when the branch cannot be analyzed in order to remove this 2905 // boiler plate. 2906 Cond.clear(); 2907 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. 2908 2909 #ifndef NDEBUG 2910 if (!BlocksWithUnanalyzableExits.count(PrevBB)) { 2911 // Given the exact block placement we chose, we may actually not _need_ to 2912 // be able to edit PrevBB's terminator sequence, but not being _able_ to 2913 // do that at this point is a bug. 2914 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) || 2915 !PrevBB->canFallThrough()) && 2916 "Unexpected block with un-analyzable fallthrough!"); 2917 Cond.clear(); 2918 TBB = FBB = nullptr; 2919 } 2920 #endif 2921 2922 // The "PrevBB" is not yet updated to reflect current code layout, so, 2923 // o. it may fall-through to a block without explicit "goto" instruction 2924 // before layout, and no longer fall-through it after layout; or 2925 // o. just opposite. 2926 // 2927 // analyzeBranch() may return erroneous value for FBB when these two 2928 // situations take place. For the first scenario FBB is mistakenly set NULL; 2929 // for the 2nd scenario, the FBB, which is expected to be NULL, is 2930 // mistakenly pointing to "*BI". 2931 // Thus, if the future change needs to use FBB before the layout is set, it 2932 // has to correct FBB first by using the code similar to the following: 2933 // 2934 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) { 2935 // PrevBB->updateTerminator(); 2936 // Cond.clear(); 2937 // TBB = FBB = nullptr; 2938 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) { 2939 // // FIXME: This should never take place. 2940 // TBB = FBB = nullptr; 2941 // } 2942 // } 2943 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) { 2944 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]); 2945 } 2946 } 2947 2948 // Fixup the last block. 2949 Cond.clear(); 2950 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. 2951 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) { 2952 MachineBasicBlock *PrevBB = &F->back(); 2953 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]); 2954 } 2955 2956 BlockWorkList.clear(); 2957 EHPadWorkList.clear(); 2958 } 2959 2960 void MachineBlockPlacement::optimizeBranches() { 2961 BlockChain &FunctionChain = *BlockToChain[&F->front()]; 2962 SmallVector<MachineOperand, 4> Cond; 2963 2964 // Now that all the basic blocks in the chain have the proper layout, 2965 // make a final call to analyzeBranch with AllowModify set. 2966 // Indeed, the target may be able to optimize the branches in a way we 2967 // cannot because all branches may not be analyzable. 2968 // E.g., the target may be able to remove an unconditional branch to 2969 // a fallthrough when it occurs after predicated terminators. 2970 for (MachineBasicBlock *ChainBB : FunctionChain) { 2971 Cond.clear(); 2972 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; 2973 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) 2974 continue; 2975 if (!TBB || !FBB || Cond.empty()) 2976 continue; 2977 // If we are optimizing for size we do not consider the runtime performance. 2978 // Instead, we retain the original branch condition so we have more uniform 2979 // instructions which will benefit ICF. 2980 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get())) 2981 continue; 2982 // If ChainBB has a two-way branch, try to re-order the branches 2983 // such that we branch to the successor with higher probability first. 2984 if (MBPI->getEdgeProbability(ChainBB, TBB) >= 2985 MBPI->getEdgeProbability(ChainBB, FBB)) 2986 continue; 2987 if (TII->reverseBranchCondition(Cond)) 2988 continue; 2989 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: " 2990 << getBlockName(ChainBB) << "\n"); 2991 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB) 2992 << "\n"); 2993 auto Dl = ChainBB->findBranchDebugLoc(); 2994 TII->removeBranch(*ChainBB); 2995 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl); 2996 } 2997 } 2998 2999 void MachineBlockPlacement::alignBlocks() { 3000 // Walk through the backedges of the function now that we have fully laid out 3001 // the basic blocks and align the destination of each backedge. We don't rely 3002 // exclusively on the loop info here so that we can align backedges in 3003 // unnatural CFGs and backedges that were introduced purely because of the 3004 // loop rotations done during this layout pass. 3005 if (!AlignAllBlock && !AlignAllNonFallThruBlocks) { 3006 if (F->getFunction().hasMinSize() || 3007 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize())) 3008 return; 3009 } 3010 3011 BlockChain &FunctionChain = *BlockToChain[&F->front()]; 3012 // Empty chain. 3013 if (FunctionChain.begin() == FunctionChain.end()) 3014 return; 3015 3016 const BranchProbability ColdProb(1, 5); // 20% 3017 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front()); 3018 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb; 3019 for (MachineBasicBlock *ChainBB : FunctionChain) { 3020 if (ChainBB == *FunctionChain.begin()) 3021 continue; 3022 3023 // Don't align non-looping basic blocks. These are unlikely to execute 3024 // enough times to matter in practice. Note that we'll still handle 3025 // unnatural CFGs inside of a natural outer loop (the common case) and 3026 // rotated loops. 3027 MachineLoop *L = MLI->getLoopFor(ChainBB); 3028 if (!L) 3029 continue; 3030 3031 const Align TLIAlign = TLI->getPrefLoopAlignment(L); 3032 unsigned MDAlign = 1; 3033 MDNode *LoopID = L->getLoopID(); 3034 if (LoopID) { 3035 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) { 3036 MDNode *MD = dyn_cast<MDNode>(MDO); 3037 if (MD == nullptr) 3038 continue; 3039 MDString *S = dyn_cast<MDString>(MD->getOperand(0)); 3040 if (S == nullptr) 3041 continue; 3042 if (S->getString() == "llvm.loop.align") { 3043 assert(MD->getNumOperands() == 2 && 3044 "per-loop align metadata should have two operands."); 3045 MDAlign = 3046 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); 3047 assert(MDAlign >= 1 && "per-loop align value must be positive."); 3048 } 3049 } 3050 } 3051 3052 // Use max of the TLIAlign and MDAlign 3053 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign)); 3054 if (LoopAlign == 1) 3055 continue; // Don't care about loop alignment. 3056 3057 // If the block is cold relative to the function entry don't waste space 3058 // aligning it. 3059 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB); 3060 if (Freq < WeightedEntryFreq) 3061 continue; 3062 3063 // If the block is cold relative to its loop header, don't align it 3064 // regardless of what edges into the block exist. 3065 MachineBasicBlock *LoopHeader = L->getHeader(); 3066 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader); 3067 if (Freq < (LoopHeaderFreq * ColdProb)) 3068 continue; 3069 3070 // If the global profiles indicates so, don't align it. 3071 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) && 3072 !TLI->alignLoopsWithOptSize()) 3073 continue; 3074 3075 // Check for the existence of a non-layout predecessor which would benefit 3076 // from aligning this block. 3077 MachineBasicBlock *LayoutPred = 3078 &*std::prev(MachineFunction::iterator(ChainBB)); 3079 3080 auto DetermineMaxAlignmentPadding = [&]() { 3081 // Set the maximum bytes allowed to be emitted for alignment. 3082 unsigned MaxBytes; 3083 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0) 3084 MaxBytes = MaxBytesForAlignmentOverride; 3085 else 3086 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB); 3087 ChainBB->setMaxBytesForAlignment(MaxBytes); 3088 }; 3089 3090 // Force alignment if all the predecessors are jumps. We already checked 3091 // that the block isn't cold above. 3092 if (!LayoutPred->isSuccessor(ChainBB)) { 3093 ChainBB->setAlignment(LoopAlign); 3094 DetermineMaxAlignmentPadding(); 3095 continue; 3096 } 3097 3098 // Align this block if the layout predecessor's edge into this block is 3099 // cold relative to the block. When this is true, other predecessors make up 3100 // all of the hot entries into the block and thus alignment is likely to be 3101 // important. 3102 BranchProbability LayoutProb = 3103 MBPI->getEdgeProbability(LayoutPred, ChainBB); 3104 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb; 3105 if (LayoutEdgeFreq <= (Freq * ColdProb)) { 3106 ChainBB->setAlignment(LoopAlign); 3107 DetermineMaxAlignmentPadding(); 3108 } 3109 } 3110 3111 const bool HasMaxBytesOverride = 3112 MaxBytesForAlignmentOverride.getNumOccurrences() > 0; 3113 3114 if (AlignAllBlock) 3115 // Align all of the blocks in the function to a specific alignment. 3116 for (MachineBasicBlock &MBB : *F) { 3117 if (HasMaxBytesOverride) 3118 MBB.setAlignment(Align(1ULL << AlignAllBlock), 3119 MaxBytesForAlignmentOverride); 3120 else 3121 MBB.setAlignment(Align(1ULL << AlignAllBlock)); 3122 } 3123 else if (AlignAllNonFallThruBlocks) { 3124 // Align all of the blocks that have no fall-through predecessors to a 3125 // specific alignment. 3126 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) { 3127 auto LayoutPred = std::prev(MBI); 3128 if (!LayoutPred->isSuccessor(&*MBI)) { 3129 if (HasMaxBytesOverride) 3130 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks), 3131 MaxBytesForAlignmentOverride); 3132 else 3133 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks)); 3134 } 3135 } 3136 } 3137 } 3138 3139 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if 3140 /// it was duplicated into its chain predecessor and removed. 3141 /// \p BB - Basic block that may be duplicated. 3142 /// 3143 /// \p LPred - Chosen layout predecessor of \p BB. 3144 /// Updated to be the chain end if LPred is removed. 3145 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong. 3146 /// \p BlockFilter - Set of blocks that belong to the loop being laid out. 3147 /// Used to identify which blocks to update predecessor 3148 /// counts. 3149 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was 3150 /// chosen in the given order due to unnatural CFG 3151 /// only needed if \p BB is removed and 3152 /// \p PrevUnplacedBlockIt pointed to \p BB. 3153 /// @return true if \p BB was removed. 3154 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( 3155 MachineBasicBlock *BB, MachineBasicBlock *&LPred, 3156 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, 3157 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, 3158 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) { 3159 bool Removed, DuplicatedToLPred; 3160 bool DuplicatedToOriginalLPred; 3161 Removed = maybeTailDuplicateBlock( 3162 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt, 3163 PrevUnplacedBlockInFilterIt, DuplicatedToLPred); 3164 if (!Removed) 3165 return false; 3166 DuplicatedToOriginalLPred = DuplicatedToLPred; 3167 // Iteratively try to duplicate again. It can happen that a block that is 3168 // duplicated into is still small enough to be duplicated again. 3169 // No need to call markBlockSuccessors in this case, as the blocks being 3170 // duplicated from here on are already scheduled. 3171 while (DuplicatedToLPred && Removed) { 3172 MachineBasicBlock *DupBB, *DupPred; 3173 // The removal callback causes Chain.end() to be updated when a block is 3174 // removed. On the first pass through the loop, the chain end should be the 3175 // same as it was on function entry. On subsequent passes, because we are 3176 // duplicating the block at the end of the chain, if it is removed the 3177 // chain will have shrunk by one block. 3178 BlockChain::iterator ChainEnd = Chain.end(); 3179 DupBB = *(--ChainEnd); 3180 // Now try to duplicate again. 3181 if (ChainEnd == Chain.begin()) 3182 break; 3183 DupPred = *std::prev(ChainEnd); 3184 Removed = maybeTailDuplicateBlock( 3185 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt, 3186 PrevUnplacedBlockInFilterIt, DuplicatedToLPred); 3187 } 3188 // If BB was duplicated into LPred, it is now scheduled. But because it was 3189 // removed, markChainSuccessors won't be called for its chain. Instead we 3190 // call markBlockSuccessors for LPred to achieve the same effect. This must go 3191 // at the end because repeating the tail duplication can increase the number 3192 // of unscheduled predecessors. 3193 LPred = *std::prev(Chain.end()); 3194 if (DuplicatedToOriginalLPred) 3195 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter); 3196 return true; 3197 } 3198 3199 /// Tail duplicate \p BB into (some) predecessors if profitable. 3200 /// \p BB - Basic block that may be duplicated 3201 /// \p LPred - Chosen layout predecessor of \p BB 3202 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong. 3203 /// \p BlockFilter - Set of blocks that belong to the loop being laid out. 3204 /// Used to identify which blocks to update predecessor 3205 /// counts. 3206 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was 3207 /// chosen in the given order due to unnatural CFG 3208 /// only needed if \p BB is removed and 3209 /// \p PrevUnplacedBlockIt pointed to \p BB. 3210 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. 3211 /// \return - True if the block was duplicated into all preds and removed. 3212 bool MachineBlockPlacement::maybeTailDuplicateBlock( 3213 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain, 3214 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, 3215 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, 3216 bool &DuplicatedToLPred) { 3217 DuplicatedToLPred = false; 3218 if (!shouldTailDuplicate(BB)) 3219 return false; 3220 3221 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() 3222 << "\n"); 3223 3224 // This has to be a callback because none of it can be done after 3225 // BB is deleted. 3226 bool Removed = false; 3227 auto RemovalCallback = [&](MachineBasicBlock *RemBB) { 3228 // Signal to outer function 3229 Removed = true; 3230 3231 // Remove from the Chain and Chain Map 3232 if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) { 3233 It->second->remove(RemBB); 3234 BlockToChain.erase(It); 3235 } 3236 3237 // Handle the unplaced block iterator 3238 if (&(*PrevUnplacedBlockIt) == RemBB) { 3239 PrevUnplacedBlockIt++; 3240 } 3241 3242 // Handle the Work Lists 3243 if (RemBB->isEHPad()) { 3244 llvm::erase(EHPadWorkList, RemBB); 3245 } else { 3246 llvm::erase(BlockWorkList, RemBB); 3247 } 3248 3249 // Handle the filter set 3250 if (BlockFilter) { 3251 auto It = llvm::find(*BlockFilter, RemBB); 3252 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt 3253 // pointing to the same element as before. 3254 if (It != BlockFilter->end()) { 3255 if (It < PrevUnplacedBlockInFilterIt) { 3256 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt; 3257 // BlockFilter is a SmallVector so all elements after RemBB are 3258 // shifted to the front by 1 after its deletion. 3259 auto Distance = PrevUnplacedBlockInFilterIt - It - 1; 3260 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance; 3261 assert(*PrevUnplacedBlockInFilterIt == PrevBB); 3262 (void)PrevBB; 3263 } else if (It == PrevUnplacedBlockInFilterIt) 3264 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we 3265 // have to set it to the next element. 3266 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It); 3267 else 3268 BlockFilter->erase(It); 3269 } 3270 } 3271 3272 // Remove the block from loop info. 3273 MLI->removeBlock(RemBB); 3274 if (RemBB == PreferredLoopExit) 3275 PreferredLoopExit = nullptr; 3276 3277 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB) 3278 << "\n"); 3279 }; 3280 auto RemovalCallbackRef = 3281 function_ref<void(MachineBasicBlock *)>(RemovalCallback); 3282 3283 SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; 3284 bool IsSimple = TailDup.isSimpleBB(BB); 3285 SmallVector<MachineBasicBlock *, 8> CandidatePreds; 3286 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr; 3287 if (F->getFunction().hasProfileData()) { 3288 // We can do partial duplication with precise profile information. 3289 findDuplicateCandidates(CandidatePreds, BB, BlockFilter); 3290 if (CandidatePreds.size() == 0) 3291 return false; 3292 if (CandidatePreds.size() < BB->pred_size()) 3293 CandidatePtr = &CandidatePreds; 3294 } 3295 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, 3296 &RemovalCallbackRef, CandidatePtr); 3297 3298 // Update UnscheduledPredecessors to reflect tail-duplication. 3299 DuplicatedToLPred = false; 3300 for (MachineBasicBlock *Pred : DuplicatedPreds) { 3301 // We're only looking for unscheduled predecessors that match the filter. 3302 BlockChain *PredChain = BlockToChain[Pred]; 3303 if (Pred == LPred) 3304 DuplicatedToLPred = true; 3305 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) || 3306 PredChain == &Chain) 3307 continue; 3308 for (MachineBasicBlock *NewSucc : Pred->successors()) { 3309 if (BlockFilter && !BlockFilter->count(NewSucc)) 3310 continue; 3311 BlockChain *NewChain = BlockToChain[NewSucc]; 3312 if (NewChain != &Chain && NewChain != PredChain) 3313 NewChain->UnscheduledPredecessors++; 3314 } 3315 } 3316 return Removed; 3317 } 3318 3319 // Count the number of actual machine instructions. 3320 static uint64_t countMBBInstruction(MachineBasicBlock *MBB) { 3321 uint64_t InstrCount = 0; 3322 for (MachineInstr &MI : *MBB) { 3323 if (!MI.isPHI() && !MI.isMetaInstruction()) 3324 InstrCount += 1; 3325 } 3326 return InstrCount; 3327 } 3328 3329 // The size cost of duplication is the instruction size of the duplicated block. 3330 // So we should scale the threshold accordingly. But the instruction size is not 3331 // available on all targets, so we use the number of instructions instead. 3332 BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) { 3333 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB)); 3334 } 3335 3336 // Returns true if BB is Pred's best successor. 3337 bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, 3338 MachineBasicBlock *Pred, 3339 BlockFilterSet *BlockFilter) { 3340 if (BB == Pred) 3341 return false; 3342 if (BlockFilter && !BlockFilter->count(Pred)) 3343 return false; 3344 BlockChain *PredChain = BlockToChain[Pred]; 3345 if (PredChain && (Pred != *std::prev(PredChain->end()))) 3346 return false; 3347 3348 // Find the successor with largest probability excluding BB. 3349 BranchProbability BestProb = BranchProbability::getZero(); 3350 for (MachineBasicBlock *Succ : Pred->successors()) 3351 if (Succ != BB) { 3352 if (BlockFilter && !BlockFilter->count(Succ)) 3353 continue; 3354 BlockChain *SuccChain = BlockToChain[Succ]; 3355 if (SuccChain && (Succ != *SuccChain->begin())) 3356 continue; 3357 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ); 3358 if (SuccProb > BestProb) 3359 BestProb = SuccProb; 3360 } 3361 3362 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB); 3363 if (BBProb <= BestProb) 3364 return false; 3365 3366 // Compute the number of reduced taken branches if Pred falls through to BB 3367 // instead of another successor. Then compare it with threshold. 3368 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred); 3369 BlockFrequency Gain = PredFreq * (BBProb - BestProb); 3370 return Gain > scaleThreshold(BB); 3371 } 3372 3373 // Find out the predecessors of BB and BB can be beneficially duplicated into 3374 // them. 3375 void MachineBlockPlacement::findDuplicateCandidates( 3376 SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB, 3377 BlockFilterSet *BlockFilter) { 3378 MachineBasicBlock *Fallthrough = nullptr; 3379 BranchProbability DefaultBranchProb = BranchProbability::getZero(); 3380 BlockFrequency BBDupThreshold(scaleThreshold(BB)); 3381 SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors()); 3382 SmallVector<MachineBasicBlock *, 8> Succs(BB->successors()); 3383 3384 // Sort for highest frequency. 3385 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) { 3386 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B); 3387 }; 3388 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) { 3389 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B); 3390 }; 3391 llvm::stable_sort(Succs, CmpSucc); 3392 llvm::stable_sort(Preds, CmpPred); 3393 3394 auto SuccIt = Succs.begin(); 3395 if (SuccIt != Succs.end()) { 3396 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl(); 3397 } 3398 3399 // For each predecessors of BB, compute the benefit of duplicating BB, 3400 // if it is larger than the threshold, add it into Candidates. 3401 // 3402 // If we have following control flow. 3403 // 3404 // PB1 PB2 PB3 PB4 3405 // \ | / /\ 3406 // \ | / / \ 3407 // \ |/ / \ 3408 // BB----/ OB 3409 // /\ 3410 // / \ 3411 // SB1 SB2 3412 // 3413 // And it can be partially duplicated as 3414 // 3415 // PB2+BB 3416 // | PB1 PB3 PB4 3417 // | | / /\ 3418 // | | / / \ 3419 // | |/ / \ 3420 // | BB----/ OB 3421 // |\ /| 3422 // | X | 3423 // |/ \| 3424 // SB2 SB1 3425 // 3426 // The benefit of duplicating into a predecessor is defined as 3427 // Orig_taken_branch - Duplicated_taken_branch 3428 // 3429 // The Orig_taken_branch is computed with the assumption that predecessor 3430 // jumps to BB and the most possible successor is laid out after BB. 3431 // 3432 // The Duplicated_taken_branch is computed with the assumption that BB is 3433 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and 3434 // SB2 for PB2 in our case). If there is no available successor, the combined 3435 // block jumps to all BB's successor, like PB3 in this example. 3436 // 3437 // If a predecessor has multiple successors, so BB can't be duplicated into 3438 // it. But it can beneficially fall through to BB, and duplicate BB into other 3439 // predecessors. 3440 for (MachineBasicBlock *Pred : Preds) { 3441 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred); 3442 3443 if (!TailDup.canTailDuplicate(BB, Pred)) { 3444 // BB can't be duplicated into Pred, but it is possible to be layout 3445 // below Pred. 3446 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) { 3447 Fallthrough = Pred; 3448 if (SuccIt != Succs.end()) 3449 SuccIt++; 3450 } 3451 continue; 3452 } 3453 3454 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb; 3455 BlockFrequency DupCost; 3456 if (SuccIt == Succs.end()) { 3457 // Jump to all successors; 3458 if (Succs.size() > 0) 3459 DupCost += PredFreq; 3460 } else { 3461 // Fallthrough to *SuccIt, jump to all other successors; 3462 DupCost += PredFreq; 3463 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt); 3464 } 3465 3466 assert(OrigCost >= DupCost); 3467 OrigCost -= DupCost; 3468 if (OrigCost > BBDupThreshold) { 3469 Candidates.push_back(Pred); 3470 if (SuccIt != Succs.end()) 3471 SuccIt++; 3472 } 3473 } 3474 3475 // No predecessors can optimally fallthrough to BB. 3476 // So we can change one duplication into fallthrough. 3477 if (!Fallthrough) { 3478 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) { 3479 Candidates[0] = Candidates.back(); 3480 Candidates.pop_back(); 3481 } 3482 } 3483 } 3484 3485 void MachineBlockPlacement::initTailDupThreshold() { 3486 DupThreshold = BlockFrequency(0); 3487 if (F->getFunction().hasProfileData()) { 3488 // We prefer to use prifile count. 3489 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold(); 3490 if (HotThreshold != UINT64_MAX) { 3491 UseProfileCount = true; 3492 DupThreshold = 3493 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100); 3494 } else { 3495 // Profile count is not available, we can use block frequency instead. 3496 BlockFrequency MaxFreq = BlockFrequency(0); 3497 for (MachineBasicBlock &MBB : *F) { 3498 BlockFrequency Freq = MBFI->getBlockFreq(&MBB); 3499 if (Freq > MaxFreq) 3500 MaxFreq = Freq; 3501 } 3502 3503 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); 3504 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb); 3505 UseProfileCount = false; 3506 } 3507 } 3508 3509 TailDupSize = TailDupPlacementThreshold; 3510 // If only the aggressive threshold is explicitly set, use it. 3511 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && 3512 TailDupPlacementThreshold.getNumOccurrences() == 0) 3513 TailDupSize = TailDupPlacementAggressiveThreshold; 3514 3515 // For aggressive optimization, we can adjust some thresholds to be less 3516 // conservative. 3517 if (OptLevel >= CodeGenOptLevel::Aggressive) { 3518 // At O3 we should be more willing to copy blocks for tail duplication. This 3519 // increases size pressure, so we only do it at O3 3520 // Do this unless only the regular threshold is explicitly set. 3521 if (TailDupPlacementThreshold.getNumOccurrences() == 0 || 3522 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) 3523 TailDupSize = TailDupPlacementAggressiveThreshold; 3524 } 3525 3526 // If there's no threshold provided through options, query the target 3527 // information for a threshold instead. 3528 if (TailDupPlacementThreshold.getNumOccurrences() == 0 && 3529 (OptLevel < CodeGenOptLevel::Aggressive || 3530 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0)) 3531 TailDupSize = TII->getTailDuplicateSize(OptLevel); 3532 } 3533 3534 PreservedAnalyses 3535 MachineBlockPlacementPass::run(MachineFunction &MF, 3536 MachineFunctionAnalysisManager &MFAM) { 3537 auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(MF); 3538 auto MBFI = std::make_unique<MBFIWrapper>( 3539 MFAM.getResult<MachineBlockFrequencyAnalysis>(MF)); 3540 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF); 3541 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF) 3542 ? &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF) 3543 : nullptr; 3544 auto *PSI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF) 3545 .getCachedResult<ProfileSummaryAnalysis>( 3546 *MF.getFunction().getParent()); 3547 if (!PSI) 3548 report_fatal_error("MachineBlockPlacement requires ProfileSummaryAnalysis", 3549 false); 3550 3551 MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT, 3552 AllowTailMerge); 3553 3554 if (!MBP.run(MF)) 3555 return PreservedAnalyses::all(); 3556 3557 return getMachineFunctionPassPreservedAnalyses(); 3558 } 3559 3560 void MachineBlockPlacementPass::printPipeline( 3561 raw_ostream &OS, 3562 function_ref<StringRef(StringRef)> MapClassName2PassName) const { 3563 OS << MapClassName2PassName(name()); 3564 if (!AllowTailMerge) 3565 OS << "<no-tail-merge>"; 3566 } 3567 3568 bool MachineBlockPlacement::run(MachineFunction &MF) { 3569 3570 // Check for single-block functions and skip them. 3571 if (std::next(MF.begin()) == MF.end()) 3572 return false; 3573 3574 F = &MF; 3575 OptLevel = F->getTarget().getOptLevel(); 3576 3577 TII = MF.getSubtarget().getInstrInfo(); 3578 TLI = MF.getSubtarget().getTargetLowering(); 3579 3580 // Initialize PreferredLoopExit to nullptr here since it may never be set if 3581 // there are no MachineLoops. 3582 PreferredLoopExit = nullptr; 3583 3584 assert(BlockToChain.empty() && 3585 "BlockToChain map should be empty before starting placement."); 3586 assert(ComputedEdges.empty() && 3587 "Computed Edge map should be empty before starting placement."); 3588 3589 // Initialize tail duplication thresholds. 3590 initTailDupThreshold(); 3591 3592 const bool OptForSize = 3593 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); 3594 // Determine whether to use ext-tsp for perf/size optimization. The method 3595 // is beneficial only for instances with at least 3 basic blocks and it can be 3596 // disabled for huge functions (exceeding a certain size). 3597 bool UseExtTspForPerf = false; 3598 bool UseExtTspForSize = false; 3599 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) { 3600 UseExtTspForPerf = 3601 EnableExtTspBlockPlacement && 3602 (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()); 3603 UseExtTspForSize = OptForSize && ApplyExtTspForSize; 3604 } 3605 3606 // Apply tail duplication. 3607 if (allowTailDupPlacement(*F)) { 3608 if (OptForSize) 3609 TailDupSize = 1; 3610 const bool PreRegAlloc = false; 3611 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI, 3612 /* LayoutMode */ true, TailDupSize); 3613 if (!UseExtTspForSize) 3614 precomputeTriangleChains(); 3615 } 3616 3617 // Run the main block placement. 3618 if (!UseExtTspForSize) 3619 buildCFGChains(); 3620 3621 // Changing the layout can create new tail merging opportunities. 3622 // TailMerge can create jump into if branches that make CFG irreducible for 3623 // HW that requires structured CFG. 3624 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && 3625 AllowTailMerge && BranchFoldPlacement && 3626 MF.size() > 3; 3627 // No tail merging opportunities if the block number is less than four. 3628 if (EnableTailMerge) { 3629 const unsigned TailMergeSize = TailDupSize + 1; 3630 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false, 3631 *MBFI, *MBPI, PSI, TailMergeSize); 3632 3633 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI, 3634 /*AfterPlacement=*/true)) { 3635 // Must redo the post-dominator tree if blocks were changed. 3636 if (MPDT) 3637 MPDT->recalculate(MF); 3638 if (!UseExtTspForSize) { 3639 // Redo the layout if tail merging creates/removes/moves blocks. 3640 BlockToChain.clear(); 3641 ComputedEdges.clear(); 3642 ChainAllocator.DestroyAll(); 3643 buildCFGChains(); 3644 } 3645 } 3646 } 3647 3648 // Apply a post-processing optimizing block placement: 3649 // - find a new placement and modify the layout of the blocks in the function; 3650 // - re-create CFG chains so that we can optimizeBranches and alignBlocks. 3651 if (UseExtTspForPerf || UseExtTspForSize) { 3652 assert( 3653 !(UseExtTspForPerf && UseExtTspForSize) && 3654 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously"); 3655 applyExtTsp(/*OptForSize=*/UseExtTspForSize); 3656 createCFGChainExtTsp(); 3657 } 3658 3659 optimizeBranches(); 3660 alignBlocks(); 3661 3662 BlockToChain.clear(); 3663 ComputedEdges.clear(); 3664 ChainAllocator.DestroyAll(); 3665 3666 // View the function. 3667 if (ViewBlockLayoutWithBFI != GVDT_None && 3668 (ViewBlockFreqFuncName.empty() || 3669 F->getFunction().getName() == ViewBlockFreqFuncName)) { 3670 if (RenumberBlocksBeforeView) 3671 MF.RenumberBlocks(); 3672 MBFI->view("MBP." + MF.getName(), false); 3673 } 3674 3675 // We always return true as we have no way to track whether the final order 3676 // differs from the original order. 3677 return true; 3678 } 3679 3680 void MachineBlockPlacement::applyExtTsp(bool OptForSize) { 3681 // Prepare data; blocks are indexed by their index in the current ordering. 3682 DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex; 3683 BlockIndex.reserve(F->size()); 3684 std::vector<const MachineBasicBlock *> CurrentBlockOrder; 3685 CurrentBlockOrder.reserve(F->size()); 3686 size_t NumBlocks = 0; 3687 for (const MachineBasicBlock &MBB : *F) { 3688 BlockIndex[&MBB] = NumBlocks++; 3689 CurrentBlockOrder.push_back(&MBB); 3690 } 3691 3692 SmallVector<uint64_t, 0> BlockCounts(F->size()); 3693 SmallVector<uint64_t, 0> BlockSizes(F->size()); 3694 SmallVector<codelayout::EdgeCount, 0> JumpCounts; 3695 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch. 3696 SmallVector<const MachineBasicBlock *, 4> Succs; 3697 for (MachineBasicBlock &MBB : *F) { 3698 // Getting the block frequency. 3699 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); 3700 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency(); 3701 // Getting the block size: 3702 // - approximate the size of an instruction by 4 bytes, and 3703 // - ignore debug instructions. 3704 // Note: getting the exact size of each block is target-dependent and can be 3705 // done by extending the interface of MCCodeEmitter. Experimentally we do 3706 // not see a perf improvement with the exact block sizes. 3707 auto NonDbgInsts = 3708 instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); 3709 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); 3710 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts; 3711 3712 // Getting jump frequencies. 3713 if (OptForSize) { 3714 Cond.clear(); 3715 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. 3716 if (TII->analyzeBranch(MBB, TBB, FBB, Cond)) 3717 continue; 3718 3719 const MachineBasicBlock *FTB = MBB.getFallThrough(); 3720 // Succs is a collection of distinct destinations of the block reachable 3721 // from MBB via a jump instruction; initialize the list using the three 3722 // (non-necessarily distinct) blocks, FTB, TBB, and FBB. 3723 Succs.clear(); 3724 if (TBB && TBB != FTB) 3725 Succs.push_back(TBB); 3726 if (FBB && FBB != FTB) 3727 Succs.push_back(FBB); 3728 if (FTB) 3729 Succs.push_back(FTB); 3730 // Absolute magnitude of non-zero counts does not matter for the 3731 // optimization; prioritize slightly jumps with a single successor, since 3732 // the corresponding jump instruction will be removed from the binary. 3733 const uint64_t Freq = Succs.size() == 1 ? 110 : 100; 3734 for (const MachineBasicBlock *Succ : Succs) 3735 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq}); 3736 } else { 3737 for (MachineBasicBlock *Succ : MBB.successors()) { 3738 auto EP = MBPI->getEdgeProbability(&MBB, Succ); 3739 BlockFrequency JumpFreq = BlockFreq * EP; 3740 JumpCounts.push_back( 3741 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); 3742 } 3743 } 3744 } 3745 3746 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size() 3747 << " with profile = " << F->getFunction().hasProfileData() 3748 << " (" << F->getName() << ")" << "\n"); 3749 3750 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts); 3751 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore)); 3752 3753 // Run the layout algorithm. 3754 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); 3755 std::vector<const MachineBasicBlock *> NewBlockOrder; 3756 NewBlockOrder.reserve(F->size()); 3757 for (uint64_t Node : NewOrder) { 3758 NewBlockOrder.push_back(CurrentBlockOrder[Node]); 3759 } 3760 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts); 3761 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore)); 3762 3763 // If the optimization is unsuccessful, fall back to the original block order. 3764 if (OptForSize && OrgScore > OptScore) 3765 assignBlockOrder(CurrentBlockOrder); 3766 else 3767 assignBlockOrder(NewBlockOrder); 3768 } 3769 3770 void MachineBlockPlacement::assignBlockOrder( 3771 const std::vector<const MachineBasicBlock *> &NewBlockOrder) { 3772 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order"); 3773 F->RenumberBlocks(); 3774 // At this point, we possibly removed blocks from the function, so we can't 3775 // renumber the domtree. At this point, we don't need it anymore, though. 3776 // TODO: move this to the point where the dominator tree is actually 3777 // invalidated (i.e., where blocks are removed without updating the domtree). 3778 MPDT = nullptr; 3779 3780 bool HasChanges = false; 3781 for (size_t I = 0; I < NewBlockOrder.size(); I++) { 3782 if (NewBlockOrder[I] != F->getBlockNumbered(I)) { 3783 HasChanges = true; 3784 break; 3785 } 3786 } 3787 // Stop early if the new block order is identical to the existing one. 3788 if (!HasChanges) 3789 return; 3790 3791 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs()); 3792 for (auto &MBB : *F) { 3793 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough(); 3794 } 3795 3796 // Sort basic blocks in the function according to the computed order. 3797 DenseMap<const MachineBasicBlock *, size_t> NewIndex; 3798 for (const MachineBasicBlock *MBB : NewBlockOrder) { 3799 NewIndex[MBB] = NewIndex.size(); 3800 } 3801 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) { 3802 return NewIndex[&L] < NewIndex[&R]; 3803 }); 3804 3805 // Update basic block branches by inserting explicit fallthrough branches 3806 // when required and re-optimize branches when possible. 3807 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); 3808 SmallVector<MachineOperand, 4> Cond; 3809 for (auto &MBB : *F) { 3810 MachineFunction::iterator NextMBB = std::next(MBB.getIterator()); 3811 MachineFunction::iterator EndIt = MBB.getParent()->end(); 3812 auto *FTMBB = PrevFallThroughs[MBB.getNumber()]; 3813 // If this block had a fallthrough before we need an explicit unconditional 3814 // branch to that block if the fallthrough block is not adjacent to the 3815 // block in the new order. 3816 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) { 3817 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc()); 3818 } 3819 3820 // It might be possible to optimize branches by flipping the condition. 3821 Cond.clear(); 3822 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; 3823 if (TII->analyzeBranch(MBB, TBB, FBB, Cond)) 3824 continue; 3825 MBB.updateTerminator(FTMBB); 3826 } 3827 } 3828 3829 void MachineBlockPlacement::createCFGChainExtTsp() { 3830 BlockToChain.clear(); 3831 ComputedEdges.clear(); 3832 ChainAllocator.DestroyAll(); 3833 3834 MachineBasicBlock *HeadBB = &F->front(); 3835 BlockChain *FunctionChain = 3836 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB); 3837 3838 for (MachineBasicBlock &MBB : *F) { 3839 if (HeadBB == &MBB) 3840 continue; // Ignore head of the chain 3841 FunctionChain->merge(&MBB, nullptr); 3842 } 3843 } 3844 3845 namespace { 3846 3847 /// A pass to compute block placement statistics. 3848 /// 3849 /// A separate pass to compute interesting statistics for evaluating block 3850 /// placement. This is separate from the actual placement pass so that they can 3851 /// be computed in the absence of any placement transformations or when using 3852 /// alternative placement strategies. 3853 class MachineBlockPlacementStats { 3854 /// A handle to the branch probability pass. 3855 const MachineBranchProbabilityInfo *MBPI; 3856 3857 /// A handle to the function-wide block frequency pass. 3858 const MachineBlockFrequencyInfo *MBFI; 3859 3860 public: 3861 MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI, 3862 const MachineBlockFrequencyInfo *MBFI) 3863 : MBPI(MBPI), MBFI(MBFI) {} 3864 bool run(MachineFunction &MF); 3865 }; 3866 3867 class MachineBlockPlacementStatsLegacy : public MachineFunctionPass { 3868 public: 3869 static char ID; // Pass identification, replacement for typeid 3870 3871 MachineBlockPlacementStatsLegacy() : MachineFunctionPass(ID) { 3872 initializeMachineBlockPlacementStatsLegacyPass( 3873 *PassRegistry::getPassRegistry()); 3874 } 3875 3876 bool runOnMachineFunction(MachineFunction &F) override { 3877 auto *MBPI = 3878 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI(); 3879 auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(); 3880 return MachineBlockPlacementStats(MBPI, MBFI).run(F); 3881 } 3882 3883 void getAnalysisUsage(AnalysisUsage &AU) const override { 3884 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>(); 3885 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>(); 3886 AU.setPreservesAll(); 3887 MachineFunctionPass::getAnalysisUsage(AU); 3888 } 3889 }; 3890 3891 } // end anonymous namespace 3892 3893 char MachineBlockPlacementStatsLegacy::ID = 0; 3894 3895 char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID; 3896 3897 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats", 3898 "Basic Block Placement Stats", false, false) 3899 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) 3900 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass) 3901 INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats", 3902 "Basic Block Placement Stats", false, false) 3903 3904 PreservedAnalyses 3905 MachineBlockPlacementStatsPass::run(MachineFunction &MF, 3906 MachineFunctionAnalysisManager &MFAM) { 3907 auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF); 3908 auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(MF); 3909 3910 MachineBlockPlacementStats(&MBPI, &MBFI).run(MF); 3911 return PreservedAnalyses::all(); 3912 } 3913 3914 bool MachineBlockPlacementStats::run(MachineFunction &F) { 3915 // Check for single-block functions and skip them. 3916 if (std::next(F.begin()) == F.end()) 3917 return false; 3918 3919 if (!isFunctionInPrintList(F.getName())) 3920 return false; 3921 3922 for (MachineBasicBlock &MBB : F) { 3923 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); 3924 Statistic &NumBranches = 3925 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches; 3926 Statistic &BranchTakenFreq = 3927 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq; 3928 for (MachineBasicBlock *Succ : MBB.successors()) { 3929 // Skip if this successor is a fallthrough. 3930 if (MBB.isLayoutSuccessor(Succ)) 3931 continue; 3932 3933 BlockFrequency EdgeFreq = 3934 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ); 3935 ++NumBranches; 3936 BranchTakenFreq += EdgeFreq.getFrequency(); 3937 } 3938 } 3939 3940 return false; 3941 } 3942