1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the loop fusion pass. 11 /// The implementation is largely based on the following document: 12 /// 13 /// Code Transformations to Augment the Scope of Loop Fusion in a 14 /// Production Compiler 15 /// Christopher Mark Barton 16 /// MSc Thesis 17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 18 /// 19 /// The general approach taken is to collect sets of control flow equivalent 20 /// loops and test whether they can be fused. The necessary conditions for 21 /// fusion are: 22 /// 1. The loops must be adjacent (there cannot be any statements between 23 /// the two loops). 24 /// 2. The loops must be conforming (they must execute the same number of 25 /// iterations). 26 /// 3. The loops must be control flow equivalent (if one loop executes, the 27 /// other is guaranteed to execute). 28 /// 4. There cannot be any negative distance dependencies between the loops. 29 /// If all of these conditions are satisfied, it is safe to fuse the loops. 30 /// 31 /// This implementation creates FusionCandidates that represent the loop and the 32 /// necessary information needed by fusion. It then operates on the fusion 33 /// candidates, first confirming that the candidate is eligible for fusion. The 34 /// candidates are then collected into control flow equivalent sets, sorted in 35 /// dominance order. Each set of control flow equivalent candidates is then 36 /// traversed, attempting to fuse pairs of candidates in the set. If all 37 /// requirements for fusion are met, the two candidates are fused, creating a 38 /// new (fused) candidate which is then added back into the set to consider for 39 /// additional fusion. 40 /// 41 /// This implementation currently does not make any modifications to remove 42 /// conditions for fusion. Code transformations to make loops conform to each of 43 /// the conditions for fusion are discussed in more detail in the document 44 /// above. These can be added to the current implementation in the future. 45 //===----------------------------------------------------------------------===// 46 47 #include "llvm/Transforms/Scalar/LoopFuse.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/Analysis/DependenceAnalysis.h" 50 #include "llvm/Analysis/DomTreeUpdater.h" 51 #include "llvm/Analysis/LoopInfo.h" 52 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 53 #include "llvm/Analysis/PostDominators.h" 54 #include "llvm/Analysis/ScalarEvolution.h" 55 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 56 #include "llvm/IR/Function.h" 57 #include "llvm/IR/Verifier.h" 58 #include "llvm/InitializePasses.h" 59 #include "llvm/Pass.h" 60 #include "llvm/Support/CommandLine.h" 61 #include "llvm/Support/Debug.h" 62 #include "llvm/Support/raw_ostream.h" 63 #include "llvm/Transforms/Scalar.h" 64 #include "llvm/Transforms/Utils.h" 65 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 66 #include "llvm/Transforms/Utils/CodeMoverUtils.h" 67 68 using namespace llvm; 69 70 #define DEBUG_TYPE "loop-fusion" 71 72 STATISTIC(FuseCounter, "Loops fused"); 73 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 74 STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 75 STATISTIC(InvalidHeader, "Loop has invalid header"); 76 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 77 STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 78 STATISTIC(InvalidLatch, "Loop has invalid latch"); 79 STATISTIC(InvalidLoop, "Loop is invalid"); 80 STATISTIC(AddressTakenBB, "Basic block has address taken"); 81 STATISTIC(MayThrowException, "Loop may throw an exception"); 82 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 83 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 84 STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 85 STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 86 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 87 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 88 STATISTIC(NonAdjacent, "Loops are not adjacent"); 89 STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader"); 90 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 91 STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 92 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); 93 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); 94 STATISTIC(NotRotated, "Candidate is not rotated"); 95 96 enum FusionDependenceAnalysisChoice { 97 FUSION_DEPENDENCE_ANALYSIS_SCEV, 98 FUSION_DEPENDENCE_ANALYSIS_DA, 99 FUSION_DEPENDENCE_ANALYSIS_ALL, 100 }; 101 102 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 103 "loop-fusion-dependence-analysis", 104 cl::desc("Which dependence analysis should loop fusion use?"), 105 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 106 "Use the scalar evolution interface"), 107 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 108 "Use the dependence analysis interface"), 109 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 110 "Use all available analyses")), 111 cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); 112 113 #ifndef NDEBUG 114 static cl::opt<bool> 115 VerboseFusionDebugging("loop-fusion-verbose-debug", 116 cl::desc("Enable verbose debugging for Loop Fusion"), 117 cl::Hidden, cl::init(false), cl::ZeroOrMore); 118 #endif 119 120 namespace { 121 /// This class is used to represent a candidate for loop fusion. When it is 122 /// constructed, it checks the conditions for loop fusion to ensure that it 123 /// represents a valid candidate. It caches several parts of a loop that are 124 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 125 /// of continually querying the underlying Loop to retrieve these values. It is 126 /// assumed these will not change throughout loop fusion. 127 /// 128 /// The invalidate method should be used to indicate that the FusionCandidate is 129 /// no longer a valid candidate for fusion. Similarly, the isValid() method can 130 /// be used to ensure that the FusionCandidate is still valid for fusion. 131 struct FusionCandidate { 132 /// Cache of parts of the loop used throughout loop fusion. These should not 133 /// need to change throughout the analysis and transformation. 134 /// These parts are cached to avoid repeatedly looking up in the Loop class. 135 136 /// Preheader of the loop this candidate represents 137 BasicBlock *Preheader; 138 /// Header of the loop this candidate represents 139 BasicBlock *Header; 140 /// Blocks in the loop that exit the loop 141 BasicBlock *ExitingBlock; 142 /// The successor block of this loop (where the exiting blocks go to) 143 BasicBlock *ExitBlock; 144 /// Latch of the loop 145 BasicBlock *Latch; 146 /// The loop that this fusion candidate represents 147 Loop *L; 148 /// Vector of instructions in this loop that read from memory 149 SmallVector<Instruction *, 16> MemReads; 150 /// Vector of instructions in this loop that write to memory 151 SmallVector<Instruction *, 16> MemWrites; 152 /// Are all of the members of this fusion candidate still valid 153 bool Valid; 154 /// Guard branch of the loop, if it exists 155 BranchInst *GuardBranch; 156 157 /// Dominator and PostDominator trees are needed for the 158 /// FusionCandidateCompare function, required by FusionCandidateSet to 159 /// determine where the FusionCandidate should be inserted into the set. These 160 /// are used to establish ordering of the FusionCandidates based on dominance. 161 const DominatorTree *DT; 162 const PostDominatorTree *PDT; 163 164 OptimizationRemarkEmitter &ORE; 165 166 FusionCandidate(Loop *L, const DominatorTree *DT, 167 const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) 168 : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 169 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 170 Latch(L->getLoopLatch()), L(L), Valid(true), 171 GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) { 172 173 // Walk over all blocks in the loop and check for conditions that may 174 // prevent fusion. For each block, walk over all instructions and collect 175 // the memory reads and writes If any instructions that prevent fusion are 176 // found, invalidate this object and return. 177 for (BasicBlock *BB : L->blocks()) { 178 if (BB->hasAddressTaken()) { 179 invalidate(); 180 reportInvalidCandidate(AddressTakenBB); 181 return; 182 } 183 184 for (Instruction &I : *BB) { 185 if (I.mayThrow()) { 186 invalidate(); 187 reportInvalidCandidate(MayThrowException); 188 return; 189 } 190 if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 191 if (SI->isVolatile()) { 192 invalidate(); 193 reportInvalidCandidate(ContainsVolatileAccess); 194 return; 195 } 196 } 197 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 198 if (LI->isVolatile()) { 199 invalidate(); 200 reportInvalidCandidate(ContainsVolatileAccess); 201 return; 202 } 203 } 204 if (I.mayWriteToMemory()) 205 MemWrites.push_back(&I); 206 if (I.mayReadFromMemory()) 207 MemReads.push_back(&I); 208 } 209 } 210 } 211 212 /// Check if all members of the class are valid. 213 bool isValid() const { 214 return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 215 !L->isInvalid() && Valid; 216 } 217 218 /// Verify that all members are in sync with the Loop object. 219 void verify() const { 220 assert(isValid() && "Candidate is not valid!!"); 221 assert(!L->isInvalid() && "Loop is invalid!"); 222 assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 223 assert(Header == L->getHeader() && "Header is out of sync"); 224 assert(ExitingBlock == L->getExitingBlock() && 225 "Exiting Blocks is out of sync"); 226 assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 227 assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 228 } 229 230 /// Get the entry block for this fusion candidate. 231 /// 232 /// If this fusion candidate represents a guarded loop, the entry block is the 233 /// loop guard block. If it represents an unguarded loop, the entry block is 234 /// the preheader of the loop. 235 BasicBlock *getEntryBlock() const { 236 if (GuardBranch) 237 return GuardBranch->getParent(); 238 else 239 return Preheader; 240 } 241 242 /// Given a guarded loop, get the successor of the guard that is not in the 243 /// loop. 244 /// 245 /// This method returns the successor of the loop guard that is not located 246 /// within the loop (i.e., the successor of the guard that is not the 247 /// preheader). 248 /// This method is only valid for guarded loops. 249 BasicBlock *getNonLoopBlock() const { 250 assert(GuardBranch && "Only valid on guarded loops."); 251 assert(GuardBranch->isConditional() && 252 "Expecting guard to be a conditional branch."); 253 return (GuardBranch->getSuccessor(0) == Preheader) 254 ? GuardBranch->getSuccessor(1) 255 : GuardBranch->getSuccessor(0); 256 } 257 258 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 259 LLVM_DUMP_METHOD void dump() const { 260 dbgs() << "\tGuardBranch: "; 261 if (GuardBranch) 262 dbgs() << *GuardBranch; 263 else 264 dbgs() << "nullptr"; 265 dbgs() << "\n" 266 << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 267 << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 268 << "\n" 269 << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 270 << "\tExitingBB: " 271 << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 272 << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 273 << "\n" 274 << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 275 << "\tEntryBlock: " 276 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 277 << "\n"; 278 } 279 #endif 280 281 /// Determine if a fusion candidate (representing a loop) is eligible for 282 /// fusion. Note that this only checks whether a single loop can be fused - it 283 /// does not check whether it is *legal* to fuse two loops together. 284 bool isEligibleForFusion(ScalarEvolution &SE) const { 285 if (!isValid()) { 286 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 287 if (!Preheader) 288 ++InvalidPreheader; 289 if (!Header) 290 ++InvalidHeader; 291 if (!ExitingBlock) 292 ++InvalidExitingBlock; 293 if (!ExitBlock) 294 ++InvalidExitBlock; 295 if (!Latch) 296 ++InvalidLatch; 297 if (L->isInvalid()) 298 ++InvalidLoop; 299 300 return false; 301 } 302 303 // Require ScalarEvolution to be able to determine a trip count. 304 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 305 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 306 << " trip count not computable!\n"); 307 return reportInvalidCandidate(UnknownTripCount); 308 } 309 310 if (!L->isLoopSimplifyForm()) { 311 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 312 << " is not in simplified form!\n"); 313 return reportInvalidCandidate(NotSimplifiedForm); 314 } 315 316 if (!L->isRotatedForm()) { 317 LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 318 return reportInvalidCandidate(NotRotated); 319 } 320 321 return true; 322 } 323 324 private: 325 // This is only used internally for now, to clear the MemWrites and MemReads 326 // list and setting Valid to false. I can't envision other uses of this right 327 // now, since once FusionCandidates are put into the FusionCandidateSet they 328 // are immutable. Thus, any time we need to change/update a FusionCandidate, 329 // we must create a new one and insert it into the FusionCandidateSet to 330 // ensure the FusionCandidateSet remains ordered correctly. 331 void invalidate() { 332 MemWrites.clear(); 333 MemReads.clear(); 334 Valid = false; 335 } 336 337 bool reportInvalidCandidate(llvm::Statistic &Stat) const { 338 using namespace ore; 339 assert(L && Preheader && "Fusion candidate not initialized properly!"); 340 ++Stat; 341 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 342 L->getStartLoc(), Preheader) 343 << "[" << Preheader->getParent()->getName() << "]: " 344 << "Loop is not a candidate for fusion: " << Stat.getDesc()); 345 return false; 346 } 347 }; 348 349 struct FusionCandidateCompare { 350 /// Comparison functor to sort two Control Flow Equivalent fusion candidates 351 /// into dominance order. 352 /// If LHS dominates RHS and RHS post-dominates LHS, return true; 353 /// IF RHS dominates LHS and LHS post-dominates RHS, return false; 354 bool operator()(const FusionCandidate &LHS, 355 const FusionCandidate &RHS) const { 356 const DominatorTree *DT = LHS.DT; 357 358 BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 359 BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 360 361 // Do not save PDT to local variable as it is only used in asserts and thus 362 // will trigger an unused variable warning if building without asserts. 363 assert(DT && LHS.PDT && "Expecting valid dominator tree"); 364 365 // Do this compare first so if LHS == RHS, function returns false. 366 if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 367 // RHS dominates LHS 368 // Verify LHS post-dominates RHS 369 assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 370 return false; 371 } 372 373 if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 374 // Verify RHS Postdominates LHS 375 assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 376 return true; 377 } 378 379 // If LHS does not dominate RHS and RHS does not dominate LHS then there is 380 // no dominance relationship between the two FusionCandidates. Thus, they 381 // should not be in the same set together. 382 llvm_unreachable( 383 "No dominance relationship between these fusion candidates!"); 384 } 385 }; 386 387 using LoopVector = SmallVector<Loop *, 4>; 388 389 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 390 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 391 // dominates FC1 and FC1 post-dominates FC0. 392 // std::set was chosen because we want a sorted data structure with stable 393 // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent 394 // loops by moving intervening code around. When this intervening code contains 395 // loops, those loops will be moved also. The corresponding FusionCandidates 396 // will also need to be moved accordingly. As this is done, having stable 397 // iterators will simplify the logic. Similarly, having an efficient insert that 398 // keeps the FusionCandidateSet sorted will also simplify the implementation. 399 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 400 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 401 402 #if !defined(NDEBUG) 403 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 404 const FusionCandidate &FC) { 405 if (FC.isValid()) 406 OS << FC.Preheader->getName(); 407 else 408 OS << "<Invalid>"; 409 410 return OS; 411 } 412 413 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 414 const FusionCandidateSet &CandSet) { 415 for (const FusionCandidate &FC : CandSet) 416 OS << FC << '\n'; 417 418 return OS; 419 } 420 421 static void 422 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 423 dbgs() << "Fusion Candidates: \n"; 424 for (const auto &CandidateSet : FusionCandidates) { 425 dbgs() << "*** Fusion Candidate Set ***\n"; 426 dbgs() << CandidateSet; 427 dbgs() << "****************************\n"; 428 } 429 } 430 #endif 431 432 /// Collect all loops in function at the same nest level, starting at the 433 /// outermost level. 434 /// 435 /// This data structure collects all loops at the same nest level for a 436 /// given function (specified by the LoopInfo object). It starts at the 437 /// outermost level. 438 struct LoopDepthTree { 439 using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 440 using iterator = LoopsOnLevelTy::iterator; 441 using const_iterator = LoopsOnLevelTy::const_iterator; 442 443 LoopDepthTree(LoopInfo &LI) : Depth(1) { 444 if (!LI.empty()) 445 LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 446 } 447 448 /// Test whether a given loop has been removed from the function, and thus is 449 /// no longer valid. 450 bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 451 452 /// Record that a given loop has been removed from the function and is no 453 /// longer valid. 454 void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 455 456 /// Descend the tree to the next (inner) nesting level 457 void descend() { 458 LoopsOnLevelTy LoopsOnNextLevel; 459 460 for (const LoopVector &LV : *this) 461 for (Loop *L : LV) 462 if (!isRemovedLoop(L) && L->begin() != L->end()) 463 LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 464 465 LoopsOnLevel = LoopsOnNextLevel; 466 RemovedLoops.clear(); 467 Depth++; 468 } 469 470 bool empty() const { return size() == 0; } 471 size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 472 unsigned getDepth() const { return Depth; } 473 474 iterator begin() { return LoopsOnLevel.begin(); } 475 iterator end() { return LoopsOnLevel.end(); } 476 const_iterator begin() const { return LoopsOnLevel.begin(); } 477 const_iterator end() const { return LoopsOnLevel.end(); } 478 479 private: 480 /// Set of loops that have been removed from the function and are no longer 481 /// valid. 482 SmallPtrSet<const Loop *, 8> RemovedLoops; 483 484 /// Depth of the current level, starting at 1 (outermost loops). 485 unsigned Depth; 486 487 /// Vector of loops at the current depth level that have the same parent loop 488 LoopsOnLevelTy LoopsOnLevel; 489 }; 490 491 #ifndef NDEBUG 492 static void printLoopVector(const LoopVector &LV) { 493 dbgs() << "****************************\n"; 494 for (auto L : LV) 495 printLoop(*L, dbgs()); 496 dbgs() << "****************************\n"; 497 } 498 #endif 499 500 struct LoopFuser { 501 private: 502 // Sets of control flow equivalent fusion candidates for a given nest level. 503 FusionCandidateCollection FusionCandidates; 504 505 LoopDepthTree LDT; 506 DomTreeUpdater DTU; 507 508 LoopInfo &LI; 509 DominatorTree &DT; 510 DependenceInfo &DI; 511 ScalarEvolution &SE; 512 PostDominatorTree &PDT; 513 OptimizationRemarkEmitter &ORE; 514 515 public: 516 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 517 ScalarEvolution &SE, PostDominatorTree &PDT, 518 OptimizationRemarkEmitter &ORE, const DataLayout &DL) 519 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 520 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {} 521 522 /// This is the main entry point for loop fusion. It will traverse the 523 /// specified function and collect candidate loops to fuse, starting at the 524 /// outermost nesting level and working inwards. 525 bool fuseLoops(Function &F) { 526 #ifndef NDEBUG 527 if (VerboseFusionDebugging) { 528 LI.print(dbgs()); 529 } 530 #endif 531 532 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 533 << "\n"); 534 bool Changed = false; 535 536 while (!LDT.empty()) { 537 LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 538 << LDT.getDepth() << "\n";); 539 540 for (const LoopVector &LV : LDT) { 541 assert(LV.size() > 0 && "Empty loop set was build!"); 542 543 // Skip singleton loop sets as they do not offer fusion opportunities on 544 // this level. 545 if (LV.size() == 1) 546 continue; 547 #ifndef NDEBUG 548 if (VerboseFusionDebugging) { 549 LLVM_DEBUG({ 550 dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 551 printLoopVector(LV); 552 }); 553 } 554 #endif 555 556 collectFusionCandidates(LV); 557 Changed |= fuseCandidates(); 558 } 559 560 // Finished analyzing candidates at this level. 561 // Descend to the next level and clear all of the candidates currently 562 // collected. Note that it will not be possible to fuse any of the 563 // existing candidates with new candidates because the new candidates will 564 // be at a different nest level and thus not be control flow equivalent 565 // with all of the candidates collected so far. 566 LLVM_DEBUG(dbgs() << "Descend one level!\n"); 567 LDT.descend(); 568 FusionCandidates.clear(); 569 } 570 571 if (Changed) 572 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 573 574 #ifndef NDEBUG 575 assert(DT.verify()); 576 assert(PDT.verify()); 577 LI.verify(DT); 578 SE.verify(); 579 #endif 580 581 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 582 return Changed; 583 } 584 585 private: 586 /// Determine if two fusion candidates are control flow equivalent. 587 /// 588 /// Two fusion candidates are control flow equivalent if when one executes, 589 /// the other is guaranteed to execute. This is determined using dominators 590 /// and post-dominators: if A dominates B and B post-dominates A then A and B 591 /// are control-flow equivalent. 592 bool isControlFlowEquivalent(const FusionCandidate &FC0, 593 const FusionCandidate &FC1) const { 594 assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 595 596 return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 597 DT, PDT); 598 } 599 600 /// Iterate over all loops in the given loop set and identify the loops that 601 /// are eligible for fusion. Place all eligible fusion candidates into Control 602 /// Flow Equivalent sets, sorted by dominance. 603 void collectFusionCandidates(const LoopVector &LV) { 604 for (Loop *L : LV) { 605 FusionCandidate CurrCand(L, &DT, &PDT, ORE); 606 if (!CurrCand.isEligibleForFusion(SE)) 607 continue; 608 609 // Go through each list in FusionCandidates and determine if L is control 610 // flow equivalent with the first loop in that list. If it is, append LV. 611 // If not, go to the next list. 612 // If no suitable list is found, start another list and add it to 613 // FusionCandidates. 614 bool FoundSet = false; 615 616 for (auto &CurrCandSet : FusionCandidates) { 617 if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 618 CurrCandSet.insert(CurrCand); 619 FoundSet = true; 620 #ifndef NDEBUG 621 if (VerboseFusionDebugging) 622 LLVM_DEBUG(dbgs() << "Adding " << CurrCand 623 << " to existing candidate set\n"); 624 #endif 625 break; 626 } 627 } 628 if (!FoundSet) { 629 // No set was found. Create a new set and add to FusionCandidates 630 #ifndef NDEBUG 631 if (VerboseFusionDebugging) 632 LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 633 #endif 634 FusionCandidateSet NewCandSet; 635 NewCandSet.insert(CurrCand); 636 FusionCandidates.push_back(NewCandSet); 637 } 638 NumFusionCandidates++; 639 } 640 } 641 642 /// Determine if it is beneficial to fuse two loops. 643 /// 644 /// For now, this method simply returns true because we want to fuse as much 645 /// as possible (primarily to test the pass). This method will evolve, over 646 /// time, to add heuristics for profitability of fusion. 647 bool isBeneficialFusion(const FusionCandidate &FC0, 648 const FusionCandidate &FC1) { 649 return true; 650 } 651 652 /// Determine if two fusion candidates have the same trip count (i.e., they 653 /// execute the same number of iterations). 654 /// 655 /// Note that for now this method simply returns a boolean value because there 656 /// are no mechanisms in loop fusion to handle different trip counts. In the 657 /// future, this behaviour can be extended to adjust one of the loops to make 658 /// the trip counts equal (e.g., loop peeling). When this is added, this 659 /// interface may need to change to return more information than just a 660 /// boolean value. 661 bool identicalTripCounts(const FusionCandidate &FC0, 662 const FusionCandidate &FC1) const { 663 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 664 if (isa<SCEVCouldNotCompute>(TripCount0)) { 665 UncomputableTripCount++; 666 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 667 return false; 668 } 669 670 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 671 if (isa<SCEVCouldNotCompute>(TripCount1)) { 672 UncomputableTripCount++; 673 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 674 return false; 675 } 676 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 677 << *TripCount1 << " are " 678 << (TripCount0 == TripCount1 ? "identical" : "different") 679 << "\n"); 680 681 return (TripCount0 == TripCount1); 682 } 683 684 /// Walk each set of control flow equivalent fusion candidates and attempt to 685 /// fuse them. This does a single linear traversal of all candidates in the 686 /// set. The conditions for legal fusion are checked at this point. If a pair 687 /// of fusion candidates passes all legality checks, they are fused together 688 /// and a new fusion candidate is created and added to the FusionCandidateSet. 689 /// The original fusion candidates are then removed, as they are no longer 690 /// valid. 691 bool fuseCandidates() { 692 bool Fused = false; 693 LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 694 for (auto &CandidateSet : FusionCandidates) { 695 if (CandidateSet.size() < 2) 696 continue; 697 698 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 699 << CandidateSet << "\n"); 700 701 for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 702 assert(!LDT.isRemovedLoop(FC0->L) && 703 "Should not have removed loops in CandidateSet!"); 704 auto FC1 = FC0; 705 for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 706 assert(!LDT.isRemovedLoop(FC1->L) && 707 "Should not have removed loops in CandidateSet!"); 708 709 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 710 dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 711 712 FC0->verify(); 713 FC1->verify(); 714 715 if (!identicalTripCounts(*FC0, *FC1)) { 716 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 717 "counts. Not fusing.\n"); 718 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 719 NonEqualTripCount); 720 continue; 721 } 722 723 if (!isAdjacent(*FC0, *FC1)) { 724 LLVM_DEBUG(dbgs() 725 << "Fusion candidates are not adjacent. Not fusing.\n"); 726 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 727 continue; 728 } 729 730 // Ensure that FC0 and FC1 have identical guards. 731 // If one (or both) are not guarded, this check is not necessary. 732 if (FC0->GuardBranch && FC1->GuardBranch && 733 !haveIdenticalGuards(*FC0, *FC1)) { 734 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 735 "guards. Not Fusing.\n"); 736 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 737 NonIdenticalGuards); 738 continue; 739 } 740 741 // The following three checks look for empty blocks in FC0 and FC1. If 742 // any of these blocks are non-empty, we do not fuse. This is done 743 // because we currently do not have the safety checks to determine if 744 // it is safe to move the blocks past other blocks in the loop. Once 745 // these checks are added, these conditions can be relaxed. 746 if (!isEmptyPreheader(*FC1)) { 747 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " 748 "preheader. Not fusing.\n"); 749 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 750 NonEmptyPreheader); 751 continue; 752 } 753 754 if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { 755 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " 756 "block. Not fusing.\n"); 757 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 758 NonEmptyExitBlock); 759 continue; 760 } 761 762 if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) { 763 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard " 764 "block. Not fusing.\n"); 765 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 766 NonEmptyGuardBlock); 767 continue; 768 } 769 770 // Check the dependencies across the loops and do not fuse if it would 771 // violate them. 772 if (!dependencesAllowFusion(*FC0, *FC1)) { 773 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 774 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 775 InvalidDependencies); 776 continue; 777 } 778 779 bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 780 LLVM_DEBUG(dbgs() 781 << "\tFusion appears to be " 782 << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 783 if (!BeneficialToFuse) { 784 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 785 FusionNotBeneficial); 786 continue; 787 } 788 // All analysis has completed and has determined that fusion is legal 789 // and profitable. At this point, start transforming the code and 790 // perform fusion. 791 792 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 793 << *FC1 << "\n"); 794 795 // Report fusion to the Optimization Remarks. 796 // Note this needs to be done *before* performFusion because 797 // performFusion will change the original loops, making it not 798 // possible to identify them after fusion is complete. 799 reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter); 800 801 FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE); 802 FusedCand.verify(); 803 assert(FusedCand.isEligibleForFusion(SE) && 804 "Fused candidate should be eligible for fusion!"); 805 806 // Notify the loop-depth-tree that these loops are not valid objects 807 LDT.removeLoop(FC1->L); 808 809 CandidateSet.erase(FC0); 810 CandidateSet.erase(FC1); 811 812 auto InsertPos = CandidateSet.insert(FusedCand); 813 814 assert(InsertPos.second && 815 "Unable to insert TargetCandidate in CandidateSet!"); 816 817 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 818 // of the FC1 loop will attempt to fuse the new (fused) loop with the 819 // remaining candidates in the current candidate set. 820 FC0 = FC1 = InsertPos.first; 821 822 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 823 << "\n"); 824 825 Fused = true; 826 } 827 } 828 } 829 return Fused; 830 } 831 832 /// Rewrite all additive recurrences in a SCEV to use a new loop. 833 class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 834 public: 835 AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 836 bool UseMax = true) 837 : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 838 NewL(NewL) {} 839 840 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 841 const Loop *ExprL = Expr->getLoop(); 842 SmallVector<const SCEV *, 2> Operands; 843 if (ExprL == &OldL) { 844 Operands.append(Expr->op_begin(), Expr->op_end()); 845 return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 846 } 847 848 if (OldL.contains(ExprL)) { 849 bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 850 if (!UseMax || !Pos || !Expr->isAffine()) { 851 Valid = false; 852 return Expr; 853 } 854 return visit(Expr->getStart()); 855 } 856 857 for (const SCEV *Op : Expr->operands()) 858 Operands.push_back(visit(Op)); 859 return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 860 } 861 862 bool wasValidSCEV() const { return Valid; } 863 864 private: 865 bool Valid, UseMax; 866 const Loop &OldL, &NewL; 867 }; 868 869 /// Return false if the access functions of \p I0 and \p I1 could cause 870 /// a negative dependence. 871 bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 872 Instruction &I1, bool EqualIsInvalid) { 873 Value *Ptr0 = getLoadStorePointerOperand(&I0); 874 Value *Ptr1 = getLoadStorePointerOperand(&I1); 875 if (!Ptr0 || !Ptr1) 876 return false; 877 878 const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 879 const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 880 #ifndef NDEBUG 881 if (VerboseFusionDebugging) 882 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 883 << *SCEVPtr1 << "\n"); 884 #endif 885 AddRecLoopReplacer Rewriter(SE, L0, L1); 886 SCEVPtr0 = Rewriter.visit(SCEVPtr0); 887 #ifndef NDEBUG 888 if (VerboseFusionDebugging) 889 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 890 << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 891 #endif 892 if (!Rewriter.wasValidSCEV()) 893 return false; 894 895 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 896 // L0) and the other is not. We could check if it is monotone and test 897 // the beginning and end value instead. 898 899 BasicBlock *L0Header = L0.getHeader(); 900 auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 901 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 902 if (!AddRec) 903 return false; 904 return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 905 !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 906 }; 907 if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 908 return false; 909 910 ICmpInst::Predicate Pred = 911 EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 912 bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 913 #ifndef NDEBUG 914 if (VerboseFusionDebugging) 915 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 916 << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 917 << "\n"); 918 #endif 919 return IsAlwaysGE; 920 } 921 922 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 923 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 924 /// specified by @p DepChoice are used to determine this. 925 bool dependencesAllowFusion(const FusionCandidate &FC0, 926 const FusionCandidate &FC1, Instruction &I0, 927 Instruction &I1, bool AnyDep, 928 FusionDependenceAnalysisChoice DepChoice) { 929 #ifndef NDEBUG 930 if (VerboseFusionDebugging) { 931 LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 932 << DepChoice << "\n"); 933 } 934 #endif 935 switch (DepChoice) { 936 case FUSION_DEPENDENCE_ANALYSIS_SCEV: 937 return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 938 case FUSION_DEPENDENCE_ANALYSIS_DA: { 939 auto DepResult = DI.depends(&I0, &I1, true); 940 if (!DepResult) 941 return true; 942 #ifndef NDEBUG 943 if (VerboseFusionDebugging) { 944 LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 945 dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 946 << (DepResult->isOrdered() ? "true" : "false") 947 << "]\n"); 948 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 949 << "\n"); 950 } 951 #endif 952 953 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 954 LLVM_DEBUG( 955 dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 956 957 // TODO: Can we actually use the dependence info analysis here? 958 return false; 959 } 960 961 case FUSION_DEPENDENCE_ANALYSIS_ALL: 962 return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 963 FUSION_DEPENDENCE_ANALYSIS_SCEV) || 964 dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 965 FUSION_DEPENDENCE_ANALYSIS_DA); 966 } 967 968 llvm_unreachable("Unknown fusion dependence analysis choice!"); 969 } 970 971 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 972 bool dependencesAllowFusion(const FusionCandidate &FC0, 973 const FusionCandidate &FC1) { 974 LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 975 << "\n"); 976 assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 977 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 978 979 for (Instruction *WriteL0 : FC0.MemWrites) { 980 for (Instruction *WriteL1 : FC1.MemWrites) 981 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 982 /* AnyDep */ false, 983 FusionDependenceAnalysis)) { 984 InvalidDependencies++; 985 return false; 986 } 987 for (Instruction *ReadL1 : FC1.MemReads) 988 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 989 /* AnyDep */ false, 990 FusionDependenceAnalysis)) { 991 InvalidDependencies++; 992 return false; 993 } 994 } 995 996 for (Instruction *WriteL1 : FC1.MemWrites) { 997 for (Instruction *WriteL0 : FC0.MemWrites) 998 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 999 /* AnyDep */ false, 1000 FusionDependenceAnalysis)) { 1001 InvalidDependencies++; 1002 return false; 1003 } 1004 for (Instruction *ReadL0 : FC0.MemReads) 1005 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 1006 /* AnyDep */ false, 1007 FusionDependenceAnalysis)) { 1008 InvalidDependencies++; 1009 return false; 1010 } 1011 } 1012 1013 // Walk through all uses in FC1. For each use, find the reaching def. If the 1014 // def is located in FC0 then it is is not safe to fuse. 1015 for (BasicBlock *BB : FC1.L->blocks()) 1016 for (Instruction &I : *BB) 1017 for (auto &Op : I.operands()) 1018 if (Instruction *Def = dyn_cast<Instruction>(Op)) 1019 if (FC0.L->contains(Def->getParent())) { 1020 InvalidDependencies++; 1021 return false; 1022 } 1023 1024 return true; 1025 } 1026 1027 /// Determine if two fusion candidates are adjacent in the CFG. 1028 /// 1029 /// This method will determine if there are additional basic blocks in the CFG 1030 /// between the exit of \p FC0 and the entry of \p FC1. 1031 /// If the two candidates are guarded loops, then it checks whether the 1032 /// non-loop successor of the \p FC0 guard branch is the entry block of \p 1033 /// FC1. If not, then the loops are not adjacent. If the two candidates are 1034 /// not guarded loops, then it checks whether the exit block of \p FC0 is the 1035 /// preheader of \p FC1. 1036 bool isAdjacent(const FusionCandidate &FC0, 1037 const FusionCandidate &FC1) const { 1038 // If the successor of the guard branch is FC1, then the loops are adjacent 1039 if (FC0.GuardBranch) 1040 return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 1041 else 1042 return FC0.ExitBlock == FC1.getEntryBlock(); 1043 } 1044 1045 /// Determine if two fusion candidates have identical guards 1046 /// 1047 /// This method will determine if two fusion candidates have the same guards. 1048 /// The guards are considered the same if: 1049 /// 1. The instructions to compute the condition used in the compare are 1050 /// identical. 1051 /// 2. The successors of the guard have the same flow into/around the loop. 1052 /// If the compare instructions are identical, then the first successor of the 1053 /// guard must go to the same place (either the preheader of the loop or the 1054 /// NonLoopBlock). In other words, the the first successor of both loops must 1055 /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 1056 /// the NonLoopBlock). The same must be true for the second successor. 1057 bool haveIdenticalGuards(const FusionCandidate &FC0, 1058 const FusionCandidate &FC1) const { 1059 assert(FC0.GuardBranch && FC1.GuardBranch && 1060 "Expecting FC0 and FC1 to be guarded loops."); 1061 1062 if (auto FC0CmpInst = 1063 dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 1064 if (auto FC1CmpInst = 1065 dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 1066 if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 1067 return false; 1068 1069 // The compare instructions are identical. 1070 // Now make sure the successor of the guards have the same flow into/around 1071 // the loop 1072 if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 1073 return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 1074 else 1075 return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 1076 } 1077 1078 /// Check that the guard for \p FC *only* contains the cmp/branch for the 1079 /// guard. 1080 /// Once we are able to handle intervening code, any code in the guard block 1081 /// for FC1 will need to be treated as intervening code and checked whether 1082 /// it can safely move around the loops. 1083 bool isEmptyGuardBlock(const FusionCandidate &FC) const { 1084 assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch."); 1085 if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) { 1086 auto *GuardBlock = FC.GuardBranch->getParent(); 1087 // If the generation of the cmp value is in GuardBlock, then the size of 1088 // the guard block should be 2 (cmp + branch). If the generation of the 1089 // cmp value is in a different block, then the size of the guard block 1090 // should only be 1. 1091 if (CmpInst->getParent() == GuardBlock) 1092 return GuardBlock->size() == 2; 1093 else 1094 return GuardBlock->size() == 1; 1095 } 1096 1097 return false; 1098 } 1099 1100 bool isEmptyPreheader(const FusionCandidate &FC) const { 1101 assert(FC.Preheader && "Expecting a valid preheader"); 1102 return FC.Preheader->size() == 1; 1103 } 1104 1105 bool isEmptyExitBlock(const FusionCandidate &FC) const { 1106 assert(FC.ExitBlock && "Expecting a valid exit block"); 1107 return FC.ExitBlock->size() == 1; 1108 } 1109 1110 /// Simplify the condition of the latch branch of \p FC to true, when both of 1111 /// its successors are the same. 1112 void simplifyLatchBranch(const FusionCandidate &FC) const { 1113 BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1114 if (FCLatchBranch) { 1115 assert(FCLatchBranch->isConditional() && 1116 FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1117 "Expecting the two successors of FCLatchBranch to be the same"); 1118 FCLatchBranch->setCondition( 1119 llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType())); 1120 } 1121 } 1122 1123 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1124 /// successor, then merge FC0.Latch with its unique successor. 1125 void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1126 moveInstsBottomUp(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1127 if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1128 MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1129 DTU.flush(); 1130 } 1131 } 1132 1133 /// Fuse two fusion candidates, creating a new fused loop. 1134 /// 1135 /// This method contains the mechanics of fusing two loops, represented by \p 1136 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 1137 /// postdominates \p FC0 (making them control flow equivalent). It also 1138 /// assumes that the other conditions for fusion have been met: adjacent, 1139 /// identical trip counts, and no negative distance dependencies exist that 1140 /// would prevent fusion. Thus, there is no checking for these conditions in 1141 /// this method. 1142 /// 1143 /// Fusion is performed by rewiring the CFG to update successor blocks of the 1144 /// components of tho loop. Specifically, the following changes are done: 1145 /// 1146 /// 1. The preheader of \p FC1 is removed as it is no longer necessary 1147 /// (because it is currently only a single statement block). 1148 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 1149 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 1150 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 1151 /// 1152 /// All of these modifications are done with dominator tree updates, thus 1153 /// keeping the dominator (and post dominator) information up-to-date. 1154 /// 1155 /// This can be improved in the future by actually merging blocks during 1156 /// fusion. For example, the preheader of \p FC1 can be merged with the 1157 /// preheader of \p FC0. This would allow loops with more than a single 1158 /// statement in the preheader to be fused. Similarly, the latch blocks of the 1159 /// two loops could also be fused into a single block. This will require 1160 /// analysis to prove it is safe to move the contents of the block past 1161 /// existing code, which currently has not been implemented. 1162 Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1163 assert(FC0.isValid() && FC1.isValid() && 1164 "Expecting valid fusion candidates"); 1165 1166 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 1167 dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 1168 1169 // Fusing guarded loops is handled slightly differently than non-guarded 1170 // loops and has been broken out into a separate method instead of trying to 1171 // intersperse the logic within a single method. 1172 if (FC0.GuardBranch) 1173 return fuseGuardedLoops(FC0, FC1); 1174 1175 assert(FC1.Preheader == FC0.ExitBlock); 1176 assert(FC1.Preheader->size() == 1 && 1177 FC1.Preheader->getSingleSuccessor() == FC1.Header); 1178 1179 // Remember the phi nodes originally in the header of FC0 in order to rewire 1180 // them later. However, this is only necessary if the new loop carried 1181 // values might not dominate the exiting branch. While we do not generally 1182 // test if this is the case but simply insert intermediate phi nodes, we 1183 // need to make sure these intermediate phi nodes have different 1184 // predecessors. To this end, we filter the special case where the exiting 1185 // block is the latch block of the first loop. Nothing needs to be done 1186 // anyway as all loop carried values dominate the latch and thereby also the 1187 // exiting branch. 1188 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1189 if (FC0.ExitingBlock != FC0.Latch) 1190 for (PHINode &PHI : FC0.Header->phis()) 1191 OriginalFC0PHIs.push_back(&PHI); 1192 1193 // Replace incoming blocks for header PHIs first. 1194 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1195 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1196 1197 // Then modify the control flow and update DT and PDT. 1198 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1199 1200 // The old exiting block of the first loop (FC0) has to jump to the header 1201 // of the second as we need to execute the code in the second header block 1202 // regardless of the trip count. That is, if the trip count is 0, so the 1203 // back edge is never taken, we still have to execute both loop headers, 1204 // especially (but not only!) if the second is a do-while style loop. 1205 // However, doing so might invalidate the phi nodes of the first loop as 1206 // the new values do only need to dominate their latch and not the exiting 1207 // predicate. To remedy this potential problem we always introduce phi 1208 // nodes in the header of the second loop later that select the loop carried 1209 // value, if the second header was reached through an old latch of the 1210 // first, or undef otherwise. This is sound as exiting the first implies the 1211 // second will exit too, __without__ taking the back-edge. [Their 1212 // trip-counts are equal after all. 1213 // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 1214 // to FC1.Header? I think this is basically what the three sequences are 1215 // trying to accomplish; however, doing this directly in the CFG may mean 1216 // the DT/PDT becomes invalid 1217 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 1218 FC1.Header); 1219 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1220 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 1221 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1222 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1223 1224 // The pre-header of L1 is not necessary anymore. 1225 assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); 1226 FC1.Preheader->getTerminator()->eraseFromParent(); 1227 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1228 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1229 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1230 1231 // Moves the phi nodes from the second to the first loops header block. 1232 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1233 if (SE.isSCEVable(PHI->getType())) 1234 SE.forgetValue(PHI); 1235 if (PHI->hasNUsesOrMore(1)) 1236 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1237 else 1238 PHI->eraseFromParent(); 1239 } 1240 1241 // Introduce new phi nodes in the second loop header to ensure 1242 // exiting the first and jumping to the header of the second does not break 1243 // the SSA property of the phis originally in the first loop. See also the 1244 // comment above. 1245 Instruction *L1HeaderIP = &FC1.Header->front(); 1246 for (PHINode *LCPHI : OriginalFC0PHIs) { 1247 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1248 assert(L1LatchBBIdx >= 0 && 1249 "Expected loop carried value to be rewired at this point!"); 1250 1251 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1252 1253 PHINode *L1HeaderPHI = PHINode::Create( 1254 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1255 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1256 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1257 FC0.ExitingBlock); 1258 1259 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1260 } 1261 1262 // Replace latch terminator destinations. 1263 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1264 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1265 1266 // Change the condition of FC0 latch branch to true, as both successors of 1267 // the branch are the same. 1268 simplifyLatchBranch(FC0); 1269 1270 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1271 // performed the updates above. 1272 if (FC0.Latch != FC0.ExitingBlock) 1273 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1274 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1275 1276 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1277 FC0.Latch, FC0.Header)); 1278 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1279 FC1.Latch, FC0.Header)); 1280 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1281 FC1.Latch, FC1.Header)); 1282 1283 // Update DT/PDT 1284 DTU.applyUpdates(TreeUpdates); 1285 1286 LI.removeBlock(FC1.Preheader); 1287 DTU.deleteBB(FC1.Preheader); 1288 DTU.flush(); 1289 1290 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1291 // and rebuild the information in subsequent passes of fusion? 1292 // Note: Need to forget the loops before merging the loop latches, as 1293 // mergeLatch may remove the only block in FC1. 1294 SE.forgetLoop(FC1.L); 1295 SE.forgetLoop(FC0.L); 1296 1297 // Move instructions from FC0.Latch to FC1.Latch. 1298 // Note: mergeLatch requires an updated DT. 1299 mergeLatch(FC0, FC1); 1300 1301 // Merge the loops. 1302 SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), 1303 FC1.L->block_end()); 1304 for (BasicBlock *BB : Blocks) { 1305 FC0.L->addBlockEntry(BB); 1306 FC1.L->removeBlockFromLoop(BB); 1307 if (LI.getLoopFor(BB) != FC1.L) 1308 continue; 1309 LI.changeLoopFor(BB, FC0.L); 1310 } 1311 while (!FC1.L->empty()) { 1312 const auto &ChildLoopIt = FC1.L->begin(); 1313 Loop *ChildLoop = *ChildLoopIt; 1314 FC1.L->removeChildLoop(ChildLoopIt); 1315 FC0.L->addChildLoop(ChildLoop); 1316 } 1317 1318 // Delete the now empty loop L1. 1319 LI.erase(FC1.L); 1320 1321 #ifndef NDEBUG 1322 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1323 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1324 assert(PDT.verify()); 1325 LI.verify(DT); 1326 SE.verify(); 1327 #endif 1328 1329 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1330 1331 return FC0.L; 1332 } 1333 1334 /// Report details on loop fusion opportunities. 1335 /// 1336 /// This template function can be used to report both successful and missed 1337 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 1338 /// be one of: 1339 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 1340 /// given two valid fusion candidates. 1341 /// - OptimizationRemark to report successful fusion of two fusion 1342 /// candidates. 1343 /// The remarks will be printed using the form: 1344 /// <path/filename>:<line number>:<column number>: [<function name>]: 1345 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 1346 template <typename RemarkKind> 1347 void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 1348 llvm::Statistic &Stat) { 1349 assert(FC0.Preheader && FC1.Preheader && 1350 "Expecting valid fusion candidates"); 1351 using namespace ore; 1352 ++Stat; 1353 ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 1354 FC0.Preheader) 1355 << "[" << FC0.Preheader->getParent()->getName() 1356 << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 1357 << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 1358 << ": " << Stat.getDesc()); 1359 } 1360 1361 /// Fuse two guarded fusion candidates, creating a new fused loop. 1362 /// 1363 /// Fusing guarded loops is handled much the same way as fusing non-guarded 1364 /// loops. The rewiring of the CFG is slightly different though, because of 1365 /// the presence of the guards around the loops and the exit blocks after the 1366 /// loop body. As such, the new loop is rewired as follows: 1367 /// 1. Keep the guard branch from FC0 and use the non-loop block target 1368 /// from the FC1 guard branch. 1369 /// 2. Remove the exit block from FC0 (this exit block should be empty 1370 /// right now). 1371 /// 3. Remove the guard branch for FC1 1372 /// 4. Remove the preheader for FC1. 1373 /// The exit block successor for the latch of FC0 is updated to be the header 1374 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 1375 /// be the header of FC0, thus creating the fused loop. 1376 Loop *fuseGuardedLoops(const FusionCandidate &FC0, 1377 const FusionCandidate &FC1) { 1378 assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 1379 1380 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 1381 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 1382 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 1383 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1384 1385 assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 1386 1387 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1388 1389 //////////////////////////////////////////////////////////////////////////// 1390 // Update the Loop Guard 1391 //////////////////////////////////////////////////////////////////////////// 1392 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 1393 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 1394 // Thus, one path from the guard goes to the preheader for FC0 (and thus 1395 // executes the new fused loop) and the other path goes to the NonLoopBlock 1396 // for FC1 (where FC1 guard would have gone if FC1 was not executed). 1397 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1398 FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, 1399 FC1.Header); 1400 1401 // The guard of FC1 is not necessary anymore. 1402 FC1.GuardBranch->eraseFromParent(); 1403 new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 1404 1405 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1406 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 1407 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1408 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 1409 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1410 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 1411 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1412 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 1413 1414 assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && 1415 "Expecting guard block to have no predecessors"); 1416 assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && 1417 "Expecting guard block to have no successors"); 1418 1419 // Remember the phi nodes originally in the header of FC0 in order to rewire 1420 // them later. However, this is only necessary if the new loop carried 1421 // values might not dominate the exiting branch. While we do not generally 1422 // test if this is the case but simply insert intermediate phi nodes, we 1423 // need to make sure these intermediate phi nodes have different 1424 // predecessors. To this end, we filter the special case where the exiting 1425 // block is the latch block of the first loop. Nothing needs to be done 1426 // anyway as all loop carried values dominate the latch and thereby also the 1427 // exiting branch. 1428 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 1429 // (because the loops are rotated. Thus, nothing will ever be added to 1430 // OriginalFC0PHIs. 1431 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1432 if (FC0.ExitingBlock != FC0.Latch) 1433 for (PHINode &PHI : FC0.Header->phis()) 1434 OriginalFC0PHIs.push_back(&PHI); 1435 1436 assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 1437 1438 // Replace incoming blocks for header PHIs first. 1439 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1440 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1441 1442 // The old exiting block of the first loop (FC0) has to jump to the header 1443 // of the second as we need to execute the code in the second header block 1444 // regardless of the trip count. That is, if the trip count is 0, so the 1445 // back edge is never taken, we still have to execute both loop headers, 1446 // especially (but not only!) if the second is a do-while style loop. 1447 // However, doing so might invalidate the phi nodes of the first loop as 1448 // the new values do only need to dominate their latch and not the exiting 1449 // predicate. To remedy this potential problem we always introduce phi 1450 // nodes in the header of the second loop later that select the loop carried 1451 // value, if the second header was reached through an old latch of the 1452 // first, or undef otherwise. This is sound as exiting the first implies the 1453 // second will exit too, __without__ taking the back-edge (their 1454 // trip-counts are equal after all). 1455 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1456 FC1.Header); 1457 1458 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1459 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1460 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1461 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1462 1463 // Remove FC0 Exit Block 1464 // The exit block for FC0 is no longer needed since control will flow 1465 // directly to the header of FC1. Since it is an empty block, it can be 1466 // removed at this point. 1467 // TODO: In the future, we can handle non-empty exit blocks my merging any 1468 // instructions from FC0 exit block into FC1 exit block prior to removing 1469 // the block. 1470 assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) && 1471 "Expecting exit block to be empty"); 1472 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1473 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1474 1475 // Remove FC1 Preheader 1476 // The pre-header of L1 is not necessary anymore. 1477 assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); 1478 FC1.Preheader->getTerminator()->eraseFromParent(); 1479 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1480 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1481 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1482 1483 // Moves the phi nodes from the second to the first loops header block. 1484 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1485 if (SE.isSCEVable(PHI->getType())) 1486 SE.forgetValue(PHI); 1487 if (PHI->hasNUsesOrMore(1)) 1488 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1489 else 1490 PHI->eraseFromParent(); 1491 } 1492 1493 // Introduce new phi nodes in the second loop header to ensure 1494 // exiting the first and jumping to the header of the second does not break 1495 // the SSA property of the phis originally in the first loop. See also the 1496 // comment above. 1497 Instruction *L1HeaderIP = &FC1.Header->front(); 1498 for (PHINode *LCPHI : OriginalFC0PHIs) { 1499 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1500 assert(L1LatchBBIdx >= 0 && 1501 "Expected loop carried value to be rewired at this point!"); 1502 1503 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1504 1505 PHINode *L1HeaderPHI = PHINode::Create( 1506 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1507 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1508 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1509 FC0.ExitingBlock); 1510 1511 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1512 } 1513 1514 // Update the latches 1515 1516 // Replace latch terminator destinations. 1517 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1518 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1519 1520 // Change the condition of FC0 latch branch to true, as both successors of 1521 // the branch are the same. 1522 simplifyLatchBranch(FC0); 1523 1524 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1525 // performed the updates above. 1526 if (FC0.Latch != FC0.ExitingBlock) 1527 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1528 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1529 1530 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1531 FC0.Latch, FC0.Header)); 1532 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1533 FC1.Latch, FC0.Header)); 1534 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1535 FC1.Latch, FC1.Header)); 1536 1537 // All done 1538 // Apply the updates to the Dominator Tree and cleanup. 1539 1540 assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && 1541 "FC1GuardBlock has successors!!"); 1542 assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && 1543 "FC1GuardBlock has predecessors!!"); 1544 1545 // Update DT/PDT 1546 DTU.applyUpdates(TreeUpdates); 1547 1548 LI.removeBlock(FC1.Preheader); 1549 DTU.deleteBB(FC1.Preheader); 1550 DTU.deleteBB(FC0.ExitBlock); 1551 DTU.flush(); 1552 1553 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1554 // and rebuild the information in subsequent passes of fusion? 1555 // Note: Need to forget the loops before merging the loop latches, as 1556 // mergeLatch may remove the only block in FC1. 1557 SE.forgetLoop(FC1.L); 1558 SE.forgetLoop(FC0.L); 1559 1560 // Move instructions from FC0.Latch to FC1.Latch. 1561 // Note: mergeLatch requires an updated DT. 1562 mergeLatch(FC0, FC1); 1563 1564 // Merge the loops. 1565 SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), 1566 FC1.L->block_end()); 1567 for (BasicBlock *BB : Blocks) { 1568 FC0.L->addBlockEntry(BB); 1569 FC1.L->removeBlockFromLoop(BB); 1570 if (LI.getLoopFor(BB) != FC1.L) 1571 continue; 1572 LI.changeLoopFor(BB, FC0.L); 1573 } 1574 while (!FC1.L->empty()) { 1575 const auto &ChildLoopIt = FC1.L->begin(); 1576 Loop *ChildLoop = *ChildLoopIt; 1577 FC1.L->removeChildLoop(ChildLoopIt); 1578 FC0.L->addChildLoop(ChildLoop); 1579 } 1580 1581 // Delete the now empty loop L1. 1582 LI.erase(FC1.L); 1583 1584 #ifndef NDEBUG 1585 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1586 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1587 assert(PDT.verify()); 1588 LI.verify(DT); 1589 SE.verify(); 1590 #endif 1591 1592 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1593 1594 return FC0.L; 1595 } 1596 }; 1597 1598 struct LoopFuseLegacy : public FunctionPass { 1599 1600 static char ID; 1601 1602 LoopFuseLegacy() : FunctionPass(ID) { 1603 initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); 1604 } 1605 1606 void getAnalysisUsage(AnalysisUsage &AU) const override { 1607 AU.addRequiredID(LoopSimplifyID); 1608 AU.addRequired<ScalarEvolutionWrapperPass>(); 1609 AU.addRequired<LoopInfoWrapperPass>(); 1610 AU.addRequired<DominatorTreeWrapperPass>(); 1611 AU.addRequired<PostDominatorTreeWrapperPass>(); 1612 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1613 AU.addRequired<DependenceAnalysisWrapperPass>(); 1614 1615 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1616 AU.addPreserved<LoopInfoWrapperPass>(); 1617 AU.addPreserved<DominatorTreeWrapperPass>(); 1618 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1619 } 1620 1621 bool runOnFunction(Function &F) override { 1622 if (skipFunction(F)) 1623 return false; 1624 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1625 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1626 auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 1627 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1628 auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 1629 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1630 1631 const DataLayout &DL = F.getParent()->getDataLayout(); 1632 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL); 1633 return LF.fuseLoops(F); 1634 } 1635 }; 1636 } // namespace 1637 1638 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 1639 auto &LI = AM.getResult<LoopAnalysis>(F); 1640 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1641 auto &DI = AM.getResult<DependenceAnalysis>(F); 1642 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1643 auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 1644 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1645 1646 const DataLayout &DL = F.getParent()->getDataLayout(); 1647 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL); 1648 bool Changed = LF.fuseLoops(F); 1649 if (!Changed) 1650 return PreservedAnalyses::all(); 1651 1652 PreservedAnalyses PA; 1653 PA.preserve<DominatorTreeAnalysis>(); 1654 PA.preserve<PostDominatorTreeAnalysis>(); 1655 PA.preserve<ScalarEvolutionAnalysis>(); 1656 PA.preserve<LoopAnalysis>(); 1657 return PA; 1658 } 1659 1660 char LoopFuseLegacy::ID = 0; 1661 1662 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, 1663 false) 1664 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 1665 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1666 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1667 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1668 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1669 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1670 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) 1671 1672 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } 1673