1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the loop fusion pass. 11 /// The implementation is largely based on the following document: 12 /// 13 /// Code Transformations to Augment the Scope of Loop Fusion in a 14 /// Production Compiler 15 /// Christopher Mark Barton 16 /// MSc Thesis 17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 18 /// 19 /// The general approach taken is to collect sets of control flow equivalent 20 /// loops and test whether they can be fused. The necessary conditions for 21 /// fusion are: 22 /// 1. The loops must be adjacent (there cannot be any statements between 23 /// the two loops). 24 /// 2. The loops must be conforming (they must execute the same number of 25 /// iterations). 26 /// 3. The loops must be control flow equivalent (if one loop executes, the 27 /// other is guaranteed to execute). 28 /// 4. There cannot be any negative distance dependencies between the loops. 29 /// If all of these conditions are satisfied, it is safe to fuse the loops. 30 /// 31 /// This implementation creates FusionCandidates that represent the loop and the 32 /// necessary information needed by fusion. It then operates on the fusion 33 /// candidates, first confirming that the candidate is eligible for fusion. The 34 /// candidates are then collected into control flow equivalent sets, sorted in 35 /// dominance order. Each set of control flow equivalent candidates is then 36 /// traversed, attempting to fuse pairs of candidates in the set. If all 37 /// requirements for fusion are met, the two candidates are fused, creating a 38 /// new (fused) candidate which is then added back into the set to consider for 39 /// additional fusion. 40 /// 41 /// This implementation currently does not make any modifications to remove 42 /// conditions for fusion. Code transformations to make loops conform to each of 43 /// the conditions for fusion are discussed in more detail in the document 44 /// above. These can be added to the current implementation in the future. 45 //===----------------------------------------------------------------------===// 46 47 #include "llvm/Transforms/Scalar/LoopFuse.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/Analysis/AssumptionCache.h" 50 #include "llvm/Analysis/DependenceAnalysis.h" 51 #include "llvm/Analysis/DomTreeUpdater.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 54 #include "llvm/Analysis/PostDominators.h" 55 #include "llvm/Analysis/ScalarEvolution.h" 56 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57 #include "llvm/Analysis/TargetTransformInfo.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/Verifier.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Pass.h" 62 #include "llvm/Support/CommandLine.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/raw_ostream.h" 65 #include "llvm/Transforms/Scalar.h" 66 #include "llvm/Transforms/Utils.h" 67 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 68 #include "llvm/Transforms/Utils/CodeMoverUtils.h" 69 #include "llvm/Transforms/Utils/LoopPeel.h" 70 #include "llvm/Transforms/Utils/LoopSimplify.h" 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "loop-fusion" 75 76 STATISTIC(FuseCounter, "Loops fused"); 77 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 78 STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 79 STATISTIC(InvalidHeader, "Loop has invalid header"); 80 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 81 STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 82 STATISTIC(InvalidLatch, "Loop has invalid latch"); 83 STATISTIC(InvalidLoop, "Loop is invalid"); 84 STATISTIC(AddressTakenBB, "Basic block has address taken"); 85 STATISTIC(MayThrowException, "Loop may throw an exception"); 86 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 87 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 88 STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 89 STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 90 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 91 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 92 STATISTIC(NonAdjacent, "Loops are not adjacent"); 93 STATISTIC( 94 NonEmptyPreheader, 95 "Loop has a non-empty preheader with instructions that cannot be moved"); 96 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 97 STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 98 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 99 "instructions that cannot be moved"); 100 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 101 "instructions that cannot be moved"); 102 STATISTIC(NotRotated, "Candidate is not rotated"); 103 STATISTIC(OnlySecondCandidateIsGuarded, 104 "The second candidate is guarded while the first one is not"); 105 STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); 106 STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); 107 108 enum FusionDependenceAnalysisChoice { 109 FUSION_DEPENDENCE_ANALYSIS_SCEV, 110 FUSION_DEPENDENCE_ANALYSIS_DA, 111 FUSION_DEPENDENCE_ANALYSIS_ALL, 112 }; 113 114 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 115 "loop-fusion-dependence-analysis", 116 cl::desc("Which dependence analysis should loop fusion use?"), 117 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 118 "Use the scalar evolution interface"), 119 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 120 "Use the dependence analysis interface"), 121 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 122 "Use all available analyses")), 123 cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL)); 124 125 static cl::opt<unsigned> FusionPeelMaxCount( 126 "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, 127 cl::desc("Max number of iterations to be peeled from a loop, such that " 128 "fusion can take place")); 129 130 #ifndef NDEBUG 131 static cl::opt<bool> 132 VerboseFusionDebugging("loop-fusion-verbose-debug", 133 cl::desc("Enable verbose debugging for Loop Fusion"), 134 cl::Hidden, cl::init(false)); 135 #endif 136 137 namespace { 138 /// This class is used to represent a candidate for loop fusion. When it is 139 /// constructed, it checks the conditions for loop fusion to ensure that it 140 /// represents a valid candidate. It caches several parts of a loop that are 141 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 142 /// of continually querying the underlying Loop to retrieve these values. It is 143 /// assumed these will not change throughout loop fusion. 144 /// 145 /// The invalidate method should be used to indicate that the FusionCandidate is 146 /// no longer a valid candidate for fusion. Similarly, the isValid() method can 147 /// be used to ensure that the FusionCandidate is still valid for fusion. 148 struct FusionCandidate { 149 /// Cache of parts of the loop used throughout loop fusion. These should not 150 /// need to change throughout the analysis and transformation. 151 /// These parts are cached to avoid repeatedly looking up in the Loop class. 152 153 /// Preheader of the loop this candidate represents 154 BasicBlock *Preheader; 155 /// Header of the loop this candidate represents 156 BasicBlock *Header; 157 /// Blocks in the loop that exit the loop 158 BasicBlock *ExitingBlock; 159 /// The successor block of this loop (where the exiting blocks go to) 160 BasicBlock *ExitBlock; 161 /// Latch of the loop 162 BasicBlock *Latch; 163 /// The loop that this fusion candidate represents 164 Loop *L; 165 /// Vector of instructions in this loop that read from memory 166 SmallVector<Instruction *, 16> MemReads; 167 /// Vector of instructions in this loop that write to memory 168 SmallVector<Instruction *, 16> MemWrites; 169 /// Are all of the members of this fusion candidate still valid 170 bool Valid; 171 /// Guard branch of the loop, if it exists 172 BranchInst *GuardBranch; 173 /// Peeling Paramaters of the Loop. 174 TTI::PeelingPreferences PP; 175 /// Can you Peel this Loop? 176 bool AbleToPeel; 177 /// Has this loop been Peeled 178 bool Peeled; 179 180 /// Dominator and PostDominator trees are needed for the 181 /// FusionCandidateCompare function, required by FusionCandidateSet to 182 /// determine where the FusionCandidate should be inserted into the set. These 183 /// are used to establish ordering of the FusionCandidates based on dominance. 184 DominatorTree &DT; 185 const PostDominatorTree *PDT; 186 187 OptimizationRemarkEmitter &ORE; 188 189 FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, 190 OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) 191 : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 192 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 193 Latch(L->getLoopLatch()), L(L), Valid(true), 194 GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), 195 Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { 196 197 // Walk over all blocks in the loop and check for conditions that may 198 // prevent fusion. For each block, walk over all instructions and collect 199 // the memory reads and writes If any instructions that prevent fusion are 200 // found, invalidate this object and return. 201 for (BasicBlock *BB : L->blocks()) { 202 if (BB->hasAddressTaken()) { 203 invalidate(); 204 reportInvalidCandidate(AddressTakenBB); 205 return; 206 } 207 208 for (Instruction &I : *BB) { 209 if (I.mayThrow()) { 210 invalidate(); 211 reportInvalidCandidate(MayThrowException); 212 return; 213 } 214 if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 215 if (SI->isVolatile()) { 216 invalidate(); 217 reportInvalidCandidate(ContainsVolatileAccess); 218 return; 219 } 220 } 221 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 222 if (LI->isVolatile()) { 223 invalidate(); 224 reportInvalidCandidate(ContainsVolatileAccess); 225 return; 226 } 227 } 228 if (I.mayWriteToMemory()) 229 MemWrites.push_back(&I); 230 if (I.mayReadFromMemory()) 231 MemReads.push_back(&I); 232 } 233 } 234 } 235 236 /// Check if all members of the class are valid. 237 bool isValid() const { 238 return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 239 !L->isInvalid() && Valid; 240 } 241 242 /// Verify that all members are in sync with the Loop object. 243 void verify() const { 244 assert(isValid() && "Candidate is not valid!!"); 245 assert(!L->isInvalid() && "Loop is invalid!"); 246 assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 247 assert(Header == L->getHeader() && "Header is out of sync"); 248 assert(ExitingBlock == L->getExitingBlock() && 249 "Exiting Blocks is out of sync"); 250 assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 251 assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 252 } 253 254 /// Get the entry block for this fusion candidate. 255 /// 256 /// If this fusion candidate represents a guarded loop, the entry block is the 257 /// loop guard block. If it represents an unguarded loop, the entry block is 258 /// the preheader of the loop. 259 BasicBlock *getEntryBlock() const { 260 if (GuardBranch) 261 return GuardBranch->getParent(); 262 else 263 return Preheader; 264 } 265 266 /// After Peeling the loop is modified quite a bit, hence all of the Blocks 267 /// need to be updated accordingly. 268 void updateAfterPeeling() { 269 Preheader = L->getLoopPreheader(); 270 Header = L->getHeader(); 271 ExitingBlock = L->getExitingBlock(); 272 ExitBlock = L->getExitBlock(); 273 Latch = L->getLoopLatch(); 274 verify(); 275 } 276 277 /// Given a guarded loop, get the successor of the guard that is not in the 278 /// loop. 279 /// 280 /// This method returns the successor of the loop guard that is not located 281 /// within the loop (i.e., the successor of the guard that is not the 282 /// preheader). 283 /// This method is only valid for guarded loops. 284 BasicBlock *getNonLoopBlock() const { 285 assert(GuardBranch && "Only valid on guarded loops."); 286 assert(GuardBranch->isConditional() && 287 "Expecting guard to be a conditional branch."); 288 if (Peeled) 289 return GuardBranch->getSuccessor(1); 290 return (GuardBranch->getSuccessor(0) == Preheader) 291 ? GuardBranch->getSuccessor(1) 292 : GuardBranch->getSuccessor(0); 293 } 294 295 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 296 LLVM_DUMP_METHOD void dump() const { 297 dbgs() << "\tGuardBranch: "; 298 if (GuardBranch) 299 dbgs() << *GuardBranch; 300 else 301 dbgs() << "nullptr"; 302 dbgs() << "\n" 303 << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 304 << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 305 << "\n" 306 << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 307 << "\tExitingBB: " 308 << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 309 << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 310 << "\n" 311 << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 312 << "\tEntryBlock: " 313 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 314 << "\n"; 315 } 316 #endif 317 318 /// Determine if a fusion candidate (representing a loop) is eligible for 319 /// fusion. Note that this only checks whether a single loop can be fused - it 320 /// does not check whether it is *legal* to fuse two loops together. 321 bool isEligibleForFusion(ScalarEvolution &SE) const { 322 if (!isValid()) { 323 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 324 if (!Preheader) 325 ++InvalidPreheader; 326 if (!Header) 327 ++InvalidHeader; 328 if (!ExitingBlock) 329 ++InvalidExitingBlock; 330 if (!ExitBlock) 331 ++InvalidExitBlock; 332 if (!Latch) 333 ++InvalidLatch; 334 if (L->isInvalid()) 335 ++InvalidLoop; 336 337 return false; 338 } 339 340 // Require ScalarEvolution to be able to determine a trip count. 341 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 342 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 343 << " trip count not computable!\n"); 344 return reportInvalidCandidate(UnknownTripCount); 345 } 346 347 if (!L->isLoopSimplifyForm()) { 348 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 349 << " is not in simplified form!\n"); 350 return reportInvalidCandidate(NotSimplifiedForm); 351 } 352 353 if (!L->isRotatedForm()) { 354 LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 355 return reportInvalidCandidate(NotRotated); 356 } 357 358 return true; 359 } 360 361 private: 362 // This is only used internally for now, to clear the MemWrites and MemReads 363 // list and setting Valid to false. I can't envision other uses of this right 364 // now, since once FusionCandidates are put into the FusionCandidateSet they 365 // are immutable. Thus, any time we need to change/update a FusionCandidate, 366 // we must create a new one and insert it into the FusionCandidateSet to 367 // ensure the FusionCandidateSet remains ordered correctly. 368 void invalidate() { 369 MemWrites.clear(); 370 MemReads.clear(); 371 Valid = false; 372 } 373 374 bool reportInvalidCandidate(llvm::Statistic &Stat) const { 375 using namespace ore; 376 assert(L && Preheader && "Fusion candidate not initialized properly!"); 377 #if LLVM_ENABLE_STATS 378 ++Stat; 379 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 380 L->getStartLoc(), Preheader) 381 << "[" << Preheader->getParent()->getName() << "]: " 382 << "Loop is not a candidate for fusion: " << Stat.getDesc()); 383 #endif 384 return false; 385 } 386 }; 387 388 struct FusionCandidateCompare { 389 /// Comparison functor to sort two Control Flow Equivalent fusion candidates 390 /// into dominance order. 391 /// If LHS dominates RHS and RHS post-dominates LHS, return true; 392 /// If RHS dominates LHS and LHS post-dominates RHS, return false; 393 /// If both LHS and RHS are not dominating each other then, non-strictly 394 /// post dominate check will decide the order of candidates. If RHS 395 /// non-strictly post dominates LHS then, return true. If LHS non-strictly 396 /// post dominates RHS then, return false. If both are non-strictly post 397 /// dominate each other then, level in the post dominator tree will decide 398 /// the order of candidates. 399 bool operator()(const FusionCandidate &LHS, 400 const FusionCandidate &RHS) const { 401 const DominatorTree *DT = &(LHS.DT); 402 403 BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 404 BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 405 406 // Do not save PDT to local variable as it is only used in asserts and thus 407 // will trigger an unused variable warning if building without asserts. 408 assert(DT && LHS.PDT && "Expecting valid dominator tree"); 409 410 // Do this compare first so if LHS == RHS, function returns false. 411 if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 412 // RHS dominates LHS 413 // Verify LHS post-dominates RHS 414 assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 415 return false; 416 } 417 418 if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 419 // Verify RHS Postdominates LHS 420 assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 421 return true; 422 } 423 424 // If two FusionCandidates are in the same level of dominator tree, 425 // they will not dominate each other, but may still be control flow 426 // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate() 427 // function is needed. 428 bool WrongOrder = 429 nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT); 430 bool RightOrder = 431 nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT); 432 if (WrongOrder && RightOrder) { 433 // If common predecessor of LHS and RHS post dominates both 434 // FusionCandidates then, Order of FusionCandidate can be 435 // identified by its level in post dominator tree. 436 DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock); 437 DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock); 438 return LNode->getLevel() > RNode->getLevel(); 439 } else if (WrongOrder) 440 return false; 441 else if (RightOrder) 442 return true; 443 444 // If LHS does not non-strict Postdominate RHS and RHS does not non-strict 445 // Postdominate LHS then, there is no dominance relationship between the 446 // two FusionCandidates. Thus, they should not be in the same set together. 447 llvm_unreachable( 448 "No dominance relationship between these fusion candidates!"); 449 } 450 }; 451 452 using LoopVector = SmallVector<Loop *, 4>; 453 454 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 455 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 456 // dominates FC1 and FC1 post-dominates FC0. 457 // std::set was chosen because we want a sorted data structure with stable 458 // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent 459 // loops by moving intervening code around. When this intervening code contains 460 // loops, those loops will be moved also. The corresponding FusionCandidates 461 // will also need to be moved accordingly. As this is done, having stable 462 // iterators will simplify the logic. Similarly, having an efficient insert that 463 // keeps the FusionCandidateSet sorted will also simplify the implementation. 464 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 465 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 466 467 #if !defined(NDEBUG) 468 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 469 const FusionCandidate &FC) { 470 if (FC.isValid()) 471 OS << FC.Preheader->getName(); 472 else 473 OS << "<Invalid>"; 474 475 return OS; 476 } 477 478 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 479 const FusionCandidateSet &CandSet) { 480 for (const FusionCandidate &FC : CandSet) 481 OS << FC << '\n'; 482 483 return OS; 484 } 485 486 static void 487 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 488 dbgs() << "Fusion Candidates: \n"; 489 for (const auto &CandidateSet : FusionCandidates) { 490 dbgs() << "*** Fusion Candidate Set ***\n"; 491 dbgs() << CandidateSet; 492 dbgs() << "****************************\n"; 493 } 494 } 495 #endif 496 497 /// Collect all loops in function at the same nest level, starting at the 498 /// outermost level. 499 /// 500 /// This data structure collects all loops at the same nest level for a 501 /// given function (specified by the LoopInfo object). It starts at the 502 /// outermost level. 503 struct LoopDepthTree { 504 using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 505 using iterator = LoopsOnLevelTy::iterator; 506 using const_iterator = LoopsOnLevelTy::const_iterator; 507 508 LoopDepthTree(LoopInfo &LI) : Depth(1) { 509 if (!LI.empty()) 510 LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 511 } 512 513 /// Test whether a given loop has been removed from the function, and thus is 514 /// no longer valid. 515 bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 516 517 /// Record that a given loop has been removed from the function and is no 518 /// longer valid. 519 void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 520 521 /// Descend the tree to the next (inner) nesting level 522 void descend() { 523 LoopsOnLevelTy LoopsOnNextLevel; 524 525 for (const LoopVector &LV : *this) 526 for (Loop *L : LV) 527 if (!isRemovedLoop(L) && L->begin() != L->end()) 528 LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 529 530 LoopsOnLevel = LoopsOnNextLevel; 531 RemovedLoops.clear(); 532 Depth++; 533 } 534 535 bool empty() const { return size() == 0; } 536 size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 537 unsigned getDepth() const { return Depth; } 538 539 iterator begin() { return LoopsOnLevel.begin(); } 540 iterator end() { return LoopsOnLevel.end(); } 541 const_iterator begin() const { return LoopsOnLevel.begin(); } 542 const_iterator end() const { return LoopsOnLevel.end(); } 543 544 private: 545 /// Set of loops that have been removed from the function and are no longer 546 /// valid. 547 SmallPtrSet<const Loop *, 8> RemovedLoops; 548 549 /// Depth of the current level, starting at 1 (outermost loops). 550 unsigned Depth; 551 552 /// Vector of loops at the current depth level that have the same parent loop 553 LoopsOnLevelTy LoopsOnLevel; 554 }; 555 556 #ifndef NDEBUG 557 static void printLoopVector(const LoopVector &LV) { 558 dbgs() << "****************************\n"; 559 for (auto *L : LV) 560 printLoop(*L, dbgs()); 561 dbgs() << "****************************\n"; 562 } 563 #endif 564 565 struct LoopFuser { 566 private: 567 // Sets of control flow equivalent fusion candidates for a given nest level. 568 FusionCandidateCollection FusionCandidates; 569 570 LoopDepthTree LDT; 571 DomTreeUpdater DTU; 572 573 LoopInfo &LI; 574 DominatorTree &DT; 575 DependenceInfo &DI; 576 ScalarEvolution &SE; 577 PostDominatorTree &PDT; 578 OptimizationRemarkEmitter &ORE; 579 AssumptionCache &AC; 580 const TargetTransformInfo &TTI; 581 582 public: 583 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 584 ScalarEvolution &SE, PostDominatorTree &PDT, 585 OptimizationRemarkEmitter &ORE, const DataLayout &DL, 586 AssumptionCache &AC, const TargetTransformInfo &TTI) 587 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 588 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} 589 590 /// This is the main entry point for loop fusion. It will traverse the 591 /// specified function and collect candidate loops to fuse, starting at the 592 /// outermost nesting level and working inwards. 593 bool fuseLoops(Function &F) { 594 #ifndef NDEBUG 595 if (VerboseFusionDebugging) { 596 LI.print(dbgs()); 597 } 598 #endif 599 600 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 601 << "\n"); 602 bool Changed = false; 603 604 while (!LDT.empty()) { 605 LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 606 << LDT.getDepth() << "\n";); 607 608 for (const LoopVector &LV : LDT) { 609 assert(LV.size() > 0 && "Empty loop set was build!"); 610 611 // Skip singleton loop sets as they do not offer fusion opportunities on 612 // this level. 613 if (LV.size() == 1) 614 continue; 615 #ifndef NDEBUG 616 if (VerboseFusionDebugging) { 617 LLVM_DEBUG({ 618 dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 619 printLoopVector(LV); 620 }); 621 } 622 #endif 623 624 collectFusionCandidates(LV); 625 Changed |= fuseCandidates(); 626 } 627 628 // Finished analyzing candidates at this level. 629 // Descend to the next level and clear all of the candidates currently 630 // collected. Note that it will not be possible to fuse any of the 631 // existing candidates with new candidates because the new candidates will 632 // be at a different nest level and thus not be control flow equivalent 633 // with all of the candidates collected so far. 634 LLVM_DEBUG(dbgs() << "Descend one level!\n"); 635 LDT.descend(); 636 FusionCandidates.clear(); 637 } 638 639 if (Changed) 640 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 641 642 #ifndef NDEBUG 643 assert(DT.verify()); 644 assert(PDT.verify()); 645 LI.verify(DT); 646 SE.verify(); 647 #endif 648 649 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 650 return Changed; 651 } 652 653 private: 654 /// Determine if two fusion candidates are control flow equivalent. 655 /// 656 /// Two fusion candidates are control flow equivalent if when one executes, 657 /// the other is guaranteed to execute. This is determined using dominators 658 /// and post-dominators: if A dominates B and B post-dominates A then A and B 659 /// are control-flow equivalent. 660 bool isControlFlowEquivalent(const FusionCandidate &FC0, 661 const FusionCandidate &FC1) const { 662 assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 663 664 return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 665 DT, PDT); 666 } 667 668 /// Iterate over all loops in the given loop set and identify the loops that 669 /// are eligible for fusion. Place all eligible fusion candidates into Control 670 /// Flow Equivalent sets, sorted by dominance. 671 void collectFusionCandidates(const LoopVector &LV) { 672 for (Loop *L : LV) { 673 TTI::PeelingPreferences PP = 674 gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt); 675 FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); 676 if (!CurrCand.isEligibleForFusion(SE)) 677 continue; 678 679 // Go through each list in FusionCandidates and determine if L is control 680 // flow equivalent with the first loop in that list. If it is, append LV. 681 // If not, go to the next list. 682 // If no suitable list is found, start another list and add it to 683 // FusionCandidates. 684 bool FoundSet = false; 685 686 for (auto &CurrCandSet : FusionCandidates) { 687 if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 688 CurrCandSet.insert(CurrCand); 689 FoundSet = true; 690 #ifndef NDEBUG 691 if (VerboseFusionDebugging) 692 LLVM_DEBUG(dbgs() << "Adding " << CurrCand 693 << " to existing candidate set\n"); 694 #endif 695 break; 696 } 697 } 698 if (!FoundSet) { 699 // No set was found. Create a new set and add to FusionCandidates 700 #ifndef NDEBUG 701 if (VerboseFusionDebugging) 702 LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 703 #endif 704 FusionCandidateSet NewCandSet; 705 NewCandSet.insert(CurrCand); 706 FusionCandidates.push_back(NewCandSet); 707 } 708 NumFusionCandidates++; 709 } 710 } 711 712 /// Determine if it is beneficial to fuse two loops. 713 /// 714 /// For now, this method simply returns true because we want to fuse as much 715 /// as possible (primarily to test the pass). This method will evolve, over 716 /// time, to add heuristics for profitability of fusion. 717 bool isBeneficialFusion(const FusionCandidate &FC0, 718 const FusionCandidate &FC1) { 719 return true; 720 } 721 722 /// Determine if two fusion candidates have the same trip count (i.e., they 723 /// execute the same number of iterations). 724 /// 725 /// This function will return a pair of values. The first is a boolean, 726 /// stating whether or not the two candidates are known at compile time to 727 /// have the same TripCount. The second is the difference in the two 728 /// TripCounts. This information can be used later to determine whether or not 729 /// peeling can be performed on either one of the candidates. 730 std::pair<bool, std::optional<unsigned>> 731 haveIdenticalTripCounts(const FusionCandidate &FC0, 732 const FusionCandidate &FC1) const { 733 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 734 if (isa<SCEVCouldNotCompute>(TripCount0)) { 735 UncomputableTripCount++; 736 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 737 return {false, std::nullopt}; 738 } 739 740 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 741 if (isa<SCEVCouldNotCompute>(TripCount1)) { 742 UncomputableTripCount++; 743 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 744 return {false, std::nullopt}; 745 } 746 747 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 748 << *TripCount1 << " are " 749 << (TripCount0 == TripCount1 ? "identical" : "different") 750 << "\n"); 751 752 if (TripCount0 == TripCount1) 753 return {true, 0}; 754 755 LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " 756 "determining the difference between trip counts\n"); 757 758 // Currently only considering loops with a single exit point 759 // and a non-constant trip count. 760 const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); 761 const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); 762 763 // If any of the tripcounts are zero that means that loop(s) do not have 764 // a single exit or a constant tripcount. 765 if (TC0 == 0 || TC1 == 0) { 766 LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " 767 "have a constant number of iterations. Peeling " 768 "is not benefical\n"); 769 return {false, std::nullopt}; 770 } 771 772 std::optional<unsigned> Difference; 773 int Diff = TC0 - TC1; 774 775 if (Diff > 0) 776 Difference = Diff; 777 else { 778 LLVM_DEBUG( 779 dbgs() << "Difference is less than 0. FC1 (second loop) has more " 780 "iterations than the first one. Currently not supported\n"); 781 } 782 783 LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference 784 << "\n"); 785 786 return {false, Difference}; 787 } 788 789 void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, 790 unsigned PeelCount) { 791 assert(FC0.AbleToPeel && "Should be able to peel loop"); 792 793 LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount 794 << " iterations of the first loop. \n"); 795 796 ValueToValueMapTy VMap; 797 FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap); 798 if (FC0.Peeled) { 799 LLVM_DEBUG(dbgs() << "Done Peeling\n"); 800 801 #ifndef NDEBUG 802 auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); 803 804 assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && 805 "Loops should have identical trip counts after peeling"); 806 #endif 807 808 FC0.PP.PeelCount += PeelCount; 809 810 // Peeling does not update the PDT 811 PDT.recalculate(*FC0.Preheader->getParent()); 812 813 FC0.updateAfterPeeling(); 814 815 // In this case the iterations of the loop are constant, so the first 816 // loop will execute completely (will not jump from one of 817 // the peeled blocks to the second loop). Here we are updating the 818 // branch conditions of each of the peeled blocks, such that it will 819 // branch to its successor which is not the preheader of the second loop 820 // in the case of unguarded loops, or the succesors of the exit block of 821 // the first loop otherwise. Doing this update will ensure that the entry 822 // block of the first loop dominates the entry block of the second loop. 823 BasicBlock *BB = 824 FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; 825 if (BB) { 826 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 827 SmallVector<Instruction *, 8> WorkList; 828 for (BasicBlock *Pred : predecessors(BB)) { 829 if (Pred != FC0.ExitBlock) { 830 WorkList.emplace_back(Pred->getTerminator()); 831 TreeUpdates.emplace_back( 832 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); 833 } 834 } 835 // Cannot modify the predecessors inside the above loop as it will cause 836 // the iterators to be nullptrs, causing memory errors. 837 for (Instruction *CurrentBranch : WorkList) { 838 BasicBlock *Succ = CurrentBranch->getSuccessor(0); 839 if (Succ == BB) 840 Succ = CurrentBranch->getSuccessor(1); 841 ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ)); 842 } 843 844 DTU.applyUpdates(TreeUpdates); 845 DTU.flush(); 846 } 847 LLVM_DEBUG( 848 dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount 849 << " iterations from the first loop.\n" 850 "Both Loops have the same number of iterations now.\n"); 851 } 852 } 853 854 /// Walk each set of control flow equivalent fusion candidates and attempt to 855 /// fuse them. This does a single linear traversal of all candidates in the 856 /// set. The conditions for legal fusion are checked at this point. If a pair 857 /// of fusion candidates passes all legality checks, they are fused together 858 /// and a new fusion candidate is created and added to the FusionCandidateSet. 859 /// The original fusion candidates are then removed, as they are no longer 860 /// valid. 861 bool fuseCandidates() { 862 bool Fused = false; 863 LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 864 for (auto &CandidateSet : FusionCandidates) { 865 if (CandidateSet.size() < 2) 866 continue; 867 868 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 869 << CandidateSet << "\n"); 870 871 for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 872 assert(!LDT.isRemovedLoop(FC0->L) && 873 "Should not have removed loops in CandidateSet!"); 874 auto FC1 = FC0; 875 for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 876 assert(!LDT.isRemovedLoop(FC1->L) && 877 "Should not have removed loops in CandidateSet!"); 878 879 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 880 dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 881 882 FC0->verify(); 883 FC1->verify(); 884 885 // Check if the candidates have identical tripcounts (first value of 886 // pair), and if not check the difference in the tripcounts between 887 // the loops (second value of pair). The difference is not equal to 888 // std::nullopt iff the loops iterate a constant number of times, and 889 // have a single exit. 890 std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = 891 haveIdenticalTripCounts(*FC0, *FC1); 892 bool SameTripCount = IdenticalTripCountRes.first; 893 std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; 894 895 // Here we are checking that FC0 (the first loop) can be peeled, and 896 // both loops have different tripcounts. 897 if (FC0->AbleToPeel && !SameTripCount && TCDifference) { 898 if (*TCDifference > FusionPeelMaxCount) { 899 LLVM_DEBUG(dbgs() 900 << "Difference in loop trip counts: " << *TCDifference 901 << " is greater than maximum peel count specificed: " 902 << FusionPeelMaxCount << "\n"); 903 } else { 904 // Dependent on peeling being performed on the first loop, and 905 // assuming all other conditions for fusion return true. 906 SameTripCount = true; 907 } 908 } 909 910 if (!SameTripCount) { 911 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 912 "counts. Not fusing.\n"); 913 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 914 NonEqualTripCount); 915 continue; 916 } 917 918 if (!isAdjacent(*FC0, *FC1)) { 919 LLVM_DEBUG(dbgs() 920 << "Fusion candidates are not adjacent. Not fusing.\n"); 921 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 922 continue; 923 } 924 925 if ((!FC0->GuardBranch && FC1->GuardBranch) || 926 (FC0->GuardBranch && !FC1->GuardBranch)) { 927 LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " 928 "another one is not. Not fusing.\n"); 929 reportLoopFusion<OptimizationRemarkMissed>( 930 *FC0, *FC1, OnlySecondCandidateIsGuarded); 931 continue; 932 } 933 934 // Ensure that FC0 and FC1 have identical guards. 935 // If one (or both) are not guarded, this check is not necessary. 936 if (FC0->GuardBranch && FC1->GuardBranch && 937 !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { 938 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 939 "guards. Not Fusing.\n"); 940 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 941 NonIdenticalGuards); 942 continue; 943 } 944 945 if (FC0->GuardBranch) { 946 assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 947 948 if (!isSafeToMoveBefore(*FC0->ExitBlock, 949 *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 950 &PDT, &DI)) { 951 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 952 "instructions in exit block. Not fusing.\n"); 953 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 954 NonEmptyExitBlock); 955 continue; 956 } 957 958 if (!isSafeToMoveBefore( 959 *FC1->GuardBranch->getParent(), 960 *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 961 &DI)) { 962 LLVM_DEBUG(dbgs() 963 << "Fusion candidate contains unsafe " 964 "instructions in guard block. Not fusing.\n"); 965 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 966 NonEmptyGuardBlock); 967 continue; 968 } 969 } 970 971 // Check the dependencies across the loops and do not fuse if it would 972 // violate them. 973 if (!dependencesAllowFusion(*FC0, *FC1)) { 974 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 975 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 976 InvalidDependencies); 977 continue; 978 } 979 980 // If the second loop has instructions in the pre-header, attempt to 981 // hoist them up to the first loop's pre-header or sink them into the 982 // body of the second loop. 983 SmallVector<Instruction *, 4> SafeToHoist; 984 SmallVector<Instruction *, 4> SafeToSink; 985 // At this point, this is the last remaining legality check. 986 // Which means if we can make this pre-header empty, we can fuse 987 // these loops 988 if (!isEmptyPreheader(*FC1)) { 989 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " 990 "preheader.\n"); 991 992 // If it is not safe to hoist/sink all instructions in the 993 // pre-header, we cannot fuse these loops. 994 if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, 995 SafeToSink)) { 996 LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " 997 "Fusion Candidate Pre-header.\n" 998 << "Not Fusing.\n"); 999 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 1000 NonEmptyPreheader); 1001 continue; 1002 } 1003 } 1004 1005 bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 1006 LLVM_DEBUG(dbgs() 1007 << "\tFusion appears to be " 1008 << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 1009 if (!BeneficialToFuse) { 1010 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 1011 FusionNotBeneficial); 1012 continue; 1013 } 1014 // All analysis has completed and has determined that fusion is legal 1015 // and profitable. At this point, start transforming the code and 1016 // perform fusion. 1017 1018 // Execute the hoist/sink operations on preheader instructions 1019 movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); 1020 1021 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 1022 << *FC1 << "\n"); 1023 1024 FusionCandidate FC0Copy = *FC0; 1025 // Peel the loop after determining that fusion is legal. The Loops 1026 // will still be safe to fuse after the peeling is performed. 1027 bool Peel = TCDifference && *TCDifference > 0; 1028 if (Peel) 1029 peelFusionCandidate(FC0Copy, *FC1, *TCDifference); 1030 1031 // Report fusion to the Optimization Remarks. 1032 // Note this needs to be done *before* performFusion because 1033 // performFusion will change the original loops, making it not 1034 // possible to identify them after fusion is complete. 1035 reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, 1036 FuseCounter); 1037 1038 FusionCandidate FusedCand( 1039 performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE, 1040 FC0Copy.PP); 1041 FusedCand.verify(); 1042 assert(FusedCand.isEligibleForFusion(SE) && 1043 "Fused candidate should be eligible for fusion!"); 1044 1045 // Notify the loop-depth-tree that these loops are not valid objects 1046 LDT.removeLoop(FC1->L); 1047 1048 CandidateSet.erase(FC0); 1049 CandidateSet.erase(FC1); 1050 1051 auto InsertPos = CandidateSet.insert(FusedCand); 1052 1053 assert(InsertPos.second && 1054 "Unable to insert TargetCandidate in CandidateSet!"); 1055 1056 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 1057 // of the FC1 loop will attempt to fuse the new (fused) loop with the 1058 // remaining candidates in the current candidate set. 1059 FC0 = FC1 = InsertPos.first; 1060 1061 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 1062 << "\n"); 1063 1064 Fused = true; 1065 } 1066 } 1067 } 1068 return Fused; 1069 } 1070 1071 // Returns true if the instruction \p I can be hoisted to the end of the 1072 // preheader of \p FC0. \p SafeToHoist contains the instructions that are 1073 // known to be safe to hoist. The instructions encountered that cannot be 1074 // hoisted are in \p NotHoisting. 1075 // TODO: Move functionality into CodeMoverUtils 1076 bool canHoistInst(Instruction &I, 1077 const SmallVector<Instruction *, 4> &SafeToHoist, 1078 const SmallVector<Instruction *, 4> &NotHoisting, 1079 const FusionCandidate &FC0) const { 1080 const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); 1081 assert(FC0PreheaderTarget && 1082 "Expected single successor for loop preheader."); 1083 1084 for (Use &Op : I.operands()) { 1085 if (auto *OpInst = dyn_cast<Instruction>(Op)) { 1086 bool OpHoisted = is_contained(SafeToHoist, OpInst); 1087 // Check if we have already decided to hoist this operand. In this 1088 // case, it does not dominate FC0 *yet*, but will after we hoist it. 1089 if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { 1090 return false; 1091 } 1092 } 1093 } 1094 1095 // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs 1096 // cannot be hoisted and should be sunk to the exit of the fused loop. 1097 if (isa<PHINode>(I)) 1098 return false; 1099 1100 // If this isn't a memory inst, hoisting is safe 1101 if (!I.mayReadOrWriteMemory()) 1102 return true; 1103 1104 LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); 1105 for (Instruction *NotHoistedInst : NotHoisting) { 1106 if (auto D = DI.depends(&I, NotHoistedInst, true)) { 1107 // Dependency is not read-before-write, write-before-read or 1108 // write-before-write 1109 if (D->isFlow() || D->isAnti() || D->isOutput()) { 1110 LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " 1111 "preheader that is not being hoisted.\n"); 1112 return false; 1113 } 1114 } 1115 } 1116 1117 for (Instruction *ReadInst : FC0.MemReads) { 1118 if (auto D = DI.depends(ReadInst, &I, true)) { 1119 // Dependency is not read-before-write 1120 if (D->isAnti()) { 1121 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n"); 1122 return false; 1123 } 1124 } 1125 } 1126 1127 for (Instruction *WriteInst : FC0.MemWrites) { 1128 if (auto D = DI.depends(WriteInst, &I, true)) { 1129 // Dependency is not write-before-read or write-before-write 1130 if (D->isFlow() || D->isOutput()) { 1131 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n"); 1132 return false; 1133 } 1134 } 1135 } 1136 return true; 1137 } 1138 1139 // Returns true if the instruction \p I can be sunk to the top of the exit 1140 // block of \p FC1. 1141 // TODO: Move functionality into CodeMoverUtils 1142 bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { 1143 for (User *U : I.users()) { 1144 if (auto *UI{dyn_cast<Instruction>(U)}) { 1145 // Cannot sink if user in loop 1146 // If FC1 has phi users of this value, we cannot sink it into FC1. 1147 if (FC1.L->contains(UI)) { 1148 // Cannot hoist or sink this instruction. No hoisting/sinking 1149 // should take place, loops should not fuse 1150 return false; 1151 } 1152 } 1153 } 1154 1155 // If this isn't a memory inst, sinking is safe 1156 if (!I.mayReadOrWriteMemory()) 1157 return true; 1158 1159 for (Instruction *ReadInst : FC1.MemReads) { 1160 if (auto D = DI.depends(&I, ReadInst, true)) { 1161 // Dependency is not write-before-read 1162 if (D->isFlow()) { 1163 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n"); 1164 return false; 1165 } 1166 } 1167 } 1168 1169 for (Instruction *WriteInst : FC1.MemWrites) { 1170 if (auto D = DI.depends(&I, WriteInst, true)) { 1171 // Dependency is not write-before-write or read-before-write 1172 if (D->isOutput() || D->isAnti()) { 1173 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n"); 1174 return false; 1175 } 1176 } 1177 } 1178 1179 return true; 1180 } 1181 1182 /// Collect instructions in the \p FC1 Preheader that can be hoisted 1183 /// to the \p FC0 Preheader or sunk into the \p FC1 Body 1184 bool collectMovablePreheaderInsts( 1185 const FusionCandidate &FC0, const FusionCandidate &FC1, 1186 SmallVector<Instruction *, 4> &SafeToHoist, 1187 SmallVector<Instruction *, 4> &SafeToSink) const { 1188 BasicBlock *FC1Preheader = FC1.Preheader; 1189 // Save the instructions that are not being hoisted, so we know not to hoist 1190 // mem insts that they dominate. 1191 SmallVector<Instruction *, 4> NotHoisting; 1192 1193 for (Instruction &I : *FC1Preheader) { 1194 // Can't move a branch 1195 if (&I == FC1Preheader->getTerminator()) 1196 continue; 1197 // If the instruction has side-effects, give up. 1198 // TODO: The case of mayReadFromMemory we can handle but requires 1199 // additional work with a dependence analysis so for now we give 1200 // up on memory reads. 1201 if (I.mayThrow() || !I.willReturn()) { 1202 LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n"); 1203 return false; 1204 } 1205 1206 LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); 1207 1208 if (I.isAtomic() || I.isVolatile()) { 1209 LLVM_DEBUG( 1210 dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n"); 1211 return false; 1212 } 1213 1214 if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { 1215 SafeToHoist.push_back(&I); 1216 LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); 1217 } else { 1218 LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); 1219 NotHoisting.push_back(&I); 1220 1221 if (canSinkInst(I, FC1)) { 1222 SafeToSink.push_back(&I); 1223 LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); 1224 } else { 1225 LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); 1226 return false; 1227 } 1228 } 1229 } 1230 LLVM_DEBUG( 1231 dbgs() << "All preheader instructions could be sunk or hoisted!\n"); 1232 return true; 1233 } 1234 1235 /// Rewrite all additive recurrences in a SCEV to use a new loop. 1236 class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 1237 public: 1238 AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 1239 bool UseMax = true) 1240 : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 1241 NewL(NewL) {} 1242 1243 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 1244 const Loop *ExprL = Expr->getLoop(); 1245 SmallVector<const SCEV *, 2> Operands; 1246 if (ExprL == &OldL) { 1247 append_range(Operands, Expr->operands()); 1248 return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 1249 } 1250 1251 if (OldL.contains(ExprL)) { 1252 bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 1253 if (!UseMax || !Pos || !Expr->isAffine()) { 1254 Valid = false; 1255 return Expr; 1256 } 1257 return visit(Expr->getStart()); 1258 } 1259 1260 for (const SCEV *Op : Expr->operands()) 1261 Operands.push_back(visit(Op)); 1262 return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 1263 } 1264 1265 bool wasValidSCEV() const { return Valid; } 1266 1267 private: 1268 bool Valid, UseMax; 1269 const Loop &OldL, &NewL; 1270 }; 1271 1272 /// Return false if the access functions of \p I0 and \p I1 could cause 1273 /// a negative dependence. 1274 bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 1275 Instruction &I1, bool EqualIsInvalid) { 1276 Value *Ptr0 = getLoadStorePointerOperand(&I0); 1277 Value *Ptr1 = getLoadStorePointerOperand(&I1); 1278 if (!Ptr0 || !Ptr1) 1279 return false; 1280 1281 const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 1282 const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 1283 #ifndef NDEBUG 1284 if (VerboseFusionDebugging) 1285 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 1286 << *SCEVPtr1 << "\n"); 1287 #endif 1288 AddRecLoopReplacer Rewriter(SE, L0, L1); 1289 SCEVPtr0 = Rewriter.visit(SCEVPtr0); 1290 #ifndef NDEBUG 1291 if (VerboseFusionDebugging) 1292 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 1293 << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 1294 #endif 1295 if (!Rewriter.wasValidSCEV()) 1296 return false; 1297 1298 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 1299 // L0) and the other is not. We could check if it is monotone and test 1300 // the beginning and end value instead. 1301 1302 BasicBlock *L0Header = L0.getHeader(); 1303 auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 1304 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 1305 if (!AddRec) 1306 return false; 1307 return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 1308 !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 1309 }; 1310 if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 1311 return false; 1312 1313 ICmpInst::Predicate Pred = 1314 EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 1315 bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 1316 #ifndef NDEBUG 1317 if (VerboseFusionDebugging) 1318 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 1319 << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 1320 << "\n"); 1321 #endif 1322 return IsAlwaysGE; 1323 } 1324 1325 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 1326 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 1327 /// specified by @p DepChoice are used to determine this. 1328 bool dependencesAllowFusion(const FusionCandidate &FC0, 1329 const FusionCandidate &FC1, Instruction &I0, 1330 Instruction &I1, bool AnyDep, 1331 FusionDependenceAnalysisChoice DepChoice) { 1332 #ifndef NDEBUG 1333 if (VerboseFusionDebugging) { 1334 LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 1335 << DepChoice << "\n"); 1336 } 1337 #endif 1338 switch (DepChoice) { 1339 case FUSION_DEPENDENCE_ANALYSIS_SCEV: 1340 return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 1341 case FUSION_DEPENDENCE_ANALYSIS_DA: { 1342 auto DepResult = DI.depends(&I0, &I1, true); 1343 if (!DepResult) 1344 return true; 1345 #ifndef NDEBUG 1346 if (VerboseFusionDebugging) { 1347 LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 1348 dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 1349 << (DepResult->isOrdered() ? "true" : "false") 1350 << "]\n"); 1351 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 1352 << "\n"); 1353 } 1354 #endif 1355 1356 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 1357 LLVM_DEBUG( 1358 dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 1359 1360 // TODO: Can we actually use the dependence info analysis here? 1361 return false; 1362 } 1363 1364 case FUSION_DEPENDENCE_ANALYSIS_ALL: 1365 return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1366 FUSION_DEPENDENCE_ANALYSIS_SCEV) || 1367 dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1368 FUSION_DEPENDENCE_ANALYSIS_DA); 1369 } 1370 1371 llvm_unreachable("Unknown fusion dependence analysis choice!"); 1372 } 1373 1374 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 1375 bool dependencesAllowFusion(const FusionCandidate &FC0, 1376 const FusionCandidate &FC1) { 1377 LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 1378 << "\n"); 1379 assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 1380 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 1381 1382 for (Instruction *WriteL0 : FC0.MemWrites) { 1383 for (Instruction *WriteL1 : FC1.MemWrites) 1384 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1385 /* AnyDep */ false, 1386 FusionDependenceAnalysis)) { 1387 InvalidDependencies++; 1388 return false; 1389 } 1390 for (Instruction *ReadL1 : FC1.MemReads) 1391 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 1392 /* AnyDep */ false, 1393 FusionDependenceAnalysis)) { 1394 InvalidDependencies++; 1395 return false; 1396 } 1397 } 1398 1399 for (Instruction *WriteL1 : FC1.MemWrites) { 1400 for (Instruction *WriteL0 : FC0.MemWrites) 1401 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1402 /* AnyDep */ false, 1403 FusionDependenceAnalysis)) { 1404 InvalidDependencies++; 1405 return false; 1406 } 1407 for (Instruction *ReadL0 : FC0.MemReads) 1408 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 1409 /* AnyDep */ false, 1410 FusionDependenceAnalysis)) { 1411 InvalidDependencies++; 1412 return false; 1413 } 1414 } 1415 1416 // Walk through all uses in FC1. For each use, find the reaching def. If the 1417 // def is located in FC0 then it is is not safe to fuse. 1418 for (BasicBlock *BB : FC1.L->blocks()) 1419 for (Instruction &I : *BB) 1420 for (auto &Op : I.operands()) 1421 if (Instruction *Def = dyn_cast<Instruction>(Op)) 1422 if (FC0.L->contains(Def->getParent())) { 1423 InvalidDependencies++; 1424 return false; 1425 } 1426 1427 return true; 1428 } 1429 1430 /// Determine if two fusion candidates are adjacent in the CFG. 1431 /// 1432 /// This method will determine if there are additional basic blocks in the CFG 1433 /// between the exit of \p FC0 and the entry of \p FC1. 1434 /// If the two candidates are guarded loops, then it checks whether the 1435 /// non-loop successor of the \p FC0 guard branch is the entry block of \p 1436 /// FC1. If not, then the loops are not adjacent. If the two candidates are 1437 /// not guarded loops, then it checks whether the exit block of \p FC0 is the 1438 /// preheader of \p FC1. 1439 bool isAdjacent(const FusionCandidate &FC0, 1440 const FusionCandidate &FC1) const { 1441 // If the successor of the guard branch is FC1, then the loops are adjacent 1442 if (FC0.GuardBranch) 1443 return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 1444 else 1445 return FC0.ExitBlock == FC1.getEntryBlock(); 1446 } 1447 1448 bool isEmptyPreheader(const FusionCandidate &FC) const { 1449 return FC.Preheader->size() == 1; 1450 } 1451 1452 /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader 1453 /// and sink others into the body of \p FC1. 1454 void movePreheaderInsts(const FusionCandidate &FC0, 1455 const FusionCandidate &FC1, 1456 SmallVector<Instruction *, 4> &HoistInsts, 1457 SmallVector<Instruction *, 4> &SinkInsts) const { 1458 // All preheader instructions except the branch must be hoisted or sunk 1459 assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && 1460 "Attempting to sink and hoist preheader instructions, but not all " 1461 "the preheader instructions are accounted for."); 1462 1463 NumHoistedInsts += HoistInsts.size(); 1464 NumSunkInsts += SinkInsts.size(); 1465 1466 LLVM_DEBUG(if (VerboseFusionDebugging) { 1467 if (!HoistInsts.empty()) 1468 dbgs() << "Hoisting: \n"; 1469 for (Instruction *I : HoistInsts) 1470 dbgs() << *I << "\n"; 1471 if (!SinkInsts.empty()) 1472 dbgs() << "Sinking: \n"; 1473 for (Instruction *I : SinkInsts) 1474 dbgs() << *I << "\n"; 1475 }); 1476 1477 for (Instruction *I : HoistInsts) { 1478 assert(I->getParent() == FC1.Preheader); 1479 I->moveBefore(FC0.Preheader->getTerminator()); 1480 } 1481 // insert instructions in reverse order to maintain dominance relationship 1482 for (Instruction *I : reverse(SinkInsts)) { 1483 assert(I->getParent() == FC1.Preheader); 1484 I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt()); 1485 } 1486 } 1487 1488 /// Determine if two fusion candidates have identical guards 1489 /// 1490 /// This method will determine if two fusion candidates have the same guards. 1491 /// The guards are considered the same if: 1492 /// 1. The instructions to compute the condition used in the compare are 1493 /// identical. 1494 /// 2. The successors of the guard have the same flow into/around the loop. 1495 /// If the compare instructions are identical, then the first successor of the 1496 /// guard must go to the same place (either the preheader of the loop or the 1497 /// NonLoopBlock). In other words, the the first successor of both loops must 1498 /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 1499 /// the NonLoopBlock). The same must be true for the second successor. 1500 bool haveIdenticalGuards(const FusionCandidate &FC0, 1501 const FusionCandidate &FC1) const { 1502 assert(FC0.GuardBranch && FC1.GuardBranch && 1503 "Expecting FC0 and FC1 to be guarded loops."); 1504 1505 if (auto FC0CmpInst = 1506 dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 1507 if (auto FC1CmpInst = 1508 dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 1509 if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 1510 return false; 1511 1512 // The compare instructions are identical. 1513 // Now make sure the successor of the guards have the same flow into/around 1514 // the loop 1515 if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 1516 return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 1517 else 1518 return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 1519 } 1520 1521 /// Modify the latch branch of FC to be unconditional since successors of the 1522 /// branch are the same. 1523 void simplifyLatchBranch(const FusionCandidate &FC) const { 1524 BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1525 if (FCLatchBranch) { 1526 assert(FCLatchBranch->isConditional() && 1527 FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1528 "Expecting the two successors of FCLatchBranch to be the same"); 1529 BranchInst *NewBranch = 1530 BranchInst::Create(FCLatchBranch->getSuccessor(0)); 1531 ReplaceInstWithInst(FCLatchBranch, NewBranch); 1532 } 1533 } 1534 1535 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1536 /// successor, then merge FC0.Latch with its unique successor. 1537 void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1538 moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1539 if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1540 MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1541 DTU.flush(); 1542 } 1543 } 1544 1545 /// Fuse two fusion candidates, creating a new fused loop. 1546 /// 1547 /// This method contains the mechanics of fusing two loops, represented by \p 1548 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 1549 /// postdominates \p FC0 (making them control flow equivalent). It also 1550 /// assumes that the other conditions for fusion have been met: adjacent, 1551 /// identical trip counts, and no negative distance dependencies exist that 1552 /// would prevent fusion. Thus, there is no checking for these conditions in 1553 /// this method. 1554 /// 1555 /// Fusion is performed by rewiring the CFG to update successor blocks of the 1556 /// components of tho loop. Specifically, the following changes are done: 1557 /// 1558 /// 1. The preheader of \p FC1 is removed as it is no longer necessary 1559 /// (because it is currently only a single statement block). 1560 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 1561 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 1562 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 1563 /// 1564 /// All of these modifications are done with dominator tree updates, thus 1565 /// keeping the dominator (and post dominator) information up-to-date. 1566 /// 1567 /// This can be improved in the future by actually merging blocks during 1568 /// fusion. For example, the preheader of \p FC1 can be merged with the 1569 /// preheader of \p FC0. This would allow loops with more than a single 1570 /// statement in the preheader to be fused. Similarly, the latch blocks of the 1571 /// two loops could also be fused into a single block. This will require 1572 /// analysis to prove it is safe to move the contents of the block past 1573 /// existing code, which currently has not been implemented. 1574 Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1575 assert(FC0.isValid() && FC1.isValid() && 1576 "Expecting valid fusion candidates"); 1577 1578 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 1579 dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 1580 1581 // Move instructions from the preheader of FC1 to the end of the preheader 1582 // of FC0. 1583 moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 1584 1585 // Fusing guarded loops is handled slightly differently than non-guarded 1586 // loops and has been broken out into a separate method instead of trying to 1587 // intersperse the logic within a single method. 1588 if (FC0.GuardBranch) 1589 return fuseGuardedLoops(FC0, FC1); 1590 1591 assert(FC1.Preheader == 1592 (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); 1593 assert(FC1.Preheader->size() == 1 && 1594 FC1.Preheader->getSingleSuccessor() == FC1.Header); 1595 1596 // Remember the phi nodes originally in the header of FC0 in order to rewire 1597 // them later. However, this is only necessary if the new loop carried 1598 // values might not dominate the exiting branch. While we do not generally 1599 // test if this is the case but simply insert intermediate phi nodes, we 1600 // need to make sure these intermediate phi nodes have different 1601 // predecessors. To this end, we filter the special case where the exiting 1602 // block is the latch block of the first loop. Nothing needs to be done 1603 // anyway as all loop carried values dominate the latch and thereby also the 1604 // exiting branch. 1605 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1606 if (FC0.ExitingBlock != FC0.Latch) 1607 for (PHINode &PHI : FC0.Header->phis()) 1608 OriginalFC0PHIs.push_back(&PHI); 1609 1610 // Replace incoming blocks for header PHIs first. 1611 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1612 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1613 1614 // Then modify the control flow and update DT and PDT. 1615 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1616 1617 // The old exiting block of the first loop (FC0) has to jump to the header 1618 // of the second as we need to execute the code in the second header block 1619 // regardless of the trip count. That is, if the trip count is 0, so the 1620 // back edge is never taken, we still have to execute both loop headers, 1621 // especially (but not only!) if the second is a do-while style loop. 1622 // However, doing so might invalidate the phi nodes of the first loop as 1623 // the new values do only need to dominate their latch and not the exiting 1624 // predicate. To remedy this potential problem we always introduce phi 1625 // nodes in the header of the second loop later that select the loop carried 1626 // value, if the second header was reached through an old latch of the 1627 // first, or undef otherwise. This is sound as exiting the first implies the 1628 // second will exit too, __without__ taking the back-edge. [Their 1629 // trip-counts are equal after all. 1630 // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 1631 // to FC1.Header? I think this is basically what the three sequences are 1632 // trying to accomplish; however, doing this directly in the CFG may mean 1633 // the DT/PDT becomes invalid 1634 if (!FC0.Peeled) { 1635 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 1636 FC1.Header); 1637 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1638 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 1639 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1640 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1641 } else { 1642 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1643 DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); 1644 1645 // Remove the ExitBlock of the first Loop (also not needed) 1646 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1647 FC1.Header); 1648 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1649 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1650 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1651 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1652 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1653 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1654 } 1655 1656 // The pre-header of L1 is not necessary anymore. 1657 assert(pred_empty(FC1.Preheader)); 1658 FC1.Preheader->getTerminator()->eraseFromParent(); 1659 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1660 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1661 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1662 1663 // Moves the phi nodes from the second to the first loops header block. 1664 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1665 if (SE.isSCEVable(PHI->getType())) 1666 SE.forgetValue(PHI); 1667 if (PHI->hasNUsesOrMore(1)) 1668 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1669 else 1670 PHI->eraseFromParent(); 1671 } 1672 1673 // Introduce new phi nodes in the second loop header to ensure 1674 // exiting the first and jumping to the header of the second does not break 1675 // the SSA property of the phis originally in the first loop. See also the 1676 // comment above. 1677 Instruction *L1HeaderIP = &FC1.Header->front(); 1678 for (PHINode *LCPHI : OriginalFC0PHIs) { 1679 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1680 assert(L1LatchBBIdx >= 0 && 1681 "Expected loop carried value to be rewired at this point!"); 1682 1683 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1684 1685 PHINode *L1HeaderPHI = PHINode::Create( 1686 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1687 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1688 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1689 FC0.ExitingBlock); 1690 1691 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1692 } 1693 1694 // Replace latch terminator destinations. 1695 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1696 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1697 1698 // Modify the latch branch of FC0 to be unconditional as both successors of 1699 // the branch are the same. 1700 simplifyLatchBranch(FC0); 1701 1702 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1703 // performed the updates above. 1704 if (FC0.Latch != FC0.ExitingBlock) 1705 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1706 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1707 1708 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1709 FC0.Latch, FC0.Header)); 1710 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1711 FC1.Latch, FC0.Header)); 1712 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1713 FC1.Latch, FC1.Header)); 1714 1715 // Update DT/PDT 1716 DTU.applyUpdates(TreeUpdates); 1717 1718 LI.removeBlock(FC1.Preheader); 1719 DTU.deleteBB(FC1.Preheader); 1720 if (FC0.Peeled) { 1721 LI.removeBlock(FC0.ExitBlock); 1722 DTU.deleteBB(FC0.ExitBlock); 1723 } 1724 1725 DTU.flush(); 1726 1727 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1728 // and rebuild the information in subsequent passes of fusion? 1729 // Note: Need to forget the loops before merging the loop latches, as 1730 // mergeLatch may remove the only block in FC1. 1731 SE.forgetLoop(FC1.L); 1732 SE.forgetLoop(FC0.L); 1733 SE.forgetLoopDispositions(); 1734 1735 // Move instructions from FC0.Latch to FC1.Latch. 1736 // Note: mergeLatch requires an updated DT. 1737 mergeLatch(FC0, FC1); 1738 1739 // Merge the loops. 1740 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 1741 for (BasicBlock *BB : Blocks) { 1742 FC0.L->addBlockEntry(BB); 1743 FC1.L->removeBlockFromLoop(BB); 1744 if (LI.getLoopFor(BB) != FC1.L) 1745 continue; 1746 LI.changeLoopFor(BB, FC0.L); 1747 } 1748 while (!FC1.L->isInnermost()) { 1749 const auto &ChildLoopIt = FC1.L->begin(); 1750 Loop *ChildLoop = *ChildLoopIt; 1751 FC1.L->removeChildLoop(ChildLoopIt); 1752 FC0.L->addChildLoop(ChildLoop); 1753 } 1754 1755 // Delete the now empty loop L1. 1756 LI.erase(FC1.L); 1757 1758 #ifndef NDEBUG 1759 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1760 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1761 assert(PDT.verify()); 1762 LI.verify(DT); 1763 SE.verify(); 1764 #endif 1765 1766 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1767 1768 return FC0.L; 1769 } 1770 1771 /// Report details on loop fusion opportunities. 1772 /// 1773 /// This template function can be used to report both successful and missed 1774 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 1775 /// be one of: 1776 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 1777 /// given two valid fusion candidates. 1778 /// - OptimizationRemark to report successful fusion of two fusion 1779 /// candidates. 1780 /// The remarks will be printed using the form: 1781 /// <path/filename>:<line number>:<column number>: [<function name>]: 1782 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 1783 template <typename RemarkKind> 1784 void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 1785 llvm::Statistic &Stat) { 1786 assert(FC0.Preheader && FC1.Preheader && 1787 "Expecting valid fusion candidates"); 1788 using namespace ore; 1789 #if LLVM_ENABLE_STATS 1790 ++Stat; 1791 ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 1792 FC0.Preheader) 1793 << "[" << FC0.Preheader->getParent()->getName() 1794 << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 1795 << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 1796 << ": " << Stat.getDesc()); 1797 #endif 1798 } 1799 1800 /// Fuse two guarded fusion candidates, creating a new fused loop. 1801 /// 1802 /// Fusing guarded loops is handled much the same way as fusing non-guarded 1803 /// loops. The rewiring of the CFG is slightly different though, because of 1804 /// the presence of the guards around the loops and the exit blocks after the 1805 /// loop body. As such, the new loop is rewired as follows: 1806 /// 1. Keep the guard branch from FC0 and use the non-loop block target 1807 /// from the FC1 guard branch. 1808 /// 2. Remove the exit block from FC0 (this exit block should be empty 1809 /// right now). 1810 /// 3. Remove the guard branch for FC1 1811 /// 4. Remove the preheader for FC1. 1812 /// The exit block successor for the latch of FC0 is updated to be the header 1813 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 1814 /// be the header of FC0, thus creating the fused loop. 1815 Loop *fuseGuardedLoops(const FusionCandidate &FC0, 1816 const FusionCandidate &FC1) { 1817 assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 1818 1819 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 1820 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 1821 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 1822 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1823 BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); 1824 1825 // Move instructions from the exit block of FC0 to the beginning of the exit 1826 // block of FC1, in the case that the FC0 loop has not been peeled. In the 1827 // case that FC0 loop is peeled, then move the instructions of the successor 1828 // of the FC0 Exit block to the beginning of the exit block of FC1. 1829 moveInstructionsToTheBeginning( 1830 (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, 1831 DT, PDT, DI); 1832 1833 // Move instructions from the guard block of FC1 to the end of the guard 1834 // block of FC0. 1835 moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 1836 1837 assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 1838 1839 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1840 1841 //////////////////////////////////////////////////////////////////////////// 1842 // Update the Loop Guard 1843 //////////////////////////////////////////////////////////////////////////// 1844 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 1845 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 1846 // Thus, one path from the guard goes to the preheader for FC0 (and thus 1847 // executes the new fused loop) and the other path goes to the NonLoopBlock 1848 // for FC1 (where FC1 guard would have gone if FC1 was not executed). 1849 FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 1850 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1851 1852 BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; 1853 BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); 1854 1855 // The guard of FC1 is not necessary anymore. 1856 FC1.GuardBranch->eraseFromParent(); 1857 new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 1858 1859 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1860 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 1861 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1862 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 1863 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1864 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 1865 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1866 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 1867 1868 if (FC0.Peeled) { 1869 // Remove the Block after the ExitBlock of FC0 1870 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1871 DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); 1872 FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); 1873 new UnreachableInst(FC0ExitBlockSuccessor->getContext(), 1874 FC0ExitBlockSuccessor); 1875 } 1876 1877 assert(pred_empty(FC1GuardBlock) && 1878 "Expecting guard block to have no predecessors"); 1879 assert(succ_empty(FC1GuardBlock) && 1880 "Expecting guard block to have no successors"); 1881 1882 // Remember the phi nodes originally in the header of FC0 in order to rewire 1883 // them later. However, this is only necessary if the new loop carried 1884 // values might not dominate the exiting branch. While we do not generally 1885 // test if this is the case but simply insert intermediate phi nodes, we 1886 // need to make sure these intermediate phi nodes have different 1887 // predecessors. To this end, we filter the special case where the exiting 1888 // block is the latch block of the first loop. Nothing needs to be done 1889 // anyway as all loop carried values dominate the latch and thereby also the 1890 // exiting branch. 1891 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 1892 // (because the loops are rotated. Thus, nothing will ever be added to 1893 // OriginalFC0PHIs. 1894 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1895 if (FC0.ExitingBlock != FC0.Latch) 1896 for (PHINode &PHI : FC0.Header->phis()) 1897 OriginalFC0PHIs.push_back(&PHI); 1898 1899 assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 1900 1901 // Replace incoming blocks for header PHIs first. 1902 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1903 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1904 1905 // The old exiting block of the first loop (FC0) has to jump to the header 1906 // of the second as we need to execute the code in the second header block 1907 // regardless of the trip count. That is, if the trip count is 0, so the 1908 // back edge is never taken, we still have to execute both loop headers, 1909 // especially (but not only!) if the second is a do-while style loop. 1910 // However, doing so might invalidate the phi nodes of the first loop as 1911 // the new values do only need to dominate their latch and not the exiting 1912 // predicate. To remedy this potential problem we always introduce phi 1913 // nodes in the header of the second loop later that select the loop carried 1914 // value, if the second header was reached through an old latch of the 1915 // first, or undef otherwise. This is sound as exiting the first implies the 1916 // second will exit too, __without__ taking the back-edge (their 1917 // trip-counts are equal after all). 1918 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1919 FC1.Header); 1920 1921 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1922 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1923 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1924 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1925 1926 // Remove FC0 Exit Block 1927 // The exit block for FC0 is no longer needed since control will flow 1928 // directly to the header of FC1. Since it is an empty block, it can be 1929 // removed at this point. 1930 // TODO: In the future, we can handle non-empty exit blocks my merging any 1931 // instructions from FC0 exit block into FC1 exit block prior to removing 1932 // the block. 1933 assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty"); 1934 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1935 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1936 1937 // Remove FC1 Preheader 1938 // The pre-header of L1 is not necessary anymore. 1939 assert(pred_empty(FC1.Preheader)); 1940 FC1.Preheader->getTerminator()->eraseFromParent(); 1941 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1942 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1943 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1944 1945 // Moves the phi nodes from the second to the first loops header block. 1946 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1947 if (SE.isSCEVable(PHI->getType())) 1948 SE.forgetValue(PHI); 1949 if (PHI->hasNUsesOrMore(1)) 1950 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1951 else 1952 PHI->eraseFromParent(); 1953 } 1954 1955 // Introduce new phi nodes in the second loop header to ensure 1956 // exiting the first and jumping to the header of the second does not break 1957 // the SSA property of the phis originally in the first loop. See also the 1958 // comment above. 1959 Instruction *L1HeaderIP = &FC1.Header->front(); 1960 for (PHINode *LCPHI : OriginalFC0PHIs) { 1961 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1962 assert(L1LatchBBIdx >= 0 && 1963 "Expected loop carried value to be rewired at this point!"); 1964 1965 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1966 1967 PHINode *L1HeaderPHI = PHINode::Create( 1968 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1969 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1970 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1971 FC0.ExitingBlock); 1972 1973 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1974 } 1975 1976 // Update the latches 1977 1978 // Replace latch terminator destinations. 1979 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1980 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1981 1982 // Modify the latch branch of FC0 to be unconditional as both successors of 1983 // the branch are the same. 1984 simplifyLatchBranch(FC0); 1985 1986 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1987 // performed the updates above. 1988 if (FC0.Latch != FC0.ExitingBlock) 1989 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1990 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1991 1992 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1993 FC0.Latch, FC0.Header)); 1994 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1995 FC1.Latch, FC0.Header)); 1996 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1997 FC1.Latch, FC1.Header)); 1998 1999 // All done 2000 // Apply the updates to the Dominator Tree and cleanup. 2001 2002 assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!"); 2003 assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!"); 2004 2005 // Update DT/PDT 2006 DTU.applyUpdates(TreeUpdates); 2007 2008 LI.removeBlock(FC1GuardBlock); 2009 LI.removeBlock(FC1.Preheader); 2010 LI.removeBlock(FC0.ExitBlock); 2011 if (FC0.Peeled) { 2012 LI.removeBlock(FC0ExitBlockSuccessor); 2013 DTU.deleteBB(FC0ExitBlockSuccessor); 2014 } 2015 DTU.deleteBB(FC1GuardBlock); 2016 DTU.deleteBB(FC1.Preheader); 2017 DTU.deleteBB(FC0.ExitBlock); 2018 DTU.flush(); 2019 2020 // Is there a way to keep SE up-to-date so we don't need to forget the loops 2021 // and rebuild the information in subsequent passes of fusion? 2022 // Note: Need to forget the loops before merging the loop latches, as 2023 // mergeLatch may remove the only block in FC1. 2024 SE.forgetLoop(FC1.L); 2025 SE.forgetLoop(FC0.L); 2026 SE.forgetLoopDispositions(); 2027 2028 // Move instructions from FC0.Latch to FC1.Latch. 2029 // Note: mergeLatch requires an updated DT. 2030 mergeLatch(FC0, FC1); 2031 2032 // Merge the loops. 2033 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 2034 for (BasicBlock *BB : Blocks) { 2035 FC0.L->addBlockEntry(BB); 2036 FC1.L->removeBlockFromLoop(BB); 2037 if (LI.getLoopFor(BB) != FC1.L) 2038 continue; 2039 LI.changeLoopFor(BB, FC0.L); 2040 } 2041 while (!FC1.L->isInnermost()) { 2042 const auto &ChildLoopIt = FC1.L->begin(); 2043 Loop *ChildLoop = *ChildLoopIt; 2044 FC1.L->removeChildLoop(ChildLoopIt); 2045 FC0.L->addChildLoop(ChildLoop); 2046 } 2047 2048 // Delete the now empty loop L1. 2049 LI.erase(FC1.L); 2050 2051 #ifndef NDEBUG 2052 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 2053 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 2054 assert(PDT.verify()); 2055 LI.verify(DT); 2056 SE.verify(); 2057 #endif 2058 2059 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 2060 2061 return FC0.L; 2062 } 2063 }; 2064 2065 struct LoopFuseLegacy : public FunctionPass { 2066 2067 static char ID; 2068 2069 LoopFuseLegacy() : FunctionPass(ID) { 2070 initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); 2071 } 2072 2073 void getAnalysisUsage(AnalysisUsage &AU) const override { 2074 AU.addRequiredID(LoopSimplifyID); 2075 AU.addRequired<ScalarEvolutionWrapperPass>(); 2076 AU.addRequired<LoopInfoWrapperPass>(); 2077 AU.addRequired<DominatorTreeWrapperPass>(); 2078 AU.addRequired<PostDominatorTreeWrapperPass>(); 2079 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2080 AU.addRequired<DependenceAnalysisWrapperPass>(); 2081 AU.addRequired<AssumptionCacheTracker>(); 2082 AU.addRequired<TargetTransformInfoWrapperPass>(); 2083 2084 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2085 AU.addPreserved<LoopInfoWrapperPass>(); 2086 AU.addPreserved<DominatorTreeWrapperPass>(); 2087 AU.addPreserved<PostDominatorTreeWrapperPass>(); 2088 } 2089 2090 bool runOnFunction(Function &F) override { 2091 if (skipFunction(F)) 2092 return false; 2093 2094 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2095 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2096 auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 2097 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2098 auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 2099 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2100 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2101 const TargetTransformInfo &TTI = 2102 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2103 const DataLayout &DL = F.getParent()->getDataLayout(); 2104 2105 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 2106 return LF.fuseLoops(F); 2107 } 2108 }; 2109 } // namespace 2110 2111 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 2112 auto &LI = AM.getResult<LoopAnalysis>(F); 2113 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 2114 auto &DI = AM.getResult<DependenceAnalysis>(F); 2115 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 2116 auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 2117 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 2118 auto &AC = AM.getResult<AssumptionAnalysis>(F); 2119 const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 2120 const DataLayout &DL = F.getParent()->getDataLayout(); 2121 2122 // Ensure loops are in simplifed form which is a pre-requisite for loop fusion 2123 // pass. Added only for new PM since the legacy PM has already added 2124 // LoopSimplify pass as a dependency. 2125 bool Changed = false; 2126 for (auto &L : LI) { 2127 Changed |= 2128 simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); 2129 } 2130 if (Changed) 2131 PDT.recalculate(F); 2132 2133 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 2134 Changed |= LF.fuseLoops(F); 2135 if (!Changed) 2136 return PreservedAnalyses::all(); 2137 2138 PreservedAnalyses PA; 2139 PA.preserve<DominatorTreeAnalysis>(); 2140 PA.preserve<PostDominatorTreeAnalysis>(); 2141 PA.preserve<ScalarEvolutionAnalysis>(); 2142 PA.preserve<LoopAnalysis>(); 2143 return PA; 2144 } 2145 2146 char LoopFuseLegacy::ID = 0; 2147 2148 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, 2149 false) 2150 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 2151 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 2152 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 2153 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 2154 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 2155 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 2156 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 2157 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 2158 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) 2159 2160 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } 2161