1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the loop fusion pass. 11 /// The implementation is largely based on the following document: 12 /// 13 /// Code Transformations to Augment the Scope of Loop Fusion in a 14 /// Production Compiler 15 /// Christopher Mark Barton 16 /// MSc Thesis 17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 18 /// 19 /// The general approach taken is to collect sets of control flow equivalent 20 /// loops and test whether they can be fused. The necessary conditions for 21 /// fusion are: 22 /// 1. The loops must be adjacent (there cannot be any statements between 23 /// the two loops). 24 /// 2. The loops must be conforming (they must execute the same number of 25 /// iterations). 26 /// 3. The loops must be control flow equivalent (if one loop executes, the 27 /// other is guaranteed to execute). 28 /// 4. There cannot be any negative distance dependencies between the loops. 29 /// If all of these conditions are satisfied, it is safe to fuse the loops. 30 /// 31 /// This implementation creates FusionCandidates that represent the loop and the 32 /// necessary information needed by fusion. It then operates on the fusion 33 /// candidates, first confirming that the candidate is eligible for fusion. The 34 /// candidates are then collected into control flow equivalent sets, sorted in 35 /// dominance order. Each set of control flow equivalent candidates is then 36 /// traversed, attempting to fuse pairs of candidates in the set. If all 37 /// requirements for fusion are met, the two candidates are fused, creating a 38 /// new (fused) candidate which is then added back into the set to consider for 39 /// additional fusion. 40 /// 41 /// This implementation currently does not make any modifications to remove 42 /// conditions for fusion. Code transformations to make loops conform to each of 43 /// the conditions for fusion are discussed in more detail in the document 44 /// above. These can be added to the current implementation in the future. 45 //===----------------------------------------------------------------------===// 46 47 #include "llvm/Transforms/Scalar/LoopFuse.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/Analysis/AssumptionCache.h" 50 #include "llvm/Analysis/DependenceAnalysis.h" 51 #include "llvm/Analysis/DomTreeUpdater.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 54 #include "llvm/Analysis/PostDominators.h" 55 #include "llvm/Analysis/ScalarEvolution.h" 56 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57 #include "llvm/Analysis/TargetTransformInfo.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/Verifier.h" 60 #include "llvm/Support/CommandLine.h" 61 #include "llvm/Support/Debug.h" 62 #include "llvm/Support/raw_ostream.h" 63 #include "llvm/Transforms/Utils.h" 64 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 65 #include "llvm/Transforms/Utils/CodeMoverUtils.h" 66 #include "llvm/Transforms/Utils/LoopPeel.h" 67 #include "llvm/Transforms/Utils/LoopSimplify.h" 68 69 using namespace llvm; 70 71 #define DEBUG_TYPE "loop-fusion" 72 73 STATISTIC(FuseCounter, "Loops fused"); 74 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 75 STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 76 STATISTIC(InvalidHeader, "Loop has invalid header"); 77 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 78 STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 79 STATISTIC(InvalidLatch, "Loop has invalid latch"); 80 STATISTIC(InvalidLoop, "Loop is invalid"); 81 STATISTIC(AddressTakenBB, "Basic block has address taken"); 82 STATISTIC(MayThrowException, "Loop may throw an exception"); 83 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 84 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 85 STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 86 STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 87 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 88 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 89 STATISTIC(NonAdjacent, "Loops are not adjacent"); 90 STATISTIC( 91 NonEmptyPreheader, 92 "Loop has a non-empty preheader with instructions that cannot be moved"); 93 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 94 STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 95 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 96 "instructions that cannot be moved"); 97 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 98 "instructions that cannot be moved"); 99 STATISTIC(NotRotated, "Candidate is not rotated"); 100 STATISTIC(OnlySecondCandidateIsGuarded, 101 "The second candidate is guarded while the first one is not"); 102 STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); 103 STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); 104 105 enum FusionDependenceAnalysisChoice { 106 FUSION_DEPENDENCE_ANALYSIS_SCEV, 107 FUSION_DEPENDENCE_ANALYSIS_DA, 108 FUSION_DEPENDENCE_ANALYSIS_ALL, 109 }; 110 111 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 112 "loop-fusion-dependence-analysis", 113 cl::desc("Which dependence analysis should loop fusion use?"), 114 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 115 "Use the scalar evolution interface"), 116 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 117 "Use the dependence analysis interface"), 118 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 119 "Use all available analyses")), 120 cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL)); 121 122 static cl::opt<unsigned> FusionPeelMaxCount( 123 "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, 124 cl::desc("Max number of iterations to be peeled from a loop, such that " 125 "fusion can take place")); 126 127 #ifndef NDEBUG 128 static cl::opt<bool> 129 VerboseFusionDebugging("loop-fusion-verbose-debug", 130 cl::desc("Enable verbose debugging for Loop Fusion"), 131 cl::Hidden, cl::init(false)); 132 #endif 133 134 namespace { 135 /// This class is used to represent a candidate for loop fusion. When it is 136 /// constructed, it checks the conditions for loop fusion to ensure that it 137 /// represents a valid candidate. It caches several parts of a loop that are 138 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 139 /// of continually querying the underlying Loop to retrieve these values. It is 140 /// assumed these will not change throughout loop fusion. 141 /// 142 /// The invalidate method should be used to indicate that the FusionCandidate is 143 /// no longer a valid candidate for fusion. Similarly, the isValid() method can 144 /// be used to ensure that the FusionCandidate is still valid for fusion. 145 struct FusionCandidate { 146 /// Cache of parts of the loop used throughout loop fusion. These should not 147 /// need to change throughout the analysis and transformation. 148 /// These parts are cached to avoid repeatedly looking up in the Loop class. 149 150 /// Preheader of the loop this candidate represents 151 BasicBlock *Preheader; 152 /// Header of the loop this candidate represents 153 BasicBlock *Header; 154 /// Blocks in the loop that exit the loop 155 BasicBlock *ExitingBlock; 156 /// The successor block of this loop (where the exiting blocks go to) 157 BasicBlock *ExitBlock; 158 /// Latch of the loop 159 BasicBlock *Latch; 160 /// The loop that this fusion candidate represents 161 Loop *L; 162 /// Vector of instructions in this loop that read from memory 163 SmallVector<Instruction *, 16> MemReads; 164 /// Vector of instructions in this loop that write to memory 165 SmallVector<Instruction *, 16> MemWrites; 166 /// Are all of the members of this fusion candidate still valid 167 bool Valid; 168 /// Guard branch of the loop, if it exists 169 BranchInst *GuardBranch; 170 /// Peeling Paramaters of the Loop. 171 TTI::PeelingPreferences PP; 172 /// Can you Peel this Loop? 173 bool AbleToPeel; 174 /// Has this loop been Peeled 175 bool Peeled; 176 177 /// Dominator and PostDominator trees are needed for the 178 /// FusionCandidateCompare function, required by FusionCandidateSet to 179 /// determine where the FusionCandidate should be inserted into the set. These 180 /// are used to establish ordering of the FusionCandidates based on dominance. 181 DominatorTree &DT; 182 const PostDominatorTree *PDT; 183 184 OptimizationRemarkEmitter &ORE; 185 186 FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, 187 OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) 188 : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 189 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 190 Latch(L->getLoopLatch()), L(L), Valid(true), 191 GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), 192 Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { 193 194 // Walk over all blocks in the loop and check for conditions that may 195 // prevent fusion. For each block, walk over all instructions and collect 196 // the memory reads and writes If any instructions that prevent fusion are 197 // found, invalidate this object and return. 198 for (BasicBlock *BB : L->blocks()) { 199 if (BB->hasAddressTaken()) { 200 invalidate(); 201 reportInvalidCandidate(AddressTakenBB); 202 return; 203 } 204 205 for (Instruction &I : *BB) { 206 if (I.mayThrow()) { 207 invalidate(); 208 reportInvalidCandidate(MayThrowException); 209 return; 210 } 211 if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 212 if (SI->isVolatile()) { 213 invalidate(); 214 reportInvalidCandidate(ContainsVolatileAccess); 215 return; 216 } 217 } 218 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 219 if (LI->isVolatile()) { 220 invalidate(); 221 reportInvalidCandidate(ContainsVolatileAccess); 222 return; 223 } 224 } 225 if (I.mayWriteToMemory()) 226 MemWrites.push_back(&I); 227 if (I.mayReadFromMemory()) 228 MemReads.push_back(&I); 229 } 230 } 231 } 232 233 /// Check if all members of the class are valid. 234 bool isValid() const { 235 return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 236 !L->isInvalid() && Valid; 237 } 238 239 /// Verify that all members are in sync with the Loop object. 240 void verify() const { 241 assert(isValid() && "Candidate is not valid!!"); 242 assert(!L->isInvalid() && "Loop is invalid!"); 243 assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 244 assert(Header == L->getHeader() && "Header is out of sync"); 245 assert(ExitingBlock == L->getExitingBlock() && 246 "Exiting Blocks is out of sync"); 247 assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 248 assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 249 } 250 251 /// Get the entry block for this fusion candidate. 252 /// 253 /// If this fusion candidate represents a guarded loop, the entry block is the 254 /// loop guard block. If it represents an unguarded loop, the entry block is 255 /// the preheader of the loop. 256 BasicBlock *getEntryBlock() const { 257 if (GuardBranch) 258 return GuardBranch->getParent(); 259 else 260 return Preheader; 261 } 262 263 /// After Peeling the loop is modified quite a bit, hence all of the Blocks 264 /// need to be updated accordingly. 265 void updateAfterPeeling() { 266 Preheader = L->getLoopPreheader(); 267 Header = L->getHeader(); 268 ExitingBlock = L->getExitingBlock(); 269 ExitBlock = L->getExitBlock(); 270 Latch = L->getLoopLatch(); 271 verify(); 272 } 273 274 /// Given a guarded loop, get the successor of the guard that is not in the 275 /// loop. 276 /// 277 /// This method returns the successor of the loop guard that is not located 278 /// within the loop (i.e., the successor of the guard that is not the 279 /// preheader). 280 /// This method is only valid for guarded loops. 281 BasicBlock *getNonLoopBlock() const { 282 assert(GuardBranch && "Only valid on guarded loops."); 283 assert(GuardBranch->isConditional() && 284 "Expecting guard to be a conditional branch."); 285 if (Peeled) 286 return GuardBranch->getSuccessor(1); 287 return (GuardBranch->getSuccessor(0) == Preheader) 288 ? GuardBranch->getSuccessor(1) 289 : GuardBranch->getSuccessor(0); 290 } 291 292 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 293 LLVM_DUMP_METHOD void dump() const { 294 dbgs() << "\tGuardBranch: "; 295 if (GuardBranch) 296 dbgs() << *GuardBranch; 297 else 298 dbgs() << "nullptr"; 299 dbgs() << "\n" 300 << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 301 << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 302 << "\n" 303 << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 304 << "\tExitingBB: " 305 << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 306 << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 307 << "\n" 308 << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 309 << "\tEntryBlock: " 310 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 311 << "\n"; 312 } 313 #endif 314 315 /// Determine if a fusion candidate (representing a loop) is eligible for 316 /// fusion. Note that this only checks whether a single loop can be fused - it 317 /// does not check whether it is *legal* to fuse two loops together. 318 bool isEligibleForFusion(ScalarEvolution &SE) const { 319 if (!isValid()) { 320 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 321 if (!Preheader) 322 ++InvalidPreheader; 323 if (!Header) 324 ++InvalidHeader; 325 if (!ExitingBlock) 326 ++InvalidExitingBlock; 327 if (!ExitBlock) 328 ++InvalidExitBlock; 329 if (!Latch) 330 ++InvalidLatch; 331 if (L->isInvalid()) 332 ++InvalidLoop; 333 334 return false; 335 } 336 337 // Require ScalarEvolution to be able to determine a trip count. 338 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 339 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 340 << " trip count not computable!\n"); 341 return reportInvalidCandidate(UnknownTripCount); 342 } 343 344 if (!L->isLoopSimplifyForm()) { 345 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 346 << " is not in simplified form!\n"); 347 return reportInvalidCandidate(NotSimplifiedForm); 348 } 349 350 if (!L->isRotatedForm()) { 351 LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 352 return reportInvalidCandidate(NotRotated); 353 } 354 355 return true; 356 } 357 358 private: 359 // This is only used internally for now, to clear the MemWrites and MemReads 360 // list and setting Valid to false. I can't envision other uses of this right 361 // now, since once FusionCandidates are put into the FusionCandidateSet they 362 // are immutable. Thus, any time we need to change/update a FusionCandidate, 363 // we must create a new one and insert it into the FusionCandidateSet to 364 // ensure the FusionCandidateSet remains ordered correctly. 365 void invalidate() { 366 MemWrites.clear(); 367 MemReads.clear(); 368 Valid = false; 369 } 370 371 bool reportInvalidCandidate(llvm::Statistic &Stat) const { 372 using namespace ore; 373 assert(L && Preheader && "Fusion candidate not initialized properly!"); 374 #if LLVM_ENABLE_STATS 375 ++Stat; 376 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 377 L->getStartLoc(), Preheader) 378 << "[" << Preheader->getParent()->getName() << "]: " 379 << "Loop is not a candidate for fusion: " << Stat.getDesc()); 380 #endif 381 return false; 382 } 383 }; 384 385 struct FusionCandidateCompare { 386 /// Comparison functor to sort two Control Flow Equivalent fusion candidates 387 /// into dominance order. 388 /// If LHS dominates RHS and RHS post-dominates LHS, return true; 389 /// If RHS dominates LHS and LHS post-dominates RHS, return false; 390 /// If both LHS and RHS are not dominating each other then, non-strictly 391 /// post dominate check will decide the order of candidates. If RHS 392 /// non-strictly post dominates LHS then, return true. If LHS non-strictly 393 /// post dominates RHS then, return false. If both are non-strictly post 394 /// dominate each other then, level in the post dominator tree will decide 395 /// the order of candidates. 396 bool operator()(const FusionCandidate &LHS, 397 const FusionCandidate &RHS) const { 398 const DominatorTree *DT = &(LHS.DT); 399 400 BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 401 BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 402 403 // Do not save PDT to local variable as it is only used in asserts and thus 404 // will trigger an unused variable warning if building without asserts. 405 assert(DT && LHS.PDT && "Expecting valid dominator tree"); 406 407 // Do this compare first so if LHS == RHS, function returns false. 408 if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 409 // RHS dominates LHS 410 // Verify LHS post-dominates RHS 411 assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 412 return false; 413 } 414 415 if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 416 // Verify RHS Postdominates LHS 417 assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 418 return true; 419 } 420 421 // If two FusionCandidates are in the same level of dominator tree, 422 // they will not dominate each other, but may still be control flow 423 // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate() 424 // function is needed. 425 bool WrongOrder = 426 nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT); 427 bool RightOrder = 428 nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT); 429 if (WrongOrder && RightOrder) { 430 // If common predecessor of LHS and RHS post dominates both 431 // FusionCandidates then, Order of FusionCandidate can be 432 // identified by its level in post dominator tree. 433 DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock); 434 DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock); 435 return LNode->getLevel() > RNode->getLevel(); 436 } else if (WrongOrder) 437 return false; 438 else if (RightOrder) 439 return true; 440 441 // If LHS does not non-strict Postdominate RHS and RHS does not non-strict 442 // Postdominate LHS then, there is no dominance relationship between the 443 // two FusionCandidates. Thus, they should not be in the same set together. 444 llvm_unreachable( 445 "No dominance relationship between these fusion candidates!"); 446 } 447 }; 448 449 using LoopVector = SmallVector<Loop *, 4>; 450 451 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 452 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 453 // dominates FC1 and FC1 post-dominates FC0. 454 // std::set was chosen because we want a sorted data structure with stable 455 // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent 456 // loops by moving intervening code around. When this intervening code contains 457 // loops, those loops will be moved also. The corresponding FusionCandidates 458 // will also need to be moved accordingly. As this is done, having stable 459 // iterators will simplify the logic. Similarly, having an efficient insert that 460 // keeps the FusionCandidateSet sorted will also simplify the implementation. 461 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 462 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 463 464 #if !defined(NDEBUG) 465 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 466 const FusionCandidate &FC) { 467 if (FC.isValid()) 468 OS << FC.Preheader->getName(); 469 else 470 OS << "<Invalid>"; 471 472 return OS; 473 } 474 475 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 476 const FusionCandidateSet &CandSet) { 477 for (const FusionCandidate &FC : CandSet) 478 OS << FC << '\n'; 479 480 return OS; 481 } 482 483 static void 484 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 485 dbgs() << "Fusion Candidates: \n"; 486 for (const auto &CandidateSet : FusionCandidates) { 487 dbgs() << "*** Fusion Candidate Set ***\n"; 488 dbgs() << CandidateSet; 489 dbgs() << "****************************\n"; 490 } 491 } 492 #endif 493 494 /// Collect all loops in function at the same nest level, starting at the 495 /// outermost level. 496 /// 497 /// This data structure collects all loops at the same nest level for a 498 /// given function (specified by the LoopInfo object). It starts at the 499 /// outermost level. 500 struct LoopDepthTree { 501 using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 502 using iterator = LoopsOnLevelTy::iterator; 503 using const_iterator = LoopsOnLevelTy::const_iterator; 504 505 LoopDepthTree(LoopInfo &LI) : Depth(1) { 506 if (!LI.empty()) 507 LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 508 } 509 510 /// Test whether a given loop has been removed from the function, and thus is 511 /// no longer valid. 512 bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 513 514 /// Record that a given loop has been removed from the function and is no 515 /// longer valid. 516 void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 517 518 /// Descend the tree to the next (inner) nesting level 519 void descend() { 520 LoopsOnLevelTy LoopsOnNextLevel; 521 522 for (const LoopVector &LV : *this) 523 for (Loop *L : LV) 524 if (!isRemovedLoop(L) && L->begin() != L->end()) 525 LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 526 527 LoopsOnLevel = LoopsOnNextLevel; 528 RemovedLoops.clear(); 529 Depth++; 530 } 531 532 bool empty() const { return size() == 0; } 533 size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 534 unsigned getDepth() const { return Depth; } 535 536 iterator begin() { return LoopsOnLevel.begin(); } 537 iterator end() { return LoopsOnLevel.end(); } 538 const_iterator begin() const { return LoopsOnLevel.begin(); } 539 const_iterator end() const { return LoopsOnLevel.end(); } 540 541 private: 542 /// Set of loops that have been removed from the function and are no longer 543 /// valid. 544 SmallPtrSet<const Loop *, 8> RemovedLoops; 545 546 /// Depth of the current level, starting at 1 (outermost loops). 547 unsigned Depth; 548 549 /// Vector of loops at the current depth level that have the same parent loop 550 LoopsOnLevelTy LoopsOnLevel; 551 }; 552 553 #ifndef NDEBUG 554 static void printLoopVector(const LoopVector &LV) { 555 dbgs() << "****************************\n"; 556 for (auto *L : LV) 557 printLoop(*L, dbgs()); 558 dbgs() << "****************************\n"; 559 } 560 #endif 561 562 struct LoopFuser { 563 private: 564 // Sets of control flow equivalent fusion candidates for a given nest level. 565 FusionCandidateCollection FusionCandidates; 566 567 LoopDepthTree LDT; 568 DomTreeUpdater DTU; 569 570 LoopInfo &LI; 571 DominatorTree &DT; 572 DependenceInfo &DI; 573 ScalarEvolution &SE; 574 PostDominatorTree &PDT; 575 OptimizationRemarkEmitter &ORE; 576 AssumptionCache &AC; 577 const TargetTransformInfo &TTI; 578 579 public: 580 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 581 ScalarEvolution &SE, PostDominatorTree &PDT, 582 OptimizationRemarkEmitter &ORE, const DataLayout &DL, 583 AssumptionCache &AC, const TargetTransformInfo &TTI) 584 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 585 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} 586 587 /// This is the main entry point for loop fusion. It will traverse the 588 /// specified function and collect candidate loops to fuse, starting at the 589 /// outermost nesting level and working inwards. 590 bool fuseLoops(Function &F) { 591 #ifndef NDEBUG 592 if (VerboseFusionDebugging) { 593 LI.print(dbgs()); 594 } 595 #endif 596 597 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 598 << "\n"); 599 bool Changed = false; 600 601 while (!LDT.empty()) { 602 LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 603 << LDT.getDepth() << "\n";); 604 605 for (const LoopVector &LV : LDT) { 606 assert(LV.size() > 0 && "Empty loop set was build!"); 607 608 // Skip singleton loop sets as they do not offer fusion opportunities on 609 // this level. 610 if (LV.size() == 1) 611 continue; 612 #ifndef NDEBUG 613 if (VerboseFusionDebugging) { 614 LLVM_DEBUG({ 615 dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 616 printLoopVector(LV); 617 }); 618 } 619 #endif 620 621 collectFusionCandidates(LV); 622 Changed |= fuseCandidates(); 623 } 624 625 // Finished analyzing candidates at this level. 626 // Descend to the next level and clear all of the candidates currently 627 // collected. Note that it will not be possible to fuse any of the 628 // existing candidates with new candidates because the new candidates will 629 // be at a different nest level and thus not be control flow equivalent 630 // with all of the candidates collected so far. 631 LLVM_DEBUG(dbgs() << "Descend one level!\n"); 632 LDT.descend(); 633 FusionCandidates.clear(); 634 } 635 636 if (Changed) 637 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 638 639 #ifndef NDEBUG 640 assert(DT.verify()); 641 assert(PDT.verify()); 642 LI.verify(DT); 643 SE.verify(); 644 #endif 645 646 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 647 return Changed; 648 } 649 650 private: 651 /// Determine if two fusion candidates are control flow equivalent. 652 /// 653 /// Two fusion candidates are control flow equivalent if when one executes, 654 /// the other is guaranteed to execute. This is determined using dominators 655 /// and post-dominators: if A dominates B and B post-dominates A then A and B 656 /// are control-flow equivalent. 657 bool isControlFlowEquivalent(const FusionCandidate &FC0, 658 const FusionCandidate &FC1) const { 659 assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 660 661 return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 662 DT, PDT); 663 } 664 665 /// Iterate over all loops in the given loop set and identify the loops that 666 /// are eligible for fusion. Place all eligible fusion candidates into Control 667 /// Flow Equivalent sets, sorted by dominance. 668 void collectFusionCandidates(const LoopVector &LV) { 669 for (Loop *L : LV) { 670 TTI::PeelingPreferences PP = 671 gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt); 672 FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); 673 if (!CurrCand.isEligibleForFusion(SE)) 674 continue; 675 676 // Go through each list in FusionCandidates and determine if L is control 677 // flow equivalent with the first loop in that list. If it is, append LV. 678 // If not, go to the next list. 679 // If no suitable list is found, start another list and add it to 680 // FusionCandidates. 681 bool FoundSet = false; 682 683 for (auto &CurrCandSet : FusionCandidates) { 684 if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 685 CurrCandSet.insert(CurrCand); 686 FoundSet = true; 687 #ifndef NDEBUG 688 if (VerboseFusionDebugging) 689 LLVM_DEBUG(dbgs() << "Adding " << CurrCand 690 << " to existing candidate set\n"); 691 #endif 692 break; 693 } 694 } 695 if (!FoundSet) { 696 // No set was found. Create a new set and add to FusionCandidates 697 #ifndef NDEBUG 698 if (VerboseFusionDebugging) 699 LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 700 #endif 701 FusionCandidateSet NewCandSet; 702 NewCandSet.insert(CurrCand); 703 FusionCandidates.push_back(NewCandSet); 704 } 705 NumFusionCandidates++; 706 } 707 } 708 709 /// Determine if it is beneficial to fuse two loops. 710 /// 711 /// For now, this method simply returns true because we want to fuse as much 712 /// as possible (primarily to test the pass). This method will evolve, over 713 /// time, to add heuristics for profitability of fusion. 714 bool isBeneficialFusion(const FusionCandidate &FC0, 715 const FusionCandidate &FC1) { 716 return true; 717 } 718 719 /// Determine if two fusion candidates have the same trip count (i.e., they 720 /// execute the same number of iterations). 721 /// 722 /// This function will return a pair of values. The first is a boolean, 723 /// stating whether or not the two candidates are known at compile time to 724 /// have the same TripCount. The second is the difference in the two 725 /// TripCounts. This information can be used later to determine whether or not 726 /// peeling can be performed on either one of the candidates. 727 std::pair<bool, std::optional<unsigned>> 728 haveIdenticalTripCounts(const FusionCandidate &FC0, 729 const FusionCandidate &FC1) const { 730 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 731 if (isa<SCEVCouldNotCompute>(TripCount0)) { 732 UncomputableTripCount++; 733 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 734 return {false, std::nullopt}; 735 } 736 737 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 738 if (isa<SCEVCouldNotCompute>(TripCount1)) { 739 UncomputableTripCount++; 740 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 741 return {false, std::nullopt}; 742 } 743 744 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 745 << *TripCount1 << " are " 746 << (TripCount0 == TripCount1 ? "identical" : "different") 747 << "\n"); 748 749 if (TripCount0 == TripCount1) 750 return {true, 0}; 751 752 LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " 753 "determining the difference between trip counts\n"); 754 755 // Currently only considering loops with a single exit point 756 // and a non-constant trip count. 757 const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); 758 const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); 759 760 // If any of the tripcounts are zero that means that loop(s) do not have 761 // a single exit or a constant tripcount. 762 if (TC0 == 0 || TC1 == 0) { 763 LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " 764 "have a constant number of iterations. Peeling " 765 "is not benefical\n"); 766 return {false, std::nullopt}; 767 } 768 769 std::optional<unsigned> Difference; 770 int Diff = TC0 - TC1; 771 772 if (Diff > 0) 773 Difference = Diff; 774 else { 775 LLVM_DEBUG( 776 dbgs() << "Difference is less than 0. FC1 (second loop) has more " 777 "iterations than the first one. Currently not supported\n"); 778 } 779 780 LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference 781 << "\n"); 782 783 return {false, Difference}; 784 } 785 786 void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, 787 unsigned PeelCount) { 788 assert(FC0.AbleToPeel && "Should be able to peel loop"); 789 790 LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount 791 << " iterations of the first loop. \n"); 792 793 ValueToValueMapTy VMap; 794 FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap); 795 if (FC0.Peeled) { 796 LLVM_DEBUG(dbgs() << "Done Peeling\n"); 797 798 #ifndef NDEBUG 799 auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); 800 801 assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && 802 "Loops should have identical trip counts after peeling"); 803 #endif 804 805 FC0.PP.PeelCount += PeelCount; 806 807 // Peeling does not update the PDT 808 PDT.recalculate(*FC0.Preheader->getParent()); 809 810 FC0.updateAfterPeeling(); 811 812 // In this case the iterations of the loop are constant, so the first 813 // loop will execute completely (will not jump from one of 814 // the peeled blocks to the second loop). Here we are updating the 815 // branch conditions of each of the peeled blocks, such that it will 816 // branch to its successor which is not the preheader of the second loop 817 // in the case of unguarded loops, or the succesors of the exit block of 818 // the first loop otherwise. Doing this update will ensure that the entry 819 // block of the first loop dominates the entry block of the second loop. 820 BasicBlock *BB = 821 FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; 822 if (BB) { 823 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 824 SmallVector<Instruction *, 8> WorkList; 825 for (BasicBlock *Pred : predecessors(BB)) { 826 if (Pred != FC0.ExitBlock) { 827 WorkList.emplace_back(Pred->getTerminator()); 828 TreeUpdates.emplace_back( 829 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); 830 } 831 } 832 // Cannot modify the predecessors inside the above loop as it will cause 833 // the iterators to be nullptrs, causing memory errors. 834 for (Instruction *CurrentBranch : WorkList) { 835 BasicBlock *Succ = CurrentBranch->getSuccessor(0); 836 if (Succ == BB) 837 Succ = CurrentBranch->getSuccessor(1); 838 ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ)); 839 } 840 841 DTU.applyUpdates(TreeUpdates); 842 DTU.flush(); 843 } 844 LLVM_DEBUG( 845 dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount 846 << " iterations from the first loop.\n" 847 "Both Loops have the same number of iterations now.\n"); 848 } 849 } 850 851 /// Walk each set of control flow equivalent fusion candidates and attempt to 852 /// fuse them. This does a single linear traversal of all candidates in the 853 /// set. The conditions for legal fusion are checked at this point. If a pair 854 /// of fusion candidates passes all legality checks, they are fused together 855 /// and a new fusion candidate is created and added to the FusionCandidateSet. 856 /// The original fusion candidates are then removed, as they are no longer 857 /// valid. 858 bool fuseCandidates() { 859 bool Fused = false; 860 LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 861 for (auto &CandidateSet : FusionCandidates) { 862 if (CandidateSet.size() < 2) 863 continue; 864 865 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 866 << CandidateSet << "\n"); 867 868 for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 869 assert(!LDT.isRemovedLoop(FC0->L) && 870 "Should not have removed loops in CandidateSet!"); 871 auto FC1 = FC0; 872 for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 873 assert(!LDT.isRemovedLoop(FC1->L) && 874 "Should not have removed loops in CandidateSet!"); 875 876 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 877 dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 878 879 FC0->verify(); 880 FC1->verify(); 881 882 // Check if the candidates have identical tripcounts (first value of 883 // pair), and if not check the difference in the tripcounts between 884 // the loops (second value of pair). The difference is not equal to 885 // std::nullopt iff the loops iterate a constant number of times, and 886 // have a single exit. 887 std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = 888 haveIdenticalTripCounts(*FC0, *FC1); 889 bool SameTripCount = IdenticalTripCountRes.first; 890 std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; 891 892 // Here we are checking that FC0 (the first loop) can be peeled, and 893 // both loops have different tripcounts. 894 if (FC0->AbleToPeel && !SameTripCount && TCDifference) { 895 if (*TCDifference > FusionPeelMaxCount) { 896 LLVM_DEBUG(dbgs() 897 << "Difference in loop trip counts: " << *TCDifference 898 << " is greater than maximum peel count specificed: " 899 << FusionPeelMaxCount << "\n"); 900 } else { 901 // Dependent on peeling being performed on the first loop, and 902 // assuming all other conditions for fusion return true. 903 SameTripCount = true; 904 } 905 } 906 907 if (!SameTripCount) { 908 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 909 "counts. Not fusing.\n"); 910 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 911 NonEqualTripCount); 912 continue; 913 } 914 915 if (!isAdjacent(*FC0, *FC1)) { 916 LLVM_DEBUG(dbgs() 917 << "Fusion candidates are not adjacent. Not fusing.\n"); 918 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 919 continue; 920 } 921 922 if ((!FC0->GuardBranch && FC1->GuardBranch) || 923 (FC0->GuardBranch && !FC1->GuardBranch)) { 924 LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " 925 "another one is not. Not fusing.\n"); 926 reportLoopFusion<OptimizationRemarkMissed>( 927 *FC0, *FC1, OnlySecondCandidateIsGuarded); 928 continue; 929 } 930 931 // Ensure that FC0 and FC1 have identical guards. 932 // If one (or both) are not guarded, this check is not necessary. 933 if (FC0->GuardBranch && FC1->GuardBranch && 934 !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { 935 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 936 "guards. Not Fusing.\n"); 937 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 938 NonIdenticalGuards); 939 continue; 940 } 941 942 if (FC0->GuardBranch) { 943 assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 944 945 if (!isSafeToMoveBefore(*FC0->ExitBlock, 946 *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 947 &PDT, &DI)) { 948 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 949 "instructions in exit block. Not fusing.\n"); 950 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 951 NonEmptyExitBlock); 952 continue; 953 } 954 955 if (!isSafeToMoveBefore( 956 *FC1->GuardBranch->getParent(), 957 *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 958 &DI)) { 959 LLVM_DEBUG(dbgs() 960 << "Fusion candidate contains unsafe " 961 "instructions in guard block. Not fusing.\n"); 962 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 963 NonEmptyGuardBlock); 964 continue; 965 } 966 } 967 968 // Check the dependencies across the loops and do not fuse if it would 969 // violate them. 970 if (!dependencesAllowFusion(*FC0, *FC1)) { 971 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 972 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 973 InvalidDependencies); 974 continue; 975 } 976 977 // If the second loop has instructions in the pre-header, attempt to 978 // hoist them up to the first loop's pre-header or sink them into the 979 // body of the second loop. 980 SmallVector<Instruction *, 4> SafeToHoist; 981 SmallVector<Instruction *, 4> SafeToSink; 982 // At this point, this is the last remaining legality check. 983 // Which means if we can make this pre-header empty, we can fuse 984 // these loops 985 if (!isEmptyPreheader(*FC1)) { 986 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " 987 "preheader.\n"); 988 989 // If it is not safe to hoist/sink all instructions in the 990 // pre-header, we cannot fuse these loops. 991 if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, 992 SafeToSink)) { 993 LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " 994 "Fusion Candidate Pre-header.\n" 995 << "Not Fusing.\n"); 996 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 997 NonEmptyPreheader); 998 continue; 999 } 1000 } 1001 1002 bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 1003 LLVM_DEBUG(dbgs() 1004 << "\tFusion appears to be " 1005 << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 1006 if (!BeneficialToFuse) { 1007 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 1008 FusionNotBeneficial); 1009 continue; 1010 } 1011 // All analysis has completed and has determined that fusion is legal 1012 // and profitable. At this point, start transforming the code and 1013 // perform fusion. 1014 1015 // Execute the hoist/sink operations on preheader instructions 1016 movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); 1017 1018 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 1019 << *FC1 << "\n"); 1020 1021 FusionCandidate FC0Copy = *FC0; 1022 // Peel the loop after determining that fusion is legal. The Loops 1023 // will still be safe to fuse after the peeling is performed. 1024 bool Peel = TCDifference && *TCDifference > 0; 1025 if (Peel) 1026 peelFusionCandidate(FC0Copy, *FC1, *TCDifference); 1027 1028 // Report fusion to the Optimization Remarks. 1029 // Note this needs to be done *before* performFusion because 1030 // performFusion will change the original loops, making it not 1031 // possible to identify them after fusion is complete. 1032 reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, 1033 FuseCounter); 1034 1035 FusionCandidate FusedCand( 1036 performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE, 1037 FC0Copy.PP); 1038 FusedCand.verify(); 1039 assert(FusedCand.isEligibleForFusion(SE) && 1040 "Fused candidate should be eligible for fusion!"); 1041 1042 // Notify the loop-depth-tree that these loops are not valid objects 1043 LDT.removeLoop(FC1->L); 1044 1045 CandidateSet.erase(FC0); 1046 CandidateSet.erase(FC1); 1047 1048 auto InsertPos = CandidateSet.insert(FusedCand); 1049 1050 assert(InsertPos.second && 1051 "Unable to insert TargetCandidate in CandidateSet!"); 1052 1053 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 1054 // of the FC1 loop will attempt to fuse the new (fused) loop with the 1055 // remaining candidates in the current candidate set. 1056 FC0 = FC1 = InsertPos.first; 1057 1058 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 1059 << "\n"); 1060 1061 Fused = true; 1062 } 1063 } 1064 } 1065 return Fused; 1066 } 1067 1068 // Returns true if the instruction \p I can be hoisted to the end of the 1069 // preheader of \p FC0. \p SafeToHoist contains the instructions that are 1070 // known to be safe to hoist. The instructions encountered that cannot be 1071 // hoisted are in \p NotHoisting. 1072 // TODO: Move functionality into CodeMoverUtils 1073 bool canHoistInst(Instruction &I, 1074 const SmallVector<Instruction *, 4> &SafeToHoist, 1075 const SmallVector<Instruction *, 4> &NotHoisting, 1076 const FusionCandidate &FC0) const { 1077 const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); 1078 assert(FC0PreheaderTarget && 1079 "Expected single successor for loop preheader."); 1080 1081 for (Use &Op : I.operands()) { 1082 if (auto *OpInst = dyn_cast<Instruction>(Op)) { 1083 bool OpHoisted = is_contained(SafeToHoist, OpInst); 1084 // Check if we have already decided to hoist this operand. In this 1085 // case, it does not dominate FC0 *yet*, but will after we hoist it. 1086 if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { 1087 return false; 1088 } 1089 } 1090 } 1091 1092 // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs 1093 // cannot be hoisted and should be sunk to the exit of the fused loop. 1094 if (isa<PHINode>(I)) 1095 return false; 1096 1097 // If this isn't a memory inst, hoisting is safe 1098 if (!I.mayReadOrWriteMemory()) 1099 return true; 1100 1101 LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); 1102 for (Instruction *NotHoistedInst : NotHoisting) { 1103 if (auto D = DI.depends(&I, NotHoistedInst, true)) { 1104 // Dependency is not read-before-write, write-before-read or 1105 // write-before-write 1106 if (D->isFlow() || D->isAnti() || D->isOutput()) { 1107 LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " 1108 "preheader that is not being hoisted.\n"); 1109 return false; 1110 } 1111 } 1112 } 1113 1114 for (Instruction *ReadInst : FC0.MemReads) { 1115 if (auto D = DI.depends(ReadInst, &I, true)) { 1116 // Dependency is not read-before-write 1117 if (D->isAnti()) { 1118 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n"); 1119 return false; 1120 } 1121 } 1122 } 1123 1124 for (Instruction *WriteInst : FC0.MemWrites) { 1125 if (auto D = DI.depends(WriteInst, &I, true)) { 1126 // Dependency is not write-before-read or write-before-write 1127 if (D->isFlow() || D->isOutput()) { 1128 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n"); 1129 return false; 1130 } 1131 } 1132 } 1133 return true; 1134 } 1135 1136 // Returns true if the instruction \p I can be sunk to the top of the exit 1137 // block of \p FC1. 1138 // TODO: Move functionality into CodeMoverUtils 1139 bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { 1140 for (User *U : I.users()) { 1141 if (auto *UI{dyn_cast<Instruction>(U)}) { 1142 // Cannot sink if user in loop 1143 // If FC1 has phi users of this value, we cannot sink it into FC1. 1144 if (FC1.L->contains(UI)) { 1145 // Cannot hoist or sink this instruction. No hoisting/sinking 1146 // should take place, loops should not fuse 1147 return false; 1148 } 1149 } 1150 } 1151 1152 // If this isn't a memory inst, sinking is safe 1153 if (!I.mayReadOrWriteMemory()) 1154 return true; 1155 1156 for (Instruction *ReadInst : FC1.MemReads) { 1157 if (auto D = DI.depends(&I, ReadInst, true)) { 1158 // Dependency is not write-before-read 1159 if (D->isFlow()) { 1160 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n"); 1161 return false; 1162 } 1163 } 1164 } 1165 1166 for (Instruction *WriteInst : FC1.MemWrites) { 1167 if (auto D = DI.depends(&I, WriteInst, true)) { 1168 // Dependency is not write-before-write or read-before-write 1169 if (D->isOutput() || D->isAnti()) { 1170 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n"); 1171 return false; 1172 } 1173 } 1174 } 1175 1176 return true; 1177 } 1178 1179 /// Collect instructions in the \p FC1 Preheader that can be hoisted 1180 /// to the \p FC0 Preheader or sunk into the \p FC1 Body 1181 bool collectMovablePreheaderInsts( 1182 const FusionCandidate &FC0, const FusionCandidate &FC1, 1183 SmallVector<Instruction *, 4> &SafeToHoist, 1184 SmallVector<Instruction *, 4> &SafeToSink) const { 1185 BasicBlock *FC1Preheader = FC1.Preheader; 1186 // Save the instructions that are not being hoisted, so we know not to hoist 1187 // mem insts that they dominate. 1188 SmallVector<Instruction *, 4> NotHoisting; 1189 1190 for (Instruction &I : *FC1Preheader) { 1191 // Can't move a branch 1192 if (&I == FC1Preheader->getTerminator()) 1193 continue; 1194 // If the instruction has side-effects, give up. 1195 // TODO: The case of mayReadFromMemory we can handle but requires 1196 // additional work with a dependence analysis so for now we give 1197 // up on memory reads. 1198 if (I.mayThrow() || !I.willReturn()) { 1199 LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n"); 1200 return false; 1201 } 1202 1203 LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); 1204 1205 if (I.isAtomic() || I.isVolatile()) { 1206 LLVM_DEBUG( 1207 dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n"); 1208 return false; 1209 } 1210 1211 if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { 1212 SafeToHoist.push_back(&I); 1213 LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); 1214 } else { 1215 LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); 1216 NotHoisting.push_back(&I); 1217 1218 if (canSinkInst(I, FC1)) { 1219 SafeToSink.push_back(&I); 1220 LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); 1221 } else { 1222 LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); 1223 return false; 1224 } 1225 } 1226 } 1227 LLVM_DEBUG( 1228 dbgs() << "All preheader instructions could be sunk or hoisted!\n"); 1229 return true; 1230 } 1231 1232 /// Rewrite all additive recurrences in a SCEV to use a new loop. 1233 class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 1234 public: 1235 AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 1236 bool UseMax = true) 1237 : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 1238 NewL(NewL) {} 1239 1240 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 1241 const Loop *ExprL = Expr->getLoop(); 1242 SmallVector<const SCEV *, 2> Operands; 1243 if (ExprL == &OldL) { 1244 append_range(Operands, Expr->operands()); 1245 return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 1246 } 1247 1248 if (OldL.contains(ExprL)) { 1249 bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 1250 if (!UseMax || !Pos || !Expr->isAffine()) { 1251 Valid = false; 1252 return Expr; 1253 } 1254 return visit(Expr->getStart()); 1255 } 1256 1257 for (const SCEV *Op : Expr->operands()) 1258 Operands.push_back(visit(Op)); 1259 return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 1260 } 1261 1262 bool wasValidSCEV() const { return Valid; } 1263 1264 private: 1265 bool Valid, UseMax; 1266 const Loop &OldL, &NewL; 1267 }; 1268 1269 /// Return false if the access functions of \p I0 and \p I1 could cause 1270 /// a negative dependence. 1271 bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 1272 Instruction &I1, bool EqualIsInvalid) { 1273 Value *Ptr0 = getLoadStorePointerOperand(&I0); 1274 Value *Ptr1 = getLoadStorePointerOperand(&I1); 1275 if (!Ptr0 || !Ptr1) 1276 return false; 1277 1278 const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 1279 const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 1280 #ifndef NDEBUG 1281 if (VerboseFusionDebugging) 1282 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 1283 << *SCEVPtr1 << "\n"); 1284 #endif 1285 AddRecLoopReplacer Rewriter(SE, L0, L1); 1286 SCEVPtr0 = Rewriter.visit(SCEVPtr0); 1287 #ifndef NDEBUG 1288 if (VerboseFusionDebugging) 1289 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 1290 << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 1291 #endif 1292 if (!Rewriter.wasValidSCEV()) 1293 return false; 1294 1295 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 1296 // L0) and the other is not. We could check if it is monotone and test 1297 // the beginning and end value instead. 1298 1299 BasicBlock *L0Header = L0.getHeader(); 1300 auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 1301 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 1302 if (!AddRec) 1303 return false; 1304 return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 1305 !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 1306 }; 1307 if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 1308 return false; 1309 1310 ICmpInst::Predicate Pred = 1311 EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 1312 bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 1313 #ifndef NDEBUG 1314 if (VerboseFusionDebugging) 1315 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 1316 << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 1317 << "\n"); 1318 #endif 1319 return IsAlwaysGE; 1320 } 1321 1322 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 1323 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 1324 /// specified by @p DepChoice are used to determine this. 1325 bool dependencesAllowFusion(const FusionCandidate &FC0, 1326 const FusionCandidate &FC1, Instruction &I0, 1327 Instruction &I1, bool AnyDep, 1328 FusionDependenceAnalysisChoice DepChoice) { 1329 #ifndef NDEBUG 1330 if (VerboseFusionDebugging) { 1331 LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 1332 << DepChoice << "\n"); 1333 } 1334 #endif 1335 switch (DepChoice) { 1336 case FUSION_DEPENDENCE_ANALYSIS_SCEV: 1337 return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 1338 case FUSION_DEPENDENCE_ANALYSIS_DA: { 1339 auto DepResult = DI.depends(&I0, &I1, true); 1340 if (!DepResult) 1341 return true; 1342 #ifndef NDEBUG 1343 if (VerboseFusionDebugging) { 1344 LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 1345 dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 1346 << (DepResult->isOrdered() ? "true" : "false") 1347 << "]\n"); 1348 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 1349 << "\n"); 1350 } 1351 #endif 1352 1353 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 1354 LLVM_DEBUG( 1355 dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 1356 1357 // TODO: Can we actually use the dependence info analysis here? 1358 return false; 1359 } 1360 1361 case FUSION_DEPENDENCE_ANALYSIS_ALL: 1362 return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1363 FUSION_DEPENDENCE_ANALYSIS_SCEV) || 1364 dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1365 FUSION_DEPENDENCE_ANALYSIS_DA); 1366 } 1367 1368 llvm_unreachable("Unknown fusion dependence analysis choice!"); 1369 } 1370 1371 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 1372 bool dependencesAllowFusion(const FusionCandidate &FC0, 1373 const FusionCandidate &FC1) { 1374 LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 1375 << "\n"); 1376 assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 1377 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 1378 1379 for (Instruction *WriteL0 : FC0.MemWrites) { 1380 for (Instruction *WriteL1 : FC1.MemWrites) 1381 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1382 /* AnyDep */ false, 1383 FusionDependenceAnalysis)) { 1384 InvalidDependencies++; 1385 return false; 1386 } 1387 for (Instruction *ReadL1 : FC1.MemReads) 1388 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 1389 /* AnyDep */ false, 1390 FusionDependenceAnalysis)) { 1391 InvalidDependencies++; 1392 return false; 1393 } 1394 } 1395 1396 for (Instruction *WriteL1 : FC1.MemWrites) { 1397 for (Instruction *WriteL0 : FC0.MemWrites) 1398 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1399 /* AnyDep */ false, 1400 FusionDependenceAnalysis)) { 1401 InvalidDependencies++; 1402 return false; 1403 } 1404 for (Instruction *ReadL0 : FC0.MemReads) 1405 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 1406 /* AnyDep */ false, 1407 FusionDependenceAnalysis)) { 1408 InvalidDependencies++; 1409 return false; 1410 } 1411 } 1412 1413 // Walk through all uses in FC1. For each use, find the reaching def. If the 1414 // def is located in FC0 then it is is not safe to fuse. 1415 for (BasicBlock *BB : FC1.L->blocks()) 1416 for (Instruction &I : *BB) 1417 for (auto &Op : I.operands()) 1418 if (Instruction *Def = dyn_cast<Instruction>(Op)) 1419 if (FC0.L->contains(Def->getParent())) { 1420 InvalidDependencies++; 1421 return false; 1422 } 1423 1424 return true; 1425 } 1426 1427 /// Determine if two fusion candidates are adjacent in the CFG. 1428 /// 1429 /// This method will determine if there are additional basic blocks in the CFG 1430 /// between the exit of \p FC0 and the entry of \p FC1. 1431 /// If the two candidates are guarded loops, then it checks whether the 1432 /// non-loop successor of the \p FC0 guard branch is the entry block of \p 1433 /// FC1. If not, then the loops are not adjacent. If the two candidates are 1434 /// not guarded loops, then it checks whether the exit block of \p FC0 is the 1435 /// preheader of \p FC1. 1436 bool isAdjacent(const FusionCandidate &FC0, 1437 const FusionCandidate &FC1) const { 1438 // If the successor of the guard branch is FC1, then the loops are adjacent 1439 if (FC0.GuardBranch) 1440 return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 1441 else 1442 return FC0.ExitBlock == FC1.getEntryBlock(); 1443 } 1444 1445 bool isEmptyPreheader(const FusionCandidate &FC) const { 1446 return FC.Preheader->size() == 1; 1447 } 1448 1449 /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader 1450 /// and sink others into the body of \p FC1. 1451 void movePreheaderInsts(const FusionCandidate &FC0, 1452 const FusionCandidate &FC1, 1453 SmallVector<Instruction *, 4> &HoistInsts, 1454 SmallVector<Instruction *, 4> &SinkInsts) const { 1455 // All preheader instructions except the branch must be hoisted or sunk 1456 assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && 1457 "Attempting to sink and hoist preheader instructions, but not all " 1458 "the preheader instructions are accounted for."); 1459 1460 NumHoistedInsts += HoistInsts.size(); 1461 NumSunkInsts += SinkInsts.size(); 1462 1463 LLVM_DEBUG(if (VerboseFusionDebugging) { 1464 if (!HoistInsts.empty()) 1465 dbgs() << "Hoisting: \n"; 1466 for (Instruction *I : HoistInsts) 1467 dbgs() << *I << "\n"; 1468 if (!SinkInsts.empty()) 1469 dbgs() << "Sinking: \n"; 1470 for (Instruction *I : SinkInsts) 1471 dbgs() << *I << "\n"; 1472 }); 1473 1474 for (Instruction *I : HoistInsts) { 1475 assert(I->getParent() == FC1.Preheader); 1476 I->moveBefore(FC0.Preheader->getTerminator()); 1477 } 1478 // insert instructions in reverse order to maintain dominance relationship 1479 for (Instruction *I : reverse(SinkInsts)) { 1480 assert(I->getParent() == FC1.Preheader); 1481 I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt()); 1482 } 1483 } 1484 1485 /// Determine if two fusion candidates have identical guards 1486 /// 1487 /// This method will determine if two fusion candidates have the same guards. 1488 /// The guards are considered the same if: 1489 /// 1. The instructions to compute the condition used in the compare are 1490 /// identical. 1491 /// 2. The successors of the guard have the same flow into/around the loop. 1492 /// If the compare instructions are identical, then the first successor of the 1493 /// guard must go to the same place (either the preheader of the loop or the 1494 /// NonLoopBlock). In other words, the the first successor of both loops must 1495 /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 1496 /// the NonLoopBlock). The same must be true for the second successor. 1497 bool haveIdenticalGuards(const FusionCandidate &FC0, 1498 const FusionCandidate &FC1) const { 1499 assert(FC0.GuardBranch && FC1.GuardBranch && 1500 "Expecting FC0 and FC1 to be guarded loops."); 1501 1502 if (auto FC0CmpInst = 1503 dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 1504 if (auto FC1CmpInst = 1505 dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 1506 if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 1507 return false; 1508 1509 // The compare instructions are identical. 1510 // Now make sure the successor of the guards have the same flow into/around 1511 // the loop 1512 if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 1513 return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 1514 else 1515 return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 1516 } 1517 1518 /// Modify the latch branch of FC to be unconditional since successors of the 1519 /// branch are the same. 1520 void simplifyLatchBranch(const FusionCandidate &FC) const { 1521 BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1522 if (FCLatchBranch) { 1523 assert(FCLatchBranch->isConditional() && 1524 FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1525 "Expecting the two successors of FCLatchBranch to be the same"); 1526 BranchInst *NewBranch = 1527 BranchInst::Create(FCLatchBranch->getSuccessor(0)); 1528 ReplaceInstWithInst(FCLatchBranch, NewBranch); 1529 } 1530 } 1531 1532 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1533 /// successor, then merge FC0.Latch with its unique successor. 1534 void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1535 moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1536 if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1537 MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1538 DTU.flush(); 1539 } 1540 } 1541 1542 /// Fuse two fusion candidates, creating a new fused loop. 1543 /// 1544 /// This method contains the mechanics of fusing two loops, represented by \p 1545 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 1546 /// postdominates \p FC0 (making them control flow equivalent). It also 1547 /// assumes that the other conditions for fusion have been met: adjacent, 1548 /// identical trip counts, and no negative distance dependencies exist that 1549 /// would prevent fusion. Thus, there is no checking for these conditions in 1550 /// this method. 1551 /// 1552 /// Fusion is performed by rewiring the CFG to update successor blocks of the 1553 /// components of tho loop. Specifically, the following changes are done: 1554 /// 1555 /// 1. The preheader of \p FC1 is removed as it is no longer necessary 1556 /// (because it is currently only a single statement block). 1557 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 1558 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 1559 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 1560 /// 1561 /// All of these modifications are done with dominator tree updates, thus 1562 /// keeping the dominator (and post dominator) information up-to-date. 1563 /// 1564 /// This can be improved in the future by actually merging blocks during 1565 /// fusion. For example, the preheader of \p FC1 can be merged with the 1566 /// preheader of \p FC0. This would allow loops with more than a single 1567 /// statement in the preheader to be fused. Similarly, the latch blocks of the 1568 /// two loops could also be fused into a single block. This will require 1569 /// analysis to prove it is safe to move the contents of the block past 1570 /// existing code, which currently has not been implemented. 1571 Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1572 assert(FC0.isValid() && FC1.isValid() && 1573 "Expecting valid fusion candidates"); 1574 1575 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 1576 dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 1577 1578 // Move instructions from the preheader of FC1 to the end of the preheader 1579 // of FC0. 1580 moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 1581 1582 // Fusing guarded loops is handled slightly differently than non-guarded 1583 // loops and has been broken out into a separate method instead of trying to 1584 // intersperse the logic within a single method. 1585 if (FC0.GuardBranch) 1586 return fuseGuardedLoops(FC0, FC1); 1587 1588 assert(FC1.Preheader == 1589 (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); 1590 assert(FC1.Preheader->size() == 1 && 1591 FC1.Preheader->getSingleSuccessor() == FC1.Header); 1592 1593 // Remember the phi nodes originally in the header of FC0 in order to rewire 1594 // them later. However, this is only necessary if the new loop carried 1595 // values might not dominate the exiting branch. While we do not generally 1596 // test if this is the case but simply insert intermediate phi nodes, we 1597 // need to make sure these intermediate phi nodes have different 1598 // predecessors. To this end, we filter the special case where the exiting 1599 // block is the latch block of the first loop. Nothing needs to be done 1600 // anyway as all loop carried values dominate the latch and thereby also the 1601 // exiting branch. 1602 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1603 if (FC0.ExitingBlock != FC0.Latch) 1604 for (PHINode &PHI : FC0.Header->phis()) 1605 OriginalFC0PHIs.push_back(&PHI); 1606 1607 // Replace incoming blocks for header PHIs first. 1608 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1609 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1610 1611 // Then modify the control flow and update DT and PDT. 1612 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1613 1614 // The old exiting block of the first loop (FC0) has to jump to the header 1615 // of the second as we need to execute the code in the second header block 1616 // regardless of the trip count. That is, if the trip count is 0, so the 1617 // back edge is never taken, we still have to execute both loop headers, 1618 // especially (but not only!) if the second is a do-while style loop. 1619 // However, doing so might invalidate the phi nodes of the first loop as 1620 // the new values do only need to dominate their latch and not the exiting 1621 // predicate. To remedy this potential problem we always introduce phi 1622 // nodes in the header of the second loop later that select the loop carried 1623 // value, if the second header was reached through an old latch of the 1624 // first, or undef otherwise. This is sound as exiting the first implies the 1625 // second will exit too, __without__ taking the back-edge. [Their 1626 // trip-counts are equal after all. 1627 // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 1628 // to FC1.Header? I think this is basically what the three sequences are 1629 // trying to accomplish; however, doing this directly in the CFG may mean 1630 // the DT/PDT becomes invalid 1631 if (!FC0.Peeled) { 1632 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 1633 FC1.Header); 1634 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1635 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 1636 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1637 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1638 } else { 1639 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1640 DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); 1641 1642 // Remove the ExitBlock of the first Loop (also not needed) 1643 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1644 FC1.Header); 1645 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1646 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1647 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1648 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1649 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1650 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1651 } 1652 1653 // The pre-header of L1 is not necessary anymore. 1654 assert(pred_empty(FC1.Preheader)); 1655 FC1.Preheader->getTerminator()->eraseFromParent(); 1656 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1657 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1658 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1659 1660 // Moves the phi nodes from the second to the first loops header block. 1661 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1662 if (SE.isSCEVable(PHI->getType())) 1663 SE.forgetValue(PHI); 1664 if (PHI->hasNUsesOrMore(1)) 1665 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1666 else 1667 PHI->eraseFromParent(); 1668 } 1669 1670 // Introduce new phi nodes in the second loop header to ensure 1671 // exiting the first and jumping to the header of the second does not break 1672 // the SSA property of the phis originally in the first loop. See also the 1673 // comment above. 1674 Instruction *L1HeaderIP = &FC1.Header->front(); 1675 for (PHINode *LCPHI : OriginalFC0PHIs) { 1676 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1677 assert(L1LatchBBIdx >= 0 && 1678 "Expected loop carried value to be rewired at this point!"); 1679 1680 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1681 1682 PHINode *L1HeaderPHI = PHINode::Create( 1683 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1684 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1685 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1686 FC0.ExitingBlock); 1687 1688 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1689 } 1690 1691 // Replace latch terminator destinations. 1692 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1693 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1694 1695 // Modify the latch branch of FC0 to be unconditional as both successors of 1696 // the branch are the same. 1697 simplifyLatchBranch(FC0); 1698 1699 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1700 // performed the updates above. 1701 if (FC0.Latch != FC0.ExitingBlock) 1702 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1703 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1704 1705 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1706 FC0.Latch, FC0.Header)); 1707 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1708 FC1.Latch, FC0.Header)); 1709 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1710 FC1.Latch, FC1.Header)); 1711 1712 // Update DT/PDT 1713 DTU.applyUpdates(TreeUpdates); 1714 1715 LI.removeBlock(FC1.Preheader); 1716 DTU.deleteBB(FC1.Preheader); 1717 if (FC0.Peeled) { 1718 LI.removeBlock(FC0.ExitBlock); 1719 DTU.deleteBB(FC0.ExitBlock); 1720 } 1721 1722 DTU.flush(); 1723 1724 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1725 // and rebuild the information in subsequent passes of fusion? 1726 // Note: Need to forget the loops before merging the loop latches, as 1727 // mergeLatch may remove the only block in FC1. 1728 SE.forgetLoop(FC1.L); 1729 SE.forgetLoop(FC0.L); 1730 SE.forgetLoopDispositions(); 1731 1732 // Move instructions from FC0.Latch to FC1.Latch. 1733 // Note: mergeLatch requires an updated DT. 1734 mergeLatch(FC0, FC1); 1735 1736 // Merge the loops. 1737 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 1738 for (BasicBlock *BB : Blocks) { 1739 FC0.L->addBlockEntry(BB); 1740 FC1.L->removeBlockFromLoop(BB); 1741 if (LI.getLoopFor(BB) != FC1.L) 1742 continue; 1743 LI.changeLoopFor(BB, FC0.L); 1744 } 1745 while (!FC1.L->isInnermost()) { 1746 const auto &ChildLoopIt = FC1.L->begin(); 1747 Loop *ChildLoop = *ChildLoopIt; 1748 FC1.L->removeChildLoop(ChildLoopIt); 1749 FC0.L->addChildLoop(ChildLoop); 1750 } 1751 1752 // Delete the now empty loop L1. 1753 LI.erase(FC1.L); 1754 1755 #ifndef NDEBUG 1756 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1757 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1758 assert(PDT.verify()); 1759 LI.verify(DT); 1760 SE.verify(); 1761 #endif 1762 1763 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1764 1765 return FC0.L; 1766 } 1767 1768 /// Report details on loop fusion opportunities. 1769 /// 1770 /// This template function can be used to report both successful and missed 1771 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 1772 /// be one of: 1773 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 1774 /// given two valid fusion candidates. 1775 /// - OptimizationRemark to report successful fusion of two fusion 1776 /// candidates. 1777 /// The remarks will be printed using the form: 1778 /// <path/filename>:<line number>:<column number>: [<function name>]: 1779 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 1780 template <typename RemarkKind> 1781 void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 1782 llvm::Statistic &Stat) { 1783 assert(FC0.Preheader && FC1.Preheader && 1784 "Expecting valid fusion candidates"); 1785 using namespace ore; 1786 #if LLVM_ENABLE_STATS 1787 ++Stat; 1788 ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 1789 FC0.Preheader) 1790 << "[" << FC0.Preheader->getParent()->getName() 1791 << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 1792 << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 1793 << ": " << Stat.getDesc()); 1794 #endif 1795 } 1796 1797 /// Fuse two guarded fusion candidates, creating a new fused loop. 1798 /// 1799 /// Fusing guarded loops is handled much the same way as fusing non-guarded 1800 /// loops. The rewiring of the CFG is slightly different though, because of 1801 /// the presence of the guards around the loops and the exit blocks after the 1802 /// loop body. As such, the new loop is rewired as follows: 1803 /// 1. Keep the guard branch from FC0 and use the non-loop block target 1804 /// from the FC1 guard branch. 1805 /// 2. Remove the exit block from FC0 (this exit block should be empty 1806 /// right now). 1807 /// 3. Remove the guard branch for FC1 1808 /// 4. Remove the preheader for FC1. 1809 /// The exit block successor for the latch of FC0 is updated to be the header 1810 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 1811 /// be the header of FC0, thus creating the fused loop. 1812 Loop *fuseGuardedLoops(const FusionCandidate &FC0, 1813 const FusionCandidate &FC1) { 1814 assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 1815 1816 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 1817 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 1818 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 1819 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1820 BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); 1821 1822 // Move instructions from the exit block of FC0 to the beginning of the exit 1823 // block of FC1, in the case that the FC0 loop has not been peeled. In the 1824 // case that FC0 loop is peeled, then move the instructions of the successor 1825 // of the FC0 Exit block to the beginning of the exit block of FC1. 1826 moveInstructionsToTheBeginning( 1827 (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, 1828 DT, PDT, DI); 1829 1830 // Move instructions from the guard block of FC1 to the end of the guard 1831 // block of FC0. 1832 moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 1833 1834 assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 1835 1836 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1837 1838 //////////////////////////////////////////////////////////////////////////// 1839 // Update the Loop Guard 1840 //////////////////////////////////////////////////////////////////////////// 1841 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 1842 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 1843 // Thus, one path from the guard goes to the preheader for FC0 (and thus 1844 // executes the new fused loop) and the other path goes to the NonLoopBlock 1845 // for FC1 (where FC1 guard would have gone if FC1 was not executed). 1846 FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 1847 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1848 1849 BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; 1850 BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); 1851 1852 // The guard of FC1 is not necessary anymore. 1853 FC1.GuardBranch->eraseFromParent(); 1854 new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 1855 1856 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1857 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 1858 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1859 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 1860 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1861 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 1862 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1863 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 1864 1865 if (FC0.Peeled) { 1866 // Remove the Block after the ExitBlock of FC0 1867 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1868 DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); 1869 FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); 1870 new UnreachableInst(FC0ExitBlockSuccessor->getContext(), 1871 FC0ExitBlockSuccessor); 1872 } 1873 1874 assert(pred_empty(FC1GuardBlock) && 1875 "Expecting guard block to have no predecessors"); 1876 assert(succ_empty(FC1GuardBlock) && 1877 "Expecting guard block to have no successors"); 1878 1879 // Remember the phi nodes originally in the header of FC0 in order to rewire 1880 // them later. However, this is only necessary if the new loop carried 1881 // values might not dominate the exiting branch. While we do not generally 1882 // test if this is the case but simply insert intermediate phi nodes, we 1883 // need to make sure these intermediate phi nodes have different 1884 // predecessors. To this end, we filter the special case where the exiting 1885 // block is the latch block of the first loop. Nothing needs to be done 1886 // anyway as all loop carried values dominate the latch and thereby also the 1887 // exiting branch. 1888 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 1889 // (because the loops are rotated. Thus, nothing will ever be added to 1890 // OriginalFC0PHIs. 1891 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1892 if (FC0.ExitingBlock != FC0.Latch) 1893 for (PHINode &PHI : FC0.Header->phis()) 1894 OriginalFC0PHIs.push_back(&PHI); 1895 1896 assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 1897 1898 // Replace incoming blocks for header PHIs first. 1899 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1900 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1901 1902 // The old exiting block of the first loop (FC0) has to jump to the header 1903 // of the second as we need to execute the code in the second header block 1904 // regardless of the trip count. That is, if the trip count is 0, so the 1905 // back edge is never taken, we still have to execute both loop headers, 1906 // especially (but not only!) if the second is a do-while style loop. 1907 // However, doing so might invalidate the phi nodes of the first loop as 1908 // the new values do only need to dominate their latch and not the exiting 1909 // predicate. To remedy this potential problem we always introduce phi 1910 // nodes in the header of the second loop later that select the loop carried 1911 // value, if the second header was reached through an old latch of the 1912 // first, or undef otherwise. This is sound as exiting the first implies the 1913 // second will exit too, __without__ taking the back-edge (their 1914 // trip-counts are equal after all). 1915 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1916 FC1.Header); 1917 1918 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1919 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1920 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1921 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1922 1923 // Remove FC0 Exit Block 1924 // The exit block for FC0 is no longer needed since control will flow 1925 // directly to the header of FC1. Since it is an empty block, it can be 1926 // removed at this point. 1927 // TODO: In the future, we can handle non-empty exit blocks my merging any 1928 // instructions from FC0 exit block into FC1 exit block prior to removing 1929 // the block. 1930 assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty"); 1931 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1932 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1933 1934 // Remove FC1 Preheader 1935 // The pre-header of L1 is not necessary anymore. 1936 assert(pred_empty(FC1.Preheader)); 1937 FC1.Preheader->getTerminator()->eraseFromParent(); 1938 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1939 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1940 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1941 1942 // Moves the phi nodes from the second to the first loops header block. 1943 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1944 if (SE.isSCEVable(PHI->getType())) 1945 SE.forgetValue(PHI); 1946 if (PHI->hasNUsesOrMore(1)) 1947 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1948 else 1949 PHI->eraseFromParent(); 1950 } 1951 1952 // Introduce new phi nodes in the second loop header to ensure 1953 // exiting the first and jumping to the header of the second does not break 1954 // the SSA property of the phis originally in the first loop. See also the 1955 // comment above. 1956 Instruction *L1HeaderIP = &FC1.Header->front(); 1957 for (PHINode *LCPHI : OriginalFC0PHIs) { 1958 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1959 assert(L1LatchBBIdx >= 0 && 1960 "Expected loop carried value to be rewired at this point!"); 1961 1962 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1963 1964 PHINode *L1HeaderPHI = PHINode::Create( 1965 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1966 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1967 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1968 FC0.ExitingBlock); 1969 1970 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1971 } 1972 1973 // Update the latches 1974 1975 // Replace latch terminator destinations. 1976 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1977 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1978 1979 // Modify the latch branch of FC0 to be unconditional as both successors of 1980 // the branch are the same. 1981 simplifyLatchBranch(FC0); 1982 1983 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1984 // performed the updates above. 1985 if (FC0.Latch != FC0.ExitingBlock) 1986 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1987 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1988 1989 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1990 FC0.Latch, FC0.Header)); 1991 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1992 FC1.Latch, FC0.Header)); 1993 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1994 FC1.Latch, FC1.Header)); 1995 1996 // All done 1997 // Apply the updates to the Dominator Tree and cleanup. 1998 1999 assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!"); 2000 assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!"); 2001 2002 // Update DT/PDT 2003 DTU.applyUpdates(TreeUpdates); 2004 2005 LI.removeBlock(FC1GuardBlock); 2006 LI.removeBlock(FC1.Preheader); 2007 LI.removeBlock(FC0.ExitBlock); 2008 if (FC0.Peeled) { 2009 LI.removeBlock(FC0ExitBlockSuccessor); 2010 DTU.deleteBB(FC0ExitBlockSuccessor); 2011 } 2012 DTU.deleteBB(FC1GuardBlock); 2013 DTU.deleteBB(FC1.Preheader); 2014 DTU.deleteBB(FC0.ExitBlock); 2015 DTU.flush(); 2016 2017 // Is there a way to keep SE up-to-date so we don't need to forget the loops 2018 // and rebuild the information in subsequent passes of fusion? 2019 // Note: Need to forget the loops before merging the loop latches, as 2020 // mergeLatch may remove the only block in FC1. 2021 SE.forgetLoop(FC1.L); 2022 SE.forgetLoop(FC0.L); 2023 SE.forgetLoopDispositions(); 2024 2025 // Move instructions from FC0.Latch to FC1.Latch. 2026 // Note: mergeLatch requires an updated DT. 2027 mergeLatch(FC0, FC1); 2028 2029 // Merge the loops. 2030 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 2031 for (BasicBlock *BB : Blocks) { 2032 FC0.L->addBlockEntry(BB); 2033 FC1.L->removeBlockFromLoop(BB); 2034 if (LI.getLoopFor(BB) != FC1.L) 2035 continue; 2036 LI.changeLoopFor(BB, FC0.L); 2037 } 2038 while (!FC1.L->isInnermost()) { 2039 const auto &ChildLoopIt = FC1.L->begin(); 2040 Loop *ChildLoop = *ChildLoopIt; 2041 FC1.L->removeChildLoop(ChildLoopIt); 2042 FC0.L->addChildLoop(ChildLoop); 2043 } 2044 2045 // Delete the now empty loop L1. 2046 LI.erase(FC1.L); 2047 2048 #ifndef NDEBUG 2049 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 2050 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 2051 assert(PDT.verify()); 2052 LI.verify(DT); 2053 SE.verify(); 2054 #endif 2055 2056 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 2057 2058 return FC0.L; 2059 } 2060 }; 2061 } // namespace 2062 2063 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 2064 auto &LI = AM.getResult<LoopAnalysis>(F); 2065 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 2066 auto &DI = AM.getResult<DependenceAnalysis>(F); 2067 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 2068 auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 2069 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 2070 auto &AC = AM.getResult<AssumptionAnalysis>(F); 2071 const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 2072 const DataLayout &DL = F.getParent()->getDataLayout(); 2073 2074 // Ensure loops are in simplifed form which is a pre-requisite for loop fusion 2075 // pass. Added only for new PM since the legacy PM has already added 2076 // LoopSimplify pass as a dependency. 2077 bool Changed = false; 2078 for (auto &L : LI) { 2079 Changed |= 2080 simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); 2081 } 2082 if (Changed) 2083 PDT.recalculate(F); 2084 2085 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 2086 Changed |= LF.fuseLoops(F); 2087 if (!Changed) 2088 return PreservedAnalyses::all(); 2089 2090 PreservedAnalyses PA; 2091 PA.preserve<DominatorTreeAnalysis>(); 2092 PA.preserve<PostDominatorTreeAnalysis>(); 2093 PA.preserve<ScalarEvolutionAnalysis>(); 2094 PA.preserve<LoopAnalysis>(); 2095 return PA; 2096 } 2097