1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This Pass handles loop interchange transform. 10 // This pass interchanges loops to provide a more cache-friendly memory access 11 // patterns. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Scalar/LoopInterchange.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/StringSet.h" 22 #include "llvm/Analysis/DependenceAnalysis.h" 23 #include "llvm/Analysis/LoopCacheAnalysis.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/LoopNestAnalysis.h" 26 #include "llvm/Analysis/LoopPass.h" 27 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 28 #include "llvm/Analysis/ScalarEvolution.h" 29 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 30 #include "llvm/IR/BasicBlock.h" 31 #include "llvm/IR/DiagnosticInfo.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/IR/InstrTypes.h" 35 #include "llvm/IR/Instruction.h" 36 #include "llvm/IR/Instructions.h" 37 #include "llvm/IR/User.h" 38 #include "llvm/IR/Value.h" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/raw_ostream.h" 44 #include "llvm/Transforms/Scalar/LoopPassManager.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include "llvm/Transforms/Utils/LoopUtils.h" 47 #include <cassert> 48 #include <utility> 49 #include <vector> 50 51 using namespace llvm; 52 53 #define DEBUG_TYPE "loop-interchange" 54 55 STATISTIC(LoopsInterchanged, "Number of loops interchanged"); 56 57 static cl::opt<int> LoopInterchangeCostThreshold( 58 "loop-interchange-threshold", cl::init(0), cl::Hidden, 59 cl::desc("Interchange if you gain more than this number")); 60 61 // Maximum number of load-stores that can be handled in the dependency matrix. 62 static cl::opt<unsigned int> MaxMemInstrCount( 63 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, 64 cl::desc( 65 "Maximum number of load-store instructions that should be handled " 66 "in the dependency matrix. Higher value may lead to more interchanges " 67 "at the cost of compile-time")); 68 69 namespace { 70 71 using LoopVector = SmallVector<Loop *, 8>; 72 73 // TODO: Check if we can use a sparse matrix here. 74 using CharMatrix = std::vector<std::vector<char>>; 75 76 /// Types of rules used in profitability check. 77 enum class RuleTy { 78 PerLoopCacheAnalysis, 79 PerInstrOrderCost, 80 ForVectorization, 81 }; 82 83 } // end anonymous namespace 84 85 // Minimum loop depth supported. 86 static cl::opt<unsigned int> MinLoopNestDepth( 87 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, 88 cl::desc("Minimum depth of loop nest considered for the transform")); 89 90 // Maximum loop depth supported. 91 static cl::opt<unsigned int> MaxLoopNestDepth( 92 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, 93 cl::desc("Maximum depth of loop nest considered for the transform")); 94 95 // We prefer cache cost to vectorization by default. 96 static cl::list<RuleTy> Profitabilities( 97 "loop-interchange-profitabilities", cl::ZeroOrMore, 98 cl::MiscFlags::CommaSeparated, cl::Hidden, 99 cl::desc("List of profitability heuristics to be used. They are applied in " 100 "the given order"), 101 cl::list_init<RuleTy>({RuleTy::PerLoopCacheAnalysis, 102 RuleTy::PerInstrOrderCost, 103 RuleTy::ForVectorization}), 104 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", 105 "Prioritize loop cache cost"), 106 clEnumValN(RuleTy::PerInstrOrderCost, "instorder", 107 "Prioritize the IVs order of each instruction"), 108 clEnumValN(RuleTy::ForVectorization, "vectorize", 109 "Prioritize vectorization"))); 110 111 #ifndef NDEBUG 112 static bool noDuplicateRules(ArrayRef<RuleTy> Rules) { 113 SmallSet<RuleTy, 4> Set; 114 for (RuleTy Rule : Rules) 115 if (!Set.insert(Rule).second) 116 return false; 117 return true; 118 } 119 120 static void printDepMatrix(CharMatrix &DepMatrix) { 121 for (auto &Row : DepMatrix) { 122 for (auto D : Row) 123 LLVM_DEBUG(dbgs() << D << " "); 124 LLVM_DEBUG(dbgs() << "\n"); 125 } 126 } 127 #endif 128 129 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 130 Loop *L, DependenceInfo *DI, 131 ScalarEvolution *SE, 132 OptimizationRemarkEmitter *ORE) { 133 using ValueVector = SmallVector<Value *, 16>; 134 135 ValueVector MemInstr; 136 137 // For each block. 138 for (BasicBlock *BB : L->blocks()) { 139 // Scan the BB and collect legal loads and stores. 140 for (Instruction &I : *BB) { 141 if (!isa<Instruction>(I)) 142 return false; 143 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 144 if (!Ld->isSimple()) 145 return false; 146 MemInstr.push_back(&I); 147 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 148 if (!St->isSimple()) 149 return false; 150 MemInstr.push_back(&I); 151 } 152 } 153 } 154 155 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() 156 << " Loads and Stores to analyze\n"); 157 if (MemInstr.size() > MaxMemInstrCount) { 158 LLVM_DEBUG(dbgs() << "The transform doesn't support more than " 159 << MaxMemInstrCount << " load/stores in a loop\n"); 160 ORE->emit([&]() { 161 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop", 162 L->getStartLoc(), L->getHeader()) 163 << "Number of loads/stores exceeded, the supported maximum " 164 "can be increased with option " 165 "-loop-interchange-maxmeminstr-count."; 166 }); 167 return false; 168 } 169 ValueVector::iterator I, IE, J, JE; 170 StringSet<> Seen; 171 172 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 173 for (J = I, JE = MemInstr.end(); J != JE; ++J) { 174 std::vector<char> Dep; 175 Instruction *Src = cast<Instruction>(*I); 176 Instruction *Dst = cast<Instruction>(*J); 177 // Ignore Input dependencies. 178 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 179 continue; 180 // Track Output, Flow, and Anti dependencies. 181 if (auto D = DI->depends(Src, Dst)) { 182 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 183 // If the direction vector is negative, normalize it to 184 // make it non-negative. 185 if (D->normalize(SE)) 186 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n"); 187 LLVM_DEBUG(StringRef DepType = 188 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 189 dbgs() << "Found " << DepType 190 << " dependency between Src and Dst\n" 191 << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 192 unsigned Levels = D->getLevels(); 193 char Direction; 194 for (unsigned II = 1; II <= Levels; ++II) { 195 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<` 196 // or `=`, for which we don't have an equivalent representation, so 197 // that the conservative approximation is necessary. The same goes for 198 // `DVEntry::GE`. 199 // TODO: Use of fine-grained expressions allows for more accurate 200 // analysis. 201 unsigned Dir = D->getDirection(II); 202 if (Dir == Dependence::DVEntry::LT) 203 Direction = '<'; 204 else if (Dir == Dependence::DVEntry::GT) 205 Direction = '>'; 206 else if (Dir == Dependence::DVEntry::EQ) 207 Direction = '='; 208 else 209 Direction = '*'; 210 Dep.push_back(Direction); 211 } 212 213 // If the Dependence object doesn't have any information, fill the 214 // dependency vector with '*'. 215 if (D->isConfused()) { 216 assert(Dep.empty() && "Expected empty dependency vector"); 217 Dep.assign(Level, '*'); 218 } 219 220 while (Dep.size() != Level) { 221 Dep.push_back('I'); 222 } 223 224 // Make sure we only add unique entries to the dependency matrix. 225 if (Seen.insert(StringRef(Dep.data(), Dep.size())).second) 226 DepMatrix.push_back(Dep); 227 } 228 } 229 } 230 231 return true; 232 } 233 234 // A loop is moved from index 'from' to an index 'to'. Update the Dependence 235 // matrix by exchanging the two columns. 236 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 237 unsigned ToIndx) { 238 for (auto &Row : DepMatrix) 239 std::swap(Row[ToIndx], Row[FromIndx]); 240 } 241 242 // Check if a direction vector is lexicographically positive. Return true if it 243 // is positive, nullopt if it is "zero", otherwise false. 244 // [Theorem] A permutation of the loops in a perfect nest is legal if and only 245 // if the direction matrix, after the same permutation is applied to its 246 // columns, has no ">" direction as the leftmost non-"=" direction in any row. 247 static std::optional<bool> 248 isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) { 249 for (unsigned char Direction : DV.slice(Begin, End - Begin)) { 250 if (Direction == '<') 251 return true; 252 if (Direction == '>' || Direction == '*') 253 return false; 254 } 255 return std::nullopt; 256 } 257 258 // Checks if it is legal to interchange 2 loops. 259 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 260 unsigned InnerLoopId, 261 unsigned OuterLoopId) { 262 unsigned NumRows = DepMatrix.size(); 263 std::vector<char> Cur; 264 // For each row check if it is valid to interchange. 265 for (unsigned Row = 0; Row < NumRows; ++Row) { 266 // Create temporary DepVector check its lexicographical order 267 // before and after swapping OuterLoop vs InnerLoop 268 Cur = DepMatrix[Row]; 269 270 // If the surrounding loops already ensure that the direction vector is 271 // lexicographically positive, nothing within the loop will be able to break 272 // the dependence. In such a case we can skip the subsequent check. 273 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true) 274 continue; 275 276 // Check if the direction vector is lexicographically positive (or zero) 277 // for both before/after exchanged. 278 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) 279 return false; 280 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); 281 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) 282 return false; 283 } 284 return true; 285 } 286 287 static void populateWorklist(Loop &L, LoopVector &LoopList) { 288 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " 289 << L.getHeader()->getParent()->getName() << " Loop: %" 290 << L.getHeader()->getName() << '\n'); 291 assert(LoopList.empty() && "LoopList should initially be empty!"); 292 Loop *CurrentLoop = &L; 293 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 294 while (!Vec->empty()) { 295 // The current loop has multiple subloops in it hence it is not tightly 296 // nested. 297 // Discard all loops above it added into Worklist. 298 if (Vec->size() != 1) { 299 LoopList = {}; 300 return; 301 } 302 303 LoopList.push_back(CurrentLoop); 304 CurrentLoop = Vec->front(); 305 Vec = &CurrentLoop->getSubLoops(); 306 } 307 LoopList.push_back(CurrentLoop); 308 } 309 310 static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList, 311 OptimizationRemarkEmitter &ORE) { 312 unsigned LoopNestDepth = LoopList.size(); 313 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) { 314 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth 315 << ", the supported range is [" << MinLoopNestDepth 316 << ", " << MaxLoopNestDepth << "].\n"); 317 Loop *OuterLoop = LoopList.front(); 318 ORE.emit([&]() { 319 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth", 320 OuterLoop->getStartLoc(), 321 OuterLoop->getHeader()) 322 << "Unsupported depth of loop nest, the supported range is [" 323 << std::to_string(MinLoopNestDepth) << ", " 324 << std::to_string(MaxLoopNestDepth) << "].\n"; 325 }); 326 return false; 327 } 328 return true; 329 } 330 331 static bool isComputableLoopNest(ScalarEvolution *SE, 332 ArrayRef<Loop *> LoopList) { 333 for (Loop *L : LoopList) { 334 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 335 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) { 336 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); 337 return false; 338 } 339 if (L->getNumBackEdges() != 1) { 340 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 341 return false; 342 } 343 if (!L->getExitingBlock()) { 344 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 345 return false; 346 } 347 } 348 return true; 349 } 350 351 namespace { 352 353 /// LoopInterchangeLegality checks if it is legal to interchange the loop. 354 class LoopInterchangeLegality { 355 public: 356 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 357 OptimizationRemarkEmitter *ORE) 358 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 359 360 /// Check if the loops can be interchanged. 361 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 362 CharMatrix &DepMatrix); 363 364 /// Discover induction PHIs in the header of \p L. Induction 365 /// PHIs are added to \p Inductions. 366 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions); 367 368 /// Check if the loop structure is understood. We do not handle triangular 369 /// loops for now. 370 bool isLoopStructureUnderstood(); 371 372 bool currentLimitations(); 373 374 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const { 375 return OuterInnerReductions; 376 } 377 378 const ArrayRef<PHINode *> getInnerLoopInductions() const { 379 return InnerLoopInductions; 380 } 381 382 ArrayRef<Instruction *> getHasNoWrapReductions() const { 383 return HasNoWrapReductions; 384 } 385 386 private: 387 bool tightlyNested(Loop *Outer, Loop *Inner); 388 bool containsUnsafeInstructions(BasicBlock *BB); 389 390 /// Discover induction and reduction PHIs in the header of \p L. Induction 391 /// PHIs are added to \p Inductions, reductions are added to 392 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs 393 /// to be passed as \p InnerLoop. 394 bool findInductionAndReductions(Loop *L, 395 SmallVector<PHINode *, 8> &Inductions, 396 Loop *InnerLoop); 397 398 Loop *OuterLoop; 399 Loop *InnerLoop; 400 401 ScalarEvolution *SE; 402 403 /// Interface to emit optimization remarks. 404 OptimizationRemarkEmitter *ORE; 405 406 /// Set of reduction PHIs taking part of a reduction across the inner and 407 /// outer loop. 408 SmallPtrSet<PHINode *, 4> OuterInnerReductions; 409 410 /// Set of inner loop induction PHIs 411 SmallVector<PHINode *, 8> InnerLoopInductions; 412 413 /// Hold instructions that have nuw/nsw flags and involved in reductions, 414 /// like integer addition/multiplication. Those flags must be dropped when 415 /// interchanging the loops. 416 SmallVector<Instruction *, 4> HasNoWrapReductions; 417 }; 418 419 /// Manages information utilized by the profitability check for cache. The main 420 /// purpose of this class is to delay the computation of CacheCost until it is 421 /// actually needed. 422 class CacheCostManager { 423 Loop *OutermostLoop; 424 LoopStandardAnalysisResults *AR; 425 DependenceInfo *DI; 426 427 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note 428 /// that the result can be nullptr. 429 std::optional<std::unique_ptr<CacheCost>> CC; 430 431 /// Maps each loop to an index representing the optimal position within the 432 /// loop-nest, as determined by the cache cost analysis. 433 DenseMap<const Loop *, unsigned> CostMap; 434 435 void computeIfUnitinialized(); 436 437 public: 438 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR, 439 DependenceInfo *DI) 440 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {} 441 CacheCost *getCacheCost(); 442 const DenseMap<const Loop *, unsigned> &getCostMap(); 443 }; 444 445 /// LoopInterchangeProfitability checks if it is profitable to interchange the 446 /// loop. 447 class LoopInterchangeProfitability { 448 public: 449 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 450 OptimizationRemarkEmitter *ORE) 451 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 452 453 /// Check if the loop interchange is profitable. 454 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, 455 unsigned InnerLoopId, unsigned OuterLoopId, 456 CharMatrix &DepMatrix, CacheCostManager &CCM); 457 458 private: 459 int getInstrOrderCost(); 460 std::optional<bool> isProfitablePerLoopCacheAnalysis( 461 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC); 462 std::optional<bool> isProfitablePerInstrOrderCost(); 463 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId, 464 unsigned OuterLoopId, 465 CharMatrix &DepMatrix); 466 Loop *OuterLoop; 467 Loop *InnerLoop; 468 469 /// Scev analysis. 470 ScalarEvolution *SE; 471 472 /// Interface to emit optimization remarks. 473 OptimizationRemarkEmitter *ORE; 474 }; 475 476 /// LoopInterchangeTransform interchanges the loop. 477 class LoopInterchangeTransform { 478 public: 479 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 480 LoopInfo *LI, DominatorTree *DT, 481 const LoopInterchangeLegality &LIL) 482 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {} 483 484 /// Interchange OuterLoop and InnerLoop. 485 bool transform(ArrayRef<Instruction *> DropNoWrapInsts); 486 void restructureLoops(Loop *NewInner, Loop *NewOuter, 487 BasicBlock *OrigInnerPreHeader, 488 BasicBlock *OrigOuterPreHeader); 489 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 490 491 private: 492 bool adjustLoopLinks(); 493 bool adjustLoopBranches(); 494 495 Loop *OuterLoop; 496 Loop *InnerLoop; 497 498 /// Scev analysis. 499 ScalarEvolution *SE; 500 501 LoopInfo *LI; 502 DominatorTree *DT; 503 504 const LoopInterchangeLegality &LIL; 505 }; 506 507 struct LoopInterchange { 508 ScalarEvolution *SE = nullptr; 509 LoopInfo *LI = nullptr; 510 DependenceInfo *DI = nullptr; 511 DominatorTree *DT = nullptr; 512 LoopStandardAnalysisResults *AR = nullptr; 513 514 /// Interface to emit optimization remarks. 515 OptimizationRemarkEmitter *ORE; 516 517 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, 518 DominatorTree *DT, LoopStandardAnalysisResults *AR, 519 OptimizationRemarkEmitter *ORE) 520 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {} 521 522 bool run(Loop *L) { 523 if (L->getParentLoop()) 524 return false; 525 SmallVector<Loop *, 8> LoopList; 526 populateWorklist(*L, LoopList); 527 return processLoopList(LoopList); 528 } 529 530 bool run(LoopNest &LN) { 531 SmallVector<Loop *, 8> LoopList(LN.getLoops()); 532 for (unsigned I = 1; I < LoopList.size(); ++I) 533 if (LoopList[I]->getParentLoop() != LoopList[I - 1]) 534 return false; 535 return processLoopList(LoopList); 536 } 537 538 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) { 539 // TODO: Add a better heuristic to select the loop to be interchanged based 540 // on the dependence matrix. Currently we select the innermost loop. 541 return LoopList.size() - 1; 542 } 543 544 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) { 545 bool Changed = false; 546 547 // Ensure proper loop nest depth. 548 assert(hasSupportedLoopDepth(LoopList, *ORE) && 549 "Unsupported depth of loop nest."); 550 551 unsigned LoopNestDepth = LoopList.size(); 552 553 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth 554 << "\n"); 555 556 CharMatrix DependencyMatrix; 557 Loop *OuterMostLoop = *(LoopList.begin()); 558 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 559 OuterMostLoop, DI, SE, ORE)) { 560 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); 561 return false; 562 } 563 564 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n"; 565 printDepMatrix(DependencyMatrix)); 566 567 // Get the Outermost loop exit. 568 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); 569 if (!LoopNestExit) { 570 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block"); 571 return false; 572 } 573 574 unsigned SelecLoopId = selectLoopForInterchange(LoopList); 575 CacheCostManager CCM(LoopList[0], AR, DI); 576 // We try to achieve the globally optimal memory access for the loopnest, 577 // and do interchange based on a bubble-sort fasion. We start from 578 // the innermost loop, move it outwards to the best possible position 579 // and repeat this process. 580 for (unsigned j = SelecLoopId; j > 0; j--) { 581 bool ChangedPerIter = false; 582 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { 583 bool Interchanged = 584 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM); 585 ChangedPerIter |= Interchanged; 586 Changed |= Interchanged; 587 } 588 // Early abort if there was no interchange during an entire round of 589 // moving loops outwards. 590 if (!ChangedPerIter) 591 break; 592 } 593 return Changed; 594 } 595 596 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId, 597 unsigned OuterLoopId, 598 std::vector<std::vector<char>> &DependencyMatrix, 599 CacheCostManager &CCM) { 600 Loop *OuterLoop = LoopList[OuterLoopId]; 601 Loop *InnerLoop = LoopList[InnerLoopId]; 602 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId 603 << " and OuterLoopId = " << OuterLoopId << "\n"); 604 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); 605 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 606 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); 607 return false; 608 } 609 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); 610 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 611 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, 612 DependencyMatrix, CCM)) { 613 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); 614 return false; 615 } 616 617 ORE->emit([&]() { 618 return OptimizationRemark(DEBUG_TYPE, "Interchanged", 619 InnerLoop->getStartLoc(), 620 InnerLoop->getHeader()) 621 << "Loop interchanged with enclosing loop."; 622 }); 623 624 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL); 625 LIT.transform(LIL.getHasNoWrapReductions()); 626 LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); 627 LoopsInterchanged++; 628 629 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE); 630 631 // Loops interchanged, update LoopList accordingly. 632 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]); 633 // Update the DependencyMatrix 634 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId); 635 636 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n"; 637 printDepMatrix(DependencyMatrix)); 638 639 return true; 640 } 641 }; 642 643 } // end anonymous namespace 644 645 bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { 646 return any_of(*BB, [](const Instruction &I) { 647 return I.mayHaveSideEffects() || I.mayReadFromMemory(); 648 }); 649 } 650 651 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 652 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 653 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 654 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 655 656 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 657 658 // A perfectly nested loop will not have any branch in between the outer and 659 // inner block i.e. outer header will branch to either inner preheader and 660 // outerloop latch. 661 BranchInst *OuterLoopHeaderBI = 662 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 663 if (!OuterLoopHeaderBI) 664 return false; 665 666 for (BasicBlock *Succ : successors(OuterLoopHeaderBI)) 667 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && 668 Succ != OuterLoopLatch) 669 return false; 670 671 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 672 // We do not have any basic block in between now make sure the outer header 673 // and outer loop latch doesn't contain any unsafe instructions. 674 if (containsUnsafeInstructions(OuterLoopHeader) || 675 containsUnsafeInstructions(OuterLoopLatch)) 676 return false; 677 678 // Also make sure the inner loop preheader does not contain any unsafe 679 // instructions. Note that all instructions in the preheader will be moved to 680 // the outer loop header when interchanging. 681 if (InnerLoopPreHeader != OuterLoopHeader && 682 containsUnsafeInstructions(InnerLoopPreHeader)) 683 return false; 684 685 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock(); 686 // Ensure the inner loop exit block flows to the outer loop latch possibly 687 // through empty blocks. 688 const BasicBlock &SuccInner = 689 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch); 690 if (&SuccInner != OuterLoopLatch) { 691 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit 692 << " does not lead to the outer loop latch.\n";); 693 return false; 694 } 695 // The inner loop exit block does flow to the outer loop latch and not some 696 // other BBs, now make sure it contains safe instructions, since it will be 697 // moved into the (new) inner loop after interchange. 698 if (containsUnsafeInstructions(InnerLoopExit)) 699 return false; 700 701 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n"); 702 // We have a perfect loop nest. 703 return true; 704 } 705 706 bool LoopInterchangeLegality::isLoopStructureUnderstood() { 707 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 708 for (PHINode *InnerInduction : InnerLoopInductions) { 709 unsigned Num = InnerInduction->getNumOperands(); 710 for (unsigned i = 0; i < Num; ++i) { 711 Value *Val = InnerInduction->getOperand(i); 712 if (isa<Constant>(Val)) 713 continue; 714 Instruction *I = dyn_cast<Instruction>(Val); 715 if (!I) 716 return false; 717 // TODO: Handle triangular loops. 718 // e.g. for(int i=0;i<N;i++) 719 // for(int j=i;j<N;j++) 720 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 721 if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 722 InnerLoopPreheader && 723 !OuterLoop->isLoopInvariant(I)) { 724 return false; 725 } 726 } 727 } 728 729 // TODO: Handle triangular loops of another form. 730 // e.g. for(int i=0;i<N;i++) 731 // for(int j=0;j<i;j++) 732 // or, 733 // for(int i=0;i<N;i++) 734 // for(int j=0;j*i<N;j++) 735 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 736 BranchInst *InnerLoopLatchBI = 737 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 738 if (!InnerLoopLatchBI->isConditional()) 739 return false; 740 if (CmpInst *InnerLoopCmp = 741 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) { 742 Value *Op0 = InnerLoopCmp->getOperand(0); 743 Value *Op1 = InnerLoopCmp->getOperand(1); 744 745 // LHS and RHS of the inner loop exit condition, e.g., 746 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i. 747 Value *Left = nullptr; 748 Value *Right = nullptr; 749 750 // Check if V only involves inner loop induction variable. 751 // Return true if V is InnerInduction, or a cast from 752 // InnerInduction, or a binary operator that involves 753 // InnerInduction and a constant. 754 std::function<bool(Value *)> IsPathToInnerIndVar; 755 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool { 756 if (llvm::is_contained(InnerLoopInductions, V)) 757 return true; 758 if (isa<Constant>(V)) 759 return true; 760 const Instruction *I = dyn_cast<Instruction>(V); 761 if (!I) 762 return false; 763 if (isa<CastInst>(I)) 764 return IsPathToInnerIndVar(I->getOperand(0)); 765 if (isa<BinaryOperator>(I)) 766 return IsPathToInnerIndVar(I->getOperand(0)) && 767 IsPathToInnerIndVar(I->getOperand(1)); 768 return false; 769 }; 770 771 // In case of multiple inner loop indvars, it is okay if LHS and RHS 772 // are both inner indvar related variables. 773 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1)) 774 return true; 775 776 // Otherwise we check if the cmp instruction compares an inner indvar 777 // related variable (Left) with a outer loop invariant (Right). 778 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) { 779 Left = Op0; 780 Right = Op1; 781 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) { 782 Left = Op1; 783 Right = Op0; 784 } 785 786 if (Left == nullptr) 787 return false; 788 789 const SCEV *S = SE->getSCEV(Right); 790 if (!SE->isLoopInvariant(S, OuterLoop)) 791 return false; 792 } 793 794 return true; 795 } 796 797 // If SV is a LCSSA PHI node with a single incoming value, return the incoming 798 // value. 799 static Value *followLCSSA(Value *SV) { 800 PHINode *PHI = dyn_cast<PHINode>(SV); 801 if (!PHI) 802 return SV; 803 804 if (PHI->getNumIncomingValues() != 1) 805 return SV; 806 return followLCSSA(PHI->getIncomingValue(0)); 807 } 808 809 // Check V's users to see if it is involved in a reduction in L. 810 static PHINode * 811 findInnerReductionPhi(Loop *L, Value *V, 812 SmallVectorImpl<Instruction *> &HasNoWrapInsts) { 813 // Reduction variables cannot be constants. 814 if (isa<Constant>(V)) 815 return nullptr; 816 817 for (Value *User : V->users()) { 818 if (PHINode *PHI = dyn_cast<PHINode>(User)) { 819 if (PHI->getNumIncomingValues() == 1) 820 continue; 821 RecurrenceDescriptor RD; 822 if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) { 823 // Detect floating point reduction only when it can be reordered. 824 if (RD.getExactFPMathInst() != nullptr) 825 return nullptr; 826 827 RecurKind RK = RD.getRecurrenceKind(); 828 switch (RK) { 829 case RecurKind::Or: 830 case RecurKind::And: 831 case RecurKind::Xor: 832 case RecurKind::SMin: 833 case RecurKind::SMax: 834 case RecurKind::UMin: 835 case RecurKind::UMax: 836 case RecurKind::FAdd: 837 case RecurKind::FMul: 838 case RecurKind::FMin: 839 case RecurKind::FMax: 840 case RecurKind::FMinimum: 841 case RecurKind::FMaximum: 842 case RecurKind::FMinimumNum: 843 case RecurKind::FMaximumNum: 844 case RecurKind::FMulAdd: 845 case RecurKind::AnyOf: 846 return PHI; 847 848 // Change the order of integer addition/multiplication may change the 849 // semantics. Consider the following case: 850 // 851 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }}; 852 // int sum = 0; 853 // for (int i = 0; i < 2; i++) 854 // for (int j = 0; j < 2; j++) 855 // sum += A[j][i]; 856 // 857 // If the above loops are exchanged, the addition will cause an 858 // overflow. To prevent this, we must drop the nuw/nsw flags from the 859 // addition/multiplication instructions when we actually exchanges the 860 // loops. 861 case RecurKind::Add: 862 case RecurKind::Mul: { 863 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK); 864 SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L); 865 866 // Bail out when we fail to collect reduction instructions chain. 867 if (Ops.empty()) 868 return nullptr; 869 870 for (Instruction *I : Ops) { 871 assert(I->getOpcode() == OpCode && 872 "Expected the instruction to be the reduction operation"); 873 874 // If the instruction has nuw/nsw flags, we must drop them when the 875 // transformation is actually performed. 876 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap()) 877 HasNoWrapInsts.push_back(I); 878 } 879 return PHI; 880 } 881 882 default: 883 return nullptr; 884 } 885 } 886 return nullptr; 887 } 888 } 889 890 return nullptr; 891 } 892 893 bool LoopInterchangeLegality::findInductionAndReductions( 894 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) { 895 if (!L->getLoopLatch() || !L->getLoopPredecessor()) 896 return false; 897 for (PHINode &PHI : L->getHeader()->phis()) { 898 InductionDescriptor ID; 899 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) 900 Inductions.push_back(&PHI); 901 else { 902 // PHIs in inner loops need to be part of a reduction in the outer loop, 903 // discovered when checking the PHIs of the outer loop earlier. 904 if (!InnerLoop) { 905 if (!OuterInnerReductions.count(&PHI)) { 906 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " 907 "across the outer loop.\n"); 908 return false; 909 } 910 } else { 911 assert(PHI.getNumIncomingValues() == 2 && 912 "Phis in loop header should have exactly 2 incoming values"); 913 // Check if we have a PHI node in the outer loop that has a reduction 914 // result from the inner loop as an incoming value. 915 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); 916 PHINode *InnerRedPhi = 917 findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions); 918 if (!InnerRedPhi || 919 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) { 920 LLVM_DEBUG( 921 dbgs() 922 << "Failed to recognize PHI as an induction or reduction.\n"); 923 return false; 924 } 925 OuterInnerReductions.insert(&PHI); 926 OuterInnerReductions.insert(InnerRedPhi); 927 } 928 } 929 } 930 return true; 931 } 932 933 // This function indicates the current limitations in the transform as a result 934 // of which we do not proceed. 935 bool LoopInterchangeLegality::currentLimitations() { 936 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 937 938 // transform currently expects the loop latches to also be the exiting 939 // blocks. 940 if (InnerLoop->getExitingBlock() != InnerLoopLatch || 941 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || 942 !isa<BranchInst>(InnerLoopLatch->getTerminator()) || 943 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) { 944 LLVM_DEBUG( 945 dbgs() << "Loops where the latch is not the exiting block are not" 946 << " supported currently.\n"); 947 ORE->emit([&]() { 948 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch", 949 OuterLoop->getStartLoc(), 950 OuterLoop->getHeader()) 951 << "Loops where the latch is not the exiting block cannot be" 952 " interchange currently."; 953 }); 954 return true; 955 } 956 957 SmallVector<PHINode *, 8> Inductions; 958 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { 959 LLVM_DEBUG( 960 dbgs() << "Only outer loops with induction or reduction PHI nodes " 961 << "are supported currently.\n"); 962 ORE->emit([&]() { 963 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", 964 OuterLoop->getStartLoc(), 965 OuterLoop->getHeader()) 966 << "Only outer loops with induction or reduction PHI nodes can be" 967 " interchanged currently."; 968 }); 969 return true; 970 } 971 972 Inductions.clear(); 973 // For multi-level loop nests, make sure that all phi nodes for inner loops 974 // at all levels can be recognized as a induction or reduction phi. Bail out 975 // if a phi node at a certain nesting level cannot be properly recognized. 976 Loop *CurLevelLoop = OuterLoop; 977 while (!CurLevelLoop->getSubLoops().empty()) { 978 // We already made sure that the loop nest is tightly nested. 979 CurLevelLoop = CurLevelLoop->getSubLoops().front(); 980 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) { 981 LLVM_DEBUG( 982 dbgs() << "Only inner loops with induction or reduction PHI nodes " 983 << "are supported currently.\n"); 984 ORE->emit([&]() { 985 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", 986 CurLevelLoop->getStartLoc(), 987 CurLevelLoop->getHeader()) 988 << "Only inner loops with induction or reduction PHI nodes can be" 989 " interchange currently."; 990 }); 991 return true; 992 } 993 } 994 995 // TODO: Triangular loops are not handled for now. 996 if (!isLoopStructureUnderstood()) { 997 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); 998 ORE->emit([&]() { 999 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", 1000 InnerLoop->getStartLoc(), 1001 InnerLoop->getHeader()) 1002 << "Inner loop structure not understood currently."; 1003 }); 1004 return true; 1005 } 1006 1007 return false; 1008 } 1009 1010 bool LoopInterchangeLegality::findInductions( 1011 Loop *L, SmallVectorImpl<PHINode *> &Inductions) { 1012 for (PHINode &PHI : L->getHeader()->phis()) { 1013 InductionDescriptor ID; 1014 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) 1015 Inductions.push_back(&PHI); 1016 } 1017 return !Inductions.empty(); 1018 } 1019 1020 // We currently only support LCSSA PHI nodes in the inner loop exit, if their 1021 // users are either reduction PHIs or PHIs outside the outer loop (which means 1022 // the we are only interested in the final value after the loop). 1023 static bool 1024 areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, 1025 SmallPtrSetImpl<PHINode *> &Reductions) { 1026 BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); 1027 for (PHINode &PHI : InnerExit->phis()) { 1028 // Reduction lcssa phi will have only 1 incoming block that from loop latch. 1029 if (PHI.getNumIncomingValues() > 1) 1030 return false; 1031 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { 1032 PHINode *PN = dyn_cast<PHINode>(U); 1033 return !PN || 1034 (!Reductions.count(PN) && OuterL->contains(PN->getParent())); 1035 })) { 1036 return false; 1037 } 1038 } 1039 return true; 1040 } 1041 1042 // We currently support LCSSA PHI nodes in the outer loop exit, if their 1043 // incoming values do not come from the outer loop latch or if the 1044 // outer loop latch has a single predecessor. In that case, the value will 1045 // be available if both the inner and outer loop conditions are true, which 1046 // will still be true after interchanging. If we have multiple predecessor, 1047 // that may not be the case, e.g. because the outer loop latch may be executed 1048 // if the inner loop is not executed. 1049 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { 1050 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); 1051 for (PHINode &PHI : LoopNestExit->phis()) { 1052 for (Value *Incoming : PHI.incoming_values()) { 1053 Instruction *IncomingI = dyn_cast<Instruction>(Incoming); 1054 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) 1055 continue; 1056 1057 // The incoming value is defined in the outer loop latch. Currently we 1058 // only support that in case the outer loop latch has a single predecessor. 1059 // This guarantees that the outer loop latch is executed if and only if 1060 // the inner loop is executed (because tightlyNested() guarantees that the 1061 // outer loop header only branches to the inner loop or the outer loop 1062 // latch). 1063 // FIXME: We could weaken this logic and allow multiple predecessors, 1064 // if the values are produced outside the loop latch. We would need 1065 // additional logic to update the PHI nodes in the exit block as 1066 // well. 1067 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) 1068 return false; 1069 } 1070 } 1071 return true; 1072 } 1073 1074 // In case of multi-level nested loops, it may occur that lcssa phis exist in 1075 // the latch of InnerLoop, i.e., when defs of the incoming values are further 1076 // inside the loopnest. Sometimes those incoming values are not available 1077 // after interchange, since the original inner latch will become the new outer 1078 // latch which may have predecessor paths that do not include those incoming 1079 // values. 1080 // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of 1081 // multi-level loop nests. 1082 static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { 1083 if (InnerLoop->getSubLoops().empty()) 1084 return true; 1085 // If the original outer latch has only one predecessor, then values defined 1086 // further inside the looploop, e.g., in the innermost loop, will be available 1087 // at the new outer latch after interchange. 1088 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr) 1089 return true; 1090 1091 // The outer latch has more than one predecessors, i.e., the inner 1092 // exit and the inner header. 1093 // PHI nodes in the inner latch are lcssa phis where the incoming values 1094 // are defined further inside the loopnest. Check if those phis are used 1095 // in the original inner latch. If that is the case then bail out since 1096 // those incoming values may not be available at the new outer latch. 1097 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1098 for (PHINode &PHI : InnerLoopLatch->phis()) { 1099 for (auto *U : PHI.users()) { 1100 Instruction *UI = cast<Instruction>(U); 1101 if (InnerLoopLatch == UI->getParent()) 1102 return false; 1103 } 1104 } 1105 return true; 1106 } 1107 1108 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 1109 unsigned OuterLoopId, 1110 CharMatrix &DepMatrix) { 1111 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 1112 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 1113 << " and OuterLoopId = " << OuterLoopId 1114 << " due to dependence\n"); 1115 ORE->emit([&]() { 1116 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", 1117 InnerLoop->getStartLoc(), 1118 InnerLoop->getHeader()) 1119 << "Cannot interchange loops due to dependences."; 1120 }); 1121 return false; 1122 } 1123 // Check if outer and inner loop contain legal instructions only. 1124 for (auto *BB : OuterLoop->blocks()) 1125 for (Instruction &I : BB->instructionsWithoutDebug()) 1126 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 1127 // readnone functions do not prevent interchanging. 1128 if (CI->onlyWritesMemory()) 1129 continue; 1130 LLVM_DEBUG( 1131 dbgs() << "Loops with call instructions cannot be interchanged " 1132 << "safely."); 1133 ORE->emit([&]() { 1134 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", 1135 CI->getDebugLoc(), 1136 CI->getParent()) 1137 << "Cannot interchange loops due to call instruction."; 1138 }); 1139 1140 return false; 1141 } 1142 1143 if (!findInductions(InnerLoop, InnerLoopInductions)) { 1144 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n"); 1145 return false; 1146 } 1147 1148 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { 1149 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n"); 1150 ORE->emit([&]() { 1151 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI", 1152 InnerLoop->getStartLoc(), 1153 InnerLoop->getHeader()) 1154 << "Cannot interchange loops because unsupported PHI nodes found " 1155 "in inner loop latch."; 1156 }); 1157 return false; 1158 } 1159 1160 // TODO: The loops could not be interchanged due to current limitations in the 1161 // transform module. 1162 if (currentLimitations()) { 1163 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 1164 return false; 1165 } 1166 1167 // Check if the loops are tightly nested. 1168 if (!tightlyNested(OuterLoop, InnerLoop)) { 1169 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n"); 1170 ORE->emit([&]() { 1171 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", 1172 InnerLoop->getStartLoc(), 1173 InnerLoop->getHeader()) 1174 << "Cannot interchange loops because they are not tightly " 1175 "nested."; 1176 }); 1177 return false; 1178 } 1179 1180 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, 1181 OuterInnerReductions)) { 1182 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); 1183 ORE->emit([&]() { 1184 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 1185 InnerLoop->getStartLoc(), 1186 InnerLoop->getHeader()) 1187 << "Found unsupported PHI node in loop exit."; 1188 }); 1189 return false; 1190 } 1191 1192 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { 1193 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); 1194 ORE->emit([&]() { 1195 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 1196 OuterLoop->getStartLoc(), 1197 OuterLoop->getHeader()) 1198 << "Found unsupported PHI node in loop exit."; 1199 }); 1200 return false; 1201 } 1202 1203 return true; 1204 } 1205 1206 void CacheCostManager::computeIfUnitinialized() { 1207 if (CC.has_value()) 1208 return; 1209 1210 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n"); 1211 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI); 1212 // Obtain the loop vector returned from loop cache analysis beforehand, 1213 // and put each <Loop, index> pair into a map for constant time query 1214 // later. Indices in loop vector reprsent the optimal order of the 1215 // corresponding loop, e.g., given a loopnest with depth N, index 0 1216 // indicates the loop should be placed as the outermost loop and index N 1217 // indicates the loop should be placed as the innermost loop. 1218 // 1219 // For the old pass manager CacheCost would be null. 1220 if (*CC != nullptr) 1221 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts())) 1222 CostMap[Cost.first] = Idx; 1223 } 1224 1225 CacheCost *CacheCostManager::getCacheCost() { 1226 computeIfUnitinialized(); 1227 return CC->get(); 1228 } 1229 1230 const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() { 1231 computeIfUnitinialized(); 1232 return CostMap; 1233 } 1234 1235 int LoopInterchangeProfitability::getInstrOrderCost() { 1236 unsigned GoodOrder, BadOrder; 1237 BadOrder = GoodOrder = 0; 1238 for (BasicBlock *BB : InnerLoop->blocks()) { 1239 for (Instruction &Ins : *BB) { 1240 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 1241 bool FoundInnerInduction = false; 1242 bool FoundOuterInduction = false; 1243 for (Value *Op : GEP->operands()) { 1244 // Skip operands that are not SCEV-able. 1245 if (!SE->isSCEVable(Op->getType())) 1246 continue; 1247 1248 const SCEV *OperandVal = SE->getSCEV(Op); 1249 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 1250 if (!AR) 1251 continue; 1252 1253 // If we find the inner induction after an outer induction e.g. 1254 // for(int i=0;i<N;i++) 1255 // for(int j=0;j<N;j++) 1256 // A[i][j] = A[i-1][j-1]+k; 1257 // then it is a good order. 1258 if (AR->getLoop() == InnerLoop) { 1259 // We found an InnerLoop induction after OuterLoop induction. It is 1260 // a good order. 1261 FoundInnerInduction = true; 1262 if (FoundOuterInduction) { 1263 GoodOrder++; 1264 break; 1265 } 1266 } 1267 // If we find the outer induction after an inner induction e.g. 1268 // for(int i=0;i<N;i++) 1269 // for(int j=0;j<N;j++) 1270 // A[j][i] = A[j-1][i-1]+k; 1271 // then it is a bad order. 1272 if (AR->getLoop() == OuterLoop) { 1273 // We found an OuterLoop induction after InnerLoop induction. It is 1274 // a bad order. 1275 FoundOuterInduction = true; 1276 if (FoundInnerInduction) { 1277 BadOrder++; 1278 break; 1279 } 1280 } 1281 } 1282 } 1283 } 1284 } 1285 return GoodOrder - BadOrder; 1286 } 1287 1288 std::optional<bool> 1289 LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis( 1290 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) { 1291 // This is the new cost model returned from loop cache analysis. 1292 // A smaller index means the loop should be placed an outer loop, and vice 1293 // versa. 1294 auto InnerLoopIt = CostMap.find(InnerLoop); 1295 if (InnerLoopIt == CostMap.end()) 1296 return std::nullopt; 1297 auto OuterLoopIt = CostMap.find(OuterLoop); 1298 if (OuterLoopIt == CostMap.end()) 1299 return std::nullopt; 1300 1301 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop)) 1302 return std::nullopt; 1303 unsigned InnerIndex = InnerLoopIt->second; 1304 unsigned OuterIndex = OuterLoopIt->second; 1305 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex 1306 << ", OuterIndex = " << OuterIndex << "\n"); 1307 assert(InnerIndex != OuterIndex && "CostMap should assign unique " 1308 "numbers to each loop"); 1309 return std::optional<bool>(InnerIndex < OuterIndex); 1310 } 1311 1312 std::optional<bool> 1313 LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { 1314 // Legacy cost model: this is rough cost estimation algorithm. It counts the 1315 // good and bad order of induction variables in the instruction and allows 1316 // reordering if number of bad orders is more than good. 1317 int Cost = getInstrOrderCost(); 1318 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); 1319 if (Cost < 0 && Cost < LoopInterchangeCostThreshold) 1320 return std::optional<bool>(true); 1321 1322 return std::nullopt; 1323 } 1324 1325 /// Return true if we can vectorize the loop specified by \p LoopId. 1326 static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { 1327 for (const auto &Dep : DepMatrix) { 1328 char Dir = Dep[LoopId]; 1329 if (Dir != 'I' && Dir != '=') 1330 return false; 1331 } 1332 return true; 1333 } 1334 1335 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( 1336 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { 1337 // If the outer loop is not loop independent it is not profitable to move 1338 // this to inner position, since doing so would not enable inner loop 1339 // parallelism. 1340 if (!canVectorize(DepMatrix, OuterLoopId)) 1341 return false; 1342 1343 // If inner loop has dependence and outer loop is loop independent then it is 1344 // profitable to interchange to enable inner loop parallelism. 1345 if (!canVectorize(DepMatrix, InnerLoopId)) 1346 return true; 1347 1348 // If both the inner and the outer loop can be vectorized, it is necessary to 1349 // check the cost of each vectorized loop for profitability decision. At this 1350 // time we do not have a cost model to estimate them, so return nullopt. 1351 // TODO: Estimate the cost of vectorized loop when both the outer and the 1352 // inner loop can be vectorized. 1353 return std::nullopt; 1354 } 1355 1356 bool LoopInterchangeProfitability::isProfitable( 1357 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, 1358 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) { 1359 // isProfitable() is structured to avoid endless loop interchange. If the 1360 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could 1361 // decide the profitability then, profitability check will stop and return the 1362 // analysis result. If it failed to determine it (e.g., cache analysis failed 1363 // to analyze the loopnest due to delinearization issues) then go ahead the 1364 // second highest priority rule (isProfitablePerInstrOrderCost by default). 1365 // Likewise, if it failed to analysis the profitability then only, the last 1366 // rule (isProfitableForVectorization by default) will decide. 1367 assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules"); 1368 std::optional<bool> shouldInterchange; 1369 for (RuleTy RT : Profitabilities) { 1370 switch (RT) { 1371 case RuleTy::PerLoopCacheAnalysis: { 1372 CacheCost *CC = CCM.getCacheCost(); 1373 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap(); 1374 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC); 1375 break; 1376 } 1377 case RuleTy::PerInstrOrderCost: 1378 shouldInterchange = isProfitablePerInstrOrderCost(); 1379 break; 1380 case RuleTy::ForVectorization: 1381 shouldInterchange = 1382 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); 1383 break; 1384 } 1385 1386 // If this rule could determine the profitability, don't call subsequent 1387 // rules. 1388 if (shouldInterchange.has_value()) 1389 break; 1390 } 1391 1392 if (!shouldInterchange.has_value()) { 1393 ORE->emit([&]() { 1394 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1395 InnerLoop->getStartLoc(), 1396 InnerLoop->getHeader()) 1397 << "Insufficient information to calculate the cost of loop for " 1398 "interchange."; 1399 }); 1400 return false; 1401 } else if (!shouldInterchange.value()) { 1402 ORE->emit([&]() { 1403 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1404 InnerLoop->getStartLoc(), 1405 InnerLoop->getHeader()) 1406 << "Interchanging loops is not considered to improve cache " 1407 "locality nor vectorization."; 1408 }); 1409 return false; 1410 } 1411 return true; 1412 } 1413 1414 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 1415 Loop *InnerLoop) { 1416 for (Loop *L : *OuterLoop) 1417 if (L == InnerLoop) { 1418 OuterLoop->removeChildLoop(L); 1419 return; 1420 } 1421 llvm_unreachable("Couldn't find loop"); 1422 } 1423 1424 /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the 1425 /// new inner and outer loop after interchanging: NewInner is the original 1426 /// outer loop and NewOuter is the original inner loop. 1427 /// 1428 /// Before interchanging, we have the following structure 1429 /// Outer preheader 1430 // Outer header 1431 // Inner preheader 1432 // Inner header 1433 // Inner body 1434 // Inner latch 1435 // outer bbs 1436 // Outer latch 1437 // 1438 // After interchanging: 1439 // Inner preheader 1440 // Inner header 1441 // Outer preheader 1442 // Outer header 1443 // Inner body 1444 // outer bbs 1445 // Outer latch 1446 // Inner latch 1447 void LoopInterchangeTransform::restructureLoops( 1448 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader, 1449 BasicBlock *OrigOuterPreHeader) { 1450 Loop *OuterLoopParent = OuterLoop->getParentLoop(); 1451 // The original inner loop preheader moves from the new inner loop to 1452 // the parent loop, if there is one. 1453 NewInner->removeBlockFromLoop(OrigInnerPreHeader); 1454 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent); 1455 1456 // Switch the loop levels. 1457 if (OuterLoopParent) { 1458 // Remove the loop from its parent loop. 1459 removeChildLoop(OuterLoopParent, NewInner); 1460 removeChildLoop(NewInner, NewOuter); 1461 OuterLoopParent->addChildLoop(NewOuter); 1462 } else { 1463 removeChildLoop(NewInner, NewOuter); 1464 LI->changeTopLevelLoop(NewInner, NewOuter); 1465 } 1466 while (!NewOuter->isInnermost()) 1467 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin())); 1468 NewOuter->addChildLoop(NewInner); 1469 1470 // BBs from the original inner loop. 1471 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); 1472 1473 // Add BBs from the original outer loop to the original inner loop (excluding 1474 // BBs already in inner loop) 1475 for (BasicBlock *BB : NewInner->blocks()) 1476 if (LI->getLoopFor(BB) == NewInner) 1477 NewOuter->addBlockEntry(BB); 1478 1479 // Now remove inner loop header and latch from the new inner loop and move 1480 // other BBs (the loop body) to the new inner loop. 1481 BasicBlock *OuterHeader = NewOuter->getHeader(); 1482 BasicBlock *OuterLatch = NewOuter->getLoopLatch(); 1483 for (BasicBlock *BB : OrigInnerBBs) { 1484 // Nothing will change for BBs in child loops. 1485 if (LI->getLoopFor(BB) != NewOuter) 1486 continue; 1487 // Remove the new outer loop header and latch from the new inner loop. 1488 if (BB == OuterHeader || BB == OuterLatch) 1489 NewInner->removeBlockFromLoop(BB); 1490 else 1491 LI->changeLoopFor(BB, NewInner); 1492 } 1493 1494 // The preheader of the original outer loop becomes part of the new 1495 // outer loop. 1496 NewOuter->addBlockEntry(OrigOuterPreHeader); 1497 LI->changeLoopFor(OrigOuterPreHeader, NewOuter); 1498 1499 // Tell SE that we move the loops around. 1500 SE->forgetLoop(NewOuter); 1501 } 1502 1503 bool LoopInterchangeTransform::transform( 1504 ArrayRef<Instruction *> DropNoWrapInsts) { 1505 bool Transformed = false; 1506 1507 if (InnerLoop->getSubLoops().empty()) { 1508 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1509 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); 1510 auto &InductionPHIs = LIL.getInnerLoopInductions(); 1511 if (InductionPHIs.empty()) { 1512 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 1513 return false; 1514 } 1515 1516 SmallVector<Instruction *, 8> InnerIndexVarList; 1517 for (PHINode *CurInductionPHI : InductionPHIs) { 1518 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 1519 InnerIndexVarList.push_back( 1520 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1))); 1521 else 1522 InnerIndexVarList.push_back( 1523 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0))); 1524 } 1525 1526 // Create a new latch block for the inner loop. We split at the 1527 // current latch's terminator and then move the condition and all 1528 // operands that are not either loop-invariant or the induction PHI into the 1529 // new latch block. 1530 BasicBlock *NewLatch = 1531 SplitBlock(InnerLoop->getLoopLatch(), 1532 InnerLoop->getLoopLatch()->getTerminator(), DT, LI); 1533 1534 SmallSetVector<Instruction *, 4> WorkList; 1535 unsigned i = 0; 1536 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() { 1537 for (; i < WorkList.size(); i++) { 1538 // Duplicate instruction and move it the new latch. Update uses that 1539 // have been moved. 1540 Instruction *NewI = WorkList[i]->clone(); 1541 NewI->insertBefore(NewLatch->getFirstNonPHIIt()); 1542 assert(!NewI->mayHaveSideEffects() && 1543 "Moving instructions with side-effects may change behavior of " 1544 "the loop nest!"); 1545 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) { 1546 Instruction *UserI = cast<Instruction>(U.getUser()); 1547 if (!InnerLoop->contains(UserI->getParent()) || 1548 UserI->getParent() == NewLatch || 1549 llvm::is_contained(InductionPHIs, UserI)) 1550 U.set(NewI); 1551 } 1552 // Add operands of moved instruction to the worklist, except if they are 1553 // outside the inner loop or are the induction PHI. 1554 for (Value *Op : WorkList[i]->operands()) { 1555 Instruction *OpI = dyn_cast<Instruction>(Op); 1556 if (!OpI || 1557 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || 1558 llvm::is_contained(InductionPHIs, OpI)) 1559 continue; 1560 WorkList.insert(OpI); 1561 } 1562 } 1563 }; 1564 1565 // FIXME: Should we interchange when we have a constant condition? 1566 Instruction *CondI = dyn_cast<Instruction>( 1567 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator()) 1568 ->getCondition()); 1569 if (CondI) 1570 WorkList.insert(CondI); 1571 MoveInstructions(); 1572 for (Instruction *InnerIndexVar : InnerIndexVarList) 1573 WorkList.insert(cast<Instruction>(InnerIndexVar)); 1574 MoveInstructions(); 1575 } 1576 1577 // Ensure the inner loop phi nodes have a separate basic block. 1578 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1579 if (&*InnerLoopHeader->getFirstNonPHIIt() != 1580 InnerLoopHeader->getTerminator()) { 1581 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI); 1582 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); 1583 } 1584 1585 // Instructions in the original inner loop preheader may depend on values 1586 // defined in the outer loop header. Move them there, because the original 1587 // inner loop preheader will become the entry into the interchanged loop nest. 1588 // Currently we move all instructions and rely on LICM to move invariant 1589 // instructions outside the loop nest. 1590 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1591 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1592 if (InnerLoopPreHeader != OuterLoopHeader) { 1593 for (Instruction &I : 1594 make_early_inc_range(make_range(InnerLoopPreHeader->begin(), 1595 std::prev(InnerLoopPreHeader->end())))) 1596 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator()); 1597 } 1598 1599 Transformed |= adjustLoopLinks(); 1600 if (!Transformed) { 1601 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n"); 1602 return false; 1603 } 1604 1605 // Finally, drop the nsw/nuw flags from the instructions for reduction 1606 // calculations. 1607 for (Instruction *Reduction : DropNoWrapInsts) { 1608 Reduction->setHasNoSignedWrap(false); 1609 Reduction->setHasNoUnsignedWrap(false); 1610 } 1611 1612 return true; 1613 } 1614 1615 /// \brief Move all instructions except the terminator from FromBB right before 1616 /// InsertBefore 1617 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 1618 BasicBlock *ToBB = InsertBefore->getParent(); 1619 1620 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(), 1621 FromBB->getTerminator()->getIterator()); 1622 } 1623 1624 /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. 1625 static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { 1626 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them 1627 // from BB1 afterwards. 1628 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; }); 1629 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end())); 1630 for (Instruction *I : TempInstrs) 1631 I->removeFromParent(); 1632 1633 // Move instructions from BB2 to BB1. 1634 moveBBContents(BB2, BB1->getTerminator()); 1635 1636 // Move instructions from TempInstrs to BB2. 1637 for (Instruction *I : TempInstrs) 1638 I->insertBefore(BB2->getTerminator()->getIterator()); 1639 } 1640 1641 // Update BI to jump to NewBB instead of OldBB. Records updates to the 1642 // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that 1643 // \p OldBB is exactly once in BI's successor list. 1644 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, 1645 BasicBlock *NewBB, 1646 std::vector<DominatorTree::UpdateType> &DTUpdates, 1647 bool MustUpdateOnce = true) { 1648 assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) && 1649 "BI must jump to OldBB exactly once."); 1650 bool Changed = false; 1651 for (Use &Op : BI->operands()) 1652 if (Op == OldBB) { 1653 Op.set(NewBB); 1654 Changed = true; 1655 } 1656 1657 if (Changed) { 1658 DTUpdates.push_back( 1659 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); 1660 DTUpdates.push_back( 1661 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); 1662 } 1663 assert(Changed && "Expected a successor to be updated"); 1664 } 1665 1666 // Move Lcssa PHIs to the right place. 1667 static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, 1668 BasicBlock *InnerLatch, BasicBlock *OuterHeader, 1669 BasicBlock *OuterLatch, BasicBlock *OuterExit, 1670 Loop *InnerLoop, LoopInfo *LI) { 1671 1672 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are 1673 // defined either in the header or latch. Those blocks will become header and 1674 // latch of the new outer loop, and the only possible users can PHI nodes 1675 // in the exit block of the loop nest or the outer loop header (reduction 1676 // PHIs, in that case, the incoming value must be defined in the inner loop 1677 // header). We can just substitute the user with the incoming value and remove 1678 // the PHI. 1679 for (PHINode &P : make_early_inc_range(InnerExit->phis())) { 1680 assert(P.getNumIncomingValues() == 1 && 1681 "Only loops with a single exit are supported!"); 1682 1683 // Incoming values are guaranteed be instructions currently. 1684 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch)); 1685 // In case of multi-level nested loops, follow LCSSA to find the incoming 1686 // value defined from the innermost loop. 1687 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI)); 1688 // Skip phis with incoming values from the inner loop body, excluding the 1689 // header and latch. 1690 if (IncIInnerMost->getParent() != InnerLatch && 1691 IncIInnerMost->getParent() != InnerHeader) 1692 continue; 1693 1694 assert(all_of(P.users(), 1695 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) { 1696 return (cast<PHINode>(U)->getParent() == OuterHeader && 1697 IncI->getParent() == InnerHeader) || 1698 cast<PHINode>(U)->getParent() == OuterExit; 1699 }) && 1700 "Can only replace phis iff the uses are in the loop nest exit or " 1701 "the incoming value is defined in the inner header (it will " 1702 "dominate all loop blocks after interchanging)"); 1703 P.replaceAllUsesWith(IncI); 1704 P.eraseFromParent(); 1705 } 1706 1707 SmallVector<PHINode *, 8> LcssaInnerExit( 1708 llvm::make_pointer_range(InnerExit->phis())); 1709 1710 SmallVector<PHINode *, 8> LcssaInnerLatch( 1711 llvm::make_pointer_range(InnerLatch->phis())); 1712 1713 // Lcssa PHIs for values used outside the inner loop are in InnerExit. 1714 // If a PHI node has users outside of InnerExit, it has a use outside the 1715 // interchanged loop and we have to preserve it. We move these to 1716 // InnerLatch, which will become the new exit block for the innermost 1717 // loop after interchanging. 1718 for (PHINode *P : LcssaInnerExit) 1719 P->moveBefore(InnerLatch->getFirstNonPHIIt()); 1720 1721 // If the inner loop latch contains LCSSA PHIs, those come from a child loop 1722 // and we have to move them to the new inner latch. 1723 for (PHINode *P : LcssaInnerLatch) 1724 P->moveBefore(InnerExit->getFirstNonPHIIt()); 1725 1726 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have 1727 // incoming values defined in the outer loop, we have to add a new PHI 1728 // in the inner loop latch, which became the exit block of the outer loop, 1729 // after interchanging. 1730 if (OuterExit) { 1731 for (PHINode &P : OuterExit->phis()) { 1732 if (P.getNumIncomingValues() != 1) 1733 continue; 1734 // Skip Phis with incoming values defined in the inner loop. Those should 1735 // already have been updated. 1736 auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); 1737 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) 1738 continue; 1739 1740 PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); 1741 NewPhi->setIncomingValue(0, P.getIncomingValue(0)); 1742 NewPhi->setIncomingBlock(0, OuterLatch); 1743 // We might have incoming edges from other BBs, i.e., the original outer 1744 // header. 1745 for (auto *Pred : predecessors(InnerLatch)) { 1746 if (Pred == OuterLatch) 1747 continue; 1748 NewPhi->addIncoming(P.getIncomingValue(0), Pred); 1749 } 1750 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt()); 1751 P.setIncomingValue(0, NewPhi); 1752 } 1753 } 1754 1755 // Now adjust the incoming blocks for the LCSSA PHIs. 1756 // For PHIs moved from Inner's exit block, we need to replace Inner's latch 1757 // with the new latch. 1758 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch); 1759 } 1760 1761 bool LoopInterchangeTransform::adjustLoopBranches() { 1762 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); 1763 std::vector<DominatorTree::UpdateType> DTUpdates; 1764 1765 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1766 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1767 1768 assert(OuterLoopPreHeader != OuterLoop->getHeader() && 1769 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && 1770 InnerLoopPreHeader && "Guaranteed by loop-simplify form"); 1771 // Ensure that both preheaders do not contain PHI nodes and have single 1772 // predecessors. This allows us to move them easily. We use 1773 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing 1774 // preheaders do not satisfy those conditions. 1775 if (isa<PHINode>(OuterLoopPreHeader->begin()) || 1776 !OuterLoopPreHeader->getUniquePredecessor()) 1777 OuterLoopPreHeader = 1778 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true); 1779 if (InnerLoopPreHeader == OuterLoop->getHeader()) 1780 InnerLoopPreHeader = 1781 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true); 1782 1783 // Adjust the loop preheader 1784 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1785 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1786 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1787 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 1788 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 1789 BasicBlock *InnerLoopLatchPredecessor = 1790 InnerLoopLatch->getUniquePredecessor(); 1791 BasicBlock *InnerLoopLatchSuccessor; 1792 BasicBlock *OuterLoopLatchSuccessor; 1793 1794 BranchInst *OuterLoopLatchBI = 1795 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 1796 BranchInst *InnerLoopLatchBI = 1797 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 1798 BranchInst *OuterLoopHeaderBI = 1799 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 1800 BranchInst *InnerLoopHeaderBI = 1801 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 1802 1803 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 1804 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 1805 !InnerLoopHeaderBI) 1806 return false; 1807 1808 BranchInst *InnerLoopLatchPredecessorBI = 1809 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 1810 BranchInst *OuterLoopPredecessorBI = 1811 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 1812 1813 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 1814 return false; 1815 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 1816 if (!InnerLoopHeaderSuccessor) 1817 return false; 1818 1819 // Adjust Loop Preheader and headers. 1820 // The branches in the outer loop predecessor and the outer loop header can 1821 // be unconditional branches or conditional branches with duplicates. Consider 1822 // this when updating the successors. 1823 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, 1824 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); 1825 // The outer loop header might or might not branch to the outer latch. 1826 // We are guaranteed to branch to the inner loop preheader. 1827 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) { 1828 // In this case the outerLoopHeader should branch to the InnerLoopLatch. 1829 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch, 1830 DTUpdates, 1831 /*MustUpdateOnce=*/false); 1832 } 1833 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, 1834 InnerLoopHeaderSuccessor, DTUpdates, 1835 /*MustUpdateOnce=*/false); 1836 1837 // Adjust reduction PHI's now that the incoming block has changed. 1838 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, 1839 OuterLoopHeader); 1840 1841 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, 1842 OuterLoopPreHeader, DTUpdates); 1843 1844 // -------------Adjust loop latches----------- 1845 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 1846 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 1847 else 1848 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 1849 1850 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, 1851 InnerLoopLatchSuccessor, DTUpdates); 1852 1853 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 1854 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 1855 else 1856 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 1857 1858 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, 1859 OuterLoopLatchSuccessor, DTUpdates); 1860 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, 1861 DTUpdates); 1862 1863 DT->applyUpdates(DTUpdates); 1864 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, 1865 OuterLoopPreHeader); 1866 1867 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, 1868 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), 1869 InnerLoop, LI); 1870 // For PHIs in the exit block of the outer loop, outer's latch has been 1871 // replaced by Inners'. 1872 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); 1873 1874 auto &OuterInnerReductions = LIL.getOuterInnerReductions(); 1875 // Now update the reduction PHIs in the inner and outer loop headers. 1876 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; 1877 for (PHINode &PHI : InnerLoopHeader->phis()) 1878 if (OuterInnerReductions.contains(&PHI)) 1879 InnerLoopPHIs.push_back(&PHI); 1880 1881 for (PHINode &PHI : OuterLoopHeader->phis()) 1882 if (OuterInnerReductions.contains(&PHI)) 1883 OuterLoopPHIs.push_back(&PHI); 1884 1885 // Now move the remaining reduction PHIs from outer to inner loop header and 1886 // vice versa. The PHI nodes must be part of a reduction across the inner and 1887 // outer loop and all the remains to do is and updating the incoming blocks. 1888 for (PHINode *PHI : OuterLoopPHIs) { 1889 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump();); 1890 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt()); 1891 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); 1892 } 1893 for (PHINode *PHI : InnerLoopPHIs) { 1894 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump();); 1895 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt()); 1896 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); 1897 } 1898 1899 // Update the incoming blocks for moved PHI nodes. 1900 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader); 1901 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch); 1902 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader); 1903 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); 1904 1905 // Values defined in the outer loop header could be used in the inner loop 1906 // latch. In that case, we need to create LCSSA phis for them, because after 1907 // interchanging they will be defined in the new inner loop and used in the 1908 // new outer loop. 1909 SmallVector<Instruction *, 4> MayNeedLCSSAPhis; 1910 for (Instruction &I : 1911 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end()))) 1912 MayNeedLCSSAPhis.push_back(&I); 1913 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE); 1914 1915 return true; 1916 } 1917 1918 bool LoopInterchangeTransform::adjustLoopLinks() { 1919 // Adjust all branches in the inner and outer loop. 1920 bool Changed = adjustLoopBranches(); 1921 if (Changed) { 1922 // We have interchanged the preheaders so we need to interchange the data in 1923 // the preheaders as well. This is because the content of the inner 1924 // preheader was previously executed inside the outer loop. 1925 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1926 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1927 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader); 1928 } 1929 return Changed; 1930 } 1931 1932 PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, 1933 LoopAnalysisManager &AM, 1934 LoopStandardAnalysisResults &AR, 1935 LPMUpdater &U) { 1936 Function &F = *LN.getParent(); 1937 SmallVector<Loop *, 8> LoopList(LN.getLoops()); 1938 1939 if (MaxMemInstrCount < 1) { 1940 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1"); 1941 return PreservedAnalyses::all(); 1942 } 1943 OptimizationRemarkEmitter ORE(&F); 1944 1945 // Ensure minimum depth of the loop nest to do the interchange. 1946 if (!hasSupportedLoopDepth(LoopList, ORE)) 1947 return PreservedAnalyses::all(); 1948 // Ensure computable loop nest. 1949 if (!isComputableLoopNest(&AR.SE, LoopList)) { 1950 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 1951 return PreservedAnalyses::all(); 1952 } 1953 1954 ORE.emit([&]() { 1955 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence", 1956 LN.getOutermostLoop().getStartLoc(), 1957 LN.getOutermostLoop().getHeader()) 1958 << "Computed dependence info, invoking the transform."; 1959 }); 1960 1961 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); 1962 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN)) 1963 return PreservedAnalyses::all(); 1964 U.markLoopNestChanged(true); 1965 return getLoopPassPreservedAnalyses(); 1966 } 1967