1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam as a routine, much like 10 // LoopUnroll.cpp implements loop unroll. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/SmallPtrSet.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/Analysis/AssumptionCache.h" 17 #include "llvm/Analysis/DependenceAnalysis.h" 18 #include "llvm/Analysis/InstructionSimplify.h" 19 #include "llvm/Analysis/LoopAnalysisManager.h" 20 #include "llvm/Analysis/LoopIterator.h" 21 #include "llvm/Analysis/LoopPass.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ScalarEvolution.h" 24 #include "llvm/Analysis/Utils/Local.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/DataLayout.h" 27 #include "llvm/IR/DebugInfoMetadata.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/IntrinsicInst.h" 30 #include "llvm/IR/LLVMContext.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 34 #include "llvm/Transforms/Utils/Cloning.h" 35 #include "llvm/Transforms/Utils/LoopSimplify.h" 36 #include "llvm/Transforms/Utils/LoopUtils.h" 37 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 38 #include "llvm/Transforms/Utils/UnrollLoop.h" 39 using namespace llvm; 40 41 #define DEBUG_TYPE "loop-unroll-and-jam" 42 43 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 44 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 45 46 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 47 48 // Partition blocks in an outer/inner loop pair into blocks before and after 49 // the loop 50 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 51 BasicBlockSet &ForeBlocks, 52 BasicBlockSet &SubLoopBlocks, 53 BasicBlockSet &AftBlocks, 54 DominatorTree *DT) { 55 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 56 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 57 58 for (BasicBlock *BB : L->blocks()) { 59 if (!SubLoop->contains(BB)) { 60 if (DT->dominates(SubLoopLatch, BB)) 61 AftBlocks.insert(BB); 62 else 63 ForeBlocks.insert(BB); 64 } 65 } 66 67 // Check that all blocks in ForeBlocks together dominate the subloop 68 // TODO: This might ideally be done better with a dominator/postdominators. 69 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 70 for (BasicBlock *BB : ForeBlocks) { 71 if (BB == SubLoopPreHeader) 72 continue; 73 Instruction *TI = BB->getTerminator(); 74 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) 75 if (!ForeBlocks.count(TI->getSuccessor(i))) 76 return false; 77 } 78 79 return true; 80 } 81 82 // Looks at the phi nodes in Header for values coming from Latch. For these 83 // instructions and all their operands calls Visit on them, keeping going for 84 // all the operands in AftBlocks. Returns false if Visit returns false, 85 // otherwise returns true. This is used to process the instructions in the 86 // Aft blocks that need to be moved before the subloop. It is used in two 87 // places. One to check that the required set of instructions can be moved 88 // before the loop. Then to collect the instructions to actually move in 89 // moveHeaderPhiOperandsToForeBlocks. 90 template <typename T> 91 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 92 BasicBlockSet &AftBlocks, T Visit) { 93 SmallVector<Instruction *, 8> Worklist; 94 for (auto &Phi : Header->phis()) { 95 Value *V = Phi.getIncomingValueForBlock(Latch); 96 if (Instruction *I = dyn_cast<Instruction>(V)) 97 Worklist.push_back(I); 98 } 99 100 while (!Worklist.empty()) { 101 Instruction *I = Worklist.back(); 102 Worklist.pop_back(); 103 if (!Visit(I)) 104 return false; 105 106 if (AftBlocks.count(I->getParent())) 107 for (auto &U : I->operands()) 108 if (Instruction *II = dyn_cast<Instruction>(U)) 109 Worklist.push_back(II); 110 } 111 112 return true; 113 } 114 115 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 116 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 117 BasicBlock *Latch, 118 Instruction *InsertLoc, 119 BasicBlockSet &AftBlocks) { 120 // We need to ensure we move the instructions in the correct order, 121 // starting with the earliest required instruction and moving forward. 122 std::vector<Instruction *> Visited; 123 processHeaderPhiOperands(Header, Latch, AftBlocks, 124 [&Visited, &AftBlocks](Instruction *I) { 125 if (AftBlocks.count(I->getParent())) 126 Visited.push_back(I); 127 return true; 128 }); 129 130 // Move all instructions in program order to before the InsertLoc 131 BasicBlock *InsertLocBB = InsertLoc->getParent(); 132 for (Instruction *I : reverse(Visited)) { 133 if (I->getParent() != InsertLocBB) 134 I->moveBefore(InsertLoc); 135 } 136 } 137 138 /* 139 This method performs Unroll and Jam. For a simple loop like: 140 for (i = ..) 141 Fore(i) 142 for (j = ..) 143 SubLoop(i, j) 144 Aft(i) 145 146 Instead of doing normal inner or outer unrolling, we do: 147 for (i = .., i+=2) 148 Fore(i) 149 Fore(i+1) 150 for (j = ..) 151 SubLoop(i, j) 152 SubLoop(i+1, j) 153 Aft(i) 154 Aft(i+1) 155 156 So the outer loop is essetially unrolled and then the inner loops are fused 157 ("jammed") together into a single loop. This can increase speed when there 158 are loads in SubLoop that are invariant to i, as they become shared between 159 the now jammed inner loops. 160 161 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 162 Fore blocks are those before the inner loop, Aft are those after. Normal 163 Unroll code is used to copy each of these sets of blocks and the results are 164 combined together into the final form above. 165 166 isSafeToUnrollAndJam should be used prior to calling this to make sure the 167 unrolling will be valid. Checking profitablility is also advisable. 168 169 If EpilogueLoop is non-null, it receives the epilogue loop (if it was 170 necessary to create one and not fully unrolled). 171 */ 172 LoopUnrollResult llvm::UnrollAndJamLoop( 173 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, 174 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 175 AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { 176 177 // When we enter here we should have already checked that it is safe 178 BasicBlock *Header = L->getHeader(); 179 assert(Header && "No header."); 180 assert(L->getSubLoops().size() == 1); 181 Loop *SubLoop = *L->begin(); 182 183 // Don't enter the unroll code if there is nothing to do. 184 if (TripCount == 0 && Count < 2) { 185 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 186 return LoopUnrollResult::Unmodified; 187 } 188 189 assert(Count > 0); 190 assert(TripMultiple > 0); 191 assert(TripCount == 0 || TripCount % TripMultiple == 0); 192 193 // Are we eliminating the loop control altogether? 194 bool CompletelyUnroll = (Count == TripCount); 195 196 // We use the runtime remainder in cases where we don't know trip multiple 197 if (TripMultiple == 1 || TripMultiple % Count != 0) { 198 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 199 /*UseEpilogRemainder*/ true, 200 UnrollRemainder, /*ForgetAllSCEV*/ false, 201 LI, SE, DT, AC, true, EpilogueLoop)) { 202 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 203 "generated when assuming runtime trip count\n"); 204 return LoopUnrollResult::Unmodified; 205 } 206 } 207 208 // Notify ScalarEvolution that the loop will be substantially changed, 209 // if not outright eliminated. 210 if (SE) { 211 SE->forgetLoop(L); 212 SE->forgetLoop(SubLoop); 213 } 214 215 using namespace ore; 216 // Report the unrolling decision. 217 if (CompletelyUnroll) { 218 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 219 << Header->getName() << " with trip count " << TripCount 220 << "!\n"); 221 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 222 L->getHeader()) 223 << "completely unroll and jammed loop with " 224 << NV("UnrollCount", TripCount) << " iterations"); 225 } else { 226 auto DiagBuilder = [&]() { 227 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 228 L->getHeader()); 229 return Diag << "unroll and jammed loop by a factor of " 230 << NV("UnrollCount", Count); 231 }; 232 233 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 234 << " by " << Count); 235 if (TripMultiple != 1) { 236 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 237 ORE->emit([&]() { 238 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 239 << " trips per branch"; 240 }); 241 } else { 242 LLVM_DEBUG(dbgs() << " with run-time trip count"); 243 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 244 } 245 LLVM_DEBUG(dbgs() << "!\n"); 246 } 247 248 BasicBlock *Preheader = L->getLoopPreheader(); 249 BasicBlock *LatchBlock = L->getLoopLatch(); 250 assert(Preheader && "No preheader"); 251 assert(LatchBlock && "No latch block"); 252 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 253 assert(BI && !BI->isUnconditional()); 254 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 255 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 256 bool SubLoopContinueOnTrue = SubLoop->contains( 257 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 258 259 // Partition blocks in an outer/inner loop pair into blocks before and after 260 // the loop 261 BasicBlockSet SubLoopBlocks; 262 BasicBlockSet ForeBlocks; 263 BasicBlockSet AftBlocks; 264 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 265 DT); 266 267 // We keep track of the entering/first and exiting/last block of each of 268 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 269 // blocks easier. 270 std::vector<BasicBlock *> ForeBlocksFirst; 271 std::vector<BasicBlock *> ForeBlocksLast; 272 std::vector<BasicBlock *> SubLoopBlocksFirst; 273 std::vector<BasicBlock *> SubLoopBlocksLast; 274 std::vector<BasicBlock *> AftBlocksFirst; 275 std::vector<BasicBlock *> AftBlocksLast; 276 ForeBlocksFirst.push_back(Header); 277 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 278 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 279 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 280 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 281 AftBlocksLast.push_back(L->getExitingBlock()); 282 // Maps Blocks[0] -> Blocks[It] 283 ValueToValueMapTy LastValueMap; 284 285 // Move any instructions from fore phi operands from AftBlocks into Fore. 286 moveHeaderPhiOperandsToForeBlocks( 287 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(), 288 AftBlocks); 289 290 // The current on-the-fly SSA update requires blocks to be processed in 291 // reverse postorder so that LastValueMap contains the correct value at each 292 // exit. 293 LoopBlocksDFS DFS(L); 294 DFS.perform(LI); 295 // Stash the DFS iterators before adding blocks to the loop. 296 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 297 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 298 299 if (Header->getParent()->isDebugInfoForProfiling()) 300 for (BasicBlock *BB : L->getBlocks()) 301 for (Instruction &I : *BB) 302 if (!isa<DbgInfoIntrinsic>(&I)) 303 if (const DILocation *DIL = I.getDebugLoc()) { 304 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 305 if (NewDIL) 306 I.setDebugLoc(NewDIL.getValue()); 307 else 308 LLVM_DEBUG(dbgs() 309 << "Failed to create new discriminator: " 310 << DIL->getFilename() << " Line: " << DIL->getLine()); 311 } 312 313 // Copy all blocks 314 for (unsigned It = 1; It != Count; ++It) { 315 std::vector<BasicBlock *> NewBlocks; 316 // Maps Blocks[It] -> Blocks[It-1] 317 DenseMap<Value *, Value *> PrevItValueMap; 318 319 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 320 ValueToValueMapTy VMap; 321 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 322 Header->getParent()->getBasicBlockList().push_back(New); 323 324 if (ForeBlocks.count(*BB)) { 325 L->addBasicBlockToLoop(New, *LI); 326 327 if (*BB == ForeBlocksFirst[0]) 328 ForeBlocksFirst.push_back(New); 329 if (*BB == ForeBlocksLast[0]) 330 ForeBlocksLast.push_back(New); 331 } else if (SubLoopBlocks.count(*BB)) { 332 SubLoop->addBasicBlockToLoop(New, *LI); 333 334 if (*BB == SubLoopBlocksFirst[0]) 335 SubLoopBlocksFirst.push_back(New); 336 if (*BB == SubLoopBlocksLast[0]) 337 SubLoopBlocksLast.push_back(New); 338 } else if (AftBlocks.count(*BB)) { 339 L->addBasicBlockToLoop(New, *LI); 340 341 if (*BB == AftBlocksFirst[0]) 342 AftBlocksFirst.push_back(New); 343 if (*BB == AftBlocksLast[0]) 344 AftBlocksLast.push_back(New); 345 } else { 346 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 347 } 348 349 // Update our running maps of newest clones 350 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 351 LastValueMap[*BB] = New; 352 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 353 VI != VE; ++VI) { 354 PrevItValueMap[VI->second] = 355 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 356 LastValueMap[VI->first] = VI->second; 357 } 358 359 NewBlocks.push_back(New); 360 361 // Update DomTree: 362 if (*BB == ForeBlocksFirst[0]) 363 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 364 else if (*BB == SubLoopBlocksFirst[0]) 365 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 366 else if (*BB == AftBlocksFirst[0]) 367 DT->addNewBlock(New, AftBlocksLast[It - 1]); 368 else { 369 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 370 // structure. 371 auto BBDomNode = DT->getNode(*BB); 372 auto BBIDom = BBDomNode->getIDom(); 373 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 374 assert(OriginalBBIDom); 375 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 376 DT->addNewBlock( 377 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 378 } 379 } 380 381 // Remap all instructions in the most recent iteration 382 for (BasicBlock *NewBlock : NewBlocks) { 383 for (Instruction &I : *NewBlock) { 384 ::remapInstruction(&I, LastValueMap); 385 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 386 if (II->getIntrinsicID() == Intrinsic::assume) 387 AC->registerAssumption(II); 388 } 389 } 390 391 // Alter the ForeBlocks phi's, pointing them at the latest version of the 392 // value from the previous iteration's phis 393 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 394 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 395 assert(OldValue && "should have incoming edge from Aft[It]"); 396 Value *NewValue = OldValue; 397 if (Value *PrevValue = PrevItValueMap[OldValue]) 398 NewValue = PrevValue; 399 400 assert(Phi.getNumOperands() == 2); 401 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 402 Phi.setIncomingValue(0, NewValue); 403 Phi.removeIncomingValue(1); 404 } 405 } 406 407 // Now that all the basic blocks for the unrolled iterations are in place, 408 // finish up connecting the blocks and phi nodes. At this point LastValueMap 409 // is the last unrolled iterations values. 410 411 // Update Phis in BB from OldBB to point to NewBB 412 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB, 413 BasicBlock *NewBB) { 414 for (PHINode &Phi : BB->phis()) { 415 int I = Phi.getBasicBlockIndex(OldBB); 416 Phi.setIncomingBlock(I, NewBB); 417 } 418 }; 419 // Update Phis in BB from OldBB to point to NewBB and use the latest value 420 // from LastValueMap 421 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 422 BasicBlock *NewBB, 423 ValueToValueMapTy &LastValueMap) { 424 for (PHINode &Phi : BB->phis()) { 425 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 426 if (Phi.getIncomingBlock(b) == OldBB) { 427 Value *OldValue = Phi.getIncomingValue(b); 428 if (Value *LastValue = LastValueMap[OldValue]) 429 Phi.setIncomingValue(b, LastValue); 430 Phi.setIncomingBlock(b, NewBB); 431 break; 432 } 433 } 434 } 435 }; 436 // Move all the phis from Src into Dest 437 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 438 Instruction *insertPoint = Dest->getFirstNonPHI(); 439 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 440 Phi->moveBefore(insertPoint); 441 }; 442 443 // Update the PHI values outside the loop to point to the last block 444 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 445 LastValueMap); 446 447 // Update ForeBlocks successors and phi nodes 448 BranchInst *ForeTerm = 449 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 450 BasicBlock *Dest = SubLoopBlocksFirst[0]; 451 ForeTerm->setSuccessor(0, Dest); 452 453 if (CompletelyUnroll) { 454 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 455 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 456 Phi->getParent()->getInstList().erase(Phi); 457 } 458 } else { 459 // Update the PHI values to point to the last aft block 460 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 461 AftBlocksLast.back(), LastValueMap); 462 } 463 464 for (unsigned It = 1; It != Count; It++) { 465 // Remap ForeBlock successors from previous iteration to this 466 BranchInst *ForeTerm = 467 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 468 BasicBlock *Dest = ForeBlocksFirst[It]; 469 ForeTerm->setSuccessor(0, Dest); 470 } 471 472 // Subloop successors and phis 473 BranchInst *SubTerm = 474 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 475 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 476 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 477 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0], 478 ForeBlocksLast.back()); 479 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0], 480 SubLoopBlocksLast.back()); 481 482 for (unsigned It = 1; It != Count; It++) { 483 // Replace the conditional branch of the previous iteration subloop with an 484 // unconditional one to this one 485 BranchInst *SubTerm = 486 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 487 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 488 SubTerm->eraseFromParent(); 489 490 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It], 491 ForeBlocksLast.back()); 492 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It], 493 SubLoopBlocksLast.back()); 494 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 495 } 496 497 // Aft blocks successors and phis 498 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 499 if (CompletelyUnroll) { 500 BranchInst::Create(LoopExit, Term); 501 Term->eraseFromParent(); 502 } else { 503 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 504 } 505 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0], 506 SubLoopBlocksLast.back()); 507 508 for (unsigned It = 1; It != Count; It++) { 509 // Replace the conditional branch of the previous iteration subloop with an 510 // unconditional one to this one 511 BranchInst *AftTerm = 512 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 513 BranchInst::Create(AftBlocksFirst[It], AftTerm); 514 AftTerm->eraseFromParent(); 515 516 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It], 517 SubLoopBlocksLast.back()); 518 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 519 } 520 521 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 522 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 523 // new ones required. 524 if (Count != 1) { 525 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 526 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 527 SubLoopBlocksFirst[0]); 528 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 529 SubLoopBlocksLast[0], AftBlocksFirst[0]); 530 531 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 532 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 533 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 534 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 535 DTU.applyUpdatesPermissive(DTUpdates); 536 } 537 538 // Merge adjacent basic blocks, if possible. 539 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 540 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 541 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 542 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 543 while (!MergeBlocks.empty()) { 544 BasicBlock *BB = *MergeBlocks.begin(); 545 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); 546 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) { 547 BasicBlock *Dest = Term->getSuccessor(0); 548 BasicBlock *Fold = Dest->getUniquePredecessor(); 549 if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) { 550 // Don't remove BB and add Fold as they are the same BB 551 assert(Fold == BB); 552 (void)Fold; 553 MergeBlocks.erase(Dest); 554 } else 555 MergeBlocks.erase(BB); 556 } else 557 MergeBlocks.erase(BB); 558 } 559 // Apply updates to the DomTree. 560 DT = &DTU.getDomTree(); 561 562 // At this point, the code is well formed. We now do a quick sweep over the 563 // inserted code, doing constant propagation and dead code elimination as we 564 // go. 565 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC); 566 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC); 567 568 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 569 ++NumUnrolledAndJammed; 570 571 #ifndef NDEBUG 572 // We shouldn't have done anything to break loop simplify form or LCSSA. 573 Loop *OuterL = L->getParentLoop(); 574 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop); 575 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 576 if (!CompletelyUnroll) 577 assert(L->isLoopSimplifyForm()); 578 assert(SubLoop->isLoopSimplifyForm()); 579 assert(DT->verify()); 580 #endif 581 582 // Update LoopInfo if the loop is completely removed. 583 if (CompletelyUnroll) 584 LI->erase(L); 585 586 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 587 : LoopUnrollResult::PartiallyUnrolled; 588 } 589 590 static bool getLoadsAndStores(BasicBlockSet &Blocks, 591 SmallVector<Value *, 4> &MemInstr) { 592 // Scan the BBs and collect legal loads and stores. 593 // Returns false if non-simple loads/stores are found. 594 for (BasicBlock *BB : Blocks) { 595 for (Instruction &I : *BB) { 596 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 597 if (!Ld->isSimple()) 598 return false; 599 MemInstr.push_back(&I); 600 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 601 if (!St->isSimple()) 602 return false; 603 MemInstr.push_back(&I); 604 } else if (I.mayReadOrWriteMemory()) { 605 return false; 606 } 607 } 608 } 609 return true; 610 } 611 612 static bool checkDependencies(SmallVector<Value *, 4> &Earlier, 613 SmallVector<Value *, 4> &Later, 614 unsigned LoopDepth, bool InnerLoop, 615 DependenceInfo &DI) { 616 // Use DA to check for dependencies between loads and stores that make unroll 617 // and jam invalid 618 for (Value *I : Earlier) { 619 for (Value *J : Later) { 620 Instruction *Src = cast<Instruction>(I); 621 Instruction *Dst = cast<Instruction>(J); 622 if (Src == Dst) 623 continue; 624 // Ignore Input dependencies. 625 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 626 continue; 627 628 // Track dependencies, and if we find them take a conservative approach 629 // by allowing only = or < (not >), altough some > would be safe 630 // (depending upon unroll width). 631 // For the inner loop, we need to disallow any (> <) dependencies 632 // FIXME: Allow > so long as distance is less than unroll width 633 if (auto D = DI.depends(Src, Dst, true)) { 634 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 635 636 if (D->isConfused()) { 637 LLVM_DEBUG(dbgs() << " Confused dependency between:\n" 638 << " " << *Src << "\n" 639 << " " << *Dst << "\n"); 640 return false; 641 } 642 if (!InnerLoop) { 643 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { 644 LLVM_DEBUG(dbgs() << " > dependency between:\n" 645 << " " << *Src << "\n" 646 << " " << *Dst << "\n"); 647 return false; 648 } 649 } else { 650 assert(LoopDepth + 1 <= D->getLevels()); 651 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && 652 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { 653 LLVM_DEBUG(dbgs() << " < > dependency between:\n" 654 << " " << *Src << "\n" 655 << " " << *Dst << "\n"); 656 return false; 657 } 658 } 659 } 660 } 661 } 662 return true; 663 } 664 665 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, 666 BasicBlockSet &SubLoopBlocks, 667 BasicBlockSet &AftBlocks, DependenceInfo &DI) { 668 // Get all loads/store pairs for each blocks 669 SmallVector<Value *, 4> ForeMemInstr; 670 SmallVector<Value *, 4> SubLoopMemInstr; 671 SmallVector<Value *, 4> AftMemInstr; 672 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || 673 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || 674 !getLoadsAndStores(AftBlocks, AftMemInstr)) 675 return false; 676 677 // Check for dependencies between any blocks that may change order 678 unsigned LoopDepth = L->getLoopDepth(); 679 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, 680 DI) && 681 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && 682 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, 683 DI) && 684 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, 685 DI); 686 } 687 688 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 689 DependenceInfo &DI) { 690 /* We currently handle outer loops like this: 691 | 692 ForeFirst <----\ } 693 Blocks | } ForeBlocks 694 ForeLast | } 695 | | 696 SubLoopFirst <\ | } 697 Blocks | | } SubLoopBlocks 698 SubLoopLast -/ | } 699 | | 700 AftFirst | } 701 Blocks | } AftBlocks 702 AftLast ------/ } 703 | 704 705 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 706 and AftBlocks, providing that there is one edge from Fores to SubLoops, 707 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 708 In practice we currently limit Aft blocks to a single block, and limit 709 things further in the profitablility checks of the unroll and jam pass. 710 711 Because of the way we rearrange basic blocks, we also require that 712 the Fore blocks on all unrolled iterations are safe to move before the 713 SubLoop blocks of all iterations. So we require that the phi node looping 714 operands of ForeHeader can be moved to at least the end of ForeEnd, so that 715 we can arrange cloned Fore Blocks before the subloop and match up Phi's 716 correctly. 717 718 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. 719 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. 720 721 There are then a number of checks along the lines of no calls, no 722 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 723 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 724 UnrollAndJamLoop if the trip count cannot be easily calculated. 725 */ 726 727 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) 728 return false; 729 Loop *SubLoop = L->getSubLoops()[0]; 730 if (!SubLoop->isLoopSimplifyForm()) 731 return false; 732 733 BasicBlock *Header = L->getHeader(); 734 BasicBlock *Latch = L->getLoopLatch(); 735 BasicBlock *Exit = L->getExitingBlock(); 736 BasicBlock *SubLoopHeader = SubLoop->getHeader(); 737 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 738 BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); 739 740 if (Latch != Exit) 741 return false; 742 if (SubLoopLatch != SubLoopExit) 743 return false; 744 745 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { 746 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 747 return false; 748 } 749 750 // Split blocks into Fore/SubLoop/Aft based on dominators 751 BasicBlockSet SubLoopBlocks; 752 BasicBlockSet ForeBlocks; 753 BasicBlockSet AftBlocks; 754 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, 755 AftBlocks, &DT)) { 756 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 757 return false; 758 } 759 760 // Aft blocks may need to move instructions to fore blocks, which becomes more 761 // difficult if there are multiple (potentially conditionally executed) 762 // blocks. For now we just exclude loops with multiple aft blocks. 763 if (AftBlocks.size() != 1) { 764 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 765 "multiple blocks after the loop\n"); 766 return false; 767 } 768 769 // Check inner loop backedge count is consistent on all iterations of the 770 // outer loop 771 if (!hasIterationCountInvariantInParent(SubLoop, SE)) { 772 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 773 "not consistent on each iteration\n"); 774 return false; 775 } 776 777 // Check the loop safety info for exceptions. 778 SimpleLoopSafetyInfo LSI; 779 LSI.computeLoopSafetyInfo(L); 780 if (LSI.anyBlockMayThrow()) { 781 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 782 return false; 783 } 784 785 // We've ruled out the easy stuff and now need to check that there are no 786 // interdependencies which may prevent us from moving the: 787 // ForeBlocks before Subloop and AftBlocks. 788 // Subloop before AftBlocks. 789 // ForeBlock phi operands before the subloop 790 791 // Make sure we can move all instructions we need to before the subloop 792 if (!processHeaderPhiOperands( 793 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 794 if (SubLoop->contains(I->getParent())) 795 return false; 796 if (AftBlocks.count(I->getParent())) { 797 // If we hit a phi node in afts we know we are done (probably 798 // LCSSA) 799 if (isa<PHINode>(I)) 800 return false; 801 // Can't move instructions with side effects or memory 802 // reads/writes 803 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 804 return false; 805 } 806 // Keep going 807 return true; 808 })) { 809 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 810 "instructions after subloop to before it\n"); 811 return false; 812 } 813 814 // Check for memory dependencies which prohibit the unrolling we are doing. 815 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 816 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 817 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { 818 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 819 return false; 820 } 821 822 return true; 823 } 824