1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam as a routine, much like 10 // LoopUnroll.cpp implements loop unroll. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/ADT/iterator_range.h" 23 #include "llvm/Analysis/AssumptionCache.h" 24 #include "llvm/Analysis/DependenceAnalysis.h" 25 #include "llvm/Analysis/DomTreeUpdater.h" 26 #include "llvm/Analysis/LoopInfo.h" 27 #include "llvm/Analysis/LoopIterator.h" 28 #include "llvm/Analysis/MustExecute.h" 29 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 30 #include "llvm/Analysis/ScalarEvolution.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/DebugInfoMetadata.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/Instruction.h" 38 #include "llvm/IR/Instructions.h" 39 #include "llvm/IR/IntrinsicInst.h" 40 #include "llvm/IR/User.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/IR/ValueHandle.h" 43 #include "llvm/IR/ValueMap.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/GenericDomTree.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 50 #include "llvm/Transforms/Utils/Cloning.h" 51 #include "llvm/Transforms/Utils/LoopUtils.h" 52 #include "llvm/Transforms/Utils/UnrollLoop.h" 53 #include "llvm/Transforms/Utils/ValueMapper.h" 54 #include <assert.h> 55 #include <memory> 56 #include <type_traits> 57 #include <vector> 58 59 using namespace llvm; 60 61 #define DEBUG_TYPE "loop-unroll-and-jam" 62 63 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 64 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 65 66 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 67 68 // Partition blocks in an outer/inner loop pair into blocks before and after 69 // the loop 70 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, 71 BasicBlockSet &AftBlocks, DominatorTree &DT) { 72 Loop *SubLoop = L.getSubLoops()[0]; 73 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 74 75 for (BasicBlock *BB : L.blocks()) { 76 if (!SubLoop->contains(BB)) { 77 if (DT.dominates(SubLoopLatch, BB)) 78 AftBlocks.insert(BB); 79 else 80 ForeBlocks.insert(BB); 81 } 82 } 83 84 // Check that all blocks in ForeBlocks together dominate the subloop 85 // TODO: This might ideally be done better with a dominator/postdominators. 86 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 87 for (BasicBlock *BB : ForeBlocks) { 88 if (BB == SubLoopPreHeader) 89 continue; 90 Instruction *TI = BB->getTerminator(); 91 for (BasicBlock *Succ : successors(TI)) 92 if (!ForeBlocks.count(Succ)) 93 return false; 94 } 95 96 return true; 97 } 98 99 /// Partition blocks in a loop nest into blocks before and after each inner 100 /// loop. 101 static bool partitionOuterLoopBlocks( 102 Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, 103 DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 104 DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { 105 JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); 106 107 for (Loop *L : Root.getLoopsInPreorder()) { 108 if (L == &JamLoop) 109 break; 110 111 if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) 112 return false; 113 } 114 115 return true; 116 } 117 118 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more 119 // than 2 levels loop. 120 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 121 BasicBlockSet &ForeBlocks, 122 BasicBlockSet &SubLoopBlocks, 123 BasicBlockSet &AftBlocks, 124 DominatorTree *DT) { 125 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 126 return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); 127 } 128 129 // Looks at the phi nodes in Header for values coming from Latch. For these 130 // instructions and all their operands calls Visit on them, keeping going for 131 // all the operands in AftBlocks. Returns false if Visit returns false, 132 // otherwise returns true. This is used to process the instructions in the 133 // Aft blocks that need to be moved before the subloop. It is used in two 134 // places. One to check that the required set of instructions can be moved 135 // before the loop. Then to collect the instructions to actually move in 136 // moveHeaderPhiOperandsToForeBlocks. 137 template <typename T> 138 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 139 BasicBlockSet &AftBlocks, T Visit) { 140 SmallPtrSet<Instruction *, 8> VisitedInstr; 141 142 std::function<bool(Instruction * I)> ProcessInstr = [&](Instruction *I) { 143 if (VisitedInstr.count(I)) 144 return true; 145 146 VisitedInstr.insert(I); 147 148 if (AftBlocks.count(I->getParent())) 149 for (auto &U : I->operands()) 150 if (Instruction *II = dyn_cast<Instruction>(U)) 151 if (!ProcessInstr(II)) 152 return false; 153 154 return Visit(I); 155 }; 156 157 for (auto &Phi : Header->phis()) { 158 Value *V = Phi.getIncomingValueForBlock(Latch); 159 if (Instruction *I = dyn_cast<Instruction>(V)) 160 if (!ProcessInstr(I)) 161 return false; 162 } 163 164 return true; 165 } 166 167 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 168 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 169 BasicBlock *Latch, 170 Instruction *InsertLoc, 171 BasicBlockSet &AftBlocks) { 172 // We need to ensure we move the instructions in the correct order, 173 // starting with the earliest required instruction and moving forward. 174 processHeaderPhiOperands(Header, Latch, AftBlocks, 175 [&AftBlocks, &InsertLoc](Instruction *I) { 176 if (AftBlocks.count(I->getParent())) 177 I->moveBefore(InsertLoc); 178 return true; 179 }); 180 } 181 182 /* 183 This method performs Unroll and Jam. For a simple loop like: 184 for (i = ..) 185 Fore(i) 186 for (j = ..) 187 SubLoop(i, j) 188 Aft(i) 189 190 Instead of doing normal inner or outer unrolling, we do: 191 for (i = .., i+=2) 192 Fore(i) 193 Fore(i+1) 194 for (j = ..) 195 SubLoop(i, j) 196 SubLoop(i+1, j) 197 Aft(i) 198 Aft(i+1) 199 200 So the outer loop is essetially unrolled and then the inner loops are fused 201 ("jammed") together into a single loop. This can increase speed when there 202 are loads in SubLoop that are invariant to i, as they become shared between 203 the now jammed inner loops. 204 205 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 206 Fore blocks are those before the inner loop, Aft are those after. Normal 207 Unroll code is used to copy each of these sets of blocks and the results are 208 combined together into the final form above. 209 210 isSafeToUnrollAndJam should be used prior to calling this to make sure the 211 unrolling will be valid. Checking profitablility is also advisable. 212 213 If EpilogueLoop is non-null, it receives the epilogue loop (if it was 214 necessary to create one and not fully unrolled). 215 */ 216 LoopUnrollResult 217 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, 218 unsigned TripMultiple, bool UnrollRemainder, 219 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 220 AssumptionCache *AC, const TargetTransformInfo *TTI, 221 OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { 222 223 // When we enter here we should have already checked that it is safe 224 BasicBlock *Header = L->getHeader(); 225 assert(Header && "No header."); 226 assert(L->getSubLoops().size() == 1); 227 Loop *SubLoop = *L->begin(); 228 229 // Don't enter the unroll code if there is nothing to do. 230 if (TripCount == 0 && Count < 2) { 231 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 232 return LoopUnrollResult::Unmodified; 233 } 234 235 assert(Count > 0); 236 assert(TripMultiple > 0); 237 assert(TripCount == 0 || TripCount % TripMultiple == 0); 238 239 // Are we eliminating the loop control altogether? 240 bool CompletelyUnroll = (Count == TripCount); 241 242 // We use the runtime remainder in cases where we don't know trip multiple 243 if (TripMultiple % Count != 0) { 244 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 245 /*UseEpilogRemainder*/ true, 246 UnrollRemainder, /*ForgetAllSCEV*/ false, 247 LI, SE, DT, AC, TTI, true, EpilogueLoop)) { 248 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 249 "generated when assuming runtime trip count\n"); 250 return LoopUnrollResult::Unmodified; 251 } 252 } 253 254 // Notify ScalarEvolution that the loop will be substantially changed, 255 // if not outright eliminated. 256 if (SE) { 257 SE->forgetLoop(L); 258 SE->forgetBlockAndLoopDispositions(); 259 } 260 261 using namespace ore; 262 // Report the unrolling decision. 263 if (CompletelyUnroll) { 264 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 265 << Header->getName() << " with trip count " << TripCount 266 << "!\n"); 267 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 268 L->getHeader()) 269 << "completely unroll and jammed loop with " 270 << NV("UnrollCount", TripCount) << " iterations"); 271 } else { 272 auto DiagBuilder = [&]() { 273 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 274 L->getHeader()); 275 return Diag << "unroll and jammed loop by a factor of " 276 << NV("UnrollCount", Count); 277 }; 278 279 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 280 << " by " << Count); 281 if (TripMultiple != 1) { 282 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 283 ORE->emit([&]() { 284 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 285 << " trips per branch"; 286 }); 287 } else { 288 LLVM_DEBUG(dbgs() << " with run-time trip count"); 289 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 290 } 291 LLVM_DEBUG(dbgs() << "!\n"); 292 } 293 294 BasicBlock *Preheader = L->getLoopPreheader(); 295 BasicBlock *LatchBlock = L->getLoopLatch(); 296 assert(Preheader && "No preheader"); 297 assert(LatchBlock && "No latch block"); 298 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 299 assert(BI && !BI->isUnconditional()); 300 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 301 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 302 bool SubLoopContinueOnTrue = SubLoop->contains( 303 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 304 305 // Partition blocks in an outer/inner loop pair into blocks before and after 306 // the loop 307 BasicBlockSet SubLoopBlocks; 308 BasicBlockSet ForeBlocks; 309 BasicBlockSet AftBlocks; 310 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 311 DT); 312 313 // We keep track of the entering/first and exiting/last block of each of 314 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 315 // blocks easier. 316 std::vector<BasicBlock *> ForeBlocksFirst; 317 std::vector<BasicBlock *> ForeBlocksLast; 318 std::vector<BasicBlock *> SubLoopBlocksFirst; 319 std::vector<BasicBlock *> SubLoopBlocksLast; 320 std::vector<BasicBlock *> AftBlocksFirst; 321 std::vector<BasicBlock *> AftBlocksLast; 322 ForeBlocksFirst.push_back(Header); 323 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 324 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 325 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 326 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 327 AftBlocksLast.push_back(L->getExitingBlock()); 328 // Maps Blocks[0] -> Blocks[It] 329 ValueToValueMapTy LastValueMap; 330 331 // Move any instructions from fore phi operands from AftBlocks into Fore. 332 moveHeaderPhiOperandsToForeBlocks( 333 Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); 334 335 // The current on-the-fly SSA update requires blocks to be processed in 336 // reverse postorder so that LastValueMap contains the correct value at each 337 // exit. 338 LoopBlocksDFS DFS(L); 339 DFS.perform(LI); 340 // Stash the DFS iterators before adding blocks to the loop. 341 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 342 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 343 344 // When a FSDiscriminator is enabled, we don't need to add the multiply 345 // factors to the discriminators. 346 if (Header->getParent()->shouldEmitDebugInfoForProfiling() && 347 !EnableFSDiscriminator) 348 for (BasicBlock *BB : L->getBlocks()) 349 for (Instruction &I : *BB) 350 if (!I.isDebugOrPseudoInst()) 351 if (const DILocation *DIL = I.getDebugLoc()) { 352 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 353 if (NewDIL) 354 I.setDebugLoc(*NewDIL); 355 else 356 LLVM_DEBUG(dbgs() 357 << "Failed to create new discriminator: " 358 << DIL->getFilename() << " Line: " << DIL->getLine()); 359 } 360 361 // Copy all blocks 362 for (unsigned It = 1; It != Count; ++It) { 363 SmallVector<BasicBlock *, 8> NewBlocks; 364 // Maps Blocks[It] -> Blocks[It-1] 365 DenseMap<Value *, Value *> PrevItValueMap; 366 SmallDenseMap<const Loop *, Loop *, 4> NewLoops; 367 NewLoops[L] = L; 368 NewLoops[SubLoop] = SubLoop; 369 370 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 371 ValueToValueMapTy VMap; 372 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 373 Header->getParent()->insert(Header->getParent()->end(), New); 374 375 // Tell LI about New. 376 addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); 377 378 if (ForeBlocks.count(*BB)) { 379 if (*BB == ForeBlocksFirst[0]) 380 ForeBlocksFirst.push_back(New); 381 if (*BB == ForeBlocksLast[0]) 382 ForeBlocksLast.push_back(New); 383 } else if (SubLoopBlocks.count(*BB)) { 384 if (*BB == SubLoopBlocksFirst[0]) 385 SubLoopBlocksFirst.push_back(New); 386 if (*BB == SubLoopBlocksLast[0]) 387 SubLoopBlocksLast.push_back(New); 388 } else if (AftBlocks.count(*BB)) { 389 if (*BB == AftBlocksFirst[0]) 390 AftBlocksFirst.push_back(New); 391 if (*BB == AftBlocksLast[0]) 392 AftBlocksLast.push_back(New); 393 } else { 394 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 395 } 396 397 // Update our running maps of newest clones 398 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 399 LastValueMap[*BB] = New; 400 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 401 VI != VE; ++VI) { 402 PrevItValueMap[VI->second] = 403 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 404 LastValueMap[VI->first] = VI->second; 405 } 406 407 NewBlocks.push_back(New); 408 409 // Update DomTree: 410 if (*BB == ForeBlocksFirst[0]) 411 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 412 else if (*BB == SubLoopBlocksFirst[0]) 413 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 414 else if (*BB == AftBlocksFirst[0]) 415 DT->addNewBlock(New, AftBlocksLast[It - 1]); 416 else { 417 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 418 // structure. 419 auto BBDomNode = DT->getNode(*BB); 420 auto BBIDom = BBDomNode->getIDom(); 421 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 422 assert(OriginalBBIDom); 423 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 424 DT->addNewBlock( 425 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 426 } 427 } 428 429 // Remap all instructions in the most recent iteration 430 remapInstructionsInBlocks(NewBlocks, LastValueMap); 431 for (BasicBlock *NewBlock : NewBlocks) { 432 for (Instruction &I : *NewBlock) { 433 if (auto *II = dyn_cast<AssumeInst>(&I)) 434 AC->registerAssumption(II); 435 } 436 } 437 438 // Alter the ForeBlocks phi's, pointing them at the latest version of the 439 // value from the previous iteration's phis 440 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 441 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 442 assert(OldValue && "should have incoming edge from Aft[It]"); 443 Value *NewValue = OldValue; 444 if (Value *PrevValue = PrevItValueMap[OldValue]) 445 NewValue = PrevValue; 446 447 assert(Phi.getNumOperands() == 2); 448 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 449 Phi.setIncomingValue(0, NewValue); 450 Phi.removeIncomingValue(1); 451 } 452 } 453 454 // Now that all the basic blocks for the unrolled iterations are in place, 455 // finish up connecting the blocks and phi nodes. At this point LastValueMap 456 // is the last unrolled iterations values. 457 458 // Update Phis in BB from OldBB to point to NewBB and use the latest value 459 // from LastValueMap 460 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 461 BasicBlock *NewBB, 462 ValueToValueMapTy &LastValueMap) { 463 for (PHINode &Phi : BB->phis()) { 464 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 465 if (Phi.getIncomingBlock(b) == OldBB) { 466 Value *OldValue = Phi.getIncomingValue(b); 467 if (Value *LastValue = LastValueMap[OldValue]) 468 Phi.setIncomingValue(b, LastValue); 469 Phi.setIncomingBlock(b, NewBB); 470 break; 471 } 472 } 473 } 474 }; 475 // Move all the phis from Src into Dest 476 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 477 Instruction *insertPoint = Dest->getFirstNonPHI(); 478 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 479 Phi->moveBefore(insertPoint); 480 }; 481 482 // Update the PHI values outside the loop to point to the last block 483 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 484 LastValueMap); 485 486 // Update ForeBlocks successors and phi nodes 487 BranchInst *ForeTerm = 488 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 489 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 490 ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); 491 492 if (CompletelyUnroll) { 493 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 494 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 495 Phi->eraseFromParent(); 496 } 497 } else { 498 // Update the PHI values to point to the last aft block 499 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 500 AftBlocksLast.back(), LastValueMap); 501 } 502 503 for (unsigned It = 1; It != Count; It++) { 504 // Remap ForeBlock successors from previous iteration to this 505 BranchInst *ForeTerm = 506 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 507 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 508 ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); 509 } 510 511 // Subloop successors and phis 512 BranchInst *SubTerm = 513 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 514 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 515 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 516 SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0], 517 ForeBlocksLast.back()); 518 SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 519 SubLoopBlocksLast.back()); 520 521 for (unsigned It = 1; It != Count; It++) { 522 // Replace the conditional branch of the previous iteration subloop with an 523 // unconditional one to this one 524 BranchInst *SubTerm = 525 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 526 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 527 SubTerm->eraseFromParent(); 528 529 SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It], 530 ForeBlocksLast.back()); 531 SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 532 SubLoopBlocksLast.back()); 533 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 534 } 535 536 // Aft blocks successors and phis 537 BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 538 if (CompletelyUnroll) { 539 BranchInst::Create(LoopExit, AftTerm); 540 AftTerm->eraseFromParent(); 541 } else { 542 AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 543 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && 544 "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); 545 } 546 AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 547 SubLoopBlocksLast.back()); 548 549 for (unsigned It = 1; It != Count; It++) { 550 // Replace the conditional branch of the previous iteration subloop with an 551 // unconditional one to this one 552 BranchInst *AftTerm = 553 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 554 BranchInst::Create(AftBlocksFirst[It], AftTerm); 555 AftTerm->eraseFromParent(); 556 557 AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 558 SubLoopBlocksLast.back()); 559 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 560 } 561 562 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 563 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 564 // new ones required. 565 if (Count != 1) { 566 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 567 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 568 SubLoopBlocksFirst[0]); 569 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 570 SubLoopBlocksLast[0], AftBlocksFirst[0]); 571 572 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 573 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 574 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 575 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 576 DTU.applyUpdatesPermissive(DTUpdates); 577 } 578 579 // Merge adjacent basic blocks, if possible. 580 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 581 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 582 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 583 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 584 585 MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); 586 587 // Apply updates to the DomTree. 588 DT = &DTU.getDomTree(); 589 590 // At this point, the code is well formed. We now do a quick sweep over the 591 // inserted code, doing constant propagation and dead code elimination as we 592 // go. 593 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); 594 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, 595 TTI); 596 597 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 598 ++NumUnrolledAndJammed; 599 600 // Update LoopInfo if the loop is completely removed. 601 if (CompletelyUnroll) 602 LI->erase(L); 603 604 #ifndef NDEBUG 605 // We shouldn't have done anything to break loop simplify form or LCSSA. 606 Loop *OutestLoop = SubLoop->getParentLoop() 607 ? SubLoop->getParentLoop()->getParentLoop() 608 ? SubLoop->getParentLoop()->getParentLoop() 609 : SubLoop->getParentLoop() 610 : SubLoop; 611 assert(DT->verify()); 612 LI->verify(*DT); 613 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 614 if (!CompletelyUnroll) 615 assert(L->isLoopSimplifyForm()); 616 assert(SubLoop->isLoopSimplifyForm()); 617 SE->verify(); 618 #endif 619 620 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 621 : LoopUnrollResult::PartiallyUnrolled; 622 } 623 624 static bool getLoadsAndStores(BasicBlockSet &Blocks, 625 SmallVector<Instruction *, 4> &MemInstr) { 626 // Scan the BBs and collect legal loads and stores. 627 // Returns false if non-simple loads/stores are found. 628 for (BasicBlock *BB : Blocks) { 629 for (Instruction &I : *BB) { 630 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 631 if (!Ld->isSimple()) 632 return false; 633 MemInstr.push_back(&I); 634 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 635 if (!St->isSimple()) 636 return false; 637 MemInstr.push_back(&I); 638 } else if (I.mayReadOrWriteMemory()) { 639 return false; 640 } 641 } 642 } 643 return true; 644 } 645 646 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, 647 unsigned UnrollLevel, unsigned JamLevel, 648 bool Sequentialized, Dependence *D) { 649 // UnrollLevel might carry the dependency Src --> Dst 650 // Does a different loop after unrolling? 651 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 652 ++CurLoopDepth) { 653 auto JammedDir = D->getDirection(CurLoopDepth); 654 if (JammedDir == Dependence::DVEntry::LT) 655 return true; 656 657 if (JammedDir & Dependence::DVEntry::GT) 658 return false; 659 } 660 661 return true; 662 } 663 664 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, 665 unsigned UnrollLevel, unsigned JamLevel, 666 bool Sequentialized, Dependence *D) { 667 // UnrollLevel might carry the dependency Dst --> Src 668 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 669 ++CurLoopDepth) { 670 auto JammedDir = D->getDirection(CurLoopDepth); 671 if (JammedDir == Dependence::DVEntry::GT) 672 return true; 673 674 if (JammedDir & Dependence::DVEntry::LT) 675 return false; 676 } 677 678 // Backward dependencies are only preserved if not interleaved. 679 return Sequentialized; 680 } 681 682 // Check whether it is semantically safe Src and Dst considering any potential 683 // dependency between them. 684 // 685 // @param UnrollLevel The level of the loop being unrolled 686 // @param JamLevel The level of the loop being jammed; if Src and Dst are on 687 // different levels, the outermost common loop counts as jammed level 688 // 689 // @return true if is safe and false if there is a dependency violation. 690 static bool checkDependency(Instruction *Src, Instruction *Dst, 691 unsigned UnrollLevel, unsigned JamLevel, 692 bool Sequentialized, DependenceInfo &DI) { 693 assert(UnrollLevel <= JamLevel && 694 "Expecting JamLevel to be at least UnrollLevel"); 695 696 if (Src == Dst) 697 return true; 698 // Ignore Input dependencies. 699 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 700 return true; 701 702 // Check whether unroll-and-jam may violate a dependency. 703 // By construction, every dependency will be lexicographically non-negative 704 // (if it was, it would violate the current execution order), such as 705 // (0,0,>,*,*) 706 // Unroll-and-jam changes the GT execution of two executions to the same 707 // iteration of the chosen unroll level. That is, a GT dependence becomes a GE 708 // dependence (or EQ, if we fully unrolled the loop) at the loop's position: 709 // (0,0,>=,*,*) 710 // Now, the dependency is not necessarily non-negative anymore, i.e. 711 // unroll-and-jam may violate correctness. 712 std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); 713 if (!D) 714 return true; 715 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 716 717 if (D->isConfused()) { 718 LLVM_DEBUG(dbgs() << " Confused dependency between:\n" 719 << " " << *Src << "\n" 720 << " " << *Dst << "\n"); 721 return false; 722 } 723 724 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a 725 // non-equal direction, then the locations accessed in the inner levels cannot 726 // overlap in memory. We assumes the indexes never overlap into neighboring 727 // dimensions. 728 for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) 729 if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) 730 return true; 731 732 auto UnrollDirection = D->getDirection(UnrollLevel); 733 734 // If the distance carried by the unrolled loop is 0, then after unrolling 735 // that distance will become non-zero resulting in non-overlapping accesses in 736 // the inner loops. 737 if (UnrollDirection == Dependence::DVEntry::EQ) 738 return true; 739 740 if (UnrollDirection & Dependence::DVEntry::LT && 741 !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, 742 Sequentialized, D.get())) 743 return false; 744 745 if (UnrollDirection & Dependence::DVEntry::GT && 746 !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, 747 Sequentialized, D.get())) 748 return false; 749 750 return true; 751 } 752 753 static bool 754 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, 755 const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 756 const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, 757 DependenceInfo &DI, LoopInfo &LI) { 758 SmallVector<BasicBlockSet, 8> AllBlocks; 759 for (Loop *L : Root.getLoopsInPreorder()) 760 if (ForeBlocksMap.contains(L)) 761 AllBlocks.push_back(ForeBlocksMap.lookup(L)); 762 AllBlocks.push_back(SubLoopBlocks); 763 for (Loop *L : Root.getLoopsInPreorder()) 764 if (AftBlocksMap.contains(L)) 765 AllBlocks.push_back(AftBlocksMap.lookup(L)); 766 767 unsigned LoopDepth = Root.getLoopDepth(); 768 SmallVector<Instruction *, 4> EarlierLoadsAndStores; 769 SmallVector<Instruction *, 4> CurrentLoadsAndStores; 770 for (BasicBlockSet &Blocks : AllBlocks) { 771 CurrentLoadsAndStores.clear(); 772 if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) 773 return false; 774 775 Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); 776 unsigned CurLoopDepth = CurLoop->getLoopDepth(); 777 778 for (auto *Earlier : EarlierLoadsAndStores) { 779 Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); 780 unsigned EarlierDepth = EarlierLoop->getLoopDepth(); 781 unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); 782 for (auto *Later : CurrentLoadsAndStores) { 783 if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, 784 DI)) 785 return false; 786 } 787 } 788 789 size_t NumInsts = CurrentLoadsAndStores.size(); 790 for (size_t I = 0; I < NumInsts; ++I) { 791 for (size_t J = I; J < NumInsts; ++J) { 792 if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], 793 LoopDepth, CurLoopDepth, true, DI)) 794 return false; 795 } 796 } 797 798 EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), 799 CurrentLoadsAndStores.end()); 800 } 801 return true; 802 } 803 804 static bool isEligibleLoopForm(const Loop &Root) { 805 // Root must have a child. 806 if (Root.getSubLoops().size() != 1) 807 return false; 808 809 const Loop *L = &Root; 810 do { 811 // All loops in Root need to be in simplify and rotated form. 812 if (!L->isLoopSimplifyForm()) 813 return false; 814 815 if (!L->isRotatedForm()) 816 return false; 817 818 if (L->getHeader()->hasAddressTaken()) { 819 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 820 return false; 821 } 822 823 unsigned SubLoopsSize = L->getSubLoops().size(); 824 if (SubLoopsSize == 0) 825 return true; 826 827 // Only one child is allowed. 828 if (SubLoopsSize != 1) 829 return false; 830 831 // Only loops with a single exit block can be unrolled and jammed. 832 // The function getExitBlock() is used for this check, rather than 833 // getUniqueExitBlock() to ensure loops with mulitple exit edges are 834 // disallowed. 835 if (!L->getExitBlock()) { 836 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit " 837 "blocks can be unrolled and jammed.\n"); 838 return false; 839 } 840 841 // Only loops with a single exiting block can be unrolled and jammed. 842 if (!L->getExitingBlock()) { 843 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single " 844 "exiting blocks can be unrolled and jammed.\n"); 845 return false; 846 } 847 848 L = L->getSubLoops()[0]; 849 } while (L); 850 851 return true; 852 } 853 854 static Loop *getInnerMostLoop(Loop *L) { 855 while (!L->getSubLoops().empty()) 856 L = L->getSubLoops()[0]; 857 return L; 858 } 859 860 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 861 DependenceInfo &DI, LoopInfo &LI) { 862 if (!isEligibleLoopForm(*L)) { 863 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); 864 return false; 865 } 866 867 /* We currently handle outer loops like this: 868 | 869 ForeFirst <------\ } 870 Blocks | } ForeBlocks of L 871 ForeLast | } 872 | | 873 ... | 874 | | 875 ForeFirst <----\ | } 876 Blocks | | } ForeBlocks of a inner loop of L 877 ForeLast | | } 878 | | | 879 JamLoopFirst <\ | | } 880 Blocks | | | } JamLoopBlocks of the innermost loop 881 JamLoopLast -/ | | } 882 | | | 883 AftFirst | | } 884 Blocks | | } AftBlocks of a inner loop of L 885 AftLast ------/ | } 886 | | 887 ... | 888 | | 889 AftFirst | } 890 Blocks | } AftBlocks of L 891 AftLast --------/ } 892 | 893 894 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 895 and AftBlocks, providing that there is one edge from Fores to SubLoops, 896 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 897 In practice we currently limit Aft blocks to a single block, and limit 898 things further in the profitablility checks of the unroll and jam pass. 899 900 Because of the way we rearrange basic blocks, we also require that 901 the Fore blocks of L on all unrolled iterations are safe to move before the 902 blocks of the direct child of L of all iterations. So we require that the 903 phi node looping operands of ForeHeader can be moved to at least the end of 904 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and 905 match up Phi's correctly. 906 907 i.e. The old order of blocks used to be 908 (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. 909 It needs to be safe to transform this to 910 (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. 911 912 There are then a number of checks along the lines of no calls, no 913 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 914 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 915 UnrollAndJamLoop if the trip count cannot be easily calculated. 916 */ 917 918 // Split blocks into Fore/SubLoop/Aft based on dominators 919 Loop *JamLoop = getInnerMostLoop(L); 920 BasicBlockSet SubLoopBlocks; 921 DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; 922 DenseMap<Loop *, BasicBlockSet> AftBlocksMap; 923 if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, 924 AftBlocksMap, DT)) { 925 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 926 return false; 927 } 928 929 // Aft blocks may need to move instructions to fore blocks, which becomes more 930 // difficult if there are multiple (potentially conditionally executed) 931 // blocks. For now we just exclude loops with multiple aft blocks. 932 if (AftBlocksMap[L].size() != 1) { 933 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 934 "multiple blocks after the loop\n"); 935 return false; 936 } 937 938 // Check inner loop backedge count is consistent on all iterations of the 939 // outer loop 940 if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { 941 return !hasIterationCountInvariantInParent(SubLoop, SE); 942 })) { 943 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 944 "not consistent on each iteration\n"); 945 return false; 946 } 947 948 // Check the loop safety info for exceptions. 949 SimpleLoopSafetyInfo LSI; 950 LSI.computeLoopSafetyInfo(L); 951 if (LSI.anyBlockMayThrow()) { 952 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 953 return false; 954 } 955 956 // We've ruled out the easy stuff and now need to check that there are no 957 // interdependencies which may prevent us from moving the: 958 // ForeBlocks before Subloop and AftBlocks. 959 // Subloop before AftBlocks. 960 // ForeBlock phi operands before the subloop 961 962 // Make sure we can move all instructions we need to before the subloop 963 BasicBlock *Header = L->getHeader(); 964 BasicBlock *Latch = L->getLoopLatch(); 965 BasicBlockSet AftBlocks = AftBlocksMap[L]; 966 Loop *SubLoop = L->getSubLoops()[0]; 967 if (!processHeaderPhiOperands( 968 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 969 if (SubLoop->contains(I->getParent())) 970 return false; 971 if (AftBlocks.count(I->getParent())) { 972 // If we hit a phi node in afts we know we are done (probably 973 // LCSSA) 974 if (isa<PHINode>(I)) 975 return false; 976 // Can't move instructions with side effects or memory 977 // reads/writes 978 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 979 return false; 980 } 981 // Keep going 982 return true; 983 })) { 984 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 985 "instructions after subloop to before it\n"); 986 return false; 987 } 988 989 // Check for memory dependencies which prohibit the unrolling we are doing. 990 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 991 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 992 if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, 993 LI)) { 994 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 995 return false; 996 } 997 998 return true; 999 } 1000