xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp (revision 9768746ba83efa02837c5b9c66348db6e900208f)
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/Optional.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallPtrSet.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/ADT/iterator_range.h"
24 #include "llvm/Analysis/AssumptionCache.h"
25 #include "llvm/Analysis/DependenceAnalysis.h"
26 #include "llvm/Analysis/DomTreeUpdater.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/LoopIterator.h"
29 #include "llvm/Analysis/MustExecute.h"
30 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
31 #include "llvm/Analysis/ScalarEvolution.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/User.h"
42 #include "llvm/IR/Value.h"
43 #include "llvm/IR/ValueHandle.h"
44 #include "llvm/IR/ValueMap.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/ErrorHandling.h"
48 #include "llvm/Support/GenericDomTree.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/LoopUtils.h"
53 #include "llvm/Transforms/Utils/UnrollLoop.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <assert.h>
56 #include <memory>
57 #include <type_traits>
58 #include <vector>
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "loop-unroll-and-jam"
63 
64 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
65 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
66 
67 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
68 
69 // Partition blocks in an outer/inner loop pair into blocks before and after
70 // the loop
71 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
72                                 BasicBlockSet &AftBlocks, DominatorTree &DT) {
73   Loop *SubLoop = L.getSubLoops()[0];
74   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
75 
76   for (BasicBlock *BB : L.blocks()) {
77     if (!SubLoop->contains(BB)) {
78       if (DT.dominates(SubLoopLatch, BB))
79         AftBlocks.insert(BB);
80       else
81         ForeBlocks.insert(BB);
82     }
83   }
84 
85   // Check that all blocks in ForeBlocks together dominate the subloop
86   // TODO: This might ideally be done better with a dominator/postdominators.
87   BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
88   for (BasicBlock *BB : ForeBlocks) {
89     if (BB == SubLoopPreHeader)
90       continue;
91     Instruction *TI = BB->getTerminator();
92     for (BasicBlock *Succ : successors(TI))
93       if (!ForeBlocks.count(Succ))
94         return false;
95   }
96 
97   return true;
98 }
99 
100 /// Partition blocks in a loop nest into blocks before and after each inner
101 /// loop.
102 static bool partitionOuterLoopBlocks(
103     Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
104     DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
105     DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
106   JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
107 
108   for (Loop *L : Root.getLoopsInPreorder()) {
109     if (L == &JamLoop)
110       break;
111 
112     if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
113       return false;
114   }
115 
116   return true;
117 }
118 
119 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
120 // than 2 levels loop.
121 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
122                                      BasicBlockSet &ForeBlocks,
123                                      BasicBlockSet &SubLoopBlocks,
124                                      BasicBlockSet &AftBlocks,
125                                      DominatorTree *DT) {
126   SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
127   return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
128 }
129 
130 // Looks at the phi nodes in Header for values coming from Latch. For these
131 // instructions and all their operands calls Visit on them, keeping going for
132 // all the operands in AftBlocks. Returns false if Visit returns false,
133 // otherwise returns true. This is used to process the instructions in the
134 // Aft blocks that need to be moved before the subloop. It is used in two
135 // places. One to check that the required set of instructions can be moved
136 // before the loop. Then to collect the instructions to actually move in
137 // moveHeaderPhiOperandsToForeBlocks.
138 template <typename T>
139 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
140                                      BasicBlockSet &AftBlocks, T Visit) {
141   SmallVector<Instruction *, 8> Worklist;
142   SmallPtrSet<Instruction *, 8> VisitedInstr;
143   for (auto &Phi : Header->phis()) {
144     Value *V = Phi.getIncomingValueForBlock(Latch);
145     if (Instruction *I = dyn_cast<Instruction>(V))
146       Worklist.push_back(I);
147   }
148 
149   while (!Worklist.empty()) {
150     Instruction *I = Worklist.pop_back_val();
151     if (!Visit(I))
152       return false;
153     VisitedInstr.insert(I);
154 
155     if (AftBlocks.count(I->getParent()))
156       for (auto &U : I->operands())
157         if (Instruction *II = dyn_cast<Instruction>(U))
158           if (!VisitedInstr.count(II))
159             Worklist.push_back(II);
160   }
161 
162   return true;
163 }
164 
165 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
166 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
167                                               BasicBlock *Latch,
168                                               Instruction *InsertLoc,
169                                               BasicBlockSet &AftBlocks) {
170   // We need to ensure we move the instructions in the correct order,
171   // starting with the earliest required instruction and moving forward.
172   std::vector<Instruction *> Visited;
173   processHeaderPhiOperands(Header, Latch, AftBlocks,
174                            [&Visited, &AftBlocks](Instruction *I) {
175                              if (AftBlocks.count(I->getParent()))
176                                Visited.push_back(I);
177                              return true;
178                            });
179 
180   // Move all instructions in program order to before the InsertLoc
181   BasicBlock *InsertLocBB = InsertLoc->getParent();
182   for (Instruction *I : reverse(Visited)) {
183     if (I->getParent() != InsertLocBB)
184       I->moveBefore(InsertLoc);
185   }
186 }
187 
188 /*
189   This method performs Unroll and Jam. For a simple loop like:
190   for (i = ..)
191     Fore(i)
192     for (j = ..)
193       SubLoop(i, j)
194     Aft(i)
195 
196   Instead of doing normal inner or outer unrolling, we do:
197   for (i = .., i+=2)
198     Fore(i)
199     Fore(i+1)
200     for (j = ..)
201       SubLoop(i, j)
202       SubLoop(i+1, j)
203     Aft(i)
204     Aft(i+1)
205 
206   So the outer loop is essetially unrolled and then the inner loops are fused
207   ("jammed") together into a single loop. This can increase speed when there
208   are loads in SubLoop that are invariant to i, as they become shared between
209   the now jammed inner loops.
210 
211   We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
212   Fore blocks are those before the inner loop, Aft are those after. Normal
213   Unroll code is used to copy each of these sets of blocks and the results are
214   combined together into the final form above.
215 
216   isSafeToUnrollAndJam should be used prior to calling this to make sure the
217   unrolling will be valid. Checking profitablility is also advisable.
218 
219   If EpilogueLoop is non-null, it receives the epilogue loop (if it was
220   necessary to create one and not fully unrolled).
221 */
222 LoopUnrollResult
223 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
224                        unsigned TripMultiple, bool UnrollRemainder,
225                        LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
226                        AssumptionCache *AC, const TargetTransformInfo *TTI,
227                        OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
228 
229   // When we enter here we should have already checked that it is safe
230   BasicBlock *Header = L->getHeader();
231   assert(Header && "No header.");
232   assert(L->getSubLoops().size() == 1);
233   Loop *SubLoop = *L->begin();
234 
235   // Don't enter the unroll code if there is nothing to do.
236   if (TripCount == 0 && Count < 2) {
237     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
238     return LoopUnrollResult::Unmodified;
239   }
240 
241   assert(Count > 0);
242   assert(TripMultiple > 0);
243   assert(TripCount == 0 || TripCount % TripMultiple == 0);
244 
245   // Are we eliminating the loop control altogether?
246   bool CompletelyUnroll = (Count == TripCount);
247 
248   // We use the runtime remainder in cases where we don't know trip multiple
249   if (TripMultiple % Count != 0) {
250     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
251                                     /*UseEpilogRemainder*/ true,
252                                     UnrollRemainder, /*ForgetAllSCEV*/ false,
253                                     LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
254       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
255                            "generated when assuming runtime trip count\n");
256       return LoopUnrollResult::Unmodified;
257     }
258   }
259 
260   // Notify ScalarEvolution that the loop will be substantially changed,
261   // if not outright eliminated.
262   if (SE) {
263     SE->forgetLoop(L);
264     SE->forgetLoop(SubLoop);
265   }
266 
267   using namespace ore;
268   // Report the unrolling decision.
269   if (CompletelyUnroll) {
270     LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
271                       << Header->getName() << " with trip count " << TripCount
272                       << "!\n");
273     ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
274                                  L->getHeader())
275               << "completely unroll and jammed loop with "
276               << NV("UnrollCount", TripCount) << " iterations");
277   } else {
278     auto DiagBuilder = [&]() {
279       OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
280                               L->getHeader());
281       return Diag << "unroll and jammed loop by a factor of "
282                   << NV("UnrollCount", Count);
283     };
284 
285     LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
286                       << " by " << Count);
287     if (TripMultiple != 1) {
288       LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
289       ORE->emit([&]() {
290         return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
291                              << " trips per branch";
292       });
293     } else {
294       LLVM_DEBUG(dbgs() << " with run-time trip count");
295       ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
296     }
297     LLVM_DEBUG(dbgs() << "!\n");
298   }
299 
300   BasicBlock *Preheader = L->getLoopPreheader();
301   BasicBlock *LatchBlock = L->getLoopLatch();
302   assert(Preheader && "No preheader");
303   assert(LatchBlock && "No latch block");
304   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
305   assert(BI && !BI->isUnconditional());
306   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
307   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
308   bool SubLoopContinueOnTrue = SubLoop->contains(
309       SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
310 
311   // Partition blocks in an outer/inner loop pair into blocks before and after
312   // the loop
313   BasicBlockSet SubLoopBlocks;
314   BasicBlockSet ForeBlocks;
315   BasicBlockSet AftBlocks;
316   partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
317                            DT);
318 
319   // We keep track of the entering/first and exiting/last block of each of
320   // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
321   // blocks easier.
322   std::vector<BasicBlock *> ForeBlocksFirst;
323   std::vector<BasicBlock *> ForeBlocksLast;
324   std::vector<BasicBlock *> SubLoopBlocksFirst;
325   std::vector<BasicBlock *> SubLoopBlocksLast;
326   std::vector<BasicBlock *> AftBlocksFirst;
327   std::vector<BasicBlock *> AftBlocksLast;
328   ForeBlocksFirst.push_back(Header);
329   ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
330   SubLoopBlocksFirst.push_back(SubLoop->getHeader());
331   SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
332   AftBlocksFirst.push_back(SubLoop->getExitBlock());
333   AftBlocksLast.push_back(L->getExitingBlock());
334   // Maps Blocks[0] -> Blocks[It]
335   ValueToValueMapTy LastValueMap;
336 
337   // Move any instructions from fore phi operands from AftBlocks into Fore.
338   moveHeaderPhiOperandsToForeBlocks(
339       Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
340 
341   // The current on-the-fly SSA update requires blocks to be processed in
342   // reverse postorder so that LastValueMap contains the correct value at each
343   // exit.
344   LoopBlocksDFS DFS(L);
345   DFS.perform(LI);
346   // Stash the DFS iterators before adding blocks to the loop.
347   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
348   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
349 
350   // When a FSDiscriminator is enabled, we don't need to add the multiply
351   // factors to the discriminators.
352   if (Header->getParent()->isDebugInfoForProfiling() && !EnableFSDiscriminator)
353     for (BasicBlock *BB : L->getBlocks())
354       for (Instruction &I : *BB)
355         if (!isa<DbgInfoIntrinsic>(&I))
356           if (const DILocation *DIL = I.getDebugLoc()) {
357             auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
358             if (NewDIL)
359               I.setDebugLoc(*NewDIL);
360             else
361               LLVM_DEBUG(dbgs()
362                          << "Failed to create new discriminator: "
363                          << DIL->getFilename() << " Line: " << DIL->getLine());
364           }
365 
366   // Copy all blocks
367   for (unsigned It = 1; It != Count; ++It) {
368     SmallVector<BasicBlock *, 8> NewBlocks;
369     // Maps Blocks[It] -> Blocks[It-1]
370     DenseMap<Value *, Value *> PrevItValueMap;
371     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
372     NewLoops[L] = L;
373     NewLoops[SubLoop] = SubLoop;
374 
375     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
376       ValueToValueMapTy VMap;
377       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
378       Header->getParent()->getBasicBlockList().push_back(New);
379 
380       // Tell LI about New.
381       addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
382 
383       if (ForeBlocks.count(*BB)) {
384         if (*BB == ForeBlocksFirst[0])
385           ForeBlocksFirst.push_back(New);
386         if (*BB == ForeBlocksLast[0])
387           ForeBlocksLast.push_back(New);
388       } else if (SubLoopBlocks.count(*BB)) {
389         if (*BB == SubLoopBlocksFirst[0])
390           SubLoopBlocksFirst.push_back(New);
391         if (*BB == SubLoopBlocksLast[0])
392           SubLoopBlocksLast.push_back(New);
393       } else if (AftBlocks.count(*BB)) {
394         if (*BB == AftBlocksFirst[0])
395           AftBlocksFirst.push_back(New);
396         if (*BB == AftBlocksLast[0])
397           AftBlocksLast.push_back(New);
398       } else {
399         llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
400       }
401 
402       // Update our running maps of newest clones
403       PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
404       LastValueMap[*BB] = New;
405       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
406            VI != VE; ++VI) {
407         PrevItValueMap[VI->second] =
408             const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
409         LastValueMap[VI->first] = VI->second;
410       }
411 
412       NewBlocks.push_back(New);
413 
414       // Update DomTree:
415       if (*BB == ForeBlocksFirst[0])
416         DT->addNewBlock(New, ForeBlocksLast[It - 1]);
417       else if (*BB == SubLoopBlocksFirst[0])
418         DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
419       else if (*BB == AftBlocksFirst[0])
420         DT->addNewBlock(New, AftBlocksLast[It - 1]);
421       else {
422         // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
423         // structure.
424         auto BBDomNode = DT->getNode(*BB);
425         auto BBIDom = BBDomNode->getIDom();
426         BasicBlock *OriginalBBIDom = BBIDom->getBlock();
427         assert(OriginalBBIDom);
428         assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
429         DT->addNewBlock(
430             New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
431       }
432     }
433 
434     // Remap all instructions in the most recent iteration
435     remapInstructionsInBlocks(NewBlocks, LastValueMap);
436     for (BasicBlock *NewBlock : NewBlocks) {
437       for (Instruction &I : *NewBlock) {
438         if (auto *II = dyn_cast<AssumeInst>(&I))
439           AC->registerAssumption(II);
440       }
441     }
442 
443     // Alter the ForeBlocks phi's, pointing them at the latest version of the
444     // value from the previous iteration's phis
445     for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
446       Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
447       assert(OldValue && "should have incoming edge from Aft[It]");
448       Value *NewValue = OldValue;
449       if (Value *PrevValue = PrevItValueMap[OldValue])
450         NewValue = PrevValue;
451 
452       assert(Phi.getNumOperands() == 2);
453       Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
454       Phi.setIncomingValue(0, NewValue);
455       Phi.removeIncomingValue(1);
456     }
457   }
458 
459   // Now that all the basic blocks for the unrolled iterations are in place,
460   // finish up connecting the blocks and phi nodes. At this point LastValueMap
461   // is the last unrolled iterations values.
462 
463   // Update Phis in BB from OldBB to point to NewBB and use the latest value
464   // from LastValueMap
465   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
466                                      BasicBlock *NewBB,
467                                      ValueToValueMapTy &LastValueMap) {
468     for (PHINode &Phi : BB->phis()) {
469       for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
470         if (Phi.getIncomingBlock(b) == OldBB) {
471           Value *OldValue = Phi.getIncomingValue(b);
472           if (Value *LastValue = LastValueMap[OldValue])
473             Phi.setIncomingValue(b, LastValue);
474           Phi.setIncomingBlock(b, NewBB);
475           break;
476         }
477       }
478     }
479   };
480   // Move all the phis from Src into Dest
481   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
482     Instruction *insertPoint = Dest->getFirstNonPHI();
483     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
484       Phi->moveBefore(insertPoint);
485   };
486 
487   // Update the PHI values outside the loop to point to the last block
488   updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
489                            LastValueMap);
490 
491   // Update ForeBlocks successors and phi nodes
492   BranchInst *ForeTerm =
493       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
494   assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
495   ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
496 
497   if (CompletelyUnroll) {
498     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
499       Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
500       Phi->getParent()->getInstList().erase(Phi);
501     }
502   } else {
503     // Update the PHI values to point to the last aft block
504     updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
505                              AftBlocksLast.back(), LastValueMap);
506   }
507 
508   for (unsigned It = 1; It != Count; It++) {
509     // Remap ForeBlock successors from previous iteration to this
510     BranchInst *ForeTerm =
511         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
512     assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
513     ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
514   }
515 
516   // Subloop successors and phis
517   BranchInst *SubTerm =
518       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
519   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
520   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
521   SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
522                                             ForeBlocksLast.back());
523   SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
524                                             SubLoopBlocksLast.back());
525 
526   for (unsigned It = 1; It != Count; It++) {
527     // Replace the conditional branch of the previous iteration subloop with an
528     // unconditional one to this one
529     BranchInst *SubTerm =
530         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
531     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
532     SubTerm->eraseFromParent();
533 
534     SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
535                                                ForeBlocksLast.back());
536     SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
537                                                SubLoopBlocksLast.back());
538     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
539   }
540 
541   // Aft blocks successors and phis
542   BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
543   if (CompletelyUnroll) {
544     BranchInst::Create(LoopExit, AftTerm);
545     AftTerm->eraseFromParent();
546   } else {
547     AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
548     assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
549            "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
550   }
551   AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
552                                         SubLoopBlocksLast.back());
553 
554   for (unsigned It = 1; It != Count; It++) {
555     // Replace the conditional branch of the previous iteration subloop with an
556     // unconditional one to this one
557     BranchInst *AftTerm =
558         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
559     BranchInst::Create(AftBlocksFirst[It], AftTerm);
560     AftTerm->eraseFromParent();
561 
562     AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
563                                            SubLoopBlocksLast.back());
564     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
565   }
566 
567   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
568   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
569   // new ones required.
570   if (Count != 1) {
571     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
572     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
573                            SubLoopBlocksFirst[0]);
574     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
575                            SubLoopBlocksLast[0], AftBlocksFirst[0]);
576 
577     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
578                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
579     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
580                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
581     DTU.applyUpdatesPermissive(DTUpdates);
582   }
583 
584   // Merge adjacent basic blocks, if possible.
585   SmallPtrSet<BasicBlock *, 16> MergeBlocks;
586   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
587   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
588   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
589 
590   MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
591 
592   // Apply updates to the DomTree.
593   DT = &DTU.getDomTree();
594 
595   // At this point, the code is well formed.  We now do a quick sweep over the
596   // inserted code, doing constant propagation and dead code elimination as we
597   // go.
598   simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
599   simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
600                           TTI);
601 
602   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
603   ++NumUnrolledAndJammed;
604 
605   // Update LoopInfo if the loop is completely removed.
606   if (CompletelyUnroll)
607     LI->erase(L);
608 
609 #ifndef NDEBUG
610   // We shouldn't have done anything to break loop simplify form or LCSSA.
611   Loop *OutestLoop = SubLoop->getParentLoop()
612                          ? SubLoop->getParentLoop()->getParentLoop()
613                                ? SubLoop->getParentLoop()->getParentLoop()
614                                : SubLoop->getParentLoop()
615                          : SubLoop;
616   assert(DT->verify());
617   LI->verify(*DT);
618   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
619   if (!CompletelyUnroll)
620     assert(L->isLoopSimplifyForm());
621   assert(SubLoop->isLoopSimplifyForm());
622   SE->verify();
623 #endif
624 
625   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
626                           : LoopUnrollResult::PartiallyUnrolled;
627 }
628 
629 static bool getLoadsAndStores(BasicBlockSet &Blocks,
630                               SmallVector<Instruction *, 4> &MemInstr) {
631   // Scan the BBs and collect legal loads and stores.
632   // Returns false if non-simple loads/stores are found.
633   for (BasicBlock *BB : Blocks) {
634     for (Instruction &I : *BB) {
635       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
636         if (!Ld->isSimple())
637           return false;
638         MemInstr.push_back(&I);
639       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
640         if (!St->isSimple())
641           return false;
642         MemInstr.push_back(&I);
643       } else if (I.mayReadOrWriteMemory()) {
644         return false;
645       }
646     }
647   }
648   return true;
649 }
650 
651 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
652                                        unsigned UnrollLevel, unsigned JamLevel,
653                                        bool Sequentialized, Dependence *D) {
654   // UnrollLevel might carry the dependency Src --> Dst
655   // Does a different loop after unrolling?
656   for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
657        ++CurLoopDepth) {
658     auto JammedDir = D->getDirection(CurLoopDepth);
659     if (JammedDir == Dependence::DVEntry::LT)
660       return true;
661 
662     if (JammedDir & Dependence::DVEntry::GT)
663       return false;
664   }
665 
666   return true;
667 }
668 
669 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
670                                         unsigned UnrollLevel, unsigned JamLevel,
671                                         bool Sequentialized, Dependence *D) {
672   // UnrollLevel might carry the dependency Dst --> Src
673   for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
674        ++CurLoopDepth) {
675     auto JammedDir = D->getDirection(CurLoopDepth);
676     if (JammedDir == Dependence::DVEntry::GT)
677       return true;
678 
679     if (JammedDir & Dependence::DVEntry::LT)
680       return false;
681   }
682 
683   // Backward dependencies are only preserved if not interleaved.
684   return Sequentialized;
685 }
686 
687 // Check whether it is semantically safe Src and Dst considering any potential
688 // dependency between them.
689 //
690 // @param UnrollLevel The level of the loop being unrolled
691 // @param JamLevel    The level of the loop being jammed; if Src and Dst are on
692 // different levels, the outermost common loop counts as jammed level
693 //
694 // @return true if is safe and false if there is a dependency violation.
695 static bool checkDependency(Instruction *Src, Instruction *Dst,
696                             unsigned UnrollLevel, unsigned JamLevel,
697                             bool Sequentialized, DependenceInfo &DI) {
698   assert(UnrollLevel <= JamLevel &&
699          "Expecting JamLevel to be at least UnrollLevel");
700 
701   if (Src == Dst)
702     return true;
703   // Ignore Input dependencies.
704   if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
705     return true;
706 
707   // Check whether unroll-and-jam may violate a dependency.
708   // By construction, every dependency will be lexicographically non-negative
709   // (if it was, it would violate the current execution order), such as
710   //   (0,0,>,*,*)
711   // Unroll-and-jam changes the GT execution of two executions to the same
712   // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
713   // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
714   //   (0,0,>=,*,*)
715   // Now, the dependency is not necessarily non-negative anymore, i.e.
716   // unroll-and-jam may violate correctness.
717   std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
718   if (!D)
719     return true;
720   assert(D->isOrdered() && "Expected an output, flow or anti dep.");
721 
722   if (D->isConfused()) {
723     LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
724                       << "  " << *Src << "\n"
725                       << "  " << *Dst << "\n");
726     return false;
727   }
728 
729   // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
730   // non-equal direction, then the locations accessed in the inner levels cannot
731   // overlap in memory. We assumes the indexes never overlap into neighboring
732   // dimensions.
733   for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
734     if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
735       return true;
736 
737   auto UnrollDirection = D->getDirection(UnrollLevel);
738 
739   // If the distance carried by the unrolled loop is 0, then after unrolling
740   // that distance will become non-zero resulting in non-overlapping accesses in
741   // the inner loops.
742   if (UnrollDirection == Dependence::DVEntry::EQ)
743     return true;
744 
745   if (UnrollDirection & Dependence::DVEntry::LT &&
746       !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
747                                   Sequentialized, D.get()))
748     return false;
749 
750   if (UnrollDirection & Dependence::DVEntry::GT &&
751       !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
752                                    Sequentialized, D.get()))
753     return false;
754 
755   return true;
756 }
757 
758 static bool
759 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
760                   const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
761                   const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
762                   DependenceInfo &DI, LoopInfo &LI) {
763   SmallVector<BasicBlockSet, 8> AllBlocks;
764   for (Loop *L : Root.getLoopsInPreorder())
765     if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
766       AllBlocks.push_back(ForeBlocksMap.lookup(L));
767   AllBlocks.push_back(SubLoopBlocks);
768   for (Loop *L : Root.getLoopsInPreorder())
769     if (AftBlocksMap.find(L) != AftBlocksMap.end())
770       AllBlocks.push_back(AftBlocksMap.lookup(L));
771 
772   unsigned LoopDepth = Root.getLoopDepth();
773   SmallVector<Instruction *, 4> EarlierLoadsAndStores;
774   SmallVector<Instruction *, 4> CurrentLoadsAndStores;
775   for (BasicBlockSet &Blocks : AllBlocks) {
776     CurrentLoadsAndStores.clear();
777     if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
778       return false;
779 
780     Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
781     unsigned CurLoopDepth = CurLoop->getLoopDepth();
782 
783     for (auto *Earlier : EarlierLoadsAndStores) {
784       Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
785       unsigned EarlierDepth = EarlierLoop->getLoopDepth();
786       unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
787       for (auto *Later : CurrentLoadsAndStores) {
788         if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
789                              DI))
790           return false;
791       }
792     }
793 
794     size_t NumInsts = CurrentLoadsAndStores.size();
795     for (size_t I = 0; I < NumInsts; ++I) {
796       for (size_t J = I; J < NumInsts; ++J) {
797         if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
798                              LoopDepth, CurLoopDepth, true, DI))
799           return false;
800       }
801     }
802 
803     EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
804                                  CurrentLoadsAndStores.end());
805   }
806   return true;
807 }
808 
809 static bool isEligibleLoopForm(const Loop &Root) {
810   // Root must have a child.
811   if (Root.getSubLoops().size() != 1)
812     return false;
813 
814   const Loop *L = &Root;
815   do {
816     // All loops in Root need to be in simplify and rotated form.
817     if (!L->isLoopSimplifyForm())
818       return false;
819 
820     if (!L->isRotatedForm())
821       return false;
822 
823     if (L->getHeader()->hasAddressTaken()) {
824       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
825       return false;
826     }
827 
828     unsigned SubLoopsSize = L->getSubLoops().size();
829     if (SubLoopsSize == 0)
830       return true;
831 
832     // Only one child is allowed.
833     if (SubLoopsSize != 1)
834       return false;
835 
836     // Only loops with a single exit block can be unrolled and jammed.
837     // The function getExitBlock() is used for this check, rather than
838     // getUniqueExitBlock() to ensure loops with mulitple exit edges are
839     // disallowed.
840     if (!L->getExitBlock()) {
841       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit "
842                            "blocks can be unrolled and jammed.\n");
843       return false;
844     }
845 
846     // Only loops with a single exiting block can be unrolled and jammed.
847     if (!L->getExitingBlock()) {
848       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single "
849                            "exiting blocks can be unrolled and jammed.\n");
850       return false;
851     }
852 
853     L = L->getSubLoops()[0];
854   } while (L);
855 
856   return true;
857 }
858 
859 static Loop *getInnerMostLoop(Loop *L) {
860   while (!L->getSubLoops().empty())
861     L = L->getSubLoops()[0];
862   return L;
863 }
864 
865 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
866                                 DependenceInfo &DI, LoopInfo &LI) {
867   if (!isEligibleLoopForm(*L)) {
868     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
869     return false;
870   }
871 
872   /* We currently handle outer loops like this:
873         |
874     ForeFirst    <------\   }
875      Blocks             |   } ForeBlocks of L
876     ForeLast            |   }
877         |               |
878        ...              |
879         |               |
880     ForeFirst    <----\ |   }
881      Blocks           | |   } ForeBlocks of a inner loop of L
882     ForeLast          | |   }
883         |             | |
884     JamLoopFirst  <\  | |   }
885      Blocks        |  | |   } JamLoopBlocks of the innermost loop
886     JamLoopLast   -/  | |   }
887         |             | |
888     AftFirst          | |   }
889      Blocks           | |   } AftBlocks of a inner loop of L
890     AftLast     ------/ |   }
891         |               |
892        ...              |
893         |               |
894     AftFirst            |   }
895      Blocks             |   } AftBlocks of L
896     AftLast     --------/   }
897         |
898 
899     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
900     and AftBlocks, providing that there is one edge from Fores to SubLoops,
901     one edge from SubLoops to Afts and a single outer loop exit (from Afts).
902     In practice we currently limit Aft blocks to a single block, and limit
903     things further in the profitablility checks of the unroll and jam pass.
904 
905     Because of the way we rearrange basic blocks, we also require that
906     the Fore blocks of L on all unrolled iterations are safe to move before the
907     blocks of the direct child of L of all iterations. So we require that the
908     phi node looping operands of ForeHeader can be moved to at least the end of
909     ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
910     match up Phi's correctly.
911 
912     i.e. The old order of blocks used to be
913            (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
914          It needs to be safe to transform this to
915            (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
916 
917     There are then a number of checks along the lines of no calls, no
918     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
919     runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
920     UnrollAndJamLoop if the trip count cannot be easily calculated.
921   */
922 
923   // Split blocks into Fore/SubLoop/Aft based on dominators
924   Loop *JamLoop = getInnerMostLoop(L);
925   BasicBlockSet SubLoopBlocks;
926   DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
927   DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
928   if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
929                                 AftBlocksMap, DT)) {
930     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
931     return false;
932   }
933 
934   // Aft blocks may need to move instructions to fore blocks, which becomes more
935   // difficult if there are multiple (potentially conditionally executed)
936   // blocks. For now we just exclude loops with multiple aft blocks.
937   if (AftBlocksMap[L].size() != 1) {
938     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
939                          "multiple blocks after the loop\n");
940     return false;
941   }
942 
943   // Check inner loop backedge count is consistent on all iterations of the
944   // outer loop
945   if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
946         return !hasIterationCountInvariantInParent(SubLoop, SE);
947       })) {
948     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
949                          "not consistent on each iteration\n");
950     return false;
951   }
952 
953   // Check the loop safety info for exceptions.
954   SimpleLoopSafetyInfo LSI;
955   LSI.computeLoopSafetyInfo(L);
956   if (LSI.anyBlockMayThrow()) {
957     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
958     return false;
959   }
960 
961   // We've ruled out the easy stuff and now need to check that there are no
962   // interdependencies which may prevent us from moving the:
963   //  ForeBlocks before Subloop and AftBlocks.
964   //  Subloop before AftBlocks.
965   //  ForeBlock phi operands before the subloop
966 
967   // Make sure we can move all instructions we need to before the subloop
968   BasicBlock *Header = L->getHeader();
969   BasicBlock *Latch = L->getLoopLatch();
970   BasicBlockSet AftBlocks = AftBlocksMap[L];
971   Loop *SubLoop = L->getSubLoops()[0];
972   if (!processHeaderPhiOperands(
973           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
974             if (SubLoop->contains(I->getParent()))
975               return false;
976             if (AftBlocks.count(I->getParent())) {
977               // If we hit a phi node in afts we know we are done (probably
978               // LCSSA)
979               if (isa<PHINode>(I))
980                 return false;
981               // Can't move instructions with side effects or memory
982               // reads/writes
983               if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
984                 return false;
985             }
986             // Keep going
987             return true;
988           })) {
989     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
990                          "instructions after subloop to before it\n");
991     return false;
992   }
993 
994   // Check for memory dependencies which prohibit the unrolling we are doing.
995   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
996   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
997   if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
998                          LI)) {
999     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
1000     return false;
1001   }
1002 
1003   return true;
1004 }
1005