xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- LoopPeel.cpp -------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Loop Peeling Utilities.
10 //===----------------------------------------------------------------------===//
11 
12 #include "llvm/Transforms/Utils/LoopPeel.h"
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/Loads.h"
17 #include "llvm/Analysis/LoopInfo.h"
18 #include "llvm/Analysis/LoopIterator.h"
19 #include "llvm/Analysis/ScalarEvolution.h"
20 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
21 #include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Dominators.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/InstrTypes.h"
27 #include "llvm/IR/Instruction.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/IR/LLVMContext.h"
30 #include "llvm/IR/MDBuilder.h"
31 #include "llvm/IR/PatternMatch.h"
32 #include "llvm/IR/ProfDataUtils.h"
33 #include "llvm/Support/Casting.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include "llvm/Transforms/Utils/Cloning.h"
39 #include "llvm/Transforms/Utils/LoopSimplify.h"
40 #include "llvm/Transforms/Utils/LoopUtils.h"
41 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
42 #include "llvm/Transforms/Utils/ValueMapper.h"
43 #include <algorithm>
44 #include <cassert>
45 #include <cstdint>
46 #include <optional>
47 
48 using namespace llvm;
49 using namespace llvm::PatternMatch;
50 using namespace llvm::SCEVPatternMatch;
51 
52 #define DEBUG_TYPE "loop-peel"
53 
54 STATISTIC(NumPeeled, "Number of loops peeled");
55 STATISTIC(NumPeeledEnd, "Number of loops peeled from end");
56 
57 static cl::opt<unsigned> UnrollPeelCount(
58     "unroll-peel-count", cl::Hidden,
59     cl::desc("Set the unroll peeling count, for testing purposes"));
60 
61 static cl::opt<bool>
62     UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
63                        cl::desc("Allows loops to be peeled when the dynamic "
64                                 "trip count is known to be low."));
65 
66 static cl::opt<bool>
67     UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",
68                                 cl::init(false), cl::Hidden,
69                                 cl::desc("Allows loop nests to be peeled."));
70 
71 static cl::opt<unsigned> UnrollPeelMaxCount(
72     "unroll-peel-max-count", cl::init(7), cl::Hidden,
73     cl::desc("Max average trip count which will cause loop peeling."));
74 
75 static cl::opt<unsigned> UnrollForcePeelCount(
76     "unroll-force-peel-count", cl::init(0), cl::Hidden,
77     cl::desc("Force a peel count regardless of profiling information."));
78 
79 static cl::opt<bool> DisableAdvancedPeeling(
80     "disable-advanced-peeling", cl::init(false), cl::Hidden,
81     cl::desc(
82         "Disable advance peeling. Issues for convergent targets (D134803)."));
83 
84 static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
85 
86 // Check whether we are capable of peeling this loop.
87 bool llvm::canPeel(const Loop *L) {
88   // Make sure the loop is in simplified form
89   if (!L->isLoopSimplifyForm())
90     return false;
91   if (!DisableAdvancedPeeling)
92     return true;
93 
94   SmallVector<BasicBlock *, 4> Exits;
95   L->getUniqueNonLatchExitBlocks(Exits);
96   // The latch must either be the only exiting block or all non-latch exit
97   // blocks have either a deopt or unreachable terminator or compose a chain of
98   // blocks where the last one is either deopt or unreachable terminated. Both
99   // deopt and unreachable terminators are a strong indication they are not
100   // taken. Note that this is a profitability check, not a legality check. Also
101   // note that LoopPeeling currently can only update the branch weights of latch
102   // blocks and branch weights to blocks with deopt or unreachable do not need
103   // updating.
104   return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable);
105 }
106 
107 namespace {
108 
109 // As a loop is peeled, it may be the case that Phi nodes become
110 // loop-invariant (ie, known because there is only one choice).
111 // For example, consider the following function:
112 //   void g(int);
113 //   void binary() {
114 //     int x = 0;
115 //     int y = 0;
116 //     int a = 0;
117 //     for(int i = 0; i <100000; ++i) {
118 //       g(x);
119 //       x = y;
120 //       g(a);
121 //       y = a + 1;
122 //       a = 5;
123 //     }
124 //   }
125 // Peeling 3 iterations is beneficial because the values for x, y and a
126 // become known.  The IR for this loop looks something like the following:
127 //
128 //   %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
129 //   %a = phi i32 [ 0, %entry ], [ 5, %if.end ]
130 //   %y = phi i32 [ 0, %entry ], [ %add, %if.end ]
131 //   %x = phi i32 [ 0, %entry ], [ %y, %if.end ]
132 //   ...
133 //   tail call void @_Z1gi(i32 signext %x)
134 //   tail call void @_Z1gi(i32 signext %a)
135 //   %add = add nuw nsw i32 %a, 1
136 //   %inc = add nuw nsw i32 %i, 1
137 //   %exitcond = icmp eq i32 %inc, 100000
138 //   br i1 %exitcond, label %for.cond.cleanup, label %for.body
139 //
140 // The arguments for the calls to g will become known after 3 iterations
141 // of the loop, because the phi nodes values become known after 3 iterations
142 // of the loop (ie, they are known on the 4th iteration, so peel 3 iterations).
143 // The first iteration has g(0), g(0); the second has g(0), g(5); the
144 // third has g(1), g(5) and the fourth (and all subsequent) have g(6), g(5).
145 // Now consider the phi nodes:
146 //   %a is a phi with constants so it is determined after iteration 1.
147 //   %y is a phi based on a constant and %a so it is determined on
148 //     the iteration after %a is determined, so iteration 2.
149 //   %x is a phi based on a constant and %y so it is determined on
150 //     the iteration after %y, so iteration 3.
151 //   %i is based on itself (and is an induction variable) so it is
152 //     never determined.
153 // This means that peeling off 3 iterations will result in being able to
154 // remove the phi nodes for %a, %y, and %x.  The arguments for the
155 // corresponding calls to g are determined and the code for computing
156 // x, y, and a can be removed.
157 //
158 // The PhiAnalyzer class calculates how many times a loop should be
159 // peeled based on the above analysis of the phi nodes in the loop while
160 // respecting the maximum specified.
161 class PhiAnalyzer {
162 public:
163   PhiAnalyzer(const Loop &L, unsigned MaxIterations);
164 
165   // Calculate the sufficient minimum number of iterations of the loop to peel
166   // such that phi instructions become determined (subject to allowable limits)
167   std::optional<unsigned> calculateIterationsToPeel();
168 
169 protected:
170   using PeelCounter = std::optional<unsigned>;
171   const PeelCounter Unknown = std::nullopt;
172 
173   // Add 1 respecting Unknown and return Unknown if result over MaxIterations
174   PeelCounter addOne(PeelCounter PC) const {
175     if (PC == Unknown)
176       return Unknown;
177     return (*PC + 1 <= MaxIterations) ? PeelCounter{*PC + 1} : Unknown;
178   }
179 
180   // Calculate the number of iterations after which the given value
181   // becomes an invariant.
182   PeelCounter calculate(const Value &);
183 
184   const Loop &L;
185   const unsigned MaxIterations;
186 
187   // Map of Values to number of iterations to invariance
188   SmallDenseMap<const Value *, PeelCounter> IterationsToInvariance;
189 };
190 
191 PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations)
192     : L(L), MaxIterations(MaxIterations) {
193   assert(canPeel(&L) && "loop is not suitable for peeling");
194   assert(MaxIterations > 0 && "no peeling is allowed?");
195 }
196 
197 // This function calculates the number of iterations after which the value
198 // becomes an invariant. The pre-calculated values are memorized in a map.
199 // N.B. This number will be Unknown or <= MaxIterations.
200 // The function is calculated according to the following definition:
201 // Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
202 //   F(%x) = G(%y) + 1 (N.B. [MaxIterations | Unknown] + 1 => Unknown)
203 //   G(%y) = 0 if %y is a loop invariant
204 //   G(%y) = G(%BackEdgeValue) if %y is a phi in the header block
205 //   G(%y) = TODO: if %y is an expression based on phis and loop invariants
206 //           The example looks like:
207 //           %x = phi(0, %a) <-- becomes invariant starting from 3rd iteration.
208 //           %y = phi(0, 5)
209 //           %a = %y + 1
210 //   G(%y) = Unknown otherwise (including phi not in header block)
211 PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
212   // If we already know the answer, take it from the map.
213   // Otherwise, place Unknown to map to avoid infinite recursion. Such
214   // cycles can never stop on an invariant.
215   auto [I, Inserted] = IterationsToInvariance.try_emplace(&V, Unknown);
216   if (!Inserted)
217     return I->second;
218 
219   if (L.isLoopInvariant(&V))
220     // Loop invariant so known at start.
221     return (IterationsToInvariance[&V] = 0);
222   if (const PHINode *Phi = dyn_cast<PHINode>(&V)) {
223     if (Phi->getParent() != L.getHeader()) {
224       // Phi is not in header block so Unknown.
225       assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");
226       return Unknown;
227     }
228     // We need to analyze the input from the back edge and add 1.
229     Value *Input = Phi->getIncomingValueForBlock(L.getLoopLatch());
230     PeelCounter Iterations = calculate(*Input);
231     assert(IterationsToInvariance[Input] == Iterations &&
232            "unexpected value saved");
233     return (IterationsToInvariance[Phi] = addOne(Iterations));
234   }
235   if (const Instruction *I = dyn_cast<Instruction>(&V)) {
236     if (isa<CmpInst>(I) || I->isBinaryOp()) {
237       // Binary instructions get the max of the operands.
238       PeelCounter LHS = calculate(*I->getOperand(0));
239       if (LHS == Unknown)
240         return Unknown;
241       PeelCounter RHS = calculate(*I->getOperand(1));
242       if (RHS == Unknown)
243         return Unknown;
244       return (IterationsToInvariance[I] = {std::max(*LHS, *RHS)});
245     }
246     if (I->isCast())
247       // Cast instructions get the value of the operand.
248       return (IterationsToInvariance[I] = calculate(*I->getOperand(0)));
249   }
250   // TODO: handle more expressions
251 
252   // Everything else is Unknown.
253   assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");
254   return Unknown;
255 }
256 
257 std::optional<unsigned> PhiAnalyzer::calculateIterationsToPeel() {
258   unsigned Iterations = 0;
259   for (auto &PHI : L.getHeader()->phis()) {
260     PeelCounter ToInvariance = calculate(PHI);
261     if (ToInvariance != Unknown) {
262       assert(*ToInvariance <= MaxIterations && "bad result in phi analysis");
263       Iterations = std::max(Iterations, *ToInvariance);
264       if (Iterations == MaxIterations)
265         break;
266     }
267   }
268   assert((Iterations <= MaxIterations) && "bad result in phi analysis");
269   return Iterations ? std::optional<unsigned>(Iterations) : std::nullopt;
270 }
271 
272 } // unnamed namespace
273 
274 // Try to find any invariant memory reads that will become dereferenceable in
275 // the remainder loop after peeling. The load must also be used (transitively)
276 // by an exit condition. Returns the number of iterations to peel off (at the
277 // moment either 0 or 1).
278 static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
279                                                       DominatorTree &DT,
280                                                       AssumptionCache *AC) {
281   // Skip loops with a single exiting block, because there should be no benefit
282   // for the heuristic below.
283   if (L.getExitingBlock())
284     return 0;
285 
286   // All non-latch exit blocks must have an UnreachableInst terminator.
287   // Otherwise the heuristic below may not be profitable.
288   SmallVector<BasicBlock *, 4> Exits;
289   L.getUniqueNonLatchExitBlocks(Exits);
290   if (any_of(Exits, [](const BasicBlock *BB) {
291         return !isa<UnreachableInst>(BB->getTerminator());
292       }))
293     return 0;
294 
295   // Now look for invariant loads that dominate the latch and are not known to
296   // be dereferenceable. If there are such loads and no writes, they will become
297   // dereferenceable in the loop if the first iteration is peeled off. Also
298   // collect the set of instructions controlled by such loads. Only peel if an
299   // exit condition uses (transitively) such a load.
300   BasicBlock *Header = L.getHeader();
301   BasicBlock *Latch = L.getLoopLatch();
302   SmallPtrSet<Value *, 8> LoadUsers;
303   const DataLayout &DL = L.getHeader()->getDataLayout();
304   for (BasicBlock *BB : L.blocks()) {
305     for (Instruction &I : *BB) {
306       if (I.mayWriteToMemory())
307         return 0;
308 
309       if (LoadUsers.contains(&I))
310         LoadUsers.insert_range(I.users());
311       // Do not look for reads in the header; they can already be hoisted
312       // without peeling.
313       if (BB == Header)
314         continue;
315       if (auto *LI = dyn_cast<LoadInst>(&I)) {
316         Value *Ptr = LI->getPointerOperand();
317         if (DT.dominates(BB, Latch) && L.isLoopInvariant(Ptr) &&
318             !isDereferenceablePointer(Ptr, LI->getType(), DL, LI, AC, &DT))
319           LoadUsers.insert_range(I.users());
320       }
321     }
322   }
323   SmallVector<BasicBlock *> ExitingBlocks;
324   L.getExitingBlocks(ExitingBlocks);
325   if (any_of(ExitingBlocks, [&LoadUsers](BasicBlock *Exiting) {
326         return LoadUsers.contains(Exiting->getTerminator());
327       }))
328     return 1;
329   return 0;
330 }
331 
332 bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
333   const SCEV *BTC = SE.getBackedgeTakenCount(&L);
334   if (isa<SCEVCouldNotCompute>(BTC))
335     return false;
336 
337   // Check if the exit condition of the loop can be adjusted by the peeling
338   // codegen. For now, it must
339   // * exit via the latch,
340   // * the exit condition must be a NE/EQ compare of an induction with step
341   // of 1 and must only be used by the exiting branch.
342   BasicBlock *Latch = L.getLoopLatch();
343   Value *Inc;
344   Value *Bound;
345   CmpPredicate Pred;
346   BasicBlock *Succ1;
347   BasicBlock *Succ2;
348   return Latch && Latch == L.getExitingBlock() &&
349          match(Latch->getTerminator(),
350                m_Br(m_OneUse(m_ICmp(Pred, m_Value(Inc), m_Value(Bound))),
351                     m_BasicBlock(Succ1), m_BasicBlock(Succ2))) &&
352          ((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) ||
353           (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) &&
354          Bound->getType()->isIntegerTy() &&
355          SE.isLoopInvariant(SE.getSCEV(Bound), &L) &&
356          match(SE.getSCEV(Inc),
357                m_scev_AffineAddRec(m_SCEV(), m_scev_One(), m_SpecificLoop(&L)));
358 }
359 
360 /// Returns true if the last iteration can be peeled off and the condition (Pred
361 /// LeftAR, RightSCEV) is known at the last iteration and the inverse condition
362 /// is known at the second-to-last.
363 static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred,
364                                     const SCEVAddRecExpr *LeftAR,
365                                     const SCEV *RightSCEV, ScalarEvolution &SE,
366                                     const TargetTransformInfo &TTI) {
367   if (!canPeelLastIteration(L, SE))
368     return false;
369 
370   const SCEV *BTC = SE.getBackedgeTakenCount(&L);
371   SCEVExpander Expander(SE, L.getHeader()->getDataLayout(), "loop-peel");
372   if (!SE.isKnownNonZero(BTC) &&
373       Expander.isHighCostExpansion(BTC, &L, SCEVCheapExpansionBudget, &TTI,
374                                    L.getLoopPredecessor()->getTerminator()))
375     return false;
376 
377   auto Guards = ScalarEvolution::LoopGuards::collect(&L, SE);
378   BTC = SE.applyLoopGuards(BTC, Guards);
379   RightSCEV = SE.applyLoopGuards(RightSCEV, Guards);
380   const SCEV *ValAtLastIter = LeftAR->evaluateAtIteration(BTC, SE);
381   const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration(
382       SE.getMinusSCEV(BTC, SE.getOne(BTC->getType())), SE);
383 
384   return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), ValAtLastIter,
385                              RightSCEV) &&
386          SE.isKnownPredicate(Pred, ValAtSecondToLastIter, RightSCEV);
387 }
388 
389 // Return the number of iterations to peel off from the beginning and end of the
390 // loop respectively, that make conditions in the body true/false. For example,
391 // if we peel 2 iterations off the loop below, the condition i < 2 can be
392 // evaluated at compile time.
393 //
394 //  for (i = 0; i < n; i++)
395 //    if (i < 2)
396 //      ..
397 //    else
398 //      ..
399 //   }
400 static std::pair<unsigned, unsigned>
401 countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
402                          const TargetTransformInfo &TTI) {
403   assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
404   unsigned DesiredPeelCount = 0;
405   unsigned DesiredPeelCountLast = 0;
406 
407   // Do not peel the entire loop.
408   const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(&L);
409   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(BE))
410     MaxPeelCount =
411         std::min((unsigned)SC->getAPInt().getLimitedValue() - 1, MaxPeelCount);
412 
413   // Increase PeelCount while (IterVal Pred BoundSCEV) condition is satisfied;
414   // return true if inversed condition become known before reaching the
415   // MaxPeelCount limit.
416   auto PeelWhilePredicateIsKnown =
417       [&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
418           const SCEV *Step, ICmpInst::Predicate Pred) {
419         while (PeelCount < MaxPeelCount &&
420                SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
421           IterVal = SE.getAddExpr(IterVal, Step);
422           ++PeelCount;
423         }
424         return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
425                                    BoundSCEV);
426       };
427 
428   const unsigned MaxDepth = 4;
429   std::function<void(Value *, unsigned)> ComputePeelCount =
430       [&](Value *Condition, unsigned Depth) -> void {
431     if (!Condition->getType()->isIntegerTy() || Depth >= MaxDepth)
432       return;
433 
434     Value *LeftVal, *RightVal;
435     if (match(Condition, m_And(m_Value(LeftVal), m_Value(RightVal))) ||
436         match(Condition, m_Or(m_Value(LeftVal), m_Value(RightVal)))) {
437       ComputePeelCount(LeftVal, Depth + 1);
438       ComputePeelCount(RightVal, Depth + 1);
439       return;
440     }
441 
442     CmpPredicate Pred;
443     if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
444       return;
445 
446     const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
447     const SCEV *RightSCEV = SE.getSCEV(RightVal);
448 
449     // Do not consider predicates that are known to be true or false
450     // independently of the loop iteration.
451     if (SE.evaluatePredicate(Pred, LeftSCEV, RightSCEV))
452       return;
453 
454     // Check if we have a condition with one AddRec and one non AddRec
455     // expression. Normalize LeftSCEV to be the AddRec.
456     if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
457       if (isa<SCEVAddRecExpr>(RightSCEV)) {
458         std::swap(LeftSCEV, RightSCEV);
459         Pred = ICmpInst::getSwappedPredicate(Pred);
460       } else
461         return;
462     }
463 
464     const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
465 
466     // Avoid huge SCEV computations in the loop below, make sure we only
467     // consider AddRecs of the loop we are trying to peel.
468     if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
469       return;
470     if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) &&
471         !SE.getMonotonicPredicateType(LeftAR, Pred))
472       return;
473 
474     // Check if extending the current DesiredPeelCount lets us evaluate Pred
475     // or !Pred in the loop body statically.
476     unsigned NewPeelCount = DesiredPeelCount;
477 
478     const SCEV *IterVal = LeftAR->evaluateAtIteration(
479         SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
480 
481     // If the original condition is not known, get the negated predicate
482     // (which holds on the else branch) and check if it is known. This allows
483     // us to peel of iterations that make the original condition false.
484     if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
485       Pred = ICmpInst::getInversePredicate(Pred);
486 
487     const SCEV *Step = LeftAR->getStepRecurrence(SE);
488     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,
489                                    Pred)) {
490       if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE, TTI))
491         DesiredPeelCountLast = 1;
492       return;
493     }
494 
495     // However, for equality comparisons, that isn't always sufficient to
496     // eliminate the comparsion in loop body, we may need to peel one more
497     // iteration. See if that makes !Pred become unknown again.
498     const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
499     if (ICmpInst::isEquality(Pred) &&
500         !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
501                              RightSCEV) &&
502         !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
503         SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
504       if (NewPeelCount >= MaxPeelCount)
505         return; // Need to peel one more iteration, but can't. Give up.
506       ++NewPeelCount; // Great!
507     }
508 
509     DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
510     DesiredPeelCountLast = std::max(DesiredPeelCountLast, NewPeelCount);
511   };
512 
513   auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) {
514     if (!MinMax->getType()->isIntegerTy())
515       return;
516     Value *LHS = MinMax->getLHS(), *RHS = MinMax->getRHS();
517     const SCEV *BoundSCEV, *IterSCEV;
518     if (L.isLoopInvariant(LHS)) {
519       BoundSCEV = SE.getSCEV(LHS);
520       IterSCEV = SE.getSCEV(RHS);
521     } else if (L.isLoopInvariant(RHS)) {
522       BoundSCEV = SE.getSCEV(RHS);
523       IterSCEV = SE.getSCEV(LHS);
524     } else
525       return;
526     const auto *AddRec = dyn_cast<SCEVAddRecExpr>(IterSCEV);
527     // For simplicity, we support only affine recurrences.
528     if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)
529       return;
530     const SCEV *Step = AddRec->getStepRecurrence(SE);
531     bool IsSigned = MinMax->isSigned();
532     // To minimize number of peeled iterations, we use strict relational
533     // predicates here.
534     ICmpInst::Predicate Pred;
535     if (SE.isKnownPositive(Step))
536       Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
537     else if (SE.isKnownNegative(Step))
538       Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
539     else
540       return;
541     // Check that AddRec is not wrapping.
542     if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))
543       return;
544     unsigned NewPeelCount = DesiredPeelCount;
545     const SCEV *IterVal = AddRec->evaluateAtIteration(
546         SE.getConstant(AddRec->getType(), NewPeelCount), SE);
547     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
548                                    Pred)) {
549       if (shouldPeelLastIteration(L, Pred, AddRec, BoundSCEV, SE, TTI))
550         DesiredPeelCountLast = 1;
551       return;
552     }
553     DesiredPeelCount = NewPeelCount;
554   };
555 
556   for (BasicBlock *BB : L.blocks()) {
557     for (Instruction &I : *BB) {
558       if (SelectInst *SI = dyn_cast<SelectInst>(&I))
559         ComputePeelCount(SI->getCondition(), 0);
560       if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(&I))
561         ComputePeelCountMinMax(MinMax);
562     }
563 
564     auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
565     if (!BI || BI->isUnconditional())
566       continue;
567 
568     // Ignore loop exit condition.
569     if (L.getLoopLatch() == BB)
570       continue;
571 
572     ComputePeelCount(BI->getCondition(), 0);
573   }
574 
575   return {DesiredPeelCount, DesiredPeelCountLast};
576 }
577 
578 /// This "heuristic" exactly matches implicit behavior which used to exist
579 /// inside getLoopEstimatedTripCount.  It was added here to keep an
580 /// improvement inside that API from causing peeling to become more aggressive.
581 /// This should probably be removed.
582 static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
583   BasicBlock *Latch = L->getLoopLatch();
584   if (!Latch)
585     return true;
586 
587   BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
588   if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
589     return true;
590 
591   assert((LatchBR->getSuccessor(0) == L->getHeader() ||
592           LatchBR->getSuccessor(1) == L->getHeader()) &&
593          "At least one edge out of the latch must go to the header");
594 
595   SmallVector<BasicBlock *, 4> ExitBlocks;
596   L->getUniqueNonLatchExitBlocks(ExitBlocks);
597   return any_of(ExitBlocks, [](const BasicBlock *EB) {
598       return !EB->getTerminatingDeoptimizeCall();
599     });
600 }
601 
602 
603 // Return the number of iterations we want to peel off.
604 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
605                             TargetTransformInfo::PeelingPreferences &PP,
606                             unsigned TripCount, DominatorTree &DT,
607                             ScalarEvolution &SE, const TargetTransformInfo &TTI,
608                             AssumptionCache *AC, unsigned Threshold) {
609   assert(LoopSize > 0 && "Zero loop size is not allowed!");
610   // Save the PP.PeelCount value set by the target in
611   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
612   unsigned TargetPeelCount = PP.PeelCount;
613   PP.PeelCount = 0;
614   PP.PeelLast = false;
615   if (!canPeel(L))
616     return;
617 
618   // Only try to peel innermost loops by default.
619   // The constraint can be relaxed by the target in TTI.getPeelingPreferences
620   // or by the flag -unroll-allow-loop-nests-peeling.
621   if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
622     return;
623 
624   // If the user provided a peel count, use that.
625   bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
626   if (UserPeelCount) {
627     LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
628                       << " iterations.\n");
629     PP.PeelCount = UnrollForcePeelCount;
630     PP.PeelProfiledIterations = true;
631     return;
632   }
633 
634   // Skip peeling if it's disabled.
635   if (!PP.AllowPeeling)
636     return;
637 
638   // Check that we can peel at least one iteration.
639   if (2 * LoopSize > Threshold)
640     return;
641 
642   unsigned AlreadyPeeled = 0;
643   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
644     AlreadyPeeled = *Peeled;
645   // Stop if we already peeled off the maximum number of iterations.
646   if (AlreadyPeeled >= UnrollPeelMaxCount)
647     return;
648 
649   // Pay respect to limitations implied by loop size and the max peel count.
650   unsigned MaxPeelCount = UnrollPeelMaxCount;
651   MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);
652 
653   // Start the max computation with the PP.PeelCount value set by the target
654   // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
655   unsigned DesiredPeelCount = TargetPeelCount;
656 
657   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
658   // iterations of the loop. For this we compute the number for iterations after
659   // which every Phi is guaranteed to become an invariant, and try to peel the
660   // maximum number of iterations among these values, thus turning all those
661   // Phis into invariants.
662   if (MaxPeelCount > DesiredPeelCount) {
663     // Check how many iterations are useful for resolving Phis
664     auto NumPeels = PhiAnalyzer(*L, MaxPeelCount).calculateIterationsToPeel();
665     if (NumPeels)
666       DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels);
667   }
668 
669   const auto &[CountToEliminateCmps, CountToEliminateCmpsLast] =
670       countToEliminateCompares(*L, MaxPeelCount, SE, TTI);
671   DesiredPeelCount = std::max(DesiredPeelCount, CountToEliminateCmps);
672 
673   if (DesiredPeelCount == 0)
674     DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC);
675 
676   if (DesiredPeelCount > 0) {
677     DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
678     // Consider max peel count limitation.
679     assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
680     if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
681       LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
682                         << " iteration(s) to turn"
683                         << " some Phis into invariants.\n");
684       PP.PeelCount = DesiredPeelCount;
685       PP.PeelProfiledIterations = false;
686       PP.PeelLast = false;
687       return;
688     }
689   }
690 
691   if (CountToEliminateCmpsLast > 0) {
692     unsigned DesiredPeelCountLast =
693         std::min(CountToEliminateCmpsLast, MaxPeelCount);
694     // Consider max peel count limitation.
695     assert(DesiredPeelCountLast > 0 && "Wrong loop size estimation?");
696     if (DesiredPeelCountLast + AlreadyPeeled <= UnrollPeelMaxCount) {
697       LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
698                         << " iteration(s) to turn"
699                         << " some Phis into invariants.\n");
700       PP.PeelCount = DesiredPeelCountLast;
701       PP.PeelProfiledIterations = false;
702       PP.PeelLast = true;
703       return;
704     }
705   }
706 
707   // Bail if we know the statically calculated trip count.
708   // In this case we rather prefer partial unrolling.
709   if (TripCount)
710     return;
711 
712   // Do not apply profile base peeling if it is disabled.
713   if (!PP.PeelProfiledIterations)
714     return;
715   // If we don't know the trip count, but have reason to believe the average
716   // trip count is low, peeling should be beneficial, since we will usually
717   // hit the peeled section.
718   // We only do this in the presence of profile information, since otherwise
719   // our estimates of the trip count are not reliable enough.
720   if (L->getHeader()->getParent()->hasProfileData()) {
721     if (violatesLegacyMultiExitLoopCheck(L))
722       return;
723     std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
724     if (!EstimatedTripCount)
725       return;
726 
727     LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
728                       << *EstimatedTripCount << "\n");
729 
730     if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
731       unsigned PeelCount = *EstimatedTripCount;
732       LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
733       PP.PeelCount = PeelCount;
734       return;
735     }
736     LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
737     LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
738     LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
739     LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
740     LLVM_DEBUG(dbgs() << "Max peel count by cost: "
741                       << (Threshold / LoopSize - 1) << "\n");
742   }
743 }
744 
745 struct WeightInfo {
746   // Weights for current iteration.
747   SmallVector<uint32_t> Weights;
748   // Weights to subtract after each iteration.
749   const SmallVector<uint32_t> SubWeights;
750 };
751 
752 /// Update the branch weights of an exiting block of a peeled-off loop
753 /// iteration.
754 /// Let F is a weight of the edge to continue (fallthrough) into the loop.
755 /// Let E is a weight of the edge to an exit.
756 /// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
757 /// go to exit.
758 /// Then, Estimated ExitCount = F / E.
759 /// For I-th (counting from 0) peeled off iteration we set the weights for
760 /// the peeled exit as (EC - I, 1). It gives us reasonable distribution,
761 /// The probability to go to exit 1/(EC-I) increases. At the same time
762 /// the estimated exit count in the remainder loop reduces by I.
763 /// To avoid dealing with division rounding we can just multiple both part
764 /// of weights to E and use weight as (F - I * E, E).
765 static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
766   setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
767   for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
768     if (SubWeight != 0)
769       // Don't set the probability of taking the edge from latch to loop header
770       // to less than 1:1 ratio (meaning Weight should not be lower than
771       // SubWeight), as this could significantly reduce the loop's hotness,
772       // which would be incorrect in the case of underestimating the trip count.
773       Info.Weights[Idx] =
774           Info.Weights[Idx] > SubWeight
775               ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
776               : SubWeight;
777 }
778 
779 /// Initialize the weights for all exiting blocks.
780 static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
781                               Loop *L) {
782   SmallVector<BasicBlock *> ExitingBlocks;
783   L->getExitingBlocks(ExitingBlocks);
784   for (BasicBlock *ExitingBlock : ExitingBlocks) {
785     Instruction *Term = ExitingBlock->getTerminator();
786     SmallVector<uint32_t> Weights;
787     if (!extractBranchWeights(*Term, Weights))
788       continue;
789 
790     // See the comment on updateBranchWeights() for an explanation of what we
791     // do here.
792     uint32_t FallThroughWeights = 0;
793     uint32_t ExitWeights = 0;
794     for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
795       if (L->contains(Succ))
796         FallThroughWeights += Weight;
797       else
798         ExitWeights += Weight;
799     }
800 
801     // Don't try to update weights for degenerate case.
802     if (FallThroughWeights == 0)
803       continue;
804 
805     SmallVector<uint32_t> SubWeights;
806     for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
807       if (!L->contains(Succ)) {
808         // Exit weights stay the same.
809         SubWeights.push_back(0);
810         continue;
811       }
812 
813       // Subtract exit weights on each iteration, distributed across all
814       // fallthrough edges.
815       double W = (double)Weight / (double)FallThroughWeights;
816       SubWeights.push_back((uint32_t)(ExitWeights * W));
817     }
818 
819     WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}});
820   }
821 }
822 
823 /// Clones the body of the loop L, putting it between \p InsertTop and \p
824 /// InsertBot.
825 /// \param IterNumber The serial number of the iteration currently being
826 /// peeled off.
827 /// \param PeelLast Peel off the last iterations from \p L.
828 /// \param ExitEdges The exit edges of the original loop.
829 /// \param[out] NewBlocks A list of the blocks in the newly created clone
830 /// \param[out] VMap The value map between the loop and the new clone.
831 /// \param LoopBlocks A helper for DFS-traversal of the loop.
832 /// \param LVMap A value-map that maps instructions from the original loop to
833 /// instructions in the last peeled-off iteration.
834 static void cloneLoopBlocks(
835     Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop,
836     BasicBlock *InsertBot, BasicBlock *OrigPreHeader,
837     SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
838     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
839     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
840     LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes,
841     ScalarEvolution &SE) {
842   BasicBlock *Header = L->getHeader();
843   BasicBlock *Latch = L->getLoopLatch();
844   BasicBlock *PreHeader = L->getLoopPreheader();
845 
846   Function *F = Header->getParent();
847   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
848   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
849   Loop *ParentLoop = L->getParentLoop();
850 
851   // For each block in the original loop, create a new copy,
852   // and update the value map with the newly created values.
853   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
854     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
855     NewBlocks.push_back(NewBB);
856 
857     // If an original block is an immediate child of the loop L, its copy
858     // is a child of a ParentLoop after peeling. If a block is a child of
859     // a nested loop, it is handled in the cloneLoop() call below.
860     if (ParentLoop && LI->getLoopFor(*BB) == L)
861       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
862 
863     VMap[*BB] = NewBB;
864 
865     // If dominator tree is available, insert nodes to represent cloned blocks.
866     if (DT) {
867       if (Header == *BB)
868         DT->addNewBlock(NewBB, InsertTop);
869       else {
870         DomTreeNode *IDom = DT->getNode(*BB)->getIDom();
871         // VMap must contain entry for IDom, as the iteration order is RPO.
872         DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));
873       }
874     }
875   }
876 
877   {
878     // Identify what other metadata depends on the cloned version. After
879     // cloning, replace the metadata with the corrected version for both
880     // memory instructions and noalias intrinsics.
881     std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
882     cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
883                                Header->getContext(), Ext);
884   }
885 
886   // Recursively create the new Loop objects for nested loops, if any,
887   // to preserve LoopInfo.
888   for (Loop *ChildLoop : *L) {
889     cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);
890   }
891 
892   // Hook-up the control flow for the newly inserted blocks.
893   // The new header is hooked up directly to the "top", which is either
894   // the original loop preheader (for the first iteration) or the previous
895   // iteration's exiting block (for every other iteration)
896   InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
897 
898   // Similarly, for the latch:
899   // The original exiting edge is still hooked up to the loop exit.
900   BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
901   if (PeelLast) {
902     // This is the last iteration and we definitely will go to the exit. Just
903     // set both successors to InsertBot and let the branch be simplified later.
904     assert(IterNumber == 0 && "Only peeling a single iteration implemented.");
905     auto *LatchTerm = cast<BranchInst>(NewLatch->getTerminator());
906     LatchTerm->setSuccessor(0, InsertBot);
907     LatchTerm->setSuccessor(1, InsertBot);
908   } else {
909     auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator());
910     // The backedge now goes to the "bottom", which is either the loop's real
911     // header (for the last peeled iteration) or the copied header of the next
912     // iteration (for every other iteration)
913     for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) {
914       if (LatchTerm->getSuccessor(idx) == Header) {
915         LatchTerm->setSuccessor(idx, InsertBot);
916         break;
917       }
918     }
919   }
920   if (DT)
921     DT->changeImmediateDominator(InsertBot, NewLatch);
922 
923   // The new copy of the loop body starts with a bunch of PHI nodes
924   // that pick an incoming value from either the preheader, or the previous
925   // loop iteration. Since this copy is no longer part of the loop, we
926   // resolve this statically:
927   if (PeelLast) {
928     // For the last iteration, we introduce new phis for each header phi in
929     // InsertTop, using the incoming value from the preheader for the original
930     // preheader (when skipping the main loop) and the incoming value from the
931     // latch for the latch (when continuing from the main loop).
932     IRBuilder<> B(InsertTop, InsertTop->getFirstNonPHIIt());
933     for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
934       PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
935       PHINode *PN = B.CreatePHI(NewPHI->getType(), 2);
936       NewPHI->eraseFromParent();
937       if (OrigPreHeader)
938         PN->addIncoming(cast<PHINode>(&*I)->getIncomingValueForBlock(PreHeader),
939                         OrigPreHeader);
940 
941       PN->addIncoming(cast<PHINode>(&*I)->getIncomingValueForBlock(Latch),
942                       Latch);
943       VMap[&*I] = PN;
944     }
945   } else {
946     // For the first iteration, we use the value from the preheader directly.
947     // For any other iteration, we replace the phi with the value generated by
948     // the immediately preceding clone of the loop body (which represents
949     // the previous iteration).
950     for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
951       PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
952       if (IterNumber == 0) {
953         VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
954       } else {
955         Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
956         Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
957         if (LatchInst && L->contains(LatchInst))
958           VMap[&*I] = LVMap[LatchInst];
959         else
960           VMap[&*I] = LatchVal;
961       }
962       NewPHI->eraseFromParent();
963     }
964   }
965 
966   // Fix up the outgoing values - we need to add a value for the iteration
967   // we've just created. Note that this must happen *after* the incoming
968   // values are adjusted, since the value going out of the latch may also be
969   // a value coming into the header.
970   for (auto Edge : ExitEdges)
971     for (PHINode &PHI : Edge.second->phis()) {
972       Value *LatchVal = PHI.getIncomingValueForBlock(Edge.first);
973       Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
974       if (LatchInst && L->contains(LatchInst))
975         LatchVal = VMap[LatchVal];
976       PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first]));
977       SE.forgetLcssaPhiWithNewPredecessor(L, &PHI);
978     }
979 
980   // LastValueMap is updated with the values for the current loop
981   // which are used the next time this function is called.
982   for (auto KV : VMap)
983     LVMap[KV.first] = KV.second;
984 }
985 
986 TargetTransformInfo::PeelingPreferences
987 llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
988                                const TargetTransformInfo &TTI,
989                                std::optional<bool> UserAllowPeeling,
990                                std::optional<bool> UserAllowProfileBasedPeeling,
991                                bool UnrollingSpecficValues) {
992   TargetTransformInfo::PeelingPreferences PP;
993 
994   // Set the default values.
995   PP.PeelCount = 0;
996   PP.AllowPeeling = true;
997   PP.AllowLoopNestsPeeling = false;
998   PP.PeelLast = false;
999   PP.PeelProfiledIterations = true;
1000 
1001   // Get the target specifc values.
1002   TTI.getPeelingPreferences(L, SE, PP);
1003 
1004   // User specified values using cl::opt.
1005   if (UnrollingSpecficValues) {
1006     if (UnrollPeelCount.getNumOccurrences() > 0)
1007       PP.PeelCount = UnrollPeelCount;
1008     if (UnrollAllowPeeling.getNumOccurrences() > 0)
1009       PP.AllowPeeling = UnrollAllowPeeling;
1010     if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
1011       PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
1012   }
1013 
1014   // User specifed values provided by argument.
1015   if (UserAllowPeeling)
1016     PP.AllowPeeling = *UserAllowPeeling;
1017   if (UserAllowProfileBasedPeeling)
1018     PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
1019 
1020   return PP;
1021 }
1022 
1023 /// Peel off the first \p PeelCount iterations of loop \p L.
1024 ///
1025 /// Note that this does not peel them off as a single straight-line block.
1026 /// Rather, each iteration is peeled off separately, and needs to check the
1027 /// exit condition.
1028 /// For loops that dynamically execute \p PeelCount iterations or less
1029 /// this provides a benefit, since the peeled off iterations, which account
1030 /// for the bulk of dynamic execution, can be further simplified by scalar
1031 /// optimizations.
1032 bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
1033                     ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,
1034                     bool PreserveLCSSA, ValueToValueMapTy &LVMap) {
1035   assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
1036   assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
1037   assert((!PeelLast || (canPeelLastIteration(*L, *SE) && PeelCount == 1)) &&
1038          "when peeling the last iteration, the loop must be supported and can "
1039          "only peel a single iteration");
1040 
1041   LoopBlocksDFS LoopBlocks(L);
1042   LoopBlocks.perform(LI);
1043 
1044   BasicBlock *Header = L->getHeader();
1045   BasicBlock *PreHeader = L->getLoopPreheader();
1046   BasicBlock *Latch = L->getLoopLatch();
1047   SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
1048   L->getExitEdges(ExitEdges);
1049 
1050   // Remember dominators of blocks we might reach through exits to change them
1051   // later. Immediate dominator of such block might change, because we add more
1052   // routes which can lead to the exit: we can reach it from the peeled
1053   // iterations too.
1054   DenseMap<BasicBlock *, BasicBlock *> NonLoopBlocksIDom;
1055   for (auto *BB : L->blocks()) {
1056     auto *BBDomNode = DT.getNode(BB);
1057     SmallVector<BasicBlock *, 16> ChildrenToUpdate;
1058     for (auto *ChildDomNode : BBDomNode->children()) {
1059       auto *ChildBB = ChildDomNode->getBlock();
1060       if (!L->contains(ChildBB))
1061         ChildrenToUpdate.push_back(ChildBB);
1062     }
1063     // The new idom of the block will be the nearest common dominator
1064     // of all copies of the previous idom. This is equivalent to the
1065     // nearest common dominator of the previous idom and the first latch,
1066     // which dominates all copies of the previous idom.
1067     BasicBlock *NewIDom = DT.findNearestCommonDominator(BB, Latch);
1068     for (auto *ChildBB : ChildrenToUpdate)
1069       NonLoopBlocksIDom[ChildBB] = NewIDom;
1070   }
1071 
1072   Function *F = Header->getParent();
1073 
1074   // Set up all the necessary basic blocks.
1075   BasicBlock *InsertTop;
1076   BasicBlock *InsertBot;
1077   BasicBlock *NewPreHeader = nullptr;
1078   DenseMap<Instruction *, Value *> ExitValues;
1079   if (PeelLast) {
1080     // It is convenient to split the single exit block from the latch the
1081     // into 3 parts - two blocks to anchor the peeled copy of the loop body,
1082     // and a new final  exit block.
1083 
1084     // Peeling the last iteration transforms.
1085     //
1086     // PreHeader:
1087     // ...
1088     // Header:
1089     //   LoopBody
1090     //   If (cond) goto Header
1091     // Exit:
1092     //
1093     // into
1094     //
1095     // Header:
1096     //  LoopBody
1097     //  If (cond) goto Header
1098     // InsertTop:
1099     //   LoopBody
1100     //   If (!cond) goto InsertBot
1101     // InsertBot:
1102     // Exit:
1103     // ...
1104     BasicBlock *Exit = L->getExitBlock();
1105     for (PHINode &P : Exit->phis())
1106       ExitValues[&P] = P.getIncomingValueForBlock(Latch);
1107 
1108     const SCEV *BTC = SE->getBackedgeTakenCount(L);
1109 
1110     InsertTop = SplitEdge(Latch, Exit, &DT, LI);
1111     InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);
1112 
1113     InsertTop->setName(Exit->getName() + ".peel.begin");
1114     InsertBot->setName(Exit->getName() + ".peel.next");
1115     NewPreHeader = nullptr;
1116 
1117     // If the original loop may only execute a single iteration we need to
1118     // insert a trip count check and skip the original loop with the last
1119     // iteration peeled off if necessary.
1120     if (!SE->isKnownNonZero(BTC)) {
1121       NewPreHeader = SplitEdge(PreHeader, Header, &DT, LI);
1122       SCEVExpander Expander(*SE, Latch->getDataLayout(), "loop-peel");
1123 
1124       BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
1125       Value *BTCValue =
1126           Expander.expandCodeFor(BTC, BTC->getType(), PreHeaderBR);
1127       IRBuilder<> B(PreHeaderBR);
1128       Value *Cond =
1129           B.CreateICmpNE(BTCValue, ConstantInt::get(BTCValue->getType(), 0));
1130       B.CreateCondBr(Cond, NewPreHeader, InsertTop);
1131       PreHeaderBR->eraseFromParent();
1132 
1133       // PreHeader now dominates InsertTop.
1134       DT.changeImmediateDominator(InsertTop, PreHeader);
1135     }
1136   } else {
1137     // It is convenient to split the preheader into 3 parts - two blocks to
1138     // anchor the peeled copy of the loop body, and a new preheader for the
1139     // "real" loop.
1140 
1141     // Peeling the first iteration transforms.
1142     //
1143     // PreHeader:
1144     // ...
1145     // Header:
1146     //   LoopBody
1147     //   If (cond) goto Header
1148     // Exit:
1149     //
1150     // into
1151     //
1152     // InsertTop:
1153     //   LoopBody
1154     //   If (!cond) goto Exit
1155     // InsertBot:
1156     // NewPreHeader:
1157     // ...
1158     // Header:
1159     //  LoopBody
1160     //  If (cond) goto Header
1161     // Exit:
1162     //
1163     // Each following iteration will split the current bottom anchor in two,
1164     // and put the new copy of the loop body between these two blocks. That
1165     // is, after peeling another iteration from the example above, we'll
1166     // split InsertBot, and get:
1167     //
1168     // InsertTop:
1169     //   LoopBody
1170     //   If (!cond) goto Exit
1171     // InsertBot:
1172     //   LoopBody
1173     //   If (!cond) goto Exit
1174     // InsertBot.next:
1175     // NewPreHeader:
1176     // ...
1177     // Header:
1178     //  LoopBody
1179     //  If (cond) goto Header
1180     // Exit:
1181     //
1182     InsertTop = SplitEdge(PreHeader, Header, &DT, LI);
1183     InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);
1184     NewPreHeader = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI);
1185 
1186     InsertTop->setName(Header->getName() + ".peel.begin");
1187     InsertBot->setName(Header->getName() + ".peel.next");
1188     NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
1189   }
1190 
1191   Instruction *LatchTerm =
1192       cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());
1193 
1194   // If we have branch weight information, we'll want to update it for the
1195   // newly created branches.
1196   DenseMap<Instruction *, WeightInfo> Weights;
1197   initBranchWeights(Weights, L);
1198 
1199   // Identify what noalias metadata is inside the loop: if it is inside the
1200   // loop, the associated metadata must be cloned for each iteration.
1201   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
1202   identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
1203 
1204   // For each peeled-off iteration, make a copy of the loop.
1205   ValueToValueMapTy VMap;
1206   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
1207     SmallVector<BasicBlock *, 8> NewBlocks;
1208 
1209     cloneLoopBlocks(L, Iter, PeelLast, InsertTop, InsertBot,
1210                     NewPreHeader ? PreHeader : nullptr, ExitEdges, NewBlocks,
1211                     LoopBlocks, VMap, LVMap, &DT, LI,
1212                     LoopLocalNoAliasDeclScopes, *SE);
1213 
1214     // Remap to use values from the current iteration instead of the
1215     // previous one.
1216     remapInstructionsInBlocks(NewBlocks, VMap);
1217 
1218     if (Iter == 0) {
1219       if (PeelLast) {
1220         // Adjust the exit condition so the loop exits one iteration early.
1221         // For now we simply subtract one form the second operand of the
1222         // exit condition. This relies on the peel count computation to
1223         // check that this is actually legal. In particular, it ensures that
1224         // the first operand of the compare is an AddRec with step 1 and we
1225         // execute more than one iteration.
1226         auto *Cmp =
1227             cast<ICmpInst>(L->getLoopLatch()->getTerminator()->getOperand(0));
1228         IRBuilder B(Cmp);
1229         Cmp->setOperand(
1230             1, B.CreateSub(Cmp->getOperand(1),
1231                            ConstantInt::get(Cmp->getOperand(1)->getType(), 1)));
1232       } else {
1233         // Update IDoms of the blocks reachable through exits.
1234         for (auto BBIDom : NonLoopBlocksIDom)
1235           DT.changeImmediateDominator(BBIDom.first,
1236                                       cast<BasicBlock>(LVMap[BBIDom.second]));
1237       }
1238     }
1239 
1240 #ifdef EXPENSIVE_CHECKS
1241     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1242 #endif
1243 
1244     for (auto &[Term, Info] : Weights) {
1245       auto *TermCopy = cast<Instruction>(VMap[Term]);
1246       updateBranchWeights(TermCopy, Info);
1247     }
1248 
1249     // Remove Loop metadata from the latch branch instruction
1250     // because it is not the Loop's latch branch anymore.
1251     auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]);
1252     LatchTermCopy->setMetadata(LLVMContext::MD_loop, nullptr);
1253 
1254     InsertTop = InsertBot;
1255     InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI);
1256     InsertBot->setName(Header->getName() + ".peel.next");
1257 
1258     F->splice(InsertTop->getIterator(), F, NewBlocks[0]->getIterator(),
1259               F->end());
1260   }
1261 
1262   if (PeelLast) {
1263     // Now adjust users of the original exit values by replacing them with the
1264     // exit value from the peeled iteration and remove them.
1265     for (const auto &[P, E] : ExitValues) {
1266       Instruction *ExitInst = dyn_cast<Instruction>(E);
1267       if (ExitInst && L->contains(ExitInst))
1268         P->replaceAllUsesWith(&*VMap[ExitInst]);
1269       else
1270         P->replaceAllUsesWith(E);
1271       P->eraseFromParent();
1272     }
1273     formLCSSA(*L, DT, LI, SE);
1274   } else {
1275     // Now adjust the phi nodes in the loop header to get their initial values
1276     // from the last peeled-off iteration instead of the preheader.
1277     for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
1278       PHINode *PHI = cast<PHINode>(I);
1279       Value *NewVal = PHI->getIncomingValueForBlock(Latch);
1280       Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
1281       if (LatchInst && L->contains(LatchInst))
1282         NewVal = LVMap[LatchInst];
1283 
1284       PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
1285     }
1286   }
1287 
1288   for (const auto &[Term, Info] : Weights) {
1289     setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
1290   }
1291 
1292   // Update Metadata for count of peeled off iterations.
1293   unsigned AlreadyPeeled = 0;
1294   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
1295     AlreadyPeeled = *Peeled;
1296   addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
1297 
1298   if (Loop *ParentLoop = L->getParentLoop())
1299     L = ParentLoop;
1300 
1301   // We modified the loop, update SE.
1302   SE->forgetTopmostLoop(L);
1303   SE->forgetBlockAndLoopDispositions();
1304 
1305 #ifdef EXPENSIVE_CHECKS
1306   // Finally DomtTree must be correct.
1307   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1308 #endif
1309 
1310   // FIXME: Incrementally update loop-simplify
1311   simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA);
1312 
1313   NumPeeled++;
1314   NumPeeledEnd += PeelLast;
1315 
1316   return true;
1317 }
1318