xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp (revision 47e073941f4e7ca6e9bde3fa65abbfcfed6bfa2b)
1  //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  /// \file
9  /// Insert hardware loop intrinsics into loops which are deemed profitable by
10  /// the target, by querying TargetTransformInfo. A hardware loop comprises of
11  /// two intrinsics: one, outside the loop, to set the loop iteration count and
12  /// another, in the exit block, to decrement the counter. The decremented value
13  /// can either be carried through the loop via a phi or handled in some opaque
14  /// way by the target.
15  ///
16  //===----------------------------------------------------------------------===//
17  
18  #include "llvm/CodeGen/HardwareLoops.h"
19  #include "llvm/ADT/Statistic.h"
20  #include "llvm/Analysis/AssumptionCache.h"
21  #include "llvm/Analysis/BranchProbabilityInfo.h"
22  #include "llvm/Analysis/LoopInfo.h"
23  #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24  #include "llvm/Analysis/ScalarEvolution.h"
25  #include "llvm/Analysis/TargetLibraryInfo.h"
26  #include "llvm/Analysis/TargetTransformInfo.h"
27  #include "llvm/CodeGen/Passes.h"
28  #include "llvm/IR/BasicBlock.h"
29  #include "llvm/IR/Constants.h"
30  #include "llvm/IR/Dominators.h"
31  #include "llvm/IR/IRBuilder.h"
32  #include "llvm/IR/Instructions.h"
33  #include "llvm/IR/IntrinsicInst.h"
34  #include "llvm/IR/Value.h"
35  #include "llvm/InitializePasses.h"
36  #include "llvm/Pass.h"
37  #include "llvm/PassRegistry.h"
38  #include "llvm/Support/CommandLine.h"
39  #include "llvm/Support/Debug.h"
40  #include "llvm/Transforms/Utils.h"
41  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
42  #include "llvm/Transforms/Utils/Local.h"
43  #include "llvm/Transforms/Utils/LoopUtils.h"
44  #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45  
46  #define DEBUG_TYPE "hardware-loops"
47  
48  #define HW_LOOPS_NAME "Hardware Loop Insertion"
49  
50  using namespace llvm;
51  
52  static cl::opt<bool>
53  ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
54                     cl::desc("Force hardware loops intrinsics to be inserted"));
55  
56  static cl::opt<bool>
57  ForceHardwareLoopPHI(
58    "force-hardware-loop-phi", cl::Hidden, cl::init(false),
59    cl::desc("Force hardware loop counter to be updated through a phi"));
60  
61  static cl::opt<bool>
62  ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
63                  cl::desc("Force allowance of nested hardware loops"));
64  
65  static cl::opt<unsigned>
66  LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
67              cl::desc("Set the loop decrement value"));
68  
69  static cl::opt<unsigned>
70  CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
71                  cl::desc("Set the loop counter bitwidth"));
72  
73  static cl::opt<bool>
74  ForceGuardLoopEntry(
75    "force-hardware-loop-guard", cl::Hidden, cl::init(false),
76    cl::desc("Force generation of loop guard intrinsic"));
77  
78  STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
79  
80  #ifndef NDEBUG
81  static void debugHWLoopFailure(const StringRef DebugMsg,
82      Instruction *I) {
83    dbgs() << "HWLoops: " << DebugMsg;
84    if (I)
85      dbgs() << ' ' << *I;
86    else
87      dbgs() << '.';
88    dbgs() << '\n';
89  }
90  #endif
91  
92  static OptimizationRemarkAnalysis
93  createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
94    Value *CodeRegion = L->getHeader();
95    DebugLoc DL = L->getStartLoc();
96  
97    if (I) {
98      CodeRegion = I->getParent();
99      // If there is no debug location attached to the instruction, revert back to
100      // using the loop's.
101      if (I->getDebugLoc())
102        DL = I->getDebugLoc();
103    }
104  
105    OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
106    R << "hardware-loop not created: ";
107    return R;
108  }
109  
110  namespace {
111  
112    void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
113        OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
114      LLVM_DEBUG(debugHWLoopFailure(Msg, I));
115      ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
116    }
117  
118    using TTI = TargetTransformInfo;
119  
120    class HardwareLoopsLegacy : public FunctionPass {
121    public:
122      static char ID;
123  
124      HardwareLoopsLegacy() : FunctionPass(ID) {
125        initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry());
126      }
127  
128      bool runOnFunction(Function &F) override;
129  
130      void getAnalysisUsage(AnalysisUsage &AU) const override {
131        AU.addRequired<LoopInfoWrapperPass>();
132        AU.addPreserved<LoopInfoWrapperPass>();
133        AU.addRequired<DominatorTreeWrapperPass>();
134        AU.addPreserved<DominatorTreeWrapperPass>();
135        AU.addRequired<ScalarEvolutionWrapperPass>();
136        AU.addPreserved<ScalarEvolutionWrapperPass>();
137        AU.addRequired<AssumptionCacheTracker>();
138        AU.addRequired<TargetTransformInfoWrapperPass>();
139        AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
140        AU.addPreserved<BranchProbabilityInfoWrapperPass>();
141      }
142    };
143  
144    class HardwareLoopsImpl {
145    public:
146      HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA,
147                        DominatorTree &DT, const DataLayout &DL,
148                        const TargetTransformInfo &TTI, TargetLibraryInfo *TLI,
149                        AssumptionCache &AC, OptimizationRemarkEmitter *ORE,
150                        HardwareLoopOptions &Opts)
151        : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI),
152          TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { }
153  
154      bool run(Function &F);
155  
156    private:
157      // Try to convert the given Loop into a hardware loop.
158      bool TryConvertLoop(Loop *L, LLVMContext &Ctx);
159  
160      // Given that the target believes the loop to be profitable, try to
161      // convert it.
162      bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo);
163  
164      ScalarEvolution &SE;
165      LoopInfo &LI;
166      bool PreserveLCSSA;
167      DominatorTree &DT;
168      const DataLayout &DL;
169      const TargetTransformInfo &TTI;
170      TargetLibraryInfo *TLI = nullptr;
171      AssumptionCache &AC;
172      OptimizationRemarkEmitter *ORE;
173      HardwareLoopOptions &Opts;
174      bool MadeChange = false;
175    };
176  
177    class HardwareLoop {
178      // Expand the trip count scev into a value that we can use.
179      Value *InitLoopCount();
180  
181      // Insert the set_loop_iteration intrinsic.
182      Value *InsertIterationSetup(Value *LoopCountInit);
183  
184      // Insert the loop_decrement intrinsic.
185      void InsertLoopDec();
186  
187      // Insert the loop_decrement_reg intrinsic.
188      Instruction *InsertLoopRegDec(Value *EltsRem);
189  
190      // If the target requires the counter value to be updated in the loop,
191      // insert a phi to hold the value. The intended purpose is for use by
192      // loop_decrement_reg.
193      PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
194  
195      // Create a new cmp, that checks the returned value of loop_decrement*,
196      // and update the exit branch to use it.
197      void UpdateBranch(Value *EltsRem);
198  
199    public:
200      HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
201                   const DataLayout &DL,
202                   OptimizationRemarkEmitter *ORE,
203                   HardwareLoopOptions &Opts) :
204        SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()),
205        ExitCount(Info.ExitCount),
206        CountType(Info.CountType),
207        ExitBranch(Info.ExitBranch),
208        LoopDecrement(Info.LoopDecrement),
209        UsePHICounter(Info.CounterInReg),
210        UseLoopGuard(Info.PerformEntryTest) { }
211  
212      void Create();
213  
214    private:
215      ScalarEvolution &SE;
216      const DataLayout &DL;
217      OptimizationRemarkEmitter *ORE = nullptr;
218      HardwareLoopOptions &Opts;
219      Loop *L                 = nullptr;
220      Module *M               = nullptr;
221      const SCEV *ExitCount   = nullptr;
222      Type *CountType         = nullptr;
223      BranchInst *ExitBranch  = nullptr;
224      Value *LoopDecrement    = nullptr;
225      bool UsePHICounter      = false;
226      bool UseLoopGuard       = false;
227      BasicBlock *BeginBB     = nullptr;
228    };
229  }
230  
231  char HardwareLoopsLegacy::ID = 0;
232  
233  bool HardwareLoopsLegacy::runOnFunction(Function &F) {
234    if (skipFunction(F))
235      return false;
236  
237    LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
238  
239    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
240    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
241    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
242    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
243    auto &DL = F.getParent()->getDataLayout();
244    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
245    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
246    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
247    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
248    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
249  
250    HardwareLoopOptions Opts;
251    if (ForceHardwareLoops.getNumOccurrences())
252      Opts.setForce(ForceHardwareLoops);
253    if (ForceHardwareLoopPHI.getNumOccurrences())
254      Opts.setForcePhi(ForceHardwareLoopPHI);
255    if (ForceNestedLoop.getNumOccurrences())
256      Opts.setForceNested(ForceNestedLoop);
257    if (ForceGuardLoopEntry.getNumOccurrences())
258      Opts.setForceGuard(ForceGuardLoopEntry);
259    if (LoopDecrement.getNumOccurrences())
260      Opts.setDecrement(LoopDecrement);
261    if (CounterBitWidth.getNumOccurrences())
262      Opts.setCounterBitwidth(CounterBitWidth);
263  
264    HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE,
265                           Opts);
266    return Impl.run(F);
267  }
268  
269  PreservedAnalyses HardwareLoopsPass::run(Function &F,
270                                           FunctionAnalysisManager &AM) {
271    auto &LI = AM.getResult<LoopAnalysis>(F);
272    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
273    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
274    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
275    auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
276    auto &AC = AM.getResult<AssumptionAnalysis>(F);
277    auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
278    auto &DL = F.getParent()->getDataLayout();
279  
280    HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts);
281    bool Changed = Impl.run(F);
282    if (!Changed)
283      return PreservedAnalyses::all();
284  
285    PreservedAnalyses PA;
286    PA.preserve<LoopAnalysis>();
287    PA.preserve<ScalarEvolutionAnalysis>();
288    PA.preserve<DominatorTreeAnalysis>();
289    PA.preserve<BranchProbabilityAnalysis>();
290    return PA;
291  }
292  
293  bool HardwareLoopsImpl::run(Function &F) {
294    LLVMContext &Ctx = F.getParent()->getContext();
295    for (Loop *L : LI)
296      if (L->isOutermost())
297        TryConvertLoop(L, Ctx);
298    return MadeChange;
299  }
300  
301  // Return true if the search should stop, which will be when an inner loop is
302  // converted and the parent loop doesn't support containing a hardware loop.
303  bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) {
304    // Process nested loops first.
305    bool AnyChanged = false;
306    for (Loop *SL : *L)
307      AnyChanged |= TryConvertLoop(SL, Ctx);
308    if (AnyChanged) {
309      reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
310                          ORE, L);
311      return true; // Stop search.
312    }
313  
314    LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
315  
316    HardwareLoopInfo HWLoopInfo(L);
317    if (!HWLoopInfo.canAnalyze(LI)) {
318      reportHWLoopFailure("cannot analyze loop, irreducible control flow",
319                          "HWLoopCannotAnalyze", ORE, L);
320      return false;
321    }
322  
323    if (!Opts.Force &&
324        !TTI.isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
325      reportHWLoopFailure("it's not profitable to create a hardware-loop",
326                          "HWLoopNotProfitable", ORE, L);
327      return false;
328    }
329  
330    // Allow overriding of the counter width and loop decrement value.
331    if (Opts.Bitwidth.has_value()) {
332      HWLoopInfo.CountType = IntegerType::get(Ctx, Opts.Bitwidth.value());
333    }
334  
335    if (Opts.Decrement.has_value())
336      HWLoopInfo.LoopDecrement =
337        ConstantInt::get(HWLoopInfo.CountType, Opts.Decrement.value());
338  
339    MadeChange |= TryConvertLoop(HWLoopInfo);
340    return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested);
341  }
342  
343  bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
344  
345    Loop *L = HWLoopInfo.L;
346    LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
347  
348    if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, Opts.getForceNested(),
349                                            Opts.getForcePhi())) {
350      // TODO: there can be many reasons a loop is not considered a
351      // candidate, so we should let isHardwareLoopCandidate fill in the
352      // reason and then report a better message here.
353      reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
354      return false;
355    }
356  
357    assert(
358        (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
359        "Hardware Loop must have set exit info.");
360  
361    BasicBlock *Preheader = L->getLoopPreheader();
362  
363    // If we don't have a preheader, then insert one.
364    if (!Preheader)
365      Preheader = InsertPreheaderForLoop(L, &DT, &LI, nullptr, PreserveLCSSA);
366    if (!Preheader)
367      return false;
368  
369    HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts);
370    HWLoop.Create();
371    ++NumHWLoops;
372    return true;
373  }
374  
375  void HardwareLoop::Create() {
376    LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
377  
378    Value *LoopCountInit = InitLoopCount();
379    if (!LoopCountInit) {
380      reportHWLoopFailure("could not safely create a loop count expression",
381                          "HWLoopNotSafe", ORE, L);
382      return;
383    }
384  
385    Value *Setup = InsertIterationSetup(LoopCountInit);
386  
387    if (UsePHICounter || Opts.ForcePhi) {
388      Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
389      Value *EltsRem = InsertPHICounter(Setup, LoopDec);
390      LoopDec->setOperand(0, EltsRem);
391      UpdateBranch(LoopDec);
392    } else
393      InsertLoopDec();
394  
395    // Run through the basic blocks of the loop and see if any of them have dead
396    // PHIs that can be removed.
397    for (auto *I : L->blocks())
398      DeleteDeadPHIs(I);
399  }
400  
401  static bool CanGenerateTest(Loop *L, Value *Count) {
402    BasicBlock *Preheader = L->getLoopPreheader();
403    if (!Preheader->getSinglePredecessor())
404      return false;
405  
406    BasicBlock *Pred = Preheader->getSinglePredecessor();
407    if (!isa<BranchInst>(Pred->getTerminator()))
408      return false;
409  
410    auto *BI = cast<BranchInst>(Pred->getTerminator());
411    if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
412      return false;
413  
414    // Check that the icmp is checking for equality of Count and zero and that
415    // a non-zero value results in entering the loop.
416    auto ICmp = cast<ICmpInst>(BI->getCondition());
417    LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
418    if (!ICmp->isEquality())
419      return false;
420  
421    auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
422      if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
423        return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
424      return false;
425    };
426  
427    // Check if Count is a zext.
428    Value *CountBefZext =
429        isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) : nullptr;
430  
431    if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
432        !IsCompareZero(ICmp, CountBefZext, 0) &&
433        !IsCompareZero(ICmp, CountBefZext, 1))
434      return false;
435  
436    unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
437    if (BI->getSuccessor(SuccIdx) != Preheader)
438      return false;
439  
440    return true;
441  }
442  
443  Value *HardwareLoop::InitLoopCount() {
444    LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
445    // Can we replace a conditional branch with an intrinsic that sets the
446    // loop counter and tests that is not zero?
447  
448    SCEVExpander SCEVE(SE, DL, "loopcnt");
449    if (!ExitCount->getType()->isPointerTy() &&
450        ExitCount->getType() != CountType)
451      ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
452  
453    ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
454  
455    // If we're trying to use the 'test and set' form of the intrinsic, we need
456    // to replace a conditional branch that is controlling entry to the loop. It
457    // is likely (guaranteed?) that the preheader has an unconditional branch to
458    // the loop header, so also check if it has a single predecessor.
459    if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
460                                    SE.getZero(ExitCount->getType()))) {
461      LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
462      if (Opts.ForceGuard)
463        UseLoopGuard = true;
464    } else
465      UseLoopGuard = false;
466  
467    BasicBlock *BB = L->getLoopPreheader();
468    if (UseLoopGuard && BB->getSinglePredecessor() &&
469        cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
470      BasicBlock *Predecessor = BB->getSinglePredecessor();
471      // If it's not safe to create a while loop then don't force it and create a
472      // do-while loop instead
473      if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->getTerminator()))
474          UseLoopGuard = false;
475      else
476          BB = Predecessor;
477    }
478  
479    if (!SCEVE.isSafeToExpandAt(ExitCount, BB->getTerminator())) {
480      LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
481                 << *ExitCount << "\n");
482      return nullptr;
483    }
484  
485    Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
486                                       BB->getTerminator());
487  
488    // FIXME: We've expanded Count where we hope to insert the counter setting
489    // intrinsic. But, in the case of the 'test and set' form, we may fallback to
490    // the just 'set' form and in which case the insertion block is most likely
491    // different. It means there will be instruction(s) in a block that possibly
492    // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue,
493    // but it's doesn't appear to work in all cases.
494  
495    UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
496    BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
497    LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
498                      << " - Expanded Count in " << BB->getName() << "\n"
499                      << " - Will insert set counter intrinsic into: "
500                      << BeginBB->getName() << "\n");
501    return Count;
502  }
503  
504  Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
505    IRBuilder<> Builder(BeginBB->getTerminator());
506    Type *Ty = LoopCountInit->getType();
507    bool UsePhi = UsePHICounter || Opts.ForcePhi;
508    Intrinsic::ID ID = UseLoopGuard
509                           ? (UsePhi ? Intrinsic::test_start_loop_iterations
510                                     : Intrinsic::test_set_loop_iterations)
511                           : (UsePhi ? Intrinsic::start_loop_iterations
512                                     : Intrinsic::set_loop_iterations);
513    Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
514    Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
515  
516    // Use the return value of the intrinsic to control the entry of the loop.
517    if (UseLoopGuard) {
518      assert((isa<BranchInst>(BeginBB->getTerminator()) &&
519              cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
520             "Expected conditional branch");
521  
522      Value *SetCount =
523          UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
524      auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
525      LoopGuard->setCondition(SetCount);
526      if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
527        LoopGuard->swapSuccessors();
528    }
529    LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup
530                      << "\n");
531    if (UsePhi && UseLoopGuard)
532      LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
533    return !UsePhi ? LoopCountInit : LoopSetup;
534  }
535  
536  void HardwareLoop::InsertLoopDec() {
537    IRBuilder<> CondBuilder(ExitBranch);
538  
539    Function *DecFunc =
540      Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
541                                LoopDecrement->getType());
542    Value *Ops[] = { LoopDecrement };
543    Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
544    Value *OldCond = ExitBranch->getCondition();
545    ExitBranch->setCondition(NewCond);
546  
547    // The false branch must exit the loop.
548    if (!L->contains(ExitBranch->getSuccessor(0)))
549      ExitBranch->swapSuccessors();
550  
551    // The old condition may be dead now, and may have even created a dead PHI
552    // (the original induction variable).
553    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
554  
555    LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
556  }
557  
558  Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
559    IRBuilder<> CondBuilder(ExitBranch);
560  
561    Function *DecFunc =
562        Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
563                                  { EltsRem->getType() });
564    Value *Ops[] = { EltsRem, LoopDecrement };
565    Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
566  
567    LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
568    return cast<Instruction>(Call);
569  }
570  
571  PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
572    BasicBlock *Preheader = L->getLoopPreheader();
573    BasicBlock *Header = L->getHeader();
574    BasicBlock *Latch = ExitBranch->getParent();
575    IRBuilder<> Builder(Header->getFirstNonPHI());
576    PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
577    Index->addIncoming(NumElts, Preheader);
578    Index->addIncoming(EltsRem, Latch);
579    LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
580    return Index;
581  }
582  
583  void HardwareLoop::UpdateBranch(Value *EltsRem) {
584    IRBuilder<> CondBuilder(ExitBranch);
585    Value *NewCond =
586      CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
587    Value *OldCond = ExitBranch->getCondition();
588    ExitBranch->setCondition(NewCond);
589  
590    // The false branch must exit the loop.
591    if (!L->contains(ExitBranch->getSuccessor(0)))
592      ExitBranch->swapSuccessors();
593  
594    // The old condition may be dead now, and may have even created a dead PHI
595    // (the original induction variable).
596    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
597  }
598  
599  INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
600  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
601  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
602  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
603  INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
604  INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
605  
606  FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); }
607