xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp (revision 1719886f6d08408b834d270c59ffcfd821c8f63a)
1 //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Insert hardware loop intrinsics into loops which are deemed profitable by
10 /// the target, by querying TargetTransformInfo. A hardware loop comprises of
11 /// two intrinsics: one, outside the loop, to set the loop iteration count and
12 /// another, in the exit block, to decrement the counter. The decremented value
13 /// can either be carried through the loop via a phi or handled in some opaque
14 /// way by the target.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/CodeGen/HardwareLoops.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BranchProbabilityInfo.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/CodeGen/Passes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/Value.h"
35 #include "llvm/InitializePasses.h"
36 #include "llvm/Pass.h"
37 #include "llvm/PassRegistry.h"
38 #include "llvm/Support/CommandLine.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Transforms/Utils.h"
41 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
42 #include "llvm/Transforms/Utils/Local.h"
43 #include "llvm/Transforms/Utils/LoopUtils.h"
44 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45 
46 #define DEBUG_TYPE "hardware-loops"
47 
48 #define HW_LOOPS_NAME "Hardware Loop Insertion"
49 
50 using namespace llvm;
51 
52 static cl::opt<bool>
53 ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
54                    cl::desc("Force hardware loops intrinsics to be inserted"));
55 
56 static cl::opt<bool>
57 ForceHardwareLoopPHI(
58   "force-hardware-loop-phi", cl::Hidden, cl::init(false),
59   cl::desc("Force hardware loop counter to be updated through a phi"));
60 
61 static cl::opt<bool>
62 ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
63                 cl::desc("Force allowance of nested hardware loops"));
64 
65 static cl::opt<unsigned>
66 LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
67             cl::desc("Set the loop decrement value"));
68 
69 static cl::opt<unsigned>
70 CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
71                 cl::desc("Set the loop counter bitwidth"));
72 
73 static cl::opt<bool>
74 ForceGuardLoopEntry(
75   "force-hardware-loop-guard", cl::Hidden, cl::init(false),
76   cl::desc("Force generation of loop guard intrinsic"));
77 
78 STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
79 
80 #ifndef NDEBUG
81 static void debugHWLoopFailure(const StringRef DebugMsg,
82     Instruction *I) {
83   dbgs() << "HWLoops: " << DebugMsg;
84   if (I)
85     dbgs() << ' ' << *I;
86   else
87     dbgs() << '.';
88   dbgs() << '\n';
89 }
90 #endif
91 
92 static OptimizationRemarkAnalysis
93 createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
94   Value *CodeRegion = L->getHeader();
95   DebugLoc DL = L->getStartLoc();
96 
97   if (I) {
98     CodeRegion = I->getParent();
99     // If there is no debug location attached to the instruction, revert back to
100     // using the loop's.
101     if (I->getDebugLoc())
102       DL = I->getDebugLoc();
103   }
104 
105   OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
106   R << "hardware-loop not created: ";
107   return R;
108 }
109 
110 namespace {
111 
112   void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
113       OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
114     LLVM_DEBUG(debugHWLoopFailure(Msg, I));
115     ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
116   }
117 
118   using TTI = TargetTransformInfo;
119 
120   class HardwareLoopsLegacy : public FunctionPass {
121   public:
122     static char ID;
123 
124     HardwareLoopsLegacy() : FunctionPass(ID) {
125       initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry());
126     }
127 
128     bool runOnFunction(Function &F) override;
129 
130     void getAnalysisUsage(AnalysisUsage &AU) const override {
131       AU.addRequired<LoopInfoWrapperPass>();
132       AU.addPreserved<LoopInfoWrapperPass>();
133       AU.addRequired<DominatorTreeWrapperPass>();
134       AU.addPreserved<DominatorTreeWrapperPass>();
135       AU.addRequired<ScalarEvolutionWrapperPass>();
136       AU.addPreserved<ScalarEvolutionWrapperPass>();
137       AU.addRequired<AssumptionCacheTracker>();
138       AU.addRequired<TargetTransformInfoWrapperPass>();
139       AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
140       AU.addPreserved<BranchProbabilityInfoWrapperPass>();
141     }
142   };
143 
144   class HardwareLoopsImpl {
145   public:
146     HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA,
147                       DominatorTree &DT, const DataLayout &DL,
148                       const TargetTransformInfo &TTI, TargetLibraryInfo *TLI,
149                       AssumptionCache &AC, OptimizationRemarkEmitter *ORE,
150                       HardwareLoopOptions &Opts)
151       : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI),
152         TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { }
153 
154     bool run(Function &F);
155 
156   private:
157     // Try to convert the given Loop into a hardware loop.
158     bool TryConvertLoop(Loop *L, LLVMContext &Ctx);
159 
160     // Given that the target believes the loop to be profitable, try to
161     // convert it.
162     bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo);
163 
164     ScalarEvolution &SE;
165     LoopInfo &LI;
166     bool PreserveLCSSA;
167     DominatorTree &DT;
168     const DataLayout &DL;
169     const TargetTransformInfo &TTI;
170     TargetLibraryInfo *TLI = nullptr;
171     AssumptionCache &AC;
172     OptimizationRemarkEmitter *ORE;
173     HardwareLoopOptions &Opts;
174     bool MadeChange = false;
175   };
176 
177   class HardwareLoop {
178     // Expand the trip count scev into a value that we can use.
179     Value *InitLoopCount();
180 
181     // Insert the set_loop_iteration intrinsic.
182     Value *InsertIterationSetup(Value *LoopCountInit);
183 
184     // Insert the loop_decrement intrinsic.
185     void InsertLoopDec();
186 
187     // Insert the loop_decrement_reg intrinsic.
188     Instruction *InsertLoopRegDec(Value *EltsRem);
189 
190     // If the target requires the counter value to be updated in the loop,
191     // insert a phi to hold the value. The intended purpose is for use by
192     // loop_decrement_reg.
193     PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
194 
195     // Create a new cmp, that checks the returned value of loop_decrement*,
196     // and update the exit branch to use it.
197     void UpdateBranch(Value *EltsRem);
198 
199   public:
200     HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
201                  const DataLayout &DL,
202                  OptimizationRemarkEmitter *ORE,
203                  HardwareLoopOptions &Opts) :
204       SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()),
205       ExitCount(Info.ExitCount),
206       CountType(Info.CountType),
207       ExitBranch(Info.ExitBranch),
208       LoopDecrement(Info.LoopDecrement),
209       UsePHICounter(Info.CounterInReg),
210       UseLoopGuard(Info.PerformEntryTest) { }
211 
212     void Create();
213 
214   private:
215     ScalarEvolution &SE;
216     const DataLayout &DL;
217     OptimizationRemarkEmitter *ORE = nullptr;
218     HardwareLoopOptions &Opts;
219     Loop *L                 = nullptr;
220     Module *M               = nullptr;
221     const SCEV *ExitCount   = nullptr;
222     Type *CountType         = nullptr;
223     BranchInst *ExitBranch  = nullptr;
224     Value *LoopDecrement    = nullptr;
225     bool UsePHICounter      = false;
226     bool UseLoopGuard       = false;
227     BasicBlock *BeginBB     = nullptr;
228   };
229 }
230 
231 char HardwareLoopsLegacy::ID = 0;
232 
233 bool HardwareLoopsLegacy::runOnFunction(Function &F) {
234   if (skipFunction(F))
235     return false;
236 
237   LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
238 
239   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
240   auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
241   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
242   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
243   auto &DL = F.getParent()->getDataLayout();
244   auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
245   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
246   auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
247   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
248   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
249 
250   HardwareLoopOptions Opts;
251   if (ForceHardwareLoops.getNumOccurrences())
252     Opts.setForce(ForceHardwareLoops);
253   if (ForceHardwareLoopPHI.getNumOccurrences())
254     Opts.setForcePhi(ForceHardwareLoopPHI);
255   if (ForceNestedLoop.getNumOccurrences())
256     Opts.setForceNested(ForceNestedLoop);
257   if (ForceGuardLoopEntry.getNumOccurrences())
258     Opts.setForceGuard(ForceGuardLoopEntry);
259   if (LoopDecrement.getNumOccurrences())
260     Opts.setDecrement(LoopDecrement);
261   if (CounterBitWidth.getNumOccurrences())
262     Opts.setCounterBitwidth(CounterBitWidth);
263 
264   HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE,
265                          Opts);
266   return Impl.run(F);
267 }
268 
269 PreservedAnalyses HardwareLoopsPass::run(Function &F,
270                                          FunctionAnalysisManager &AM) {
271   auto &LI = AM.getResult<LoopAnalysis>(F);
272   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
273   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
274   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
275   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
276   auto &AC = AM.getResult<AssumptionAnalysis>(F);
277   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
278   auto &DL = F.getParent()->getDataLayout();
279 
280   HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts);
281   bool Changed = Impl.run(F);
282   if (!Changed)
283     return PreservedAnalyses::all();
284 
285   PreservedAnalyses PA;
286   PA.preserve<LoopAnalysis>();
287   PA.preserve<ScalarEvolutionAnalysis>();
288   PA.preserve<DominatorTreeAnalysis>();
289   PA.preserve<BranchProbabilityAnalysis>();
290   return PA;
291 }
292 
293 bool HardwareLoopsImpl::run(Function &F) {
294   LLVMContext &Ctx = F.getParent()->getContext();
295   for (Loop *L : LI)
296     if (L->isOutermost())
297       TryConvertLoop(L, Ctx);
298   return MadeChange;
299 }
300 
301 // Return true if the search should stop, which will be when an inner loop is
302 // converted and the parent loop doesn't support containing a hardware loop.
303 bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) {
304   // Process nested loops first.
305   bool AnyChanged = false;
306   for (Loop *SL : *L)
307     AnyChanged |= TryConvertLoop(SL, Ctx);
308   if (AnyChanged) {
309     reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
310                         ORE, L);
311     return true; // Stop search.
312   }
313 
314   LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
315 
316   HardwareLoopInfo HWLoopInfo(L);
317   if (!HWLoopInfo.canAnalyze(LI)) {
318     reportHWLoopFailure("cannot analyze loop, irreducible control flow",
319                         "HWLoopCannotAnalyze", ORE, L);
320     return false;
321   }
322 
323   if (!Opts.Force &&
324       !TTI.isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
325     reportHWLoopFailure("it's not profitable to create a hardware-loop",
326                         "HWLoopNotProfitable", ORE, L);
327     return false;
328   }
329 
330   // Allow overriding of the counter width and loop decrement value.
331   if (Opts.Bitwidth.has_value()) {
332     HWLoopInfo.CountType = IntegerType::get(Ctx, Opts.Bitwidth.value());
333   }
334 
335   if (Opts.Decrement.has_value())
336     HWLoopInfo.LoopDecrement =
337       ConstantInt::get(HWLoopInfo.CountType, Opts.Decrement.value());
338 
339   MadeChange |= TryConvertLoop(HWLoopInfo);
340   return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested);
341 }
342 
343 bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
344 
345   Loop *L = HWLoopInfo.L;
346   LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
347 
348   if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, Opts.getForceNested(),
349                                           Opts.getForcePhi())) {
350     // TODO: there can be many reasons a loop is not considered a
351     // candidate, so we should let isHardwareLoopCandidate fill in the
352     // reason and then report a better message here.
353     reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
354     return false;
355   }
356 
357   assert(
358       (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
359       "Hardware Loop must have set exit info.");
360 
361   BasicBlock *Preheader = L->getLoopPreheader();
362 
363   // If we don't have a preheader, then insert one.
364   if (!Preheader)
365     Preheader = InsertPreheaderForLoop(L, &DT, &LI, nullptr, PreserveLCSSA);
366   if (!Preheader)
367     return false;
368 
369   HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts);
370   HWLoop.Create();
371   ++NumHWLoops;
372   return true;
373 }
374 
375 void HardwareLoop::Create() {
376   LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
377 
378   Value *LoopCountInit = InitLoopCount();
379   if (!LoopCountInit) {
380     reportHWLoopFailure("could not safely create a loop count expression",
381                         "HWLoopNotSafe", ORE, L);
382     return;
383   }
384 
385   Value *Setup = InsertIterationSetup(LoopCountInit);
386 
387   if (UsePHICounter || Opts.ForcePhi) {
388     Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
389     Value *EltsRem = InsertPHICounter(Setup, LoopDec);
390     LoopDec->setOperand(0, EltsRem);
391     UpdateBranch(LoopDec);
392   } else
393     InsertLoopDec();
394 
395   // Run through the basic blocks of the loop and see if any of them have dead
396   // PHIs that can be removed.
397   for (auto *I : L->blocks())
398     DeleteDeadPHIs(I);
399 }
400 
401 static bool CanGenerateTest(Loop *L, Value *Count) {
402   BasicBlock *Preheader = L->getLoopPreheader();
403   if (!Preheader->getSinglePredecessor())
404     return false;
405 
406   BasicBlock *Pred = Preheader->getSinglePredecessor();
407   if (!isa<BranchInst>(Pred->getTerminator()))
408     return false;
409 
410   auto *BI = cast<BranchInst>(Pred->getTerminator());
411   if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
412     return false;
413 
414   // Check that the icmp is checking for equality of Count and zero and that
415   // a non-zero value results in entering the loop.
416   auto ICmp = cast<ICmpInst>(BI->getCondition());
417   LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
418   if (!ICmp->isEquality())
419     return false;
420 
421   auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
422     if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
423       return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
424     return false;
425   };
426 
427   // Check if Count is a zext.
428   Value *CountBefZext =
429       isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) : nullptr;
430 
431   if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
432       !IsCompareZero(ICmp, CountBefZext, 0) &&
433       !IsCompareZero(ICmp, CountBefZext, 1))
434     return false;
435 
436   unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
437   if (BI->getSuccessor(SuccIdx) != Preheader)
438     return false;
439 
440   return true;
441 }
442 
443 Value *HardwareLoop::InitLoopCount() {
444   LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
445   // Can we replace a conditional branch with an intrinsic that sets the
446   // loop counter and tests that is not zero?
447 
448   SCEVExpander SCEVE(SE, DL, "loopcnt");
449   if (!ExitCount->getType()->isPointerTy() &&
450       ExitCount->getType() != CountType)
451     ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
452 
453   ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
454 
455   // If we're trying to use the 'test and set' form of the intrinsic, we need
456   // to replace a conditional branch that is controlling entry to the loop. It
457   // is likely (guaranteed?) that the preheader has an unconditional branch to
458   // the loop header, so also check if it has a single predecessor.
459   if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
460                                   SE.getZero(ExitCount->getType()))) {
461     LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
462     if (Opts.ForceGuard)
463       UseLoopGuard = true;
464   } else
465     UseLoopGuard = false;
466 
467   BasicBlock *BB = L->getLoopPreheader();
468   if (UseLoopGuard && BB->getSinglePredecessor() &&
469       cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
470     BasicBlock *Predecessor = BB->getSinglePredecessor();
471     // If it's not safe to create a while loop then don't force it and create a
472     // do-while loop instead
473     if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->getTerminator()))
474         UseLoopGuard = false;
475     else
476         BB = Predecessor;
477   }
478 
479   if (!SCEVE.isSafeToExpandAt(ExitCount, BB->getTerminator())) {
480     LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
481                << *ExitCount << "\n");
482     return nullptr;
483   }
484 
485   Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
486                                      BB->getTerminator());
487 
488   // FIXME: We've expanded Count where we hope to insert the counter setting
489   // intrinsic. But, in the case of the 'test and set' form, we may fallback to
490   // the just 'set' form and in which case the insertion block is most likely
491   // different. It means there will be instruction(s) in a block that possibly
492   // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue,
493   // but it's doesn't appear to work in all cases.
494 
495   UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
496   BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
497   LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
498                     << " - Expanded Count in " << BB->getName() << "\n"
499                     << " - Will insert set counter intrinsic into: "
500                     << BeginBB->getName() << "\n");
501   return Count;
502 }
503 
504 Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
505   IRBuilder<> Builder(BeginBB->getTerminator());
506   Type *Ty = LoopCountInit->getType();
507   bool UsePhi = UsePHICounter || Opts.ForcePhi;
508   Intrinsic::ID ID = UseLoopGuard
509                          ? (UsePhi ? Intrinsic::test_start_loop_iterations
510                                    : Intrinsic::test_set_loop_iterations)
511                          : (UsePhi ? Intrinsic::start_loop_iterations
512                                    : Intrinsic::set_loop_iterations);
513   Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
514   Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
515 
516   // Use the return value of the intrinsic to control the entry of the loop.
517   if (UseLoopGuard) {
518     assert((isa<BranchInst>(BeginBB->getTerminator()) &&
519             cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
520            "Expected conditional branch");
521 
522     Value *SetCount =
523         UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
524     auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
525     LoopGuard->setCondition(SetCount);
526     if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
527       LoopGuard->swapSuccessors();
528   }
529   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup
530                     << "\n");
531   if (UsePhi && UseLoopGuard)
532     LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
533   return !UsePhi ? LoopCountInit : LoopSetup;
534 }
535 
536 void HardwareLoop::InsertLoopDec() {
537   IRBuilder<> CondBuilder(ExitBranch);
538 
539   Function *DecFunc =
540     Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
541                               LoopDecrement->getType());
542   Value *Ops[] = { LoopDecrement };
543   Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
544   Value *OldCond = ExitBranch->getCondition();
545   ExitBranch->setCondition(NewCond);
546 
547   // The false branch must exit the loop.
548   if (!L->contains(ExitBranch->getSuccessor(0)))
549     ExitBranch->swapSuccessors();
550 
551   // The old condition may be dead now, and may have even created a dead PHI
552   // (the original induction variable).
553   RecursivelyDeleteTriviallyDeadInstructions(OldCond);
554 
555   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
556 }
557 
558 Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
559   IRBuilder<> CondBuilder(ExitBranch);
560 
561   Function *DecFunc =
562       Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
563                                 { EltsRem->getType() });
564   Value *Ops[] = { EltsRem, LoopDecrement };
565   Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
566 
567   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
568   return cast<Instruction>(Call);
569 }
570 
571 PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
572   BasicBlock *Preheader = L->getLoopPreheader();
573   BasicBlock *Header = L->getHeader();
574   BasicBlock *Latch = ExitBranch->getParent();
575   IRBuilder<> Builder(Header->getFirstNonPHI());
576   PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
577   Index->addIncoming(NumElts, Preheader);
578   Index->addIncoming(EltsRem, Latch);
579   LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
580   return Index;
581 }
582 
583 void HardwareLoop::UpdateBranch(Value *EltsRem) {
584   IRBuilder<> CondBuilder(ExitBranch);
585   Value *NewCond =
586     CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
587   Value *OldCond = ExitBranch->getCondition();
588   ExitBranch->setCondition(NewCond);
589 
590   // The false branch must exit the loop.
591   if (!L->contains(ExitBranch->getSuccessor(0)))
592     ExitBranch->swapSuccessors();
593 
594   // The old condition may be dead now, and may have even created a dead PHI
595   // (the original induction variable).
596   RecursivelyDeleteTriviallyDeadInstructions(OldCond);
597 }
598 
599 INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
600 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
601 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
602 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
603 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
604 INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
605 
606 FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); }
607