1 //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Insert hardware loop intrinsics into loops which are deemed profitable by
10 /// the target, by querying TargetTransformInfo. A hardware loop comprises of
11 /// two intrinsics: one, outside the loop, to set the loop iteration count and
12 /// another, in the exit block, to decrement the counter. The decremented value
13 /// can either be carried through the loop via a phi or handled in some opaque
14 /// way by the target.
15 ///
16 //===----------------------------------------------------------------------===//
17
18 #include "llvm/CodeGen/HardwareLoops.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BranchProbabilityInfo.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/CodeGen/Passes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/Value.h"
35 #include "llvm/InitializePasses.h"
36 #include "llvm/Pass.h"
37 #include "llvm/PassRegistry.h"
38 #include "llvm/Support/CommandLine.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Transforms/Utils.h"
41 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
42 #include "llvm/Transforms/Utils/Local.h"
43 #include "llvm/Transforms/Utils/LoopUtils.h"
44 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
45
46 #define DEBUG_TYPE "hardware-loops"
47
48 #define HW_LOOPS_NAME "Hardware Loop Insertion"
49
50 using namespace llvm;
51
52 static cl::opt<bool>
53 ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
54 cl::desc("Force hardware loops intrinsics to be inserted"));
55
56 static cl::opt<bool>
57 ForceHardwareLoopPHI(
58 "force-hardware-loop-phi", cl::Hidden, cl::init(false),
59 cl::desc("Force hardware loop counter to be updated through a phi"));
60
61 static cl::opt<bool>
62 ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
63 cl::desc("Force allowance of nested hardware loops"));
64
65 static cl::opt<unsigned>
66 LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
67 cl::desc("Set the loop decrement value"));
68
69 static cl::opt<unsigned>
70 CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
71 cl::desc("Set the loop counter bitwidth"));
72
73 static cl::opt<bool>
74 ForceGuardLoopEntry(
75 "force-hardware-loop-guard", cl::Hidden, cl::init(false),
76 cl::desc("Force generation of loop guard intrinsic"));
77
78 STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
79
80 #ifndef NDEBUG
debugHWLoopFailure(const StringRef DebugMsg,Instruction * I)81 static void debugHWLoopFailure(const StringRef DebugMsg,
82 Instruction *I) {
83 dbgs() << "HWLoops: " << DebugMsg;
84 if (I)
85 dbgs() << ' ' << *I;
86 else
87 dbgs() << '.';
88 dbgs() << '\n';
89 }
90 #endif
91
92 static OptimizationRemarkAnalysis
createHWLoopAnalysis(StringRef RemarkName,Loop * L,Instruction * I)93 createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
94 Value *CodeRegion = L->getHeader();
95 DebugLoc DL = L->getStartLoc();
96
97 if (I) {
98 CodeRegion = I->getParent();
99 // If there is no debug location attached to the instruction, revert back to
100 // using the loop's.
101 if (I->getDebugLoc())
102 DL = I->getDebugLoc();
103 }
104
105 OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
106 R << "hardware-loop not created: ";
107 return R;
108 }
109
110 namespace {
111
reportHWLoopFailure(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I=nullptr)112 void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
113 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
114 LLVM_DEBUG(debugHWLoopFailure(Msg, I));
115 ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
116 }
117
118 using TTI = TargetTransformInfo;
119
120 class HardwareLoopsLegacy : public FunctionPass {
121 public:
122 static char ID;
123
HardwareLoopsLegacy()124 HardwareLoopsLegacy() : FunctionPass(ID) {
125 initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry());
126 }
127
128 bool runOnFunction(Function &F) override;
129
getAnalysisUsage(AnalysisUsage & AU) const130 void getAnalysisUsage(AnalysisUsage &AU) const override {
131 AU.addRequired<LoopInfoWrapperPass>();
132 AU.addPreserved<LoopInfoWrapperPass>();
133 AU.addRequired<DominatorTreeWrapperPass>();
134 AU.addPreserved<DominatorTreeWrapperPass>();
135 AU.addRequired<ScalarEvolutionWrapperPass>();
136 AU.addPreserved<ScalarEvolutionWrapperPass>();
137 AU.addRequired<AssumptionCacheTracker>();
138 AU.addRequired<TargetTransformInfoWrapperPass>();
139 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
140 AU.addPreserved<BranchProbabilityInfoWrapperPass>();
141 }
142 };
143
144 class HardwareLoopsImpl {
145 public:
HardwareLoopsImpl(ScalarEvolution & SE,LoopInfo & LI,bool PreserveLCSSA,DominatorTree & DT,const DataLayout & DL,const TargetTransformInfo & TTI,TargetLibraryInfo * TLI,AssumptionCache & AC,OptimizationRemarkEmitter * ORE,HardwareLoopOptions & Opts)146 HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA,
147 DominatorTree &DT, const DataLayout &DL,
148 const TargetTransformInfo &TTI, TargetLibraryInfo *TLI,
149 AssumptionCache &AC, OptimizationRemarkEmitter *ORE,
150 HardwareLoopOptions &Opts)
151 : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI),
152 TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { }
153
154 bool run(Function &F);
155
156 private:
157 // Try to convert the given Loop into a hardware loop.
158 bool TryConvertLoop(Loop *L, LLVMContext &Ctx);
159
160 // Given that the target believes the loop to be profitable, try to
161 // convert it.
162 bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo);
163
164 ScalarEvolution &SE;
165 LoopInfo &LI;
166 bool PreserveLCSSA;
167 DominatorTree &DT;
168 const DataLayout &DL;
169 const TargetTransformInfo &TTI;
170 TargetLibraryInfo *TLI = nullptr;
171 AssumptionCache &AC;
172 OptimizationRemarkEmitter *ORE;
173 HardwareLoopOptions &Opts;
174 bool MadeChange = false;
175 };
176
177 class HardwareLoop {
178 // Expand the trip count scev into a value that we can use.
179 Value *InitLoopCount();
180
181 // Insert the set_loop_iteration intrinsic.
182 Value *InsertIterationSetup(Value *LoopCountInit);
183
184 // Insert the loop_decrement intrinsic.
185 void InsertLoopDec();
186
187 // Insert the loop_decrement_reg intrinsic.
188 Instruction *InsertLoopRegDec(Value *EltsRem);
189
190 // If the target requires the counter value to be updated in the loop,
191 // insert a phi to hold the value. The intended purpose is for use by
192 // loop_decrement_reg.
193 PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
194
195 // Create a new cmp, that checks the returned value of loop_decrement*,
196 // and update the exit branch to use it.
197 void UpdateBranch(Value *EltsRem);
198
199 public:
HardwareLoop(HardwareLoopInfo & Info,ScalarEvolution & SE,const DataLayout & DL,OptimizationRemarkEmitter * ORE,HardwareLoopOptions & Opts)200 HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
201 const DataLayout &DL,
202 OptimizationRemarkEmitter *ORE,
203 HardwareLoopOptions &Opts) :
204 SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()),
205 ExitCount(Info.ExitCount),
206 CountType(Info.CountType),
207 ExitBranch(Info.ExitBranch),
208 LoopDecrement(Info.LoopDecrement),
209 UsePHICounter(Info.CounterInReg),
210 UseLoopGuard(Info.PerformEntryTest) { }
211
212 void Create();
213
214 private:
215 ScalarEvolution &SE;
216 const DataLayout &DL;
217 OptimizationRemarkEmitter *ORE = nullptr;
218 HardwareLoopOptions &Opts;
219 Loop *L = nullptr;
220 Module *M = nullptr;
221 const SCEV *ExitCount = nullptr;
222 Type *CountType = nullptr;
223 BranchInst *ExitBranch = nullptr;
224 Value *LoopDecrement = nullptr;
225 bool UsePHICounter = false;
226 bool UseLoopGuard = false;
227 BasicBlock *BeginBB = nullptr;
228 };
229 }
230
231 char HardwareLoopsLegacy::ID = 0;
232
runOnFunction(Function & F)233 bool HardwareLoopsLegacy::runOnFunction(Function &F) {
234 if (skipFunction(F))
235 return false;
236
237 LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
238
239 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
240 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
241 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
242 auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
243 auto &DL = F.getDataLayout();
244 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
245 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
246 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
247 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
248 bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
249
250 HardwareLoopOptions Opts;
251 if (ForceHardwareLoops.getNumOccurrences())
252 Opts.setForce(ForceHardwareLoops);
253 if (ForceHardwareLoopPHI.getNumOccurrences())
254 Opts.setForcePhi(ForceHardwareLoopPHI);
255 if (ForceNestedLoop.getNumOccurrences())
256 Opts.setForceNested(ForceNestedLoop);
257 if (ForceGuardLoopEntry.getNumOccurrences())
258 Opts.setForceGuard(ForceGuardLoopEntry);
259 if (LoopDecrement.getNumOccurrences())
260 Opts.setDecrement(LoopDecrement);
261 if (CounterBitWidth.getNumOccurrences())
262 Opts.setCounterBitwidth(CounterBitWidth);
263
264 HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE,
265 Opts);
266 return Impl.run(F);
267 }
268
run(Function & F,FunctionAnalysisManager & AM)269 PreservedAnalyses HardwareLoopsPass::run(Function &F,
270 FunctionAnalysisManager &AM) {
271 auto &LI = AM.getResult<LoopAnalysis>(F);
272 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
273 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
274 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
275 auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
276 auto &AC = AM.getResult<AssumptionAnalysis>(F);
277 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
278 auto &DL = F.getDataLayout();
279
280 HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts);
281 bool Changed = Impl.run(F);
282 if (!Changed)
283 return PreservedAnalyses::all();
284
285 PreservedAnalyses PA;
286 PA.preserve<LoopAnalysis>();
287 PA.preserve<ScalarEvolutionAnalysis>();
288 PA.preserve<DominatorTreeAnalysis>();
289 PA.preserve<BranchProbabilityAnalysis>();
290 return PA;
291 }
292
run(Function & F)293 bool HardwareLoopsImpl::run(Function &F) {
294 LLVMContext &Ctx = F.getContext();
295 for (Loop *L : LI)
296 if (L->isOutermost())
297 TryConvertLoop(L, Ctx);
298 return MadeChange;
299 }
300
301 // Return true if the search should stop, which will be when an inner loop is
302 // converted and the parent loop doesn't support containing a hardware loop.
TryConvertLoop(Loop * L,LLVMContext & Ctx)303 bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) {
304 // Process nested loops first.
305 bool AnyChanged = false;
306 for (Loop *SL : *L)
307 AnyChanged |= TryConvertLoop(SL, Ctx);
308 if (AnyChanged) {
309 reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
310 ORE, L);
311 return true; // Stop search.
312 }
313
314 LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
315
316 HardwareLoopInfo HWLoopInfo(L);
317 if (!HWLoopInfo.canAnalyze(LI)) {
318 reportHWLoopFailure("cannot analyze loop, irreducible control flow",
319 "HWLoopCannotAnalyze", ORE, L);
320 return false;
321 }
322
323 if (!Opts.Force &&
324 !TTI.isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
325 reportHWLoopFailure("it's not profitable to create a hardware-loop",
326 "HWLoopNotProfitable", ORE, L);
327 return false;
328 }
329
330 // Allow overriding of the counter width and loop decrement value.
331 if (Opts.Bitwidth.has_value()) {
332 HWLoopInfo.CountType = IntegerType::get(Ctx, Opts.Bitwidth.value());
333 }
334
335 if (Opts.Decrement.has_value())
336 HWLoopInfo.LoopDecrement =
337 ConstantInt::get(HWLoopInfo.CountType, Opts.Decrement.value());
338
339 MadeChange |= TryConvertLoop(HWLoopInfo);
340 return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested);
341 }
342
TryConvertLoop(HardwareLoopInfo & HWLoopInfo)343 bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
344
345 Loop *L = HWLoopInfo.L;
346 LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
347
348 if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, Opts.getForceNested(),
349 Opts.getForcePhi())) {
350 // TODO: there can be many reasons a loop is not considered a
351 // candidate, so we should let isHardwareLoopCandidate fill in the
352 // reason and then report a better message here.
353 reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
354 return false;
355 }
356
357 assert(
358 (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
359 "Hardware Loop must have set exit info.");
360
361 BasicBlock *Preheader = L->getLoopPreheader();
362
363 // If we don't have a preheader, then insert one.
364 if (!Preheader)
365 Preheader = InsertPreheaderForLoop(L, &DT, &LI, nullptr, PreserveLCSSA);
366 if (!Preheader)
367 return false;
368
369 HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts);
370 HWLoop.Create();
371 ++NumHWLoops;
372 return true;
373 }
374
Create()375 void HardwareLoop::Create() {
376 LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
377
378 Value *LoopCountInit = InitLoopCount();
379 if (!LoopCountInit) {
380 reportHWLoopFailure("could not safely create a loop count expression",
381 "HWLoopNotSafe", ORE, L);
382 return;
383 }
384
385 Value *Setup = InsertIterationSetup(LoopCountInit);
386
387 if (UsePHICounter || Opts.ForcePhi) {
388 Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
389 Value *EltsRem = InsertPHICounter(Setup, LoopDec);
390 LoopDec->setOperand(0, EltsRem);
391 UpdateBranch(LoopDec);
392 } else
393 InsertLoopDec();
394
395 // Run through the basic blocks of the loop and see if any of them have dead
396 // PHIs that can be removed.
397 for (auto *I : L->blocks())
398 DeleteDeadPHIs(I);
399 }
400
CanGenerateTest(Loop * L,Value * Count)401 static bool CanGenerateTest(Loop *L, Value *Count) {
402 BasicBlock *Preheader = L->getLoopPreheader();
403 if (!Preheader->getSinglePredecessor())
404 return false;
405
406 BasicBlock *Pred = Preheader->getSinglePredecessor();
407 if (!isa<BranchInst>(Pred->getTerminator()))
408 return false;
409
410 auto *BI = cast<BranchInst>(Pred->getTerminator());
411 if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
412 return false;
413
414 // Check that the icmp is checking for equality of Count and zero and that
415 // a non-zero value results in entering the loop.
416 auto ICmp = cast<ICmpInst>(BI->getCondition());
417 LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
418 if (!ICmp->isEquality())
419 return false;
420
421 auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
422 if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
423 return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
424 return false;
425 };
426
427 // Check if Count is a zext.
428 Value *CountBefZext =
429 isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) : nullptr;
430
431 if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
432 !IsCompareZero(ICmp, CountBefZext, 0) &&
433 !IsCompareZero(ICmp, CountBefZext, 1))
434 return false;
435
436 unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
437 if (BI->getSuccessor(SuccIdx) != Preheader)
438 return false;
439
440 return true;
441 }
442
InitLoopCount()443 Value *HardwareLoop::InitLoopCount() {
444 LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
445 // Can we replace a conditional branch with an intrinsic that sets the
446 // loop counter and tests that is not zero?
447
448 SCEVExpander SCEVE(SE, DL, "loopcnt");
449 if (!ExitCount->getType()->isPointerTy() &&
450 ExitCount->getType() != CountType)
451 ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
452
453 ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
454
455 // If we're trying to use the 'test and set' form of the intrinsic, we need
456 // to replace a conditional branch that is controlling entry to the loop. It
457 // is likely (guaranteed?) that the preheader has an unconditional branch to
458 // the loop header, so also check if it has a single predecessor.
459 if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
460 SE.getZero(ExitCount->getType()))) {
461 LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
462 if (Opts.ForceGuard)
463 UseLoopGuard = true;
464 } else
465 UseLoopGuard = false;
466
467 BasicBlock *BB = L->getLoopPreheader();
468 if (UseLoopGuard && BB->getSinglePredecessor() &&
469 cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
470 BasicBlock *Predecessor = BB->getSinglePredecessor();
471 // If it's not safe to create a while loop then don't force it and create a
472 // do-while loop instead
473 if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->getTerminator()))
474 UseLoopGuard = false;
475 else
476 BB = Predecessor;
477 }
478
479 if (!SCEVE.isSafeToExpandAt(ExitCount, BB->getTerminator())) {
480 LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
481 << *ExitCount << "\n");
482 return nullptr;
483 }
484
485 Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
486 BB->getTerminator());
487
488 // FIXME: We've expanded Count where we hope to insert the counter setting
489 // intrinsic. But, in the case of the 'test and set' form, we may fallback to
490 // the just 'set' form and in which case the insertion block is most likely
491 // different. It means there will be instruction(s) in a block that possibly
492 // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue,
493 // but it's doesn't appear to work in all cases.
494
495 UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
496 BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
497 LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
498 << " - Expanded Count in " << BB->getName() << "\n"
499 << " - Will insert set counter intrinsic into: "
500 << BeginBB->getName() << "\n");
501 return Count;
502 }
503
InsertIterationSetup(Value * LoopCountInit)504 Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
505 IRBuilder<> Builder(BeginBB->getTerminator());
506 if (BeginBB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP))
507 Builder.setIsFPConstrained(true);
508 Type *Ty = LoopCountInit->getType();
509 bool UsePhi = UsePHICounter || Opts.ForcePhi;
510 Intrinsic::ID ID = UseLoopGuard
511 ? (UsePhi ? Intrinsic::test_start_loop_iterations
512 : Intrinsic::test_set_loop_iterations)
513 : (UsePhi ? Intrinsic::start_loop_iterations
514 : Intrinsic::set_loop_iterations);
515 Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
516 Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
517
518 // Use the return value of the intrinsic to control the entry of the loop.
519 if (UseLoopGuard) {
520 assert((isa<BranchInst>(BeginBB->getTerminator()) &&
521 cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
522 "Expected conditional branch");
523
524 Value *SetCount =
525 UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
526 auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
527 LoopGuard->setCondition(SetCount);
528 if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
529 LoopGuard->swapSuccessors();
530 }
531 LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup
532 << "\n");
533 if (UsePhi && UseLoopGuard)
534 LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
535 return !UsePhi ? LoopCountInit : LoopSetup;
536 }
537
InsertLoopDec()538 void HardwareLoop::InsertLoopDec() {
539 IRBuilder<> CondBuilder(ExitBranch);
540 if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
541 Attribute::StrictFP))
542 CondBuilder.setIsFPConstrained(true);
543
544 Function *DecFunc =
545 Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
546 LoopDecrement->getType());
547 Value *Ops[] = { LoopDecrement };
548 Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
549 Value *OldCond = ExitBranch->getCondition();
550 ExitBranch->setCondition(NewCond);
551
552 // The false branch must exit the loop.
553 if (!L->contains(ExitBranch->getSuccessor(0)))
554 ExitBranch->swapSuccessors();
555
556 // The old condition may be dead now, and may have even created a dead PHI
557 // (the original induction variable).
558 RecursivelyDeleteTriviallyDeadInstructions(OldCond);
559
560 LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
561 }
562
InsertLoopRegDec(Value * EltsRem)563 Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
564 IRBuilder<> CondBuilder(ExitBranch);
565 if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
566 Attribute::StrictFP))
567 CondBuilder.setIsFPConstrained(true);
568
569 Function *DecFunc =
570 Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
571 { EltsRem->getType() });
572 Value *Ops[] = { EltsRem, LoopDecrement };
573 Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
574
575 LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
576 return cast<Instruction>(Call);
577 }
578
InsertPHICounter(Value * NumElts,Value * EltsRem)579 PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
580 BasicBlock *Preheader = L->getLoopPreheader();
581 BasicBlock *Header = L->getHeader();
582 BasicBlock *Latch = ExitBranch->getParent();
583 IRBuilder<> Builder(Header, Header->getFirstNonPHIIt());
584 PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
585 Index->addIncoming(NumElts, Preheader);
586 Index->addIncoming(EltsRem, Latch);
587 LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
588 return Index;
589 }
590
UpdateBranch(Value * EltsRem)591 void HardwareLoop::UpdateBranch(Value *EltsRem) {
592 IRBuilder<> CondBuilder(ExitBranch);
593 Value *NewCond =
594 CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
595 Value *OldCond = ExitBranch->getCondition();
596 ExitBranch->setCondition(NewCond);
597
598 // The false branch must exit the loop.
599 if (!L->contains(ExitBranch->getSuccessor(0)))
600 ExitBranch->swapSuccessors();
601
602 // The old condition may be dead now, and may have even created a dead PHI
603 // (the original induction variable).
604 RecursivelyDeleteTriviallyDeadInstructions(OldCond);
605 }
606
INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy,DEBUG_TYPE,HW_LOOPS_NAME,false,false)607 INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
608 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
609 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
610 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
611 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
612 INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
613
614 FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); }
615