1 //===-- VPlanConstruction.cpp - Transforms for initial VPlan construction -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements transforms for initial VPlan construction.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "LoopVectorizationPlanner.h"
15 #include "VPlan.h"
16 #include "VPlanCFG.h"
17 #include "VPlanDominatorTree.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanTransforms.h"
20 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/Analysis/LoopIterator.h"
22 #include "llvm/Analysis/ScalarEvolution.h"
23 #include "llvm/IR/MDBuilder.h"
24
25 #define DEBUG_TYPE "vplan"
26
27 using namespace llvm;
28 using namespace VPlanPatternMatch;
29
30 namespace {
31 // Class that is used to build the plain CFG for the incoming IR.
32 class PlainCFGBuilder {
33 // The outermost loop of the input loop nest considered for vectorization.
34 Loop *TheLoop;
35
36 // Loop Info analysis.
37 LoopInfo *LI;
38
39 // Vectorization plan that we are working on.
40 std::unique_ptr<VPlan> Plan;
41
42 // Builder of the VPlan instruction-level representation.
43 VPBuilder VPIRBuilder;
44
45 // NOTE: The following maps are intentionally destroyed after the plain CFG
46 // construction because subsequent VPlan-to-VPlan transformation may
47 // invalidate them.
48 // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
49 DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
50 // Map incoming Value definitions to their newly-created VPValues.
51 DenseMap<Value *, VPValue *> IRDef2VPValue;
52
53 // Hold phi node's that need to be fixed once the plain CFG has been built.
54 SmallVector<PHINode *, 8> PhisToFix;
55
56 // Utility functions.
57 void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
58 void fixHeaderPhis();
59 VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
60 #ifndef NDEBUG
61 bool isExternalDef(Value *Val);
62 #endif
63 VPValue *getOrCreateVPOperand(Value *IRVal);
64 void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
65
66 public:
PlainCFGBuilder(Loop * Lp,LoopInfo * LI)67 PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
68 : TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
69
70 /// Build plain CFG for TheLoop and connect it to Plan's entry.
71 std::unique_ptr<VPlan> buildPlainCFG();
72 };
73 } // anonymous namespace
74
75 // Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
76 // must have no predecessors.
setVPBBPredsFromBB(VPBasicBlock * VPBB,BasicBlock * BB)77 void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
78 // Collect VPBB predecessors.
79 SmallVector<VPBlockBase *, 2> VPBBPreds;
80 for (BasicBlock *Pred : predecessors(BB))
81 VPBBPreds.push_back(getOrCreateVPBB(Pred));
82 VPBB->setPredecessors(VPBBPreds);
83 }
84
isHeaderBB(BasicBlock * BB,Loop * L)85 static bool isHeaderBB(BasicBlock *BB, Loop *L) {
86 return L && BB == L->getHeader();
87 }
88
89 // Add operands to VPInstructions representing phi nodes from the input IR.
fixHeaderPhis()90 void PlainCFGBuilder::fixHeaderPhis() {
91 for (auto *Phi : PhisToFix) {
92 assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
93 VPValue *VPVal = IRDef2VPValue[Phi];
94 assert(isa<VPWidenPHIRecipe>(VPVal) &&
95 "Expected WidenPHIRecipe for phi node.");
96 auto *VPPhi = cast<VPWidenPHIRecipe>(VPVal);
97 assert(VPPhi->getNumOperands() == 0 &&
98 "Expected VPInstruction with no operands.");
99 assert(isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent())) &&
100 "Expected Phi in header block.");
101 assert(Phi->getNumOperands() == 2 &&
102 "header phi must have exactly 2 operands");
103 for (BasicBlock *Pred : predecessors(Phi->getParent()))
104 VPPhi->addOperand(
105 getOrCreateVPOperand(Phi->getIncomingValueForBlock(Pred)));
106 }
107 }
108
109 // Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
110 // existing one if it was already created.
getOrCreateVPBB(BasicBlock * BB)111 VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
112 if (auto *VPBB = BB2VPBB.lookup(BB)) {
113 // Retrieve existing VPBB.
114 return VPBB;
115 }
116
117 // Create new VPBB.
118 StringRef Name = BB->getName();
119 LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
120 VPBasicBlock *VPBB = Plan->createVPBasicBlock(Name);
121 BB2VPBB[BB] = VPBB;
122 return VPBB;
123 }
124
125 #ifndef NDEBUG
126 // Return true if \p Val is considered an external definition. An external
127 // definition is either:
128 // 1. A Value that is not an Instruction. This will be refined in the future.
129 // 2. An Instruction that is outside of the IR region represented in VPlan,
130 // i.e., is not part of the loop nest.
isExternalDef(Value * Val)131 bool PlainCFGBuilder::isExternalDef(Value *Val) {
132 // All the Values that are not Instructions are considered external
133 // definitions for now.
134 Instruction *Inst = dyn_cast<Instruction>(Val);
135 if (!Inst)
136 return true;
137
138 // Check whether Instruction definition is in loop body.
139 return !TheLoop->contains(Inst);
140 }
141 #endif
142
143 // Create a new VPValue or retrieve an existing one for the Instruction's
144 // operand \p IRVal. This function must only be used to create/retrieve VPValues
145 // for *Instruction's operands* and not to create regular VPInstruction's. For
146 // the latter, please, look at 'createVPInstructionsForVPBB'.
getOrCreateVPOperand(Value * IRVal)147 VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
148 auto VPValIt = IRDef2VPValue.find(IRVal);
149 if (VPValIt != IRDef2VPValue.end())
150 // Operand has an associated VPInstruction or VPValue that was previously
151 // created.
152 return VPValIt->second;
153
154 // Operand doesn't have a previously created VPInstruction/VPValue. This
155 // means that operand is:
156 // A) a definition external to VPlan,
157 // B) any other Value without specific representation in VPlan.
158 // For now, we use VPValue to represent A and B and classify both as external
159 // definitions. We may introduce specific VPValue subclasses for them in the
160 // future.
161 assert(isExternalDef(IRVal) && "Expected external definition as operand.");
162
163 // A and B: Create VPValue and add it to the pool of external definitions and
164 // to the Value->VPValue map.
165 VPValue *NewVPVal = Plan->getOrAddLiveIn(IRVal);
166 IRDef2VPValue[IRVal] = NewVPVal;
167 return NewVPVal;
168 }
169
170 // Create new VPInstructions in a VPBasicBlock, given its BasicBlock
171 // counterpart. This function must be invoked in RPO so that the operands of a
172 // VPInstruction in \p BB have been visited before (except for Phi nodes).
createVPInstructionsForVPBB(VPBasicBlock * VPBB,BasicBlock * BB)173 void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
174 BasicBlock *BB) {
175 VPIRBuilder.setInsertPoint(VPBB);
176 // TODO: Model and preserve debug intrinsics in VPlan.
177 for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
178 Instruction *Inst = &InstRef;
179
180 // There shouldn't be any VPValue for Inst at this point. Otherwise, we
181 // visited Inst when we shouldn't, breaking the RPO traversal order.
182 assert(!IRDef2VPValue.count(Inst) &&
183 "Instruction shouldn't have been visited.");
184
185 if (auto *Br = dyn_cast<BranchInst>(Inst)) {
186 // Conditional branch instruction are represented using BranchOnCond
187 // recipes.
188 if (Br->isConditional()) {
189 VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
190 VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
191 }
192
193 // Skip the rest of the Instruction processing for Branch instructions.
194 continue;
195 }
196
197 if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
198 SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
199 for (auto Case : SI->cases())
200 Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
201 VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
202 continue;
203 }
204
205 VPSingleDefRecipe *NewR;
206 if (auto *Phi = dyn_cast<PHINode>(Inst)) {
207 // Phi node's operands may have not been visited at this point. We create
208 // an empty VPInstruction that we will fix once the whole plain CFG has
209 // been built.
210 NewR = new VPWidenPHIRecipe(Phi, nullptr, Phi->getDebugLoc(), "vec.phi");
211 VPBB->appendRecipe(NewR);
212 if (isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent()))) {
213 // Header phis need to be fixed after the VPBB for the latch has been
214 // created.
215 PhisToFix.push_back(Phi);
216 } else {
217 // Add operands for VPPhi in the order matching its predecessors in
218 // VPlan.
219 DenseMap<const VPBasicBlock *, VPValue *> VPPredToIncomingValue;
220 for (unsigned I = 0; I != Phi->getNumOperands(); ++I) {
221 VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(I)]] =
222 getOrCreateVPOperand(Phi->getIncomingValue(I));
223 }
224 for (VPBlockBase *Pred : VPBB->getPredecessors())
225 NewR->addOperand(
226 VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
227 }
228 } else {
229 // Translate LLVM-IR operands into VPValue operands and set them in the
230 // new VPInstruction.
231 SmallVector<VPValue *, 4> VPOperands;
232 for (Value *Op : Inst->operands())
233 VPOperands.push_back(getOrCreateVPOperand(Op));
234
235 // Build VPInstruction for any arbitrary Instruction without specific
236 // representation in VPlan.
237 NewR = cast<VPInstruction>(
238 VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
239 }
240
241 IRDef2VPValue[Inst] = NewR;
242 }
243 }
244
245 // Main interface to build the plain CFG.
buildPlainCFG()246 std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
247 VPIRBasicBlock *Entry = cast<VPIRBasicBlock>(Plan->getEntry());
248 BB2VPBB[Entry->getIRBasicBlock()] = Entry;
249 for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks())
250 BB2VPBB[ExitVPBB->getIRBasicBlock()] = ExitVPBB;
251
252 // 1. Scan the body of the loop in a topological order to visit each basic
253 // block after having visited its predecessor basic blocks. Create a VPBB for
254 // each BB and link it to its successor and predecessor VPBBs. Note that
255 // predecessors must be set in the same order as they are in the incomming IR.
256 // Otherwise, there might be problems with existing phi nodes and algorithm
257 // based on predecessors traversal.
258
259 // Loop PH needs to be explicitly visited since it's not taken into account by
260 // LoopBlocksDFS.
261 BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
262 assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
263 "Unexpected loop preheader");
264 for (auto &I : *ThePreheaderBB) {
265 if (I.getType()->isVoidTy())
266 continue;
267 IRDef2VPValue[&I] = Plan->getOrAddLiveIn(&I);
268 }
269
270 LoopBlocksRPO RPO(TheLoop);
271 RPO.perform(LI);
272
273 for (BasicBlock *BB : RPO) {
274 // Create or retrieve the VPBasicBlock for this BB.
275 VPBasicBlock *VPBB = getOrCreateVPBB(BB);
276 // Set VPBB predecessors in the same order as they are in the incoming BB.
277 setVPBBPredsFromBB(VPBB, BB);
278
279 // Create VPInstructions for BB.
280 createVPInstructionsForVPBB(VPBB, BB);
281
282 // Set VPBB successors. We create empty VPBBs for successors if they don't
283 // exist already. Recipes will be created when the successor is visited
284 // during the RPO traversal.
285 if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
286 SmallVector<VPBlockBase *> Succs = {
287 getOrCreateVPBB(SI->getDefaultDest())};
288 for (auto Case : SI->cases())
289 Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
290 VPBB->setSuccessors(Succs);
291 continue;
292 }
293 auto *BI = cast<BranchInst>(BB->getTerminator());
294 unsigned NumSuccs = succ_size(BB);
295 if (NumSuccs == 1) {
296 VPBB->setOneSuccessor(getOrCreateVPBB(BB->getSingleSuccessor()));
297 continue;
298 }
299 assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() &&
300 "block must have conditional branch with 2 successors");
301
302 BasicBlock *IRSucc0 = BI->getSuccessor(0);
303 BasicBlock *IRSucc1 = BI->getSuccessor(1);
304 VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
305 VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1);
306 VPBB->setTwoSuccessors(Successor0, Successor1);
307 }
308
309 for (auto *EB : Plan->getExitBlocks())
310 setVPBBPredsFromBB(EB, EB->getIRBasicBlock());
311
312 // 2. The whole CFG has been built at this point so all the input Values must
313 // have a VPlan counterpart. Fix VPlan header phi by adding their
314 // corresponding VPlan operands.
315 fixHeaderPhis();
316
317 Plan->getEntry()->setOneSuccessor(getOrCreateVPBB(TheLoop->getHeader()));
318 Plan->getEntry()->setPlan(&*Plan);
319
320 // Fix VPlan loop-closed-ssa exit phi's by adding incoming operands to the
321 // VPIRInstructions wrapping them.
322 // // Note that the operand order corresponds to IR predecessor order, and may
323 // need adjusting when VPlan predecessors are added, if an exit block has
324 // multiple predecessor.
325 for (auto *EB : Plan->getExitBlocks()) {
326 for (VPRecipeBase &R : EB->phis()) {
327 auto *PhiR = cast<VPIRPhi>(&R);
328 PHINode &Phi = PhiR->getIRPhi();
329 assert(PhiR->getNumOperands() == 0 &&
330 "no phi operands should be added yet");
331 for (BasicBlock *Pred : predecessors(EB->getIRBasicBlock()))
332 PhiR->addOperand(
333 getOrCreateVPOperand(Phi.getIncomingValueForBlock(Pred)));
334 }
335 }
336
337 LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan);
338 return std::move(Plan);
339 }
340
buildPlainCFG(Loop * TheLoop,LoopInfo & LI)341 std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
342 LoopInfo &LI) {
343 PlainCFGBuilder Builder(TheLoop, &LI);
344 return Builder.buildPlainCFG();
345 }
346
347 /// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
348 /// has exactly 2 predecessors (preheader and latch), where the block
349 /// dominates the latch and the preheader dominates the block. If it is a
350 /// header block return true and canonicalize the predecessors of the header
351 /// (making sure the preheader appears first and the latch second) and the
352 /// successors of the latch (making sure the loop exit comes first). Otherwise
353 /// return false.
canonicalHeaderAndLatch(VPBlockBase * HeaderVPB,const VPDominatorTree & VPDT)354 static bool canonicalHeaderAndLatch(VPBlockBase *HeaderVPB,
355 const VPDominatorTree &VPDT) {
356 ArrayRef<VPBlockBase *> Preds = HeaderVPB->getPredecessors();
357 if (Preds.size() != 2)
358 return false;
359
360 auto *PreheaderVPBB = Preds[0];
361 auto *LatchVPBB = Preds[1];
362 if (!VPDT.dominates(PreheaderVPBB, HeaderVPB) ||
363 !VPDT.dominates(HeaderVPB, LatchVPBB)) {
364 std::swap(PreheaderVPBB, LatchVPBB);
365
366 if (!VPDT.dominates(PreheaderVPBB, HeaderVPB) ||
367 !VPDT.dominates(HeaderVPB, LatchVPBB))
368 return false;
369
370 // Canonicalize predecessors of header so that preheader is first and
371 // latch second.
372 HeaderVPB->swapPredecessors();
373 for (VPRecipeBase &R : cast<VPBasicBlock>(HeaderVPB)->phis())
374 R.swapOperands();
375 }
376
377 // The two successors of conditional branch match the condition, with the
378 // first successor corresponding to true and the second to false. We
379 // canonicalize the successors of the latch when introducing the region, such
380 // that the latch exits the region when its condition is true; invert the
381 // original condition if the original CFG branches to the header on true.
382 // Note that the exit edge is not yet connected for top-level loops.
383 if (LatchVPBB->getSingleSuccessor() ||
384 LatchVPBB->getSuccessors()[0] != HeaderVPB)
385 return true;
386
387 assert(LatchVPBB->getNumSuccessors() == 2 && "Must have 2 successors");
388 auto *Term = cast<VPBasicBlock>(LatchVPBB)->getTerminator();
389 assert(cast<VPInstruction>(Term)->getOpcode() ==
390 VPInstruction::BranchOnCond &&
391 "terminator must be a BranchOnCond");
392 auto *Not = new VPInstruction(VPInstruction::Not, {Term->getOperand(0)});
393 Not->insertBefore(Term);
394 Term->setOperand(0, Not);
395 LatchVPBB->swapSuccessors();
396
397 return true;
398 }
399
400 /// Create a new VPRegionBlock for the loop starting at \p HeaderVPB.
createLoopRegion(VPlan & Plan,VPBlockBase * HeaderVPB)401 static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
402 auto *PreheaderVPBB = HeaderVPB->getPredecessors()[0];
403 auto *LatchVPBB = HeaderVPB->getPredecessors()[1];
404
405 VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPB);
406 VPBlockUtils::disconnectBlocks(LatchVPBB, HeaderVPB);
407 VPBlockBase *LatchExitVPB = LatchVPBB->getSingleSuccessor();
408 assert(LatchExitVPB && "Latch expected to be left with a single successor");
409
410 // Create an empty region first and insert it between PreheaderVPBB and
411 // LatchExitVPB, taking care to preserve the original predecessor & successor
412 // order of blocks. Set region entry and exiting after both HeaderVPB and
413 // LatchVPBB have been disconnected from their predecessors/successors.
414 auto *R = Plan.createVPRegionBlock("", false /*isReplicator*/);
415 VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
416 VPBlockUtils::disconnectBlocks(LatchVPBB, R);
417 VPBlockUtils::connectBlocks(PreheaderVPBB, R);
418 R->setEntry(HeaderVPB);
419 R->setExiting(LatchVPBB);
420
421 // All VPBB's reachable shallowly from HeaderVPB belong to the current region.
422 for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPB))
423 VPBB->setParent(R);
424 }
425
426 // Add the necessary canonical IV and branch recipes required to control the
427 // loop.
addCanonicalIVRecipes(VPlan & Plan,VPBasicBlock * HeaderVPBB,VPBasicBlock * LatchVPBB,Type * IdxTy,DebugLoc DL)428 static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
429 VPBasicBlock *LatchVPBB, Type *IdxTy,
430 DebugLoc DL) {
431 Value *StartIdx = ConstantInt::get(IdxTy, 0);
432 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
433
434 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
435 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
436 HeaderVPBB->insert(CanonicalIVPHI, HeaderVPBB->begin());
437
438 // We are about to replace the branch to exit the region. Remove the original
439 // BranchOnCond, if there is any.
440 if (!LatchVPBB->empty() &&
441 match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue())))
442 LatchVPBB->getTerminator()->eraseFromParent();
443
444 VPBuilder Builder(LatchVPBB);
445 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
446 // Initially the induction increment is guaranteed to not wrap, but that may
447 // change later, e.g. when tail-folding, when the flags need to be dropped.
448 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
449 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {true, false}, DL,
450 "index.next");
451 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
452
453 // Add the BranchOnCount VPInstruction to the latch.
454 Builder.createNaryOp(VPInstruction::BranchOnCount,
455 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
456 }
457
prepareForVectorization(VPlan & Plan,Type * InductionTy,PredicatedScalarEvolution & PSE,bool RequiresScalarEpilogueCheck,bool TailFolded,Loop * TheLoop,DebugLoc IVDL,bool HasUncountableEarlyExit,VFRange & Range)458 void VPlanTransforms::prepareForVectorization(
459 VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
460 bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop,
461 DebugLoc IVDL, bool HasUncountableEarlyExit, VFRange &Range) {
462 VPDominatorTree VPDT;
463 VPDT.recalculate(Plan);
464
465 VPBlockBase *HeaderVPB = Plan.getEntry()->getSingleSuccessor();
466 canonicalHeaderAndLatch(HeaderVPB, VPDT);
467 VPBlockBase *LatchVPB = HeaderVPB->getPredecessors()[1];
468
469 VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph");
470 VPBlockUtils::insertBlockAfter(VecPreheader, Plan.getEntry());
471
472 VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock("middle.block");
473 // The canonical LatchVPB has the header block as last successor. If it has
474 // another successor, this successor is an exit block - insert middle block on
475 // its edge. Otherwise, add middle block as another successor retaining header
476 // as last.
477 if (LatchVPB->getNumSuccessors() == 2) {
478 VPBlockBase *LatchExitVPB = LatchVPB->getSuccessors()[0];
479 VPBlockUtils::insertOnEdge(LatchVPB, LatchExitVPB, MiddleVPBB);
480 } else {
481 VPBlockUtils::connectBlocks(LatchVPB, MiddleVPBB);
482 LatchVPB->swapSuccessors();
483 }
484
485 addCanonicalIVRecipes(Plan, cast<VPBasicBlock>(HeaderVPB),
486 cast<VPBasicBlock>(LatchVPB), InductionTy, IVDL);
487
488 [[maybe_unused]] bool HandledUncountableEarlyExit = false;
489 // Disconnect all early exits from the loop leaving it with a single exit from
490 // the latch. Early exits that are countable are left for a scalar epilog. The
491 // condition of uncountable early exits (currently at most one is supported)
492 // is fused into the latch exit, and used to branch from middle block to the
493 // early exit destination.
494 for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
495 for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
496 if (Pred == MiddleVPBB)
497 continue;
498 if (HasUncountableEarlyExit) {
499 assert(!HandledUncountableEarlyExit &&
500 "can handle exactly one uncountable early exit");
501 handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
502 cast<VPBasicBlock>(HeaderVPB),
503 cast<VPBasicBlock>(LatchVPB), Range);
504 HandledUncountableEarlyExit = true;
505 } else {
506 for (VPRecipeBase &R : EB->phis())
507 cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
508 }
509 cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
510 VPBlockUtils::disconnectBlocks(Pred, EB);
511 }
512 }
513
514 assert((!HasUncountableEarlyExit || HandledUncountableEarlyExit) &&
515 "missed an uncountable exit that must be handled");
516
517 // Create SCEV and VPValue for the trip count.
518 // We use the symbolic max backedge-taken-count, which works also when
519 // vectorizing loops with uncountable early exits.
520 const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
521 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
522 "Invalid loop count");
523 ScalarEvolution &SE = *PSE.getSE();
524 const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
525 InductionTy, TheLoop);
526 Plan.setTripCount(
527 vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE));
528
529 VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
530 VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
531
532 // The connection order corresponds to the operands of the conditional branch,
533 // with the middle block already connected to the exit block.
534 VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
535 // Also connect the entry block to the scalar preheader.
536 // TODO: Also introduce a branch recipe together with the minimum trip count
537 // check.
538 VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
539 Plan.getEntry()->swapSuccessors();
540
541 // If MiddleVPBB has a single successor then the original loop does not exit
542 // via the latch and the single successor must be the scalar preheader.
543 // There's no need to add a runtime check to MiddleVPBB.
544 if (MiddleVPBB->getNumSuccessors() == 1) {
545 assert(MiddleVPBB->getSingleSuccessor() == ScalarPH &&
546 "must have ScalarPH as single successor");
547 return;
548 }
549
550 assert(MiddleVPBB->getNumSuccessors() == 2 && "must have 2 successors");
551
552 // Add a check in the middle block to see if we have completed all of the
553 // iterations in the first vector loop.
554 //
555 // Three cases:
556 // 1) If we require a scalar epilogue, the scalar ph must execute. Set the
557 // condition to false.
558 // 2) If (N - N%VF) == N, then we *don't* need to run the
559 // remainder. Thus if tail is to be folded, we know we don't need to run
560 // the remainder and we can set the condition to true.
561 // 3) Otherwise, construct a runtime check.
562
563 // We use the same DebugLoc as the scalar loop latch terminator instead of
564 // the corresponding compare because they may have ended up with different
565 // line numbers and we want to avoid awkward line stepping while debugging.
566 // E.g., if the compare has got a line number inside the loop.
567 DebugLoc LatchDL = TheLoop->getLoopLatch()->getTerminator()->getDebugLoc();
568 VPBuilder Builder(MiddleVPBB);
569 VPValue *Cmp;
570 if (!RequiresScalarEpilogueCheck)
571 Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse(
572 IntegerType::getInt1Ty(TripCount->getType()->getContext())));
573 else if (TailFolded)
574 Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue(
575 IntegerType::getInt1Ty(TripCount->getType()->getContext())));
576 else
577 Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
578 &Plan.getVectorTripCount(), LatchDL, "cmp.n");
579 Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp}, LatchDL);
580 }
581
createLoopRegions(VPlan & Plan)582 void VPlanTransforms::createLoopRegions(VPlan &Plan) {
583 VPDominatorTree VPDT;
584 VPDT.recalculate(Plan);
585 for (VPBlockBase *HeaderVPB : vp_post_order_shallow(Plan.getEntry()))
586 if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
587 createLoopRegion(Plan, HeaderVPB);
588
589 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
590 TopRegion->setName("vector loop");
591 TopRegion->getEntryBasicBlock()->setName("vector.body");
592 }
593
594 // Likelyhood of bypassing the vectorized loop due to a runtime check block,
595 // including memory overlap checks block and wrapping/unit-stride checks block.
596 static constexpr uint32_t CheckBypassWeights[] = {1, 127};
597
attachCheckBlock(VPlan & Plan,Value * Cond,BasicBlock * CheckBlock,bool AddBranchWeights)598 void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
599 BasicBlock *CheckBlock,
600 bool AddBranchWeights) {
601 VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
602 VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
603 VPBlockBase *VectorPH = Plan.getVectorPreheader();
604 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
605 VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
606 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckBlockVPBB);
607 VPBlockUtils::connectBlocks(CheckBlockVPBB, ScalarPH);
608 CheckBlockVPBB->swapSuccessors();
609
610 // We just connected a new block to the scalar preheader. Update all
611 // VPPhis by adding an incoming value for it, replicating the last value.
612 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
613 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
614 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
615 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
616 "must have incoming values for all operands");
617 R.addOperand(R.getOperand(NumPredecessors - 2));
618 }
619
620 VPIRMetadata VPBranchWeights;
621 auto *Term = VPBuilder(CheckBlockVPBB)
622 .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
623 Plan.getCanonicalIV()->getDebugLoc());
624 if (AddBranchWeights) {
625 MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
626 MDNode *BranchWeights =
627 MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
628 Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
629 }
630 }
631
handleMaxMinNumReductions(VPlan & Plan)632 bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
633 auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
634 auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
635 RedPhiR->getBackedgeValue()->getDefiningRecipe());
636 if (!MinMaxR)
637 return nullptr;
638
639 auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
640 if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
641 !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
642 return nullptr;
643
644 #ifndef NDEBUG
645 Intrinsic::ID RdxIntrinsicId =
646 RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
647 : Intrinsic::minnum;
648 assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
649 cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
650 RdxIntrinsicId) ||
651 (RepR &&
652 cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
653 RdxIntrinsicId) &&
654 "Intrinsic did not match recurrence kind");
655 #endif
656
657 if (MinMaxR->getOperand(0) == RedPhiR)
658 return MinMaxR->getOperand(1);
659
660 assert(MinMaxR->getOperand(1) == RedPhiR &&
661 "Reduction phi operand expected");
662 return MinMaxR->getOperand(0);
663 };
664
665 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
666 VPReductionPHIRecipe *RedPhiR = nullptr;
667 bool HasUnsupportedPhi = false;
668 for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
669 if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
670 continue;
671 auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
672 if (!Cur) {
673 // TODO: Also support fixed-order recurrence phis.
674 HasUnsupportedPhi = true;
675 continue;
676 }
677 // For now, only a single reduction is supported.
678 // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
679 if (RedPhiR)
680 return false;
681 if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
682 Cur->getRecurrenceKind() != RecurKind::FMinNum) {
683 HasUnsupportedPhi = true;
684 continue;
685 }
686 RedPhiR = Cur;
687 }
688
689 if (!RedPhiR)
690 return true;
691
692 // We won't be able to resume execution in the scalar tail, if there are
693 // unsupported header phis or there is no scalar tail at all, due to
694 // tail-folding.
695 if (HasUnsupportedPhi || !Plan.hasScalarTail())
696 return false;
697
698 VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
699 if (!MinMaxOp)
700 return false;
701
702 RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
703 assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
704 "unsupported reduction");
705
706 /// Check if the vector loop of \p Plan can early exit and restart
707 /// execution of last vector iteration in the scalar loop. This requires all
708 /// recipes up to early exit point be side-effect free as they are
709 /// re-executed. Currently we check that the loop is free of any recipe that
710 /// may write to memory. Expected to operate on an early VPlan w/o nested
711 /// regions.
712 for (VPBlockBase *VPB : vp_depth_first_shallow(
713 Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
714 auto *VPBB = cast<VPBasicBlock>(VPB);
715 for (auto &R : *VPBB) {
716 if (R.mayWriteToMemory() &&
717 !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
718 return false;
719 }
720 }
721
722 VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
723 VPBuilder Builder(LatchVPBB->getTerminator());
724 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
725 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
726 "Unexpected terminator");
727 auto *IsLatchExitTaken =
728 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
729 LatchExitingBranch->getOperand(1));
730
731 VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
732 VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
733 auto *AnyExitTaken =
734 Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
735 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
736 LatchExitingBranch->eraseFromParent();
737
738 // If we exit early due to NaNs, compute the final reduction result based on
739 // the reduction phi at the beginning of the last vector iteration.
740 auto *RdxResult = find_singleton<VPSingleDefRecipe>(
741 RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
742 auto *VPI = dyn_cast<VPInstruction>(U);
743 if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
744 return VPI;
745 return nullptr;
746 });
747
748 auto *MiddleVPBB = Plan.getMiddleBlock();
749 Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
750 auto *NewSel =
751 Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
752 RdxResult->setOperand(1, NewSel);
753
754 auto *ScalarPH = Plan.getScalarPreheader();
755 // Update resume phis for inductions in the scalar preheader. If AnyNaN is
756 // true, the resume from the start of the last vector iteration via the
757 // canonical IV, otherwise from the original value.
758 for (auto &R : ScalarPH->phis()) {
759 auto *ResumeR = cast<VPPhi>(&R);
760 VPValue *VecV = ResumeR->getOperand(0);
761 if (VecV == RdxResult)
762 continue;
763 if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
764 if (DerivedIV->getNumUsers() == 1 &&
765 DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
766 auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
767 &Plan.getVectorTripCount());
768 DerivedIV->moveAfter(&*Builder.getInsertPoint());
769 DerivedIV->setOperand(1, NewSel);
770 continue;
771 }
772 }
773 // Bail out and abandon the current, partially modified, VPlan if we
774 // encounter resume phi that cannot be updated yet.
775 if (VecV != &Plan.getVectorTripCount()) {
776 LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
777 "FMaxNum/FMinNum reduction.\n");
778 return false;
779 }
780 auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
781 ResumeR->setOperand(0, NewSel);
782 }
783
784 auto *MiddleTerm = MiddleVPBB->getTerminator();
785 Builder.setInsertPoint(MiddleTerm);
786 VPValue *MiddleCond = MiddleTerm->getOperand(0);
787 VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
788 MiddleTerm->setOperand(0, NewCond);
789 return true;
790 }
791