xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
45 
46 bool VPRecipeBase::mayWriteToMemory() const {
47   switch (getVPDefID()) {
48   case VPInterleaveSC:
49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50   case VPWidenMemoryInstructionSC: {
51     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
52   }
53   case VPReplicateSC:
54   case VPWidenCallSC:
55     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56         ->mayWriteToMemory();
57   case VPBranchOnMaskSC:
58   case VPScalarIVStepsSC:
59   case VPPredInstPHISC:
60     return false;
61   case VPBlendSC:
62   case VPReductionSC:
63   case VPWidenCanonicalIVSC:
64   case VPWidenCastSC:
65   case VPWidenGEPSC:
66   case VPWidenIntOrFpInductionSC:
67   case VPWidenPHISC:
68   case VPWidenSC:
69   case VPWidenSelectSC: {
70     const Instruction *I =
71         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
72     (void)I;
73     assert((!I || !I->mayWriteToMemory()) &&
74            "underlying instruction may write to memory");
75     return false;
76   }
77   default:
78     return true;
79   }
80 }
81 
82 bool VPRecipeBase::mayReadFromMemory() const {
83   switch (getVPDefID()) {
84   case VPWidenMemoryInstructionSC: {
85     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
86   }
87   case VPReplicateSC:
88   case VPWidenCallSC:
89     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
90         ->mayReadFromMemory();
91   case VPBranchOnMaskSC:
92   case VPScalarIVStepsSC:
93   case VPPredInstPHISC:
94     return false;
95   case VPBlendSC:
96   case VPReductionSC:
97   case VPWidenCanonicalIVSC:
98   case VPWidenCastSC:
99   case VPWidenGEPSC:
100   case VPWidenIntOrFpInductionSC:
101   case VPWidenPHISC:
102   case VPWidenSC:
103   case VPWidenSelectSC: {
104     const Instruction *I =
105         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
106     (void)I;
107     assert((!I || !I->mayReadFromMemory()) &&
108            "underlying instruction may read from memory");
109     return false;
110   }
111   default:
112     return true;
113   }
114 }
115 
116 bool VPRecipeBase::mayHaveSideEffects() const {
117   switch (getVPDefID()) {
118   case VPDerivedIVSC:
119   case VPPredInstPHISC:
120     return false;
121   case VPInstructionSC:
122     switch (cast<VPInstruction>(this)->getOpcode()) {
123     case Instruction::Or:
124     case Instruction::ICmp:
125     case Instruction::Select:
126     case VPInstruction::Not:
127     case VPInstruction::CalculateTripCountMinusVF:
128     case VPInstruction::CanonicalIVIncrementForPart:
129       return false;
130     default:
131       return true;
132     }
133   case VPWidenCallSC:
134     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
135         ->mayHaveSideEffects();
136   case VPBlendSC:
137   case VPReductionSC:
138   case VPScalarIVStepsSC:
139   case VPWidenCanonicalIVSC:
140   case VPWidenCastSC:
141   case VPWidenGEPSC:
142   case VPWidenIntOrFpInductionSC:
143   case VPWidenPHISC:
144   case VPWidenPointerInductionSC:
145   case VPWidenSC:
146   case VPWidenSelectSC: {
147     const Instruction *I =
148         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
149     (void)I;
150     assert((!I || !I->mayHaveSideEffects()) &&
151            "underlying instruction has side-effects");
152     return false;
153   }
154   case VPInterleaveSC:
155     return mayWriteToMemory();
156   case VPWidenMemoryInstructionSC:
157     assert(cast<VPWidenMemoryInstructionRecipe>(this)
158                    ->getIngredient()
159                    .mayHaveSideEffects() == mayWriteToMemory() &&
160            "mayHaveSideffects result for ingredient differs from this "
161            "implementation");
162     return mayWriteToMemory();
163   case VPReplicateSC: {
164     auto *R = cast<VPReplicateRecipe>(this);
165     return R->getUnderlyingInstr()->mayHaveSideEffects();
166   }
167   default:
168     return true;
169   }
170 }
171 
172 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
173   auto Lane = VPLane::getLastLaneForVF(State.VF);
174   VPValue *ExitValue = getOperand(0);
175   if (vputils::isUniformAfterVectorization(ExitValue))
176     Lane = VPLane::getFirstLane();
177   VPBasicBlock *MiddleVPBB =
178       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
179   assert(MiddleVPBB->getNumSuccessors() == 0 &&
180          "the middle block must not have any successors");
181   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
182   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
183                    MiddleBB);
184 }
185 
186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
188   O << "Live-out ";
189   getPhi()->printAsOperand(O);
190   O << " = ";
191   getOperand(0)->printAsOperand(O, SlotTracker);
192   O << "\n";
193 }
194 #endif
195 
196 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
197   assert(!Parent && "Recipe already in some VPBasicBlock");
198   assert(InsertPos->getParent() &&
199          "Insertion position not in any VPBasicBlock");
200   Parent = InsertPos->getParent();
201   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
202 }
203 
204 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
205                                 iplist<VPRecipeBase>::iterator I) {
206   assert(!Parent && "Recipe already in some VPBasicBlock");
207   assert(I == BB.end() || I->getParent() == &BB);
208   Parent = &BB;
209   BB.getRecipeList().insert(I, this);
210 }
211 
212 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
213   assert(!Parent && "Recipe already in some VPBasicBlock");
214   assert(InsertPos->getParent() &&
215          "Insertion position not in any VPBasicBlock");
216   Parent = InsertPos->getParent();
217   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
218 }
219 
220 void VPRecipeBase::removeFromParent() {
221   assert(getParent() && "Recipe not in any VPBasicBlock");
222   getParent()->getRecipeList().remove(getIterator());
223   Parent = nullptr;
224 }
225 
226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
227   assert(getParent() && "Recipe not in any VPBasicBlock");
228   return getParent()->getRecipeList().erase(getIterator());
229 }
230 
231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
232   removeFromParent();
233   insertAfter(InsertPos);
234 }
235 
236 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
237                               iplist<VPRecipeBase>::iterator I) {
238   removeFromParent();
239   insertBefore(BB, I);
240 }
241 
242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
243   assert(OpType == OperationType::FPMathOp &&
244          "recipe doesn't have fast math flags");
245   FastMathFlags Res;
246   Res.setAllowReassoc(FMFs.AllowReassoc);
247   Res.setNoNaNs(FMFs.NoNaNs);
248   Res.setNoInfs(FMFs.NoInfs);
249   Res.setNoSignedZeros(FMFs.NoSignedZeros);
250   Res.setAllowReciprocal(FMFs.AllowReciprocal);
251   Res.setAllowContract(FMFs.AllowContract);
252   Res.setApproxFunc(FMFs.ApproxFunc);
253   return Res;
254 }
255 
256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
257                              VPValue *A, VPValue *B, DebugLoc DL,
258                              const Twine &Name)
259     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
260                           Pred, DL),
261       VPValue(this), Opcode(Opcode), Name(Name.str()) {
262   assert(Opcode == Instruction::ICmp &&
263          "only ICmp predicates supported at the moment");
264 }
265 
266 VPInstruction::VPInstruction(unsigned Opcode,
267                              std::initializer_list<VPValue *> Operands,
268                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
269     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
270       VPValue(this), Opcode(Opcode), Name(Name.str()) {
271   // Make sure the VPInstruction is a floating-point operation.
272   assert(isFPMathOp() && "this op can't take fast-math flags");
273 }
274 
275 Value *VPInstruction::generateInstruction(VPTransformState &State,
276                                           unsigned Part) {
277   IRBuilderBase &Builder = State.Builder;
278   Builder.SetCurrentDebugLocation(getDebugLoc());
279 
280   if (Instruction::isBinaryOp(getOpcode())) {
281     if (Part != 0 && vputils::onlyFirstPartUsed(this))
282       return State.get(this, 0);
283 
284     Value *A = State.get(getOperand(0), Part);
285     Value *B = State.get(getOperand(1), Part);
286     auto *Res =
287         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
288     if (auto *I = dyn_cast<Instruction>(Res))
289       setFlags(I);
290     return Res;
291   }
292 
293   switch (getOpcode()) {
294   case VPInstruction::Not: {
295     Value *A = State.get(getOperand(0), Part);
296     return Builder.CreateNot(A, Name);
297   }
298   case Instruction::ICmp: {
299     Value *A = State.get(getOperand(0), Part);
300     Value *B = State.get(getOperand(1), Part);
301     return Builder.CreateCmp(getPredicate(), A, B, Name);
302   }
303   case Instruction::Select: {
304     Value *Cond = State.get(getOperand(0), Part);
305     Value *Op1 = State.get(getOperand(1), Part);
306     Value *Op2 = State.get(getOperand(2), Part);
307     return Builder.CreateSelect(Cond, Op1, Op2, Name);
308   }
309   case VPInstruction::ActiveLaneMask: {
310     // Get first lane of vector induction variable.
311     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
312     // Get the original loop tripcount.
313     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
314 
315     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
316     auto *PredTy = VectorType::get(Int1Ty, State.VF);
317     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
318                                    {PredTy, ScalarTC->getType()},
319                                    {VIVElem0, ScalarTC}, nullptr, Name);
320   }
321   case VPInstruction::FirstOrderRecurrenceSplice: {
322     // Generate code to combine the previous and current values in vector v3.
323     //
324     //   vector.ph:
325     //     v_init = vector(..., ..., ..., a[-1])
326     //     br vector.body
327     //
328     //   vector.body
329     //     i = phi [0, vector.ph], [i+4, vector.body]
330     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
331     //     v2 = a[i, i+1, i+2, i+3];
332     //     v3 = vector(v1(3), v2(0, 1, 2))
333 
334     // For the first part, use the recurrence phi (v1), otherwise v2.
335     auto *V1 = State.get(getOperand(0), 0);
336     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
337     if (!PartMinus1->getType()->isVectorTy())
338       return PartMinus1;
339     Value *V2 = State.get(getOperand(1), Part);
340     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
341   }
342   case VPInstruction::CalculateTripCountMinusVF: {
343     Value *ScalarTC = State.get(getOperand(0), {0, 0});
344     Value *Step =
345         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
346     Value *Sub = Builder.CreateSub(ScalarTC, Step);
347     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
348     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
349     return Builder.CreateSelect(Cmp, Sub, Zero);
350   }
351   case VPInstruction::CanonicalIVIncrementForPart: {
352     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
353     if (Part == 0)
354       return IV;
355 
356     // The canonical IV is incremented by the vectorization factor (num of SIMD
357     // elements) times the unroll part.
358     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
359     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
360                              hasNoSignedWrap());
361   }
362   case VPInstruction::BranchOnCond: {
363     if (Part != 0)
364       return nullptr;
365 
366     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
367     VPRegionBlock *ParentRegion = getParent()->getParent();
368     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
369 
370     // Replace the temporary unreachable terminator with a new conditional
371     // branch, hooking it up to backward destination for exiting blocks now and
372     // to forward destination(s) later when they are created.
373     BranchInst *CondBr =
374         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
375 
376     if (getParent()->isExiting())
377       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
378 
379     CondBr->setSuccessor(0, nullptr);
380     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
381     return CondBr;
382   }
383   case VPInstruction::BranchOnCount: {
384     if (Part != 0)
385       return nullptr;
386     // First create the compare.
387     Value *IV = State.get(getOperand(0), Part);
388     Value *TC = State.get(getOperand(1), Part);
389     Value *Cond = Builder.CreateICmpEQ(IV, TC);
390 
391     // Now create the branch.
392     auto *Plan = getParent()->getPlan();
393     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
394     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
395 
396     // Replace the temporary unreachable terminator with a new conditional
397     // branch, hooking it up to backward destination (the header) now and to the
398     // forward destination (the exit/middle block) later when it is created.
399     // Note that CreateCondBr expects a valid BB as first argument, so we need
400     // to set it to nullptr later.
401     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
402                                               State.CFG.VPBB2IRBB[Header]);
403     CondBr->setSuccessor(0, nullptr);
404     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
405     return CondBr;
406   }
407   case VPInstruction::ComputeReductionResult: {
408     if (Part != 0)
409       return State.get(this, 0);
410 
411     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
412     // and will be removed by breaking up the recipe further.
413     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
414     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
415     // Get its reduction variable descriptor.
416     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
417 
418     RecurKind RK = RdxDesc.getRecurrenceKind();
419 
420     State.setDebugLocFrom(getDebugLoc());
421 
422     VPValue *LoopExitingDef = getOperand(1);
423     Type *PhiTy = OrigPhi->getType();
424     VectorParts RdxParts(State.UF);
425     for (unsigned Part = 0; Part < State.UF; ++Part)
426       RdxParts[Part] = State.get(LoopExitingDef, Part);
427 
428     // If the vector reduction can be performed in a smaller type, we truncate
429     // then extend the loop exit value to enable InstCombine to evaluate the
430     // entire expression in the smaller type.
431     // TODO: Handle this in truncateToMinBW.
432     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
433       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
434       for (unsigned Part = 0; Part < State.UF; ++Part)
435         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
436     }
437     // Reduce all of the unrolled parts into a single vector.
438     Value *ReducedPartRdx = RdxParts[0];
439     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
440 
441     if (PhiR->isOrdered()) {
442       ReducedPartRdx = RdxParts[State.UF - 1];
443     } else {
444       // Floating-point operations should have some FMF to enable the reduction.
445       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
446       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
447       for (unsigned Part = 1; Part < State.UF; ++Part) {
448         Value *RdxPart = RdxParts[Part];
449         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
450           ReducedPartRdx = Builder.CreateBinOp(
451               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
452         else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
453           TrackingVH<Value> ReductionStartValue =
454               RdxDesc.getRecurrenceStartValue();
455           ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
456                                          ReducedPartRdx, RdxPart);
457         } else
458           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
459       }
460     }
461 
462     // Create the reduction after the loop. Note that inloop reductions create
463     // the target reduction in the loop using a Reduction recipe.
464     if (State.VF.isVector() && !PhiR->isInLoop()) {
465       ReducedPartRdx =
466           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
467       // If the reduction can be performed in a smaller type, we need to extend
468       // the reduction to the wider type before we branch to the original loop.
469       if (PhiTy != RdxDesc.getRecurrenceType())
470         ReducedPartRdx = RdxDesc.isSigned()
471                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
472                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
473     }
474 
475     // If there were stores of the reduction value to a uniform memory address
476     // inside the loop, create the final store here.
477     if (StoreInst *SI = RdxDesc.IntermediateStore) {
478       auto *NewSI = Builder.CreateAlignedStore(
479           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
480       propagateMetadata(NewSI, SI);
481     }
482 
483     return ReducedPartRdx;
484   }
485   default:
486     llvm_unreachable("Unsupported opcode for instruction");
487   }
488 }
489 
490 #if !defined(NDEBUG)
491 bool VPInstruction::isFPMathOp() const {
492   // Inspired by FPMathOperator::classof. Notable differences are that we don't
493   // support Call, PHI and Select opcodes here yet.
494   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
495          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
496          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
497          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
498 }
499 #endif
500 
501 void VPInstruction::execute(VPTransformState &State) {
502   assert(!State.Instance && "VPInstruction executing an Instance");
503   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
504   assert((hasFastMathFlags() == isFPMathOp() ||
505           getOpcode() == Instruction::Select) &&
506          "Recipe not a FPMathOp but has fast-math flags?");
507   if (hasFastMathFlags())
508     State.Builder.setFastMathFlags(getFastMathFlags());
509   for (unsigned Part = 0; Part < State.UF; ++Part) {
510     Value *GeneratedValue = generateInstruction(State, Part);
511     if (!hasResult())
512       continue;
513     assert(GeneratedValue && "generateInstruction must produce a value");
514     State.set(this, GeneratedValue, Part);
515   }
516 }
517 
518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
519 void VPInstruction::dump() const {
520   VPSlotTracker SlotTracker(getParent()->getPlan());
521   print(dbgs(), "", SlotTracker);
522 }
523 
524 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
525                           VPSlotTracker &SlotTracker) const {
526   O << Indent << "EMIT ";
527 
528   if (hasResult()) {
529     printAsOperand(O, SlotTracker);
530     O << " = ";
531   }
532 
533   switch (getOpcode()) {
534   case VPInstruction::Not:
535     O << "not";
536     break;
537   case VPInstruction::SLPLoad:
538     O << "combined load";
539     break;
540   case VPInstruction::SLPStore:
541     O << "combined store";
542     break;
543   case VPInstruction::ActiveLaneMask:
544     O << "active lane mask";
545     break;
546   case VPInstruction::FirstOrderRecurrenceSplice:
547     O << "first-order splice";
548     break;
549   case VPInstruction::BranchOnCond:
550     O << "branch-on-cond";
551     break;
552   case VPInstruction::CalculateTripCountMinusVF:
553     O << "TC > VF ? TC - VF : 0";
554     break;
555   case VPInstruction::CanonicalIVIncrementForPart:
556     O << "VF * Part +";
557     break;
558   case VPInstruction::BranchOnCount:
559     O << "branch-on-count";
560     break;
561   case VPInstruction::ComputeReductionResult:
562     O << "compute-reduction-result";
563     break;
564   default:
565     O << Instruction::getOpcodeName(getOpcode());
566   }
567 
568   printFlags(O);
569   printOperands(O, SlotTracker);
570 
571   if (auto DL = getDebugLoc()) {
572     O << ", !dbg ";
573     DL.print(O);
574   }
575 }
576 #endif
577 
578 void VPWidenCallRecipe::execute(VPTransformState &State) {
579   assert(State.VF.isVector() && "not widening");
580   auto &CI = *cast<CallInst>(getUnderlyingInstr());
581   assert(!isa<DbgInfoIntrinsic>(CI) &&
582          "DbgInfoIntrinsic should have been dropped during VPlan construction");
583   State.setDebugLocFrom(CI.getDebugLoc());
584 
585   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
586   FunctionType *VFTy = nullptr;
587   if (Variant)
588     VFTy = Variant->getFunctionType();
589   for (unsigned Part = 0; Part < State.UF; ++Part) {
590     SmallVector<Type *, 2> TysForDecl;
591     // Add return type if intrinsic is overloaded on it.
592     if (UseIntrinsic &&
593         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
594       TysForDecl.push_back(
595           VectorType::get(CI.getType()->getScalarType(), State.VF));
596     SmallVector<Value *, 4> Args;
597     for (const auto &I : enumerate(operands())) {
598       // Some intrinsics have a scalar argument - don't replace it with a
599       // vector.
600       // Some vectorized function variants may also take a scalar argument,
601       // e.g. linear parameters for pointers.
602       Value *Arg;
603       if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
604           (UseIntrinsic &&
605            isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
606         Arg = State.get(I.value(), VPIteration(0, 0));
607       else
608         Arg = State.get(I.value(), Part);
609       if (UseIntrinsic &&
610           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
611         TysForDecl.push_back(Arg->getType());
612       Args.push_back(Arg);
613     }
614 
615     Function *VectorF;
616     if (UseIntrinsic) {
617       // Use vector version of the intrinsic.
618       Module *M = State.Builder.GetInsertBlock()->getModule();
619       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
620       assert(VectorF && "Can't retrieve vector intrinsic.");
621     } else {
622 #ifndef NDEBUG
623       assert(Variant != nullptr && "Can't create vector function.");
624 #endif
625       VectorF = Variant;
626     }
627 
628     SmallVector<OperandBundleDef, 1> OpBundles;
629     CI.getOperandBundlesAsDefs(OpBundles);
630     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
631 
632     if (isa<FPMathOperator>(V))
633       V->copyFastMathFlags(&CI);
634 
635     State.set(this, V, Part);
636     State.addMetadata(V, &CI);
637   }
638 }
639 
640 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
641 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
642                               VPSlotTracker &SlotTracker) const {
643   O << Indent << "WIDEN-CALL ";
644 
645   auto *CI = cast<CallInst>(getUnderlyingInstr());
646   if (CI->getType()->isVoidTy())
647     O << "void ";
648   else {
649     printAsOperand(O, SlotTracker);
650     O << " = ";
651   }
652 
653   O << "call @" << CI->getCalledFunction()->getName() << "(";
654   printOperands(O, SlotTracker);
655   O << ")";
656 
657   if (VectorIntrinsicID)
658     O << " (using vector intrinsic)";
659   else {
660     O << " (using library function";
661     if (Variant->hasName())
662       O << ": " << Variant->getName();
663     O << ")";
664   }
665 }
666 
667 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
668                                 VPSlotTracker &SlotTracker) const {
669   O << Indent << "WIDEN-SELECT ";
670   printAsOperand(O, SlotTracker);
671   O << " = select ";
672   getOperand(0)->printAsOperand(O, SlotTracker);
673   O << ", ";
674   getOperand(1)->printAsOperand(O, SlotTracker);
675   O << ", ";
676   getOperand(2)->printAsOperand(O, SlotTracker);
677   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
678 }
679 #endif
680 
681 void VPWidenSelectRecipe::execute(VPTransformState &State) {
682   State.setDebugLocFrom(getDebugLoc());
683 
684   // The condition can be loop invariant but still defined inside the
685   // loop. This means that we can't just use the original 'cond' value.
686   // We have to take the 'vectorized' value and pick the first lane.
687   // Instcombine will make this a no-op.
688   auto *InvarCond =
689       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
690 
691   for (unsigned Part = 0; Part < State.UF; ++Part) {
692     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
693     Value *Op0 = State.get(getOperand(1), Part);
694     Value *Op1 = State.get(getOperand(2), Part);
695     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
696     State.set(this, Sel, Part);
697     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
698   }
699 }
700 
701 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
702     const FastMathFlags &FMF) {
703   AllowReassoc = FMF.allowReassoc();
704   NoNaNs = FMF.noNaNs();
705   NoInfs = FMF.noInfs();
706   NoSignedZeros = FMF.noSignedZeros();
707   AllowReciprocal = FMF.allowReciprocal();
708   AllowContract = FMF.allowContract();
709   ApproxFunc = FMF.approxFunc();
710 }
711 
712 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
713 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
714   switch (OpType) {
715   case OperationType::Cmp:
716     O << " " << CmpInst::getPredicateName(getPredicate());
717     break;
718   case OperationType::DisjointOp:
719     if (DisjointFlags.IsDisjoint)
720       O << " disjoint";
721     break;
722   case OperationType::PossiblyExactOp:
723     if (ExactFlags.IsExact)
724       O << " exact";
725     break;
726   case OperationType::OverflowingBinOp:
727     if (WrapFlags.HasNUW)
728       O << " nuw";
729     if (WrapFlags.HasNSW)
730       O << " nsw";
731     break;
732   case OperationType::FPMathOp:
733     getFastMathFlags().print(O);
734     break;
735   case OperationType::GEPOp:
736     if (GEPFlags.IsInBounds)
737       O << " inbounds";
738     break;
739   case OperationType::NonNegOp:
740     if (NonNegFlags.NonNeg)
741       O << " nneg";
742     break;
743   case OperationType::Other:
744     break;
745   }
746   if (getNumOperands() > 0)
747     O << " ";
748 }
749 #endif
750 
751 void VPWidenRecipe::execute(VPTransformState &State) {
752   State.setDebugLocFrom(getDebugLoc());
753   auto &Builder = State.Builder;
754   switch (Opcode) {
755   case Instruction::Call:
756   case Instruction::Br:
757   case Instruction::PHI:
758   case Instruction::GetElementPtr:
759   case Instruction::Select:
760     llvm_unreachable("This instruction is handled by a different recipe.");
761   case Instruction::UDiv:
762   case Instruction::SDiv:
763   case Instruction::SRem:
764   case Instruction::URem:
765   case Instruction::Add:
766   case Instruction::FAdd:
767   case Instruction::Sub:
768   case Instruction::FSub:
769   case Instruction::FNeg:
770   case Instruction::Mul:
771   case Instruction::FMul:
772   case Instruction::FDiv:
773   case Instruction::FRem:
774   case Instruction::Shl:
775   case Instruction::LShr:
776   case Instruction::AShr:
777   case Instruction::And:
778   case Instruction::Or:
779   case Instruction::Xor: {
780     // Just widen unops and binops.
781     for (unsigned Part = 0; Part < State.UF; ++Part) {
782       SmallVector<Value *, 2> Ops;
783       for (VPValue *VPOp : operands())
784         Ops.push_back(State.get(VPOp, Part));
785 
786       Value *V = Builder.CreateNAryOp(Opcode, Ops);
787 
788       if (auto *VecOp = dyn_cast<Instruction>(V))
789         setFlags(VecOp);
790 
791       // Use this vector value for all users of the original instruction.
792       State.set(this, V, Part);
793       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
794     }
795 
796     break;
797   }
798   case Instruction::Freeze: {
799     for (unsigned Part = 0; Part < State.UF; ++Part) {
800       Value *Op = State.get(getOperand(0), Part);
801 
802       Value *Freeze = Builder.CreateFreeze(Op);
803       State.set(this, Freeze, Part);
804     }
805     break;
806   }
807   case Instruction::ICmp:
808   case Instruction::FCmp: {
809     // Widen compares. Generate vector compares.
810     bool FCmp = Opcode == Instruction::FCmp;
811     for (unsigned Part = 0; Part < State.UF; ++Part) {
812       Value *A = State.get(getOperand(0), Part);
813       Value *B = State.get(getOperand(1), Part);
814       Value *C = nullptr;
815       if (FCmp) {
816         // Propagate fast math flags.
817         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
818         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
819           Builder.setFastMathFlags(I->getFastMathFlags());
820         C = Builder.CreateFCmp(getPredicate(), A, B);
821       } else {
822         C = Builder.CreateICmp(getPredicate(), A, B);
823       }
824       State.set(this, C, Part);
825       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
826     }
827 
828     break;
829   }
830   default:
831     // This instruction is not vectorized by simple widening.
832     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
833                       << Instruction::getOpcodeName(Opcode));
834     llvm_unreachable("Unhandled instruction!");
835   } // end of switch.
836 
837 #if !defined(NDEBUG)
838   // Verify that VPlan type inference results agree with the type of the
839   // generated values.
840   for (unsigned Part = 0; Part < State.UF; ++Part) {
841     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
842                            State.VF) == State.get(this, Part)->getType() &&
843            "inferred type and type from generated instructions do not match");
844   }
845 #endif
846 }
847 
848 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
849 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
850                           VPSlotTracker &SlotTracker) const {
851   O << Indent << "WIDEN ";
852   printAsOperand(O, SlotTracker);
853   O << " = " << Instruction::getOpcodeName(Opcode);
854   printFlags(O);
855   printOperands(O, SlotTracker);
856 }
857 #endif
858 
859 void VPWidenCastRecipe::execute(VPTransformState &State) {
860   State.setDebugLocFrom(getDebugLoc());
861   auto &Builder = State.Builder;
862   /// Vectorize casts.
863   assert(State.VF.isVector() && "Not vectorizing?");
864   Type *DestTy = VectorType::get(getResultType(), State.VF);
865   VPValue *Op = getOperand(0);
866   for (unsigned Part = 0; Part < State.UF; ++Part) {
867     if (Part > 0 && Op->isLiveIn()) {
868       // FIXME: Remove once explicit unrolling is implemented using VPlan.
869       State.set(this, State.get(this, 0), Part);
870       continue;
871     }
872     Value *A = State.get(Op, Part);
873     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
874     State.set(this, Cast, Part);
875     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
876   }
877 }
878 
879 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
880 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
881                               VPSlotTracker &SlotTracker) const {
882   O << Indent << "WIDEN-CAST ";
883   printAsOperand(O, SlotTracker);
884   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
885   printFlags(O);
886   printOperands(O, SlotTracker);
887   O << " to " << *getResultType();
888 }
889 #endif
890 
891 /// This function adds
892 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
893 /// to each vector element of Val. The sequence starts at StartIndex.
894 /// \p Opcode is relevant for FP induction variable.
895 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
896                             Instruction::BinaryOps BinOp, ElementCount VF,
897                             IRBuilderBase &Builder) {
898   assert(VF.isVector() && "only vector VFs are supported");
899 
900   // Create and check the types.
901   auto *ValVTy = cast<VectorType>(Val->getType());
902   ElementCount VLen = ValVTy->getElementCount();
903 
904   Type *STy = Val->getType()->getScalarType();
905   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
906          "Induction Step must be an integer or FP");
907   assert(Step->getType() == STy && "Step has wrong type");
908 
909   SmallVector<Constant *, 8> Indices;
910 
911   // Create a vector of consecutive numbers from zero to VF.
912   VectorType *InitVecValVTy = ValVTy;
913   if (STy->isFloatingPointTy()) {
914     Type *InitVecValSTy =
915         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
916     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
917   }
918   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
919 
920   // Splat the StartIdx
921   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
922 
923   if (STy->isIntegerTy()) {
924     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
925     Step = Builder.CreateVectorSplat(VLen, Step);
926     assert(Step->getType() == Val->getType() && "Invalid step vec");
927     // FIXME: The newly created binary instructions should contain nsw/nuw
928     // flags, which can be found from the original scalar operations.
929     Step = Builder.CreateMul(InitVec, Step);
930     return Builder.CreateAdd(Val, Step, "induction");
931   }
932 
933   // Floating point induction.
934   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
935          "Binary Opcode should be specified for FP induction");
936   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
937   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
938 
939   Step = Builder.CreateVectorSplat(VLen, Step);
940   Value *MulOp = Builder.CreateFMul(InitVec, Step);
941   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
942 }
943 
944 /// A helper function that returns an integer or floating-point constant with
945 /// value C.
946 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
947   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
948                            : ConstantFP::get(Ty, C);
949 }
950 
951 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
952                                   ElementCount VF) {
953   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
954   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
955   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
956   return B.CreateUIToFP(RuntimeVF, FTy);
957 }
958 
959 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
960   assert(!State.Instance && "Int or FP induction being replicated.");
961 
962   Value *Start = getStartValue()->getLiveInIRValue();
963   const InductionDescriptor &ID = getInductionDescriptor();
964   TruncInst *Trunc = getTruncInst();
965   IRBuilderBase &Builder = State.Builder;
966   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
967   assert(State.VF.isVector() && "must have vector VF");
968 
969   // The value from the original loop to which we are mapping the new induction
970   // variable.
971   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
972 
973   // Fast-math-flags propagate from the original induction instruction.
974   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
975   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
976     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
977 
978   // Now do the actual transformations, and start with fetching the step value.
979   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
980 
981   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
982          "Expected either an induction phi-node or a truncate of it!");
983 
984   // Construct the initial value of the vector IV in the vector loop preheader
985   auto CurrIP = Builder.saveIP();
986   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
987   Builder.SetInsertPoint(VectorPH->getTerminator());
988   if (isa<TruncInst>(EntryVal)) {
989     assert(Start->getType()->isIntegerTy() &&
990            "Truncation requires an integer type");
991     auto *TruncType = cast<IntegerType>(EntryVal->getType());
992     Step = Builder.CreateTrunc(Step, TruncType);
993     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
994   }
995 
996   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
997   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
998   Value *SteppedStart = getStepVector(
999       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1000 
1001   // We create vector phi nodes for both integer and floating-point induction
1002   // variables. Here, we determine the kind of arithmetic we will perform.
1003   Instruction::BinaryOps AddOp;
1004   Instruction::BinaryOps MulOp;
1005   if (Step->getType()->isIntegerTy()) {
1006     AddOp = Instruction::Add;
1007     MulOp = Instruction::Mul;
1008   } else {
1009     AddOp = ID.getInductionOpcode();
1010     MulOp = Instruction::FMul;
1011   }
1012 
1013   // Multiply the vectorization factor by the step using integer or
1014   // floating-point arithmetic as appropriate.
1015   Type *StepType = Step->getType();
1016   Value *RuntimeVF;
1017   if (Step->getType()->isFloatingPointTy())
1018     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1019   else
1020     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1021   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1022 
1023   // Create a vector splat to use in the induction update.
1024   //
1025   // FIXME: If the step is non-constant, we create the vector splat with
1026   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1027   //        handle a constant vector splat.
1028   Value *SplatVF = isa<Constant>(Mul)
1029                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1030                        : Builder.CreateVectorSplat(State.VF, Mul);
1031   Builder.restoreIP(CurrIP);
1032 
1033   // We may need to add the step a number of times, depending on the unroll
1034   // factor. The last of those goes into the PHI.
1035   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1036   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1037   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1038   Instruction *LastInduction = VecInd;
1039   for (unsigned Part = 0; Part < State.UF; ++Part) {
1040     State.set(this, LastInduction, Part);
1041 
1042     if (isa<TruncInst>(EntryVal))
1043       State.addMetadata(LastInduction, EntryVal);
1044 
1045     LastInduction = cast<Instruction>(
1046         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1047     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1048   }
1049 
1050   LastInduction->setName("vec.ind.next");
1051   VecInd->addIncoming(SteppedStart, VectorPH);
1052   // Add induction update using an incorrect block temporarily. The phi node
1053   // will be fixed after VPlan execution. Note that at this point the latch
1054   // block cannot be used, as it does not exist yet.
1055   // TODO: Model increment value in VPlan, by turning the recipe into a
1056   // multi-def and a subclass of VPHeaderPHIRecipe.
1057   VecInd->addIncoming(LastInduction, VectorPH);
1058 }
1059 
1060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1061 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1062                                           VPSlotTracker &SlotTracker) const {
1063   O << Indent << "WIDEN-INDUCTION";
1064   if (getTruncInst()) {
1065     O << "\\l\"";
1066     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1067     O << " +\n" << Indent << "\"  ";
1068     getVPValue(0)->printAsOperand(O, SlotTracker);
1069   } else
1070     O << " " << VPlanIngredient(IV);
1071 
1072   O << ", ";
1073   getStepValue()->printAsOperand(O, SlotTracker);
1074 }
1075 #endif
1076 
1077 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1078   // The step may be defined by a recipe in the preheader (e.g. if it requires
1079   // SCEV expansion), but for the canonical induction the step is required to be
1080   // 1, which is represented as live-in.
1081   if (getStepValue()->getDefiningRecipe())
1082     return false;
1083   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1084   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1085   return StartC && StartC->isZero() && StepC && StepC->isOne();
1086 }
1087 
1088 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1089 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1090                               VPSlotTracker &SlotTracker) const {
1091   O << Indent;
1092   printAsOperand(O, SlotTracker);
1093   O << Indent << "= DERIVED-IV ";
1094   getStartValue()->printAsOperand(O, SlotTracker);
1095   O << " + ";
1096   getCanonicalIV()->printAsOperand(O, SlotTracker);
1097   O << " * ";
1098   getStepValue()->printAsOperand(O, SlotTracker);
1099 
1100   if (TruncResultTy)
1101     O << " (truncated to " << *TruncResultTy << ")";
1102 }
1103 #endif
1104 
1105 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1106   // Fast-math-flags propagate from the original induction instruction.
1107   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1108   if (hasFastMathFlags())
1109     State.Builder.setFastMathFlags(getFastMathFlags());
1110 
1111   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1112   /// variable on which to base the steps, \p Step is the size of the step.
1113 
1114   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1115   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1116   IRBuilderBase &Builder = State.Builder;
1117 
1118   // Ensure step has the same type as that of scalar IV.
1119   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1120   if (BaseIVTy != Step->getType()) {
1121     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
1122     // avoid separate truncate here.
1123     assert(Step->getType()->isIntegerTy() &&
1124            "Truncation requires an integer step");
1125     Step = State.Builder.CreateTrunc(Step, BaseIVTy);
1126   }
1127 
1128   // We build scalar steps for both integer and floating-point induction
1129   // variables. Here, we determine the kind of arithmetic we will perform.
1130   Instruction::BinaryOps AddOp;
1131   Instruction::BinaryOps MulOp;
1132   if (BaseIVTy->isIntegerTy()) {
1133     AddOp = Instruction::Add;
1134     MulOp = Instruction::Mul;
1135   } else {
1136     AddOp = InductionOpcode;
1137     MulOp = Instruction::FMul;
1138   }
1139 
1140   // Determine the number of scalars we need to generate for each unroll
1141   // iteration.
1142   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1143   // Compute the scalar steps and save the results in State.
1144   Type *IntStepTy =
1145       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1146   Type *VecIVTy = nullptr;
1147   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1148   if (!FirstLaneOnly && State.VF.isScalable()) {
1149     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1150     UnitStepVec =
1151         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1152     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1153     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1154   }
1155 
1156   unsigned StartPart = 0;
1157   unsigned EndPart = State.UF;
1158   unsigned StartLane = 0;
1159   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1160   if (State.Instance) {
1161     StartPart = State.Instance->Part;
1162     EndPart = StartPart + 1;
1163     StartLane = State.Instance->Lane.getKnownLane();
1164     EndLane = StartLane + 1;
1165   }
1166   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1167     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1168 
1169     if (!FirstLaneOnly && State.VF.isScalable()) {
1170       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1171       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1172       if (BaseIVTy->isFloatingPointTy())
1173         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1174       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1175       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1176       State.set(this, Add, Part);
1177       // It's useful to record the lane values too for the known minimum number
1178       // of elements so we do those below. This improves the code quality when
1179       // trying to extract the first element, for example.
1180     }
1181 
1182     if (BaseIVTy->isFloatingPointTy())
1183       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1184 
1185     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1186       Value *StartIdx = Builder.CreateBinOp(
1187           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1188       // The step returned by `createStepForVF` is a runtime-evaluated value
1189       // when VF is scalable. Otherwise, it should be folded into a Constant.
1190       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1191              "Expected StartIdx to be folded to a constant when VF is not "
1192              "scalable");
1193       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1194       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1195       State.set(this, Add, VPIteration(Part, Lane));
1196     }
1197   }
1198 }
1199 
1200 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1201 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1202                                   VPSlotTracker &SlotTracker) const {
1203   O << Indent;
1204   printAsOperand(O, SlotTracker);
1205   O << " = SCALAR-STEPS ";
1206   printOperands(O, SlotTracker);
1207 }
1208 #endif
1209 
1210 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1211   assert(State.VF.isVector() && "not widening");
1212   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1213   // Construct a vector GEP by widening the operands of the scalar GEP as
1214   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1215   // results in a vector of pointers when at least one operand of the GEP
1216   // is vector-typed. Thus, to keep the representation compact, we only use
1217   // vector-typed operands for loop-varying values.
1218 
1219   if (areAllOperandsInvariant()) {
1220     // If we are vectorizing, but the GEP has only loop-invariant operands,
1221     // the GEP we build (by only using vector-typed operands for
1222     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1223     // produce a vector of pointers, we need to either arbitrarily pick an
1224     // operand to broadcast, or broadcast a clone of the original GEP.
1225     // Here, we broadcast a clone of the original.
1226     //
1227     // TODO: If at some point we decide to scalarize instructions having
1228     //       loop-invariant operands, this special case will no longer be
1229     //       required. We would add the scalarization decision to
1230     //       collectLoopScalars() and teach getVectorValue() to broadcast
1231     //       the lane-zero scalar value.
1232     SmallVector<Value *> Ops;
1233     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1234       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1235 
1236     auto *NewGEP =
1237         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1238                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1239     for (unsigned Part = 0; Part < State.UF; ++Part) {
1240       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1241       State.set(this, EntryPart, Part);
1242       State.addMetadata(EntryPart, GEP);
1243     }
1244   } else {
1245     // If the GEP has at least one loop-varying operand, we are sure to
1246     // produce a vector of pointers. But if we are only unrolling, we want
1247     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1248     // produce with the code below will be scalar (if VF == 1) or vector
1249     // (otherwise). Note that for the unroll-only case, we still maintain
1250     // values in the vector mapping with initVector, as we do for other
1251     // instructions.
1252     for (unsigned Part = 0; Part < State.UF; ++Part) {
1253       // The pointer operand of the new GEP. If it's loop-invariant, we
1254       // won't broadcast it.
1255       auto *Ptr = isPointerLoopInvariant()
1256                       ? State.get(getOperand(0), VPIteration(0, 0))
1257                       : State.get(getOperand(0), Part);
1258 
1259       // Collect all the indices for the new GEP. If any index is
1260       // loop-invariant, we won't broadcast it.
1261       SmallVector<Value *, 4> Indices;
1262       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1263         VPValue *Operand = getOperand(I);
1264         if (isIndexLoopInvariant(I - 1))
1265           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1266         else
1267           Indices.push_back(State.get(Operand, Part));
1268       }
1269 
1270       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1271       // but it should be a vector, otherwise.
1272       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1273                                              Indices, "", isInBounds());
1274       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1275              "NewGEP is not a pointer vector");
1276       State.set(this, NewGEP, Part);
1277       State.addMetadata(NewGEP, GEP);
1278     }
1279   }
1280 }
1281 
1282 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1283 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1284                              VPSlotTracker &SlotTracker) const {
1285   O << Indent << "WIDEN-GEP ";
1286   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1287   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1288     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1289 
1290   O << " ";
1291   printAsOperand(O, SlotTracker);
1292   O << " = getelementptr";
1293   printFlags(O);
1294   printOperands(O, SlotTracker);
1295 }
1296 #endif
1297 
1298 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1299   auto &Builder = State.Builder;
1300   State.setDebugLocFrom(getDebugLoc());
1301   for (unsigned Part = 0; Part < State.UF; ++Part) {
1302     // Calculate the pointer for the specific unroll-part.
1303     Value *PartPtr = nullptr;
1304     // Use i32 for the gep index type when the value is constant,
1305     // or query DataLayout for a more suitable index type otherwise.
1306     const DataLayout &DL =
1307         Builder.GetInsertBlock()->getModule()->getDataLayout();
1308     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1309                         ? DL.getIndexType(IndexedTy->getPointerTo())
1310                         : Builder.getInt32Ty();
1311     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1312     bool InBounds = isInBounds();
1313     if (IsReverse) {
1314       // If the address is consecutive but reversed, then the
1315       // wide store needs to start at the last vector element.
1316       // RunTimeVF =  VScale * VF.getKnownMinValue()
1317       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1318       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1319       // NumElt = -Part * RunTimeVF
1320       Value *NumElt = Builder.CreateMul(
1321           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1322       // LastLane = 1 - RunTimeVF
1323       Value *LastLane =
1324           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1325       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1326       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1327     } else {
1328       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1329       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1330     }
1331 
1332     State.set(this, PartPtr, Part);
1333   }
1334 }
1335 
1336 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1337 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1338                                   VPSlotTracker &SlotTracker) const {
1339   O << Indent;
1340   printAsOperand(O, SlotTracker);
1341   O << " = vector-pointer ";
1342   if (IsReverse)
1343     O << "(reverse) ";
1344 
1345   printOperands(O, SlotTracker);
1346 }
1347 #endif
1348 
1349 void VPBlendRecipe::execute(VPTransformState &State) {
1350   State.setDebugLocFrom(getDebugLoc());
1351   // We know that all PHIs in non-header blocks are converted into
1352   // selects, so we don't have to worry about the insertion order and we
1353   // can just use the builder.
1354   // At this point we generate the predication tree. There may be
1355   // duplications since this is a simple recursive scan, but future
1356   // optimizations will clean it up.
1357 
1358   unsigned NumIncoming = getNumIncomingValues();
1359 
1360   // Generate a sequence of selects of the form:
1361   // SELECT(Mask3, In3,
1362   //        SELECT(Mask2, In2,
1363   //               SELECT(Mask1, In1,
1364   //                      In0)))
1365   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1366   // are essentially undef are taken from In0.
1367  VectorParts Entry(State.UF);
1368   for (unsigned In = 0; In < NumIncoming; ++In) {
1369     for (unsigned Part = 0; Part < State.UF; ++Part) {
1370       // We might have single edge PHIs (blocks) - use an identity
1371       // 'select' for the first PHI operand.
1372       Value *In0 = State.get(getIncomingValue(In), Part);
1373       if (In == 0)
1374         Entry[Part] = In0; // Initialize with the first incoming value.
1375       else {
1376         // Select between the current value and the previous incoming edge
1377         // based on the incoming mask.
1378         Value *Cond = State.get(getMask(In), Part);
1379         Entry[Part] =
1380             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1381       }
1382     }
1383   }
1384   for (unsigned Part = 0; Part < State.UF; ++Part)
1385     State.set(this, Entry[Part], Part);
1386 }
1387 
1388 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1389 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1390                           VPSlotTracker &SlotTracker) const {
1391   O << Indent << "BLEND ";
1392   printAsOperand(O, SlotTracker);
1393   O << " =";
1394   if (getNumIncomingValues() == 1) {
1395     // Not a User of any mask: not really blending, this is a
1396     // single-predecessor phi.
1397     O << " ";
1398     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1399   } else {
1400     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1401       O << " ";
1402       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1403       O << "/";
1404       getMask(I)->printAsOperand(O, SlotTracker);
1405     }
1406   }
1407 }
1408 
1409 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1410                               VPSlotTracker &SlotTracker) const {
1411   O << Indent << "REDUCE ";
1412   printAsOperand(O, SlotTracker);
1413   O << " = ";
1414   getChainOp()->printAsOperand(O, SlotTracker);
1415   O << " +";
1416   if (isa<FPMathOperator>(getUnderlyingInstr()))
1417     O << getUnderlyingInstr()->getFastMathFlags();
1418   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1419   getVecOp()->printAsOperand(O, SlotTracker);
1420   if (getCondOp()) {
1421     O << ", ";
1422     getCondOp()->printAsOperand(O, SlotTracker);
1423   }
1424   O << ")";
1425   if (RdxDesc.IntermediateStore)
1426     O << " (with final reduction value stored in invariant address sank "
1427          "outside of loop)";
1428 }
1429 #endif
1430 
1431 bool VPReplicateRecipe::shouldPack() const {
1432   // Find if the recipe is used by a widened recipe via an intervening
1433   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1434   return any_of(users(), [](const VPUser *U) {
1435     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1436       return any_of(PredR->users(), [PredR](const VPUser *U) {
1437         return !U->usesScalars(PredR);
1438       });
1439     return false;
1440   });
1441 }
1442 
1443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1444 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1445                               VPSlotTracker &SlotTracker) const {
1446   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1447 
1448   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1449     printAsOperand(O, SlotTracker);
1450     O << " = ";
1451   }
1452   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1453     O << "call";
1454     printFlags(O);
1455     O << "@" << CB->getCalledFunction()->getName() << "(";
1456     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1457                     O, [&O, &SlotTracker](VPValue *Op) {
1458                       Op->printAsOperand(O, SlotTracker);
1459                     });
1460     O << ")";
1461   } else {
1462     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1463     printFlags(O);
1464     printOperands(O, SlotTracker);
1465   }
1466 
1467   if (shouldPack())
1468     O << " (S->V)";
1469 }
1470 #endif
1471 
1472 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1473   assert(State.Instance && "Branch on Mask works only on single instance.");
1474 
1475   unsigned Part = State.Instance->Part;
1476   unsigned Lane = State.Instance->Lane.getKnownLane();
1477 
1478   Value *ConditionBit = nullptr;
1479   VPValue *BlockInMask = getMask();
1480   if (BlockInMask) {
1481     ConditionBit = State.get(BlockInMask, Part);
1482     if (ConditionBit->getType()->isVectorTy())
1483       ConditionBit = State.Builder.CreateExtractElement(
1484           ConditionBit, State.Builder.getInt32(Lane));
1485   } else // Block in mask is all-one.
1486     ConditionBit = State.Builder.getTrue();
1487 
1488   // Replace the temporary unreachable terminator with a new conditional branch,
1489   // whose two destinations will be set later when they are created.
1490   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1491   assert(isa<UnreachableInst>(CurrentTerminator) &&
1492          "Expected to replace unreachable terminator with conditional branch.");
1493   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1494   CondBr->setSuccessor(0, nullptr);
1495   ReplaceInstWithInst(CurrentTerminator, CondBr);
1496 }
1497 
1498 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1499   assert(State.Instance && "Predicated instruction PHI works per instance.");
1500   Instruction *ScalarPredInst =
1501       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1502   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1503   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1504   assert(PredicatingBB && "Predicated block has no single predecessor.");
1505   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1506          "operand must be VPReplicateRecipe");
1507 
1508   // By current pack/unpack logic we need to generate only a single phi node: if
1509   // a vector value for the predicated instruction exists at this point it means
1510   // the instruction has vector users only, and a phi for the vector value is
1511   // needed. In this case the recipe of the predicated instruction is marked to
1512   // also do that packing, thereby "hoisting" the insert-element sequence.
1513   // Otherwise, a phi node for the scalar value is needed.
1514   unsigned Part = State.Instance->Part;
1515   if (State.hasVectorValue(getOperand(0), Part)) {
1516     Value *VectorValue = State.get(getOperand(0), Part);
1517     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1518     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1519     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1520     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1521     if (State.hasVectorValue(this, Part))
1522       State.reset(this, VPhi, Part);
1523     else
1524       State.set(this, VPhi, Part);
1525     // NOTE: Currently we need to update the value of the operand, so the next
1526     // predicated iteration inserts its generated value in the correct vector.
1527     State.reset(getOperand(0), VPhi, Part);
1528   } else {
1529     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1530     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1531     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1532                      PredicatingBB);
1533     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1534     if (State.hasScalarValue(this, *State.Instance))
1535       State.reset(this, Phi, *State.Instance);
1536     else
1537       State.set(this, Phi, *State.Instance);
1538     // NOTE: Currently we need to update the value of the operand, so the next
1539     // predicated iteration inserts its generated value in the correct vector.
1540     State.reset(getOperand(0), Phi, *State.Instance);
1541   }
1542 }
1543 
1544 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1545 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1546                                 VPSlotTracker &SlotTracker) const {
1547   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1548   printAsOperand(O, SlotTracker);
1549   O << " = ";
1550   printOperands(O, SlotTracker);
1551 }
1552 
1553 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1554                                            VPSlotTracker &SlotTracker) const {
1555   O << Indent << "WIDEN ";
1556 
1557   if (!isStore()) {
1558     getVPSingleValue()->printAsOperand(O, SlotTracker);
1559     O << " = ";
1560   }
1561   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1562 
1563   printOperands(O, SlotTracker);
1564 }
1565 #endif
1566 
1567 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1568   Value *Start = getStartValue()->getLiveInIRValue();
1569   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1570   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1571 
1572   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1573   EntryPart->addIncoming(Start, VectorPH);
1574   EntryPart->setDebugLoc(getDebugLoc());
1575   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1576     State.set(this, EntryPart, Part);
1577 }
1578 
1579 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1580 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1581                                    VPSlotTracker &SlotTracker) const {
1582   O << Indent << "EMIT ";
1583   printAsOperand(O, SlotTracker);
1584   O << " = CANONICAL-INDUCTION ";
1585   printOperands(O, SlotTracker);
1586 }
1587 #endif
1588 
1589 bool VPCanonicalIVPHIRecipe::isCanonical(
1590     InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
1591     Type *Ty) const {
1592   // The types must match and it must be an integer induction.
1593   if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
1594     return false;
1595   // Start must match the start value of this canonical induction.
1596   if (Start != getStartValue())
1597     return false;
1598 
1599   // If the step is defined by a recipe, it is not a ConstantInt.
1600   if (Step->getDefiningRecipe())
1601     return false;
1602 
1603   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1604   return StepC && StepC->isOne();
1605 }
1606 
1607 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1608   return IsScalarAfterVectorization &&
1609          (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1610 }
1611 
1612 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1613 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1614                                           VPSlotTracker &SlotTracker) const {
1615   O << Indent << "EMIT ";
1616   printAsOperand(O, SlotTracker);
1617   O << " = WIDEN-POINTER-INDUCTION ";
1618   getStartValue()->printAsOperand(O, SlotTracker);
1619   O << ", " << *IndDesc.getStep();
1620 }
1621 #endif
1622 
1623 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1624   assert(!State.Instance && "cannot be used in per-lane");
1625   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1626   SCEVExpander Exp(SE, DL, "induction");
1627 
1628   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1629                                  &*State.Builder.GetInsertPoint());
1630   assert(!State.ExpandedSCEVs.contains(Expr) &&
1631          "Same SCEV expanded multiple times");
1632   State.ExpandedSCEVs[Expr] = Res;
1633   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1634     State.set(this, Res, {Part, 0});
1635 }
1636 
1637 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1638 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1639                                VPSlotTracker &SlotTracker) const {
1640   O << Indent << "EMIT ";
1641   getVPSingleValue()->printAsOperand(O, SlotTracker);
1642   O << " = EXPAND SCEV " << *Expr;
1643 }
1644 #endif
1645 
1646 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1647   Value *CanonicalIV = State.get(getOperand(0), 0);
1648   Type *STy = CanonicalIV->getType();
1649   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1650   ElementCount VF = State.VF;
1651   Value *VStart = VF.isScalar()
1652                       ? CanonicalIV
1653                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1654   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1655     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1656     if (VF.isVector()) {
1657       VStep = Builder.CreateVectorSplat(VF, VStep);
1658       VStep =
1659           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1660     }
1661     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1662     State.set(this, CanonicalVectorIV, Part);
1663   }
1664 }
1665 
1666 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1667 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1668                                      VPSlotTracker &SlotTracker) const {
1669   O << Indent << "EMIT ";
1670   printAsOperand(O, SlotTracker);
1671   O << " = WIDEN-CANONICAL-INDUCTION ";
1672   printOperands(O, SlotTracker);
1673 }
1674 #endif
1675 
1676 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1677   auto &Builder = State.Builder;
1678   // Create a vector from the initial value.
1679   auto *VectorInit = getStartValue()->getLiveInIRValue();
1680 
1681   Type *VecTy = State.VF.isScalar()
1682                     ? VectorInit->getType()
1683                     : VectorType::get(VectorInit->getType(), State.VF);
1684 
1685   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1686   if (State.VF.isVector()) {
1687     auto *IdxTy = Builder.getInt32Ty();
1688     auto *One = ConstantInt::get(IdxTy, 1);
1689     IRBuilder<>::InsertPointGuard Guard(Builder);
1690     Builder.SetInsertPoint(VectorPH->getTerminator());
1691     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1692     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1693     VectorInit = Builder.CreateInsertElement(
1694         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1695   }
1696 
1697   // Create a phi node for the new recurrence.
1698   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1699   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1700   EntryPart->addIncoming(VectorInit, VectorPH);
1701   State.set(this, EntryPart, 0);
1702 }
1703 
1704 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1705 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1706                                             VPSlotTracker &SlotTracker) const {
1707   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1708   printAsOperand(O, SlotTracker);
1709   O << " = phi ";
1710   printOperands(O, SlotTracker);
1711 }
1712 #endif
1713 
1714 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1715   PHINode *PN = cast<PHINode>(getUnderlyingValue());
1716   auto &Builder = State.Builder;
1717 
1718   // In order to support recurrences we need to be able to vectorize Phi nodes.
1719   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1720   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1721   // this value when we vectorize all of the instructions that use the PHI.
1722   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1723   Type *VecTy =
1724       ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
1725 
1726   BasicBlock *HeaderBB = State.CFG.PrevBB;
1727   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1728          "recipe must be in the vector loop header");
1729   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1730   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1731     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1732     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1733     State.set(this, EntryPart, Part);
1734   }
1735 
1736   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1737 
1738   // Reductions do not have to start at zero. They can start with
1739   // any loop invariant values.
1740   VPValue *StartVPV = getStartValue();
1741   Value *StartV = StartVPV->getLiveInIRValue();
1742 
1743   Value *Iden = nullptr;
1744   RecurKind RK = RdxDesc.getRecurrenceKind();
1745   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1746       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
1747     // MinMax and AnyOf reductions have the start value as their identity.
1748     if (ScalarPHI) {
1749       Iden = StartV;
1750     } else {
1751       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1752       Builder.SetInsertPoint(VectorPH->getTerminator());
1753       StartV = Iden =
1754           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1755     }
1756   } else {
1757     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1758                                          RdxDesc.getFastMathFlags());
1759 
1760     if (!ScalarPHI) {
1761       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1762       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1763       Builder.SetInsertPoint(VectorPH->getTerminator());
1764       Constant *Zero = Builder.getInt32(0);
1765       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1766     }
1767   }
1768 
1769   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1770     Value *EntryPart = State.get(this, Part);
1771     // Make sure to add the reduction start value only to the
1772     // first unroll part.
1773     Value *StartVal = (Part == 0) ? StartV : Iden;
1774     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1775   }
1776 }
1777 
1778 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1779 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1780                                  VPSlotTracker &SlotTracker) const {
1781   O << Indent << "WIDEN-REDUCTION-PHI ";
1782 
1783   printAsOperand(O, SlotTracker);
1784   O << " = phi ";
1785   printOperands(O, SlotTracker);
1786 }
1787 #endif
1788 
1789 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1790   assert(EnableVPlanNativePath &&
1791          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1792 
1793   Value *Op0 = State.get(getOperand(0), 0);
1794   Type *VecTy = Op0->getType();
1795   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1796   State.set(this, VecPhi, 0);
1797 }
1798 
1799 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1800 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1801                              VPSlotTracker &SlotTracker) const {
1802   O << Indent << "WIDEN-PHI ";
1803 
1804   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1805   // Unless all incoming values are modeled in VPlan  print the original PHI
1806   // directly.
1807   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1808   // values as VPValues.
1809   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1810     O << VPlanIngredient(OriginalPhi);
1811     return;
1812   }
1813 
1814   printAsOperand(O, SlotTracker);
1815   O << " = phi ";
1816   printOperands(O, SlotTracker);
1817 }
1818 #endif
1819 
1820 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1821 // remove VPActiveLaneMaskPHIRecipe.
1822 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1823   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1824   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1825     Value *StartMask = State.get(getOperand(0), Part);
1826     PHINode *EntryPart =
1827         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1828     EntryPart->addIncoming(StartMask, VectorPH);
1829     EntryPart->setDebugLoc(getDebugLoc());
1830     State.set(this, EntryPart, Part);
1831   }
1832 }
1833 
1834 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1835 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1836                                       VPSlotTracker &SlotTracker) const {
1837   O << Indent << "ACTIVE-LANE-MASK-PHI ";
1838 
1839   printAsOperand(O, SlotTracker);
1840   O << " = phi ";
1841   printOperands(O, SlotTracker);
1842 }
1843 #endif
1844