xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Analysis/IVDescriptors.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/IR/Value.h"
25 #include "llvm/Support/Casting.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
30 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
31 #include <cassert>
32 
33 using namespace llvm;
34 
35 using VectorParts = SmallVector<Value *, 2>;
36 
37 namespace llvm {
38 extern cl::opt<bool> EnableVPlanNativePath;
39 }
40 
41 #define LV_NAME "loop-vectorize"
42 #define DEBUG_TYPE LV_NAME
43 
44 bool VPRecipeBase::mayWriteToMemory() const {
45   switch (getVPDefID()) {
46   case VPWidenMemoryInstructionSC: {
47     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
48   }
49   case VPReplicateSC:
50   case VPWidenCallSC:
51     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
52         ->mayWriteToMemory();
53   case VPBranchOnMaskSC:
54   case VPScalarIVStepsSC:
55   case VPPredInstPHISC:
56     return false;
57   case VPBlendSC:
58   case VPReductionSC:
59   case VPWidenCanonicalIVSC:
60   case VPWidenCastSC:
61   case VPWidenGEPSC:
62   case VPWidenIntOrFpInductionSC:
63   case VPWidenPHISC:
64   case VPWidenSC:
65   case VPWidenSelectSC: {
66     const Instruction *I =
67         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
68     (void)I;
69     assert((!I || !I->mayWriteToMemory()) &&
70            "underlying instruction may write to memory");
71     return false;
72   }
73   default:
74     return true;
75   }
76 }
77 
78 bool VPRecipeBase::mayReadFromMemory() const {
79   switch (getVPDefID()) {
80   case VPWidenMemoryInstructionSC: {
81     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
82   }
83   case VPReplicateSC:
84   case VPWidenCallSC:
85     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
86         ->mayReadFromMemory();
87   case VPBranchOnMaskSC:
88   case VPScalarIVStepsSC:
89   case VPPredInstPHISC:
90     return false;
91   case VPBlendSC:
92   case VPReductionSC:
93   case VPWidenCanonicalIVSC:
94   case VPWidenCastSC:
95   case VPWidenGEPSC:
96   case VPWidenIntOrFpInductionSC:
97   case VPWidenPHISC:
98   case VPWidenSC:
99   case VPWidenSelectSC: {
100     const Instruction *I =
101         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
102     (void)I;
103     assert((!I || !I->mayReadFromMemory()) &&
104            "underlying instruction may read from memory");
105     return false;
106   }
107   default:
108     return true;
109   }
110 }
111 
112 bool VPRecipeBase::mayHaveSideEffects() const {
113   switch (getVPDefID()) {
114   case VPDerivedIVSC:
115   case VPPredInstPHISC:
116     return false;
117   case VPWidenCallSC:
118     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
119         ->mayHaveSideEffects();
120   case VPBlendSC:
121   case VPReductionSC:
122   case VPScalarIVStepsSC:
123   case VPWidenCanonicalIVSC:
124   case VPWidenCastSC:
125   case VPWidenGEPSC:
126   case VPWidenIntOrFpInductionSC:
127   case VPWidenPHISC:
128   case VPWidenPointerInductionSC:
129   case VPWidenSC:
130   case VPWidenSelectSC: {
131     const Instruction *I =
132         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
133     (void)I;
134     assert((!I || !I->mayHaveSideEffects()) &&
135            "underlying instruction has side-effects");
136     return false;
137   }
138   case VPWidenMemoryInstructionSC:
139     assert(cast<VPWidenMemoryInstructionRecipe>(this)
140                    ->getIngredient()
141                    .mayHaveSideEffects() == mayWriteToMemory() &&
142            "mayHaveSideffects result for ingredient differs from this "
143            "implementation");
144     return mayWriteToMemory();
145   case VPReplicateSC: {
146     auto *R = cast<VPReplicateRecipe>(this);
147     return R->getUnderlyingInstr()->mayHaveSideEffects();
148   }
149   default:
150     return true;
151   }
152 }
153 
154 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
155   auto Lane = VPLane::getLastLaneForVF(State.VF);
156   VPValue *ExitValue = getOperand(0);
157   if (vputils::isUniformAfterVectorization(ExitValue))
158     Lane = VPLane::getFirstLane();
159   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
160                    State.Builder.GetInsertBlock());
161 }
162 
163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
164 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
165   O << "Live-out ";
166   getPhi()->printAsOperand(O);
167   O << " = ";
168   getOperand(0)->printAsOperand(O, SlotTracker);
169   O << "\n";
170 }
171 #endif
172 
173 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
174   assert(!Parent && "Recipe already in some VPBasicBlock");
175   assert(InsertPos->getParent() &&
176          "Insertion position not in any VPBasicBlock");
177   Parent = InsertPos->getParent();
178   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
179 }
180 
181 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
182                                 iplist<VPRecipeBase>::iterator I) {
183   assert(!Parent && "Recipe already in some VPBasicBlock");
184   assert(I == BB.end() || I->getParent() == &BB);
185   Parent = &BB;
186   BB.getRecipeList().insert(I, this);
187 }
188 
189 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
190   assert(!Parent && "Recipe already in some VPBasicBlock");
191   assert(InsertPos->getParent() &&
192          "Insertion position not in any VPBasicBlock");
193   Parent = InsertPos->getParent();
194   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
195 }
196 
197 void VPRecipeBase::removeFromParent() {
198   assert(getParent() && "Recipe not in any VPBasicBlock");
199   getParent()->getRecipeList().remove(getIterator());
200   Parent = nullptr;
201 }
202 
203 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
204   assert(getParent() && "Recipe not in any VPBasicBlock");
205   return getParent()->getRecipeList().erase(getIterator());
206 }
207 
208 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
209   removeFromParent();
210   insertAfter(InsertPos);
211 }
212 
213 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
214                               iplist<VPRecipeBase>::iterator I) {
215   removeFromParent();
216   insertBefore(BB, I);
217 }
218 
219 Value *VPInstruction::generateInstruction(VPTransformState &State,
220                                           unsigned Part) {
221   IRBuilderBase &Builder = State.Builder;
222   Builder.SetCurrentDebugLocation(DL);
223 
224   if (Instruction::isBinaryOp(getOpcode())) {
225     Value *A = State.get(getOperand(0), Part);
226     Value *B = State.get(getOperand(1), Part);
227     return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
228   }
229 
230   switch (getOpcode()) {
231   case VPInstruction::Not: {
232     Value *A = State.get(getOperand(0), Part);
233     return Builder.CreateNot(A, Name);
234   }
235   case VPInstruction::ICmpULE: {
236     Value *IV = State.get(getOperand(0), Part);
237     Value *TC = State.get(getOperand(1), Part);
238     return Builder.CreateICmpULE(IV, TC, Name);
239   }
240   case Instruction::Select: {
241     Value *Cond = State.get(getOperand(0), Part);
242     Value *Op1 = State.get(getOperand(1), Part);
243     Value *Op2 = State.get(getOperand(2), Part);
244     return Builder.CreateSelect(Cond, Op1, Op2, Name);
245   }
246   case VPInstruction::ActiveLaneMask: {
247     // Get first lane of vector induction variable.
248     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
249     // Get the original loop tripcount.
250     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
251 
252     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
253     auto *PredTy = VectorType::get(Int1Ty, State.VF);
254     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
255                                    {PredTy, ScalarTC->getType()},
256                                    {VIVElem0, ScalarTC}, nullptr, Name);
257   }
258   case VPInstruction::FirstOrderRecurrenceSplice: {
259     // Generate code to combine the previous and current values in vector v3.
260     //
261     //   vector.ph:
262     //     v_init = vector(..., ..., ..., a[-1])
263     //     br vector.body
264     //
265     //   vector.body
266     //     i = phi [0, vector.ph], [i+4, vector.body]
267     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
268     //     v2 = a[i, i+1, i+2, i+3];
269     //     v3 = vector(v1(3), v2(0, 1, 2))
270 
271     // For the first part, use the recurrence phi (v1), otherwise v2.
272     auto *V1 = State.get(getOperand(0), 0);
273     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
274     if (!PartMinus1->getType()->isVectorTy())
275       return PartMinus1;
276     Value *V2 = State.get(getOperand(1), Part);
277     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
278   }
279   case VPInstruction::CalculateTripCountMinusVF: {
280     Value *ScalarTC = State.get(getOperand(0), {0, 0});
281     Value *Step =
282         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
283     Value *Sub = Builder.CreateSub(ScalarTC, Step);
284     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
285     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
286     return Builder.CreateSelect(Cmp, Sub, Zero);
287   }
288   case VPInstruction::CanonicalIVIncrement:
289   case VPInstruction::CanonicalIVIncrementNUW: {
290     if (Part == 0) {
291       bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
292       auto *Phi = State.get(getOperand(0), 0);
293       // The loop step is equal to the vectorization factor (num of SIMD
294       // elements) times the unroll factor (num of SIMD instructions).
295       Value *Step =
296           createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
297       return Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
298     }
299     return State.get(this, 0);
300   }
301 
302   case VPInstruction::CanonicalIVIncrementForPart:
303   case VPInstruction::CanonicalIVIncrementForPartNUW: {
304     bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
305     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
306     if (Part == 0)
307       return IV;
308 
309     // The canonical IV is incremented by the vectorization factor (num of SIMD
310     // elements) times the unroll part.
311     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
312     return Builder.CreateAdd(IV, Step, Name, IsNUW, false);
313   }
314   case VPInstruction::BranchOnCond: {
315     if (Part != 0)
316       return nullptr;
317 
318     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
319     VPRegionBlock *ParentRegion = getParent()->getParent();
320     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
321 
322     // Replace the temporary unreachable terminator with a new conditional
323     // branch, hooking it up to backward destination for exiting blocks now and
324     // to forward destination(s) later when they are created.
325     BranchInst *CondBr =
326         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
327 
328     if (getParent()->isExiting())
329       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
330 
331     CondBr->setSuccessor(0, nullptr);
332     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
333     return CondBr;
334   }
335   case VPInstruction::BranchOnCount: {
336     if (Part != 0)
337       return nullptr;
338     // First create the compare.
339     Value *IV = State.get(getOperand(0), Part);
340     Value *TC = State.get(getOperand(1), Part);
341     Value *Cond = Builder.CreateICmpEQ(IV, TC);
342 
343     // Now create the branch.
344     auto *Plan = getParent()->getPlan();
345     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
346     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
347 
348     // Replace the temporary unreachable terminator with a new conditional
349     // branch, hooking it up to backward destination (the header) now and to the
350     // forward destination (the exit/middle block) later when it is created.
351     // Note that CreateCondBr expects a valid BB as first argument, so we need
352     // to set it to nullptr later.
353     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
354                                               State.CFG.VPBB2IRBB[Header]);
355     CondBr->setSuccessor(0, nullptr);
356     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
357     return CondBr;
358   }
359   default:
360     llvm_unreachable("Unsupported opcode for instruction");
361   }
362 }
363 
364 void VPInstruction::execute(VPTransformState &State) {
365   assert(!State.Instance && "VPInstruction executing an Instance");
366   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
367   State.Builder.setFastMathFlags(FMF);
368   for (unsigned Part = 0; Part < State.UF; ++Part) {
369     Value *GeneratedValue = generateInstruction(State, Part);
370     if (!hasResult())
371       continue;
372     assert(GeneratedValue && "generateInstruction must produce a value");
373     State.set(this, GeneratedValue, Part);
374   }
375 }
376 
377 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
378 void VPInstruction::dump() const {
379   VPSlotTracker SlotTracker(getParent()->getPlan());
380   print(dbgs(), "", SlotTracker);
381 }
382 
383 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
384                           VPSlotTracker &SlotTracker) const {
385   O << Indent << "EMIT ";
386 
387   if (hasResult()) {
388     printAsOperand(O, SlotTracker);
389     O << " = ";
390   }
391 
392   switch (getOpcode()) {
393   case VPInstruction::Not:
394     O << "not";
395     break;
396   case VPInstruction::ICmpULE:
397     O << "icmp ule";
398     break;
399   case VPInstruction::SLPLoad:
400     O << "combined load";
401     break;
402   case VPInstruction::SLPStore:
403     O << "combined store";
404     break;
405   case VPInstruction::ActiveLaneMask:
406     O << "active lane mask";
407     break;
408   case VPInstruction::FirstOrderRecurrenceSplice:
409     O << "first-order splice";
410     break;
411   case VPInstruction::CanonicalIVIncrement:
412     O << "VF * UF + ";
413     break;
414   case VPInstruction::CanonicalIVIncrementNUW:
415     O << "VF * UF +(nuw) ";
416     break;
417   case VPInstruction::BranchOnCond:
418     O << "branch-on-cond";
419     break;
420   case VPInstruction::CalculateTripCountMinusVF:
421     O << "TC > VF ? TC - VF : 0";
422     break;
423   case VPInstruction::CanonicalIVIncrementForPart:
424     O << "VF * Part + ";
425     break;
426   case VPInstruction::CanonicalIVIncrementForPartNUW:
427     O << "VF * Part +(nuw) ";
428     break;
429   case VPInstruction::BranchOnCount:
430     O << "branch-on-count ";
431     break;
432   default:
433     O << Instruction::getOpcodeName(getOpcode());
434   }
435 
436   O << FMF;
437 
438   for (const VPValue *Operand : operands()) {
439     O << " ";
440     Operand->printAsOperand(O, SlotTracker);
441   }
442 
443   if (DL) {
444     O << ", !dbg ";
445     DL.print(O);
446   }
447 }
448 #endif
449 
450 void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
451   // Make sure the VPInstruction is a floating-point operation.
452   assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
453           Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
454           Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
455           Opcode == Instruction::FCmp) &&
456          "this op can't take fast-math flags");
457   FMF = FMFNew;
458 }
459 
460 void VPWidenCallRecipe::execute(VPTransformState &State) {
461   assert(State.VF.isVector() && "not widening");
462   auto &CI = *cast<CallInst>(getUnderlyingInstr());
463   assert(!isa<DbgInfoIntrinsic>(CI) &&
464          "DbgInfoIntrinsic should have been dropped during VPlan construction");
465   State.setDebugLocFromInst(&CI);
466 
467   for (unsigned Part = 0; Part < State.UF; ++Part) {
468     SmallVector<Type *, 2> TysForDecl;
469     // Add return type if intrinsic is overloaded on it.
470     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) {
471       TysForDecl.push_back(
472           VectorType::get(CI.getType()->getScalarType(), State.VF));
473     }
474     SmallVector<Value *, 4> Args;
475     for (const auto &I : enumerate(operands())) {
476       // Some intrinsics have a scalar argument - don't replace it with a
477       // vector.
478       Value *Arg;
479       if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
480           !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
481         Arg = State.get(I.value(), Part);
482       else
483         Arg = State.get(I.value(), VPIteration(0, 0));
484       if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
485         TysForDecl.push_back(Arg->getType());
486       Args.push_back(Arg);
487     }
488 
489     Function *VectorF;
490     if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
491       // Use vector version of the intrinsic.
492       Module *M = State.Builder.GetInsertBlock()->getModule();
493       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
494       assert(VectorF && "Can't retrieve vector intrinsic.");
495     } else {
496 #ifndef NDEBUG
497       assert(Variant != nullptr && "Can't create vector function.");
498 #endif
499       VectorF = Variant;
500     }
501 
502     SmallVector<OperandBundleDef, 1> OpBundles;
503     CI.getOperandBundlesAsDefs(OpBundles);
504     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
505 
506     if (isa<FPMathOperator>(V))
507       V->copyFastMathFlags(&CI);
508 
509     State.set(this, V, Part);
510     State.addMetadata(V, &CI);
511   }
512 }
513 
514 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
515 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
516                               VPSlotTracker &SlotTracker) const {
517   O << Indent << "WIDEN-CALL ";
518 
519   auto *CI = cast<CallInst>(getUnderlyingInstr());
520   if (CI->getType()->isVoidTy())
521     O << "void ";
522   else {
523     printAsOperand(O, SlotTracker);
524     O << " = ";
525   }
526 
527   O << "call @" << CI->getCalledFunction()->getName() << "(";
528   printOperands(O, SlotTracker);
529   O << ")";
530 
531   if (VectorIntrinsicID)
532     O << " (using vector intrinsic)";
533   else {
534     O << " (using library function";
535     if (Variant->hasName())
536       O << ": " << Variant->getName();
537     O << ")";
538   }
539 }
540 
541 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
542                                 VPSlotTracker &SlotTracker) const {
543   O << Indent << "WIDEN-SELECT ";
544   printAsOperand(O, SlotTracker);
545   O << " = select ";
546   getOperand(0)->printAsOperand(O, SlotTracker);
547   O << ", ";
548   getOperand(1)->printAsOperand(O, SlotTracker);
549   O << ", ";
550   getOperand(2)->printAsOperand(O, SlotTracker);
551   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
552 }
553 #endif
554 
555 void VPWidenSelectRecipe::execute(VPTransformState &State) {
556   auto &I = *cast<SelectInst>(getUnderlyingInstr());
557   State.setDebugLocFromInst(&I);
558 
559   // The condition can be loop invariant but still defined inside the
560   // loop. This means that we can't just use the original 'cond' value.
561   // We have to take the 'vectorized' value and pick the first lane.
562   // Instcombine will make this a no-op.
563   auto *InvarCond =
564       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
565 
566   for (unsigned Part = 0; Part < State.UF; ++Part) {
567     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
568     Value *Op0 = State.get(getOperand(1), Part);
569     Value *Op1 = State.get(getOperand(2), Part);
570     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
571     State.set(this, Sel, Part);
572     State.addMetadata(Sel, &I);
573   }
574 }
575 
576 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
577 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
578   switch (OpType) {
579   case OperationType::PossiblyExactOp:
580     if (ExactFlags.IsExact)
581       O << " exact";
582     break;
583   case OperationType::OverflowingBinOp:
584     if (WrapFlags.HasNUW)
585       O << " nuw";
586     if (WrapFlags.HasNSW)
587       O << " nsw";
588     break;
589   case OperationType::FPMathOp:
590     getFastMathFlags().print(O);
591     break;
592   case OperationType::GEPOp:
593     if (GEPFlags.IsInBounds)
594       O << " inbounds";
595     break;
596   case OperationType::Other:
597     break;
598   }
599   O << " ";
600 }
601 #endif
602 
603 void VPWidenRecipe::execute(VPTransformState &State) {
604   auto &I = *cast<Instruction>(getUnderlyingValue());
605   auto &Builder = State.Builder;
606   switch (I.getOpcode()) {
607   case Instruction::Call:
608   case Instruction::Br:
609   case Instruction::PHI:
610   case Instruction::GetElementPtr:
611   case Instruction::Select:
612     llvm_unreachable("This instruction is handled by a different recipe.");
613   case Instruction::UDiv:
614   case Instruction::SDiv:
615   case Instruction::SRem:
616   case Instruction::URem:
617   case Instruction::Add:
618   case Instruction::FAdd:
619   case Instruction::Sub:
620   case Instruction::FSub:
621   case Instruction::FNeg:
622   case Instruction::Mul:
623   case Instruction::FMul:
624   case Instruction::FDiv:
625   case Instruction::FRem:
626   case Instruction::Shl:
627   case Instruction::LShr:
628   case Instruction::AShr:
629   case Instruction::And:
630   case Instruction::Or:
631   case Instruction::Xor: {
632     // Just widen unops and binops.
633     State.setDebugLocFromInst(&I);
634 
635     for (unsigned Part = 0; Part < State.UF; ++Part) {
636       SmallVector<Value *, 2> Ops;
637       for (VPValue *VPOp : operands())
638         Ops.push_back(State.get(VPOp, Part));
639 
640       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
641 
642       if (auto *VecOp = dyn_cast<Instruction>(V))
643         setFlags(VecOp);
644 
645       // Use this vector value for all users of the original instruction.
646       State.set(this, V, Part);
647       State.addMetadata(V, &I);
648     }
649 
650     break;
651   }
652   case Instruction::Freeze: {
653     State.setDebugLocFromInst(&I);
654 
655     for (unsigned Part = 0; Part < State.UF; ++Part) {
656       Value *Op = State.get(getOperand(0), Part);
657 
658       Value *Freeze = Builder.CreateFreeze(Op);
659       State.set(this, Freeze, Part);
660     }
661     break;
662   }
663   case Instruction::ICmp:
664   case Instruction::FCmp: {
665     // Widen compares. Generate vector compares.
666     bool FCmp = (I.getOpcode() == Instruction::FCmp);
667     auto *Cmp = cast<CmpInst>(&I);
668     State.setDebugLocFromInst(Cmp);
669     for (unsigned Part = 0; Part < State.UF; ++Part) {
670       Value *A = State.get(getOperand(0), Part);
671       Value *B = State.get(getOperand(1), Part);
672       Value *C = nullptr;
673       if (FCmp) {
674         // Propagate fast math flags.
675         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
676         Builder.setFastMathFlags(Cmp->getFastMathFlags());
677         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
678       } else {
679         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
680       }
681       State.set(this, C, Part);
682       State.addMetadata(C, &I);
683     }
684 
685     break;
686   }
687   default:
688     // This instruction is not vectorized by simple widening.
689     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
690     llvm_unreachable("Unhandled instruction!");
691   } // end of switch.
692 }
693 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
694 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
695                           VPSlotTracker &SlotTracker) const {
696   O << Indent << "WIDEN ";
697   printAsOperand(O, SlotTracker);
698   const Instruction *UI = getUnderlyingInstr();
699   O << " = " << UI->getOpcodeName();
700   printFlags(O);
701   if (auto *Cmp = dyn_cast<CmpInst>(UI))
702     O << Cmp->getPredicate() << " ";
703   printOperands(O, SlotTracker);
704 }
705 #endif
706 
707 void VPWidenCastRecipe::execute(VPTransformState &State) {
708   auto *I = cast_or_null<Instruction>(getUnderlyingValue());
709   if (I)
710     State.setDebugLocFromInst(I);
711   auto &Builder = State.Builder;
712   /// Vectorize casts.
713   assert(State.VF.isVector() && "Not vectorizing?");
714   Type *DestTy = VectorType::get(getResultType(), State.VF);
715 
716   for (unsigned Part = 0; Part < State.UF; ++Part) {
717     Value *A = State.get(getOperand(0), Part);
718     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
719     State.set(this, Cast, Part);
720     State.addMetadata(Cast, I);
721   }
722 }
723 
724 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
725 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
726                               VPSlotTracker &SlotTracker) const {
727   O << Indent << "WIDEN-CAST ";
728   printAsOperand(O, SlotTracker);
729   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
730   printOperands(O, SlotTracker);
731   O << " to " << *getResultType();
732 }
733 
734 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
735                                           VPSlotTracker &SlotTracker) const {
736   O << Indent << "WIDEN-INDUCTION";
737   if (getTruncInst()) {
738     O << "\\l\"";
739     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
740     O << " +\n" << Indent << "\"  ";
741     getVPValue(0)->printAsOperand(O, SlotTracker);
742   } else
743     O << " " << VPlanIngredient(IV);
744 
745   O << ", ";
746   getStepValue()->printAsOperand(O, SlotTracker);
747 }
748 #endif
749 
750 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
751   // The step may be defined by a recipe in the preheader (e.g. if it requires
752   // SCEV expansion), but for the canonical induction the step is required to be
753   // 1, which is represented as live-in.
754   if (getStepValue()->getDefiningRecipe())
755     return false;
756   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
757   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
758   return StartC && StartC->isZero() && StepC && StepC->isOne();
759 }
760 
761 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
762 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
763                               VPSlotTracker &SlotTracker) const {
764   O << Indent;
765   printAsOperand(O, SlotTracker);
766   O << Indent << "= DERIVED-IV ";
767   getStartValue()->printAsOperand(O, SlotTracker);
768   O << " + ";
769   getCanonicalIV()->printAsOperand(O, SlotTracker);
770   O << " * ";
771   getStepValue()->printAsOperand(O, SlotTracker);
772 
773   if (IndDesc.getStep()->getType() != ResultTy)
774     O << " (truncated to " << *ResultTy << ")";
775 }
776 #endif
777 
778 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
779 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
780                                   VPSlotTracker &SlotTracker) const {
781   O << Indent;
782   printAsOperand(O, SlotTracker);
783   O << Indent << "= SCALAR-STEPS ";
784   printOperands(O, SlotTracker);
785 }
786 #endif
787 
788 void VPWidenGEPRecipe::execute(VPTransformState &State) {
789   assert(State.VF.isVector() && "not widening");
790   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
791   // Construct a vector GEP by widening the operands of the scalar GEP as
792   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
793   // results in a vector of pointers when at least one operand of the GEP
794   // is vector-typed. Thus, to keep the representation compact, we only use
795   // vector-typed operands for loop-varying values.
796 
797   if (areAllOperandsInvariant()) {
798     // If we are vectorizing, but the GEP has only loop-invariant operands,
799     // the GEP we build (by only using vector-typed operands for
800     // loop-varying values) would be a scalar pointer. Thus, to ensure we
801     // produce a vector of pointers, we need to either arbitrarily pick an
802     // operand to broadcast, or broadcast a clone of the original GEP.
803     // Here, we broadcast a clone of the original.
804     //
805     // TODO: If at some point we decide to scalarize instructions having
806     //       loop-invariant operands, this special case will no longer be
807     //       required. We would add the scalarization decision to
808     //       collectLoopScalars() and teach getVectorValue() to broadcast
809     //       the lane-zero scalar value.
810     SmallVector<Value *> Ops;
811     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
812       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
813 
814     auto *NewGEP =
815         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
816                                 ArrayRef(Ops).drop_front(), "", isInBounds());
817     for (unsigned Part = 0; Part < State.UF; ++Part) {
818       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
819       State.set(this, EntryPart, Part);
820       State.addMetadata(EntryPart, GEP);
821     }
822   } else {
823     // If the GEP has at least one loop-varying operand, we are sure to
824     // produce a vector of pointers. But if we are only unrolling, we want
825     // to produce a scalar GEP for each unroll part. Thus, the GEP we
826     // produce with the code below will be scalar (if VF == 1) or vector
827     // (otherwise). Note that for the unroll-only case, we still maintain
828     // values in the vector mapping with initVector, as we do for other
829     // instructions.
830     for (unsigned Part = 0; Part < State.UF; ++Part) {
831       // The pointer operand of the new GEP. If it's loop-invariant, we
832       // won't broadcast it.
833       auto *Ptr = isPointerLoopInvariant()
834                       ? State.get(getOperand(0), VPIteration(0, 0))
835                       : State.get(getOperand(0), Part);
836 
837       // Collect all the indices for the new GEP. If any index is
838       // loop-invariant, we won't broadcast it.
839       SmallVector<Value *, 4> Indices;
840       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
841         VPValue *Operand = getOperand(I);
842         if (isIndexLoopInvariant(I - 1))
843           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
844         else
845           Indices.push_back(State.get(Operand, Part));
846       }
847 
848       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
849       // but it should be a vector, otherwise.
850       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
851                                              Indices, "", isInBounds());
852       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
853              "NewGEP is not a pointer vector");
854       State.set(this, NewGEP, Part);
855       State.addMetadata(NewGEP, GEP);
856     }
857   }
858 }
859 
860 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
861 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
862                              VPSlotTracker &SlotTracker) const {
863   O << Indent << "WIDEN-GEP ";
864   O << (isPointerLoopInvariant() ? "Inv" : "Var");
865   for (size_t I = 0; I < getNumOperands() - 1; ++I)
866     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
867 
868   O << " ";
869   printAsOperand(O, SlotTracker);
870   O << " = getelementptr";
871   printFlags(O);
872   printOperands(O, SlotTracker);
873 }
874 #endif
875 
876 void VPBlendRecipe::execute(VPTransformState &State) {
877   State.setDebugLocFromInst(Phi);
878   // We know that all PHIs in non-header blocks are converted into
879   // selects, so we don't have to worry about the insertion order and we
880   // can just use the builder.
881   // At this point we generate the predication tree. There may be
882   // duplications since this is a simple recursive scan, but future
883   // optimizations will clean it up.
884 
885   unsigned NumIncoming = getNumIncomingValues();
886 
887   // Generate a sequence of selects of the form:
888   // SELECT(Mask3, In3,
889   //        SELECT(Mask2, In2,
890   //               SELECT(Mask1, In1,
891   //                      In0)))
892   // Note that Mask0 is never used: lanes for which no path reaches this phi and
893   // are essentially undef are taken from In0.
894  VectorParts Entry(State.UF);
895   for (unsigned In = 0; In < NumIncoming; ++In) {
896     for (unsigned Part = 0; Part < State.UF; ++Part) {
897       // We might have single edge PHIs (blocks) - use an identity
898       // 'select' for the first PHI operand.
899       Value *In0 = State.get(getIncomingValue(In), Part);
900       if (In == 0)
901         Entry[Part] = In0; // Initialize with the first incoming value.
902       else {
903         // Select between the current value and the previous incoming edge
904         // based on the incoming mask.
905         Value *Cond = State.get(getMask(In), Part);
906         Entry[Part] =
907             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
908       }
909     }
910   }
911   for (unsigned Part = 0; Part < State.UF; ++Part)
912     State.set(this, Entry[Part], Part);
913 }
914 
915 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
916 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
917                           VPSlotTracker &SlotTracker) const {
918   O << Indent << "BLEND ";
919   Phi->printAsOperand(O, false);
920   O << " =";
921   if (getNumIncomingValues() == 1) {
922     // Not a User of any mask: not really blending, this is a
923     // single-predecessor phi.
924     O << " ";
925     getIncomingValue(0)->printAsOperand(O, SlotTracker);
926   } else {
927     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
928       O << " ";
929       getIncomingValue(I)->printAsOperand(O, SlotTracker);
930       O << "/";
931       getMask(I)->printAsOperand(O, SlotTracker);
932     }
933   }
934 }
935 
936 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
937                               VPSlotTracker &SlotTracker) const {
938   O << Indent << "REDUCE ";
939   printAsOperand(O, SlotTracker);
940   O << " = ";
941   getChainOp()->printAsOperand(O, SlotTracker);
942   O << " +";
943   if (isa<FPMathOperator>(getUnderlyingInstr()))
944     O << getUnderlyingInstr()->getFastMathFlags();
945   O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
946   getVecOp()->printAsOperand(O, SlotTracker);
947   if (getCondOp()) {
948     O << ", ";
949     getCondOp()->printAsOperand(O, SlotTracker);
950   }
951   O << ")";
952   if (RdxDesc->IntermediateStore)
953     O << " (with final reduction value stored in invariant address sank "
954          "outside of loop)";
955 }
956 #endif
957 
958 bool VPReplicateRecipe::shouldPack() const {
959   // Find if the recipe is used by a widened recipe via an intervening
960   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
961   return any_of(users(), [](const VPUser *U) {
962     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
963       return any_of(PredR->users(), [PredR](const VPUser *U) {
964         return !U->usesScalars(PredR);
965       });
966     return false;
967   });
968 }
969 
970 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
971 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
972                               VPSlotTracker &SlotTracker) const {
973   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
974 
975   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
976     printAsOperand(O, SlotTracker);
977     O << " = ";
978   }
979   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
980     O << "call";
981     printFlags(O);
982     O << "@" << CB->getCalledFunction()->getName() << "(";
983     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
984                     O, [&O, &SlotTracker](VPValue *Op) {
985                       Op->printAsOperand(O, SlotTracker);
986                     });
987     O << ")";
988   } else {
989     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
990     printFlags(O);
991     printOperands(O, SlotTracker);
992   }
993 
994   if (shouldPack())
995     O << " (S->V)";
996 }
997 #endif
998 
999 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1000   assert(State.Instance && "Branch on Mask works only on single instance.");
1001 
1002   unsigned Part = State.Instance->Part;
1003   unsigned Lane = State.Instance->Lane.getKnownLane();
1004 
1005   Value *ConditionBit = nullptr;
1006   VPValue *BlockInMask = getMask();
1007   if (BlockInMask) {
1008     ConditionBit = State.get(BlockInMask, Part);
1009     if (ConditionBit->getType()->isVectorTy())
1010       ConditionBit = State.Builder.CreateExtractElement(
1011           ConditionBit, State.Builder.getInt32(Lane));
1012   } else // Block in mask is all-one.
1013     ConditionBit = State.Builder.getTrue();
1014 
1015   // Replace the temporary unreachable terminator with a new conditional branch,
1016   // whose two destinations will be set later when they are created.
1017   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1018   assert(isa<UnreachableInst>(CurrentTerminator) &&
1019          "Expected to replace unreachable terminator with conditional branch.");
1020   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1021   CondBr->setSuccessor(0, nullptr);
1022   ReplaceInstWithInst(CurrentTerminator, CondBr);
1023 }
1024 
1025 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1026   assert(State.Instance && "Predicated instruction PHI works per instance.");
1027   Instruction *ScalarPredInst =
1028       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1029   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1030   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1031   assert(PredicatingBB && "Predicated block has no single predecessor.");
1032   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1033          "operand must be VPReplicateRecipe");
1034 
1035   // By current pack/unpack logic we need to generate only a single phi node: if
1036   // a vector value for the predicated instruction exists at this point it means
1037   // the instruction has vector users only, and a phi for the vector value is
1038   // needed. In this case the recipe of the predicated instruction is marked to
1039   // also do that packing, thereby "hoisting" the insert-element sequence.
1040   // Otherwise, a phi node for the scalar value is needed.
1041   unsigned Part = State.Instance->Part;
1042   if (State.hasVectorValue(getOperand(0), Part)) {
1043     Value *VectorValue = State.get(getOperand(0), Part);
1044     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1045     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1046     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1047     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1048     if (State.hasVectorValue(this, Part))
1049       State.reset(this, VPhi, Part);
1050     else
1051       State.set(this, VPhi, Part);
1052     // NOTE: Currently we need to update the value of the operand, so the next
1053     // predicated iteration inserts its generated value in the correct vector.
1054     State.reset(getOperand(0), VPhi, Part);
1055   } else {
1056     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1057     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1058     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1059                      PredicatingBB);
1060     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1061     if (State.hasScalarValue(this, *State.Instance))
1062       State.reset(this, Phi, *State.Instance);
1063     else
1064       State.set(this, Phi, *State.Instance);
1065     // NOTE: Currently we need to update the value of the operand, so the next
1066     // predicated iteration inserts its generated value in the correct vector.
1067     State.reset(getOperand(0), Phi, *State.Instance);
1068   }
1069 }
1070 
1071 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1072 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1073                                 VPSlotTracker &SlotTracker) const {
1074   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1075   printAsOperand(O, SlotTracker);
1076   O << " = ";
1077   printOperands(O, SlotTracker);
1078 }
1079 
1080 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1081                                            VPSlotTracker &SlotTracker) const {
1082   O << Indent << "WIDEN ";
1083 
1084   if (!isStore()) {
1085     getVPSingleValue()->printAsOperand(O, SlotTracker);
1086     O << " = ";
1087   }
1088   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1089 
1090   printOperands(O, SlotTracker);
1091 }
1092 #endif
1093 
1094 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1095   Value *Start = getStartValue()->getLiveInIRValue();
1096   PHINode *EntryPart = PHINode::Create(
1097       Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
1098 
1099   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1100   EntryPart->addIncoming(Start, VectorPH);
1101   EntryPart->setDebugLoc(DL);
1102   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1103     State.set(this, EntryPart, Part);
1104 }
1105 
1106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1107 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1108                                    VPSlotTracker &SlotTracker) const {
1109   O << Indent << "EMIT ";
1110   printAsOperand(O, SlotTracker);
1111   O << " = CANONICAL-INDUCTION";
1112 }
1113 #endif
1114 
1115 bool VPCanonicalIVPHIRecipe::isCanonical(
1116     InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
1117     Type *Ty) const {
1118   // The types must match and it must be an integer induction.
1119   if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
1120     return false;
1121   // Start must match the start value of this canonical induction.
1122   if (Start != getStartValue())
1123     return false;
1124 
1125   // If the step is defined by a recipe, it is not a ConstantInt.
1126   if (Step->getDefiningRecipe())
1127     return false;
1128 
1129   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1130   return StepC && StepC->isOne();
1131 }
1132 
1133 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1134   return IsScalarAfterVectorization &&
1135          (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1136 }
1137 
1138 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1139 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1140                                           VPSlotTracker &SlotTracker) const {
1141   O << Indent << "EMIT ";
1142   printAsOperand(O, SlotTracker);
1143   O << " = WIDEN-POINTER-INDUCTION ";
1144   getStartValue()->printAsOperand(O, SlotTracker);
1145   O << ", " << *IndDesc.getStep();
1146 }
1147 #endif
1148 
1149 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1150   assert(!State.Instance && "cannot be used in per-lane");
1151   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1152   SCEVExpander Exp(SE, DL, "induction");
1153 
1154   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1155                                  &*State.Builder.GetInsertPoint());
1156   assert(!State.ExpandedSCEVs.contains(Expr) &&
1157          "Same SCEV expanded multiple times");
1158   State.ExpandedSCEVs[Expr] = Res;
1159   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1160     State.set(this, Res, {Part, 0});
1161 }
1162 
1163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1164 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1165                                VPSlotTracker &SlotTracker) const {
1166   O << Indent << "EMIT ";
1167   getVPSingleValue()->printAsOperand(O, SlotTracker);
1168   O << " = EXPAND SCEV " << *Expr;
1169 }
1170 #endif
1171 
1172 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1173   Value *CanonicalIV = State.get(getOperand(0), 0);
1174   Type *STy = CanonicalIV->getType();
1175   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1176   ElementCount VF = State.VF;
1177   Value *VStart = VF.isScalar()
1178                       ? CanonicalIV
1179                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1180   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1181     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1182     if (VF.isVector()) {
1183       VStep = Builder.CreateVectorSplat(VF, VStep);
1184       VStep =
1185           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1186     }
1187     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1188     State.set(this, CanonicalVectorIV, Part);
1189   }
1190 }
1191 
1192 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1193 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1194                                      VPSlotTracker &SlotTracker) const {
1195   O << Indent << "EMIT ";
1196   printAsOperand(O, SlotTracker);
1197   O << " = WIDEN-CANONICAL-INDUCTION ";
1198   printOperands(O, SlotTracker);
1199 }
1200 #endif
1201 
1202 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1203   auto &Builder = State.Builder;
1204   // Create a vector from the initial value.
1205   auto *VectorInit = getStartValue()->getLiveInIRValue();
1206 
1207   Type *VecTy = State.VF.isScalar()
1208                     ? VectorInit->getType()
1209                     : VectorType::get(VectorInit->getType(), State.VF);
1210 
1211   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1212   if (State.VF.isVector()) {
1213     auto *IdxTy = Builder.getInt32Ty();
1214     auto *One = ConstantInt::get(IdxTy, 1);
1215     IRBuilder<>::InsertPointGuard Guard(Builder);
1216     Builder.SetInsertPoint(VectorPH->getTerminator());
1217     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1218     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1219     VectorInit = Builder.CreateInsertElement(
1220         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1221   }
1222 
1223   // Create a phi node for the new recurrence.
1224   PHINode *EntryPart = PHINode::Create(
1225       VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
1226   EntryPart->addIncoming(VectorInit, VectorPH);
1227   State.set(this, EntryPart, 0);
1228 }
1229 
1230 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1231 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1232                                             VPSlotTracker &SlotTracker) const {
1233   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1234   printAsOperand(O, SlotTracker);
1235   O << " = phi ";
1236   printOperands(O, SlotTracker);
1237 }
1238 #endif
1239 
1240 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1241   PHINode *PN = cast<PHINode>(getUnderlyingValue());
1242   auto &Builder = State.Builder;
1243 
1244   // In order to support recurrences we need to be able to vectorize Phi nodes.
1245   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1246   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1247   // this value when we vectorize all of the instructions that use the PHI.
1248   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1249   Type *VecTy =
1250       ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
1251 
1252   BasicBlock *HeaderBB = State.CFG.PrevBB;
1253   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1254          "recipe must be in the vector loop header");
1255   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1256   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1257     Value *EntryPart =
1258         PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
1259     State.set(this, EntryPart, Part);
1260   }
1261 
1262   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1263 
1264   // Reductions do not have to start at zero. They can start with
1265   // any loop invariant values.
1266   VPValue *StartVPV = getStartValue();
1267   Value *StartV = StartVPV->getLiveInIRValue();
1268 
1269   Value *Iden = nullptr;
1270   RecurKind RK = RdxDesc.getRecurrenceKind();
1271   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1272       RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
1273     // MinMax reduction have the start value as their identify.
1274     if (ScalarPHI) {
1275       Iden = StartV;
1276     } else {
1277       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1278       Builder.SetInsertPoint(VectorPH->getTerminator());
1279       StartV = Iden =
1280           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1281     }
1282   } else {
1283     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1284                                          RdxDesc.getFastMathFlags());
1285 
1286     if (!ScalarPHI) {
1287       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1288       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1289       Builder.SetInsertPoint(VectorPH->getTerminator());
1290       Constant *Zero = Builder.getInt32(0);
1291       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1292     }
1293   }
1294 
1295   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1296     Value *EntryPart = State.get(this, Part);
1297     // Make sure to add the reduction start value only to the
1298     // first unroll part.
1299     Value *StartVal = (Part == 0) ? StartV : Iden;
1300     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1301   }
1302 }
1303 
1304 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1305 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1306                                  VPSlotTracker &SlotTracker) const {
1307   O << Indent << "WIDEN-REDUCTION-PHI ";
1308 
1309   printAsOperand(O, SlotTracker);
1310   O << " = phi ";
1311   printOperands(O, SlotTracker);
1312 }
1313 #endif
1314 
1315 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1316   assert(EnableVPlanNativePath &&
1317          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1318 
1319   // Currently we enter here in the VPlan-native path for non-induction
1320   // PHIs where all control flow is uniform. We simply widen these PHIs.
1321   // Create a vector phi with no operands - the vector phi operands will be
1322   // set at the end of vector code generation.
1323   VPBasicBlock *Parent = getParent();
1324   VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
1325   unsigned StartIdx = 0;
1326   // For phis in header blocks of loop regions, use the index of the value
1327   // coming from the preheader.
1328   if (LoopRegion->getEntryBasicBlock() == Parent) {
1329     for (unsigned I = 0; I < getNumOperands(); ++I) {
1330       if (getIncomingBlock(I) ==
1331           LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
1332         StartIdx = I;
1333     }
1334   }
1335   Value *Op0 = State.get(getOperand(StartIdx), 0);
1336   Type *VecTy = Op0->getType();
1337   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1338   State.set(this, VecPhi, 0);
1339 }
1340 
1341 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1342 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1343                              VPSlotTracker &SlotTracker) const {
1344   O << Indent << "WIDEN-PHI ";
1345 
1346   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1347   // Unless all incoming values are modeled in VPlan  print the original PHI
1348   // directly.
1349   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1350   // values as VPValues.
1351   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1352     O << VPlanIngredient(OriginalPhi);
1353     return;
1354   }
1355 
1356   printAsOperand(O, SlotTracker);
1357   O << " = phi ";
1358   printOperands(O, SlotTracker);
1359 }
1360 #endif
1361 
1362 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1363 // remove VPActiveLaneMaskPHIRecipe.
1364 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1365   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1366   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1367     Value *StartMask = State.get(getOperand(0), Part);
1368     PHINode *EntryPart =
1369         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1370     EntryPart->addIncoming(StartMask, VectorPH);
1371     EntryPart->setDebugLoc(DL);
1372     State.set(this, EntryPart, Part);
1373   }
1374 }
1375 
1376 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1377 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1378                                       VPSlotTracker &SlotTracker) const {
1379   O << Indent << "ACTIVE-LANE-MASK-PHI ";
1380 
1381   printAsOperand(O, SlotTracker);
1382   O << " = phi ";
1383   printOperands(O, SlotTracker);
1384 }
1385 #endif
1386