xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 extern cl::opt<unsigned> ForceTargetInstructionCost;
43 
44 #define LV_NAME "loop-vectorize"
45 #define DEBUG_TYPE LV_NAME
46 
mayWriteToMemory() const47 bool VPRecipeBase::mayWriteToMemory() const {
48   switch (getVPDefID()) {
49   case VPInterleaveSC:
50     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
51   case VPWidenStoreEVLSC:
52   case VPWidenStoreSC:
53     return true;
54   case VPReplicateSC:
55     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56         ->mayWriteToMemory();
57   case VPWidenCallSC:
58     return !cast<VPWidenCallRecipe>(this)
59                 ->getCalledScalarFunction()
60                 ->onlyReadsMemory();
61   case VPBranchOnMaskSC:
62   case VPScalarIVStepsSC:
63   case VPPredInstPHISC:
64     return false;
65   case VPBlendSC:
66   case VPReductionEVLSC:
67   case VPReductionSC:
68   case VPWidenCanonicalIVSC:
69   case VPWidenCastSC:
70   case VPWidenGEPSC:
71   case VPWidenIntOrFpInductionSC:
72   case VPWidenLoadEVLSC:
73   case VPWidenLoadSC:
74   case VPWidenPHISC:
75   case VPWidenSC:
76   case VPWidenSelectSC: {
77     const Instruction *I =
78         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
79     (void)I;
80     assert((!I || !I->mayWriteToMemory()) &&
81            "underlying instruction may write to memory");
82     return false;
83   }
84   default:
85     return true;
86   }
87 }
88 
mayReadFromMemory() const89 bool VPRecipeBase::mayReadFromMemory() const {
90   switch (getVPDefID()) {
91   case VPWidenLoadEVLSC:
92   case VPWidenLoadSC:
93     return true;
94   case VPReplicateSC:
95     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
96         ->mayReadFromMemory();
97   case VPWidenCallSC:
98     return !cast<VPWidenCallRecipe>(this)
99                 ->getCalledScalarFunction()
100                 ->onlyWritesMemory();
101   case VPBranchOnMaskSC:
102   case VPPredInstPHISC:
103   case VPScalarIVStepsSC:
104   case VPWidenStoreEVLSC:
105   case VPWidenStoreSC:
106     return false;
107   case VPBlendSC:
108   case VPReductionEVLSC:
109   case VPReductionSC:
110   case VPWidenCanonicalIVSC:
111   case VPWidenCastSC:
112   case VPWidenGEPSC:
113   case VPWidenIntOrFpInductionSC:
114   case VPWidenPHISC:
115   case VPWidenSC:
116   case VPWidenSelectSC: {
117     const Instruction *I =
118         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
119     (void)I;
120     assert((!I || !I->mayReadFromMemory()) &&
121            "underlying instruction may read from memory");
122     return false;
123   }
124   default:
125     return true;
126   }
127 }
128 
mayHaveSideEffects() const129 bool VPRecipeBase::mayHaveSideEffects() const {
130   switch (getVPDefID()) {
131   case VPDerivedIVSC:
132   case VPPredInstPHISC:
133   case VPScalarCastSC:
134     return false;
135   case VPInstructionSC:
136     switch (cast<VPInstruction>(this)->getOpcode()) {
137     case Instruction::Or:
138     case Instruction::ICmp:
139     case Instruction::Select:
140     case VPInstruction::Not:
141     case VPInstruction::CalculateTripCountMinusVF:
142     case VPInstruction::CanonicalIVIncrementForPart:
143     case VPInstruction::ExtractFromEnd:
144     case VPInstruction::FirstOrderRecurrenceSplice:
145     case VPInstruction::LogicalAnd:
146     case VPInstruction::PtrAdd:
147       return false;
148     default:
149       return true;
150     }
151   case VPWidenCallSC: {
152     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
153     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
154   }
155   case VPBlendSC:
156   case VPReductionEVLSC:
157   case VPReductionSC:
158   case VPScalarIVStepsSC:
159   case VPWidenCanonicalIVSC:
160   case VPWidenCastSC:
161   case VPWidenGEPSC:
162   case VPWidenIntOrFpInductionSC:
163   case VPWidenPHISC:
164   case VPWidenPointerInductionSC:
165   case VPWidenSC:
166   case VPWidenSelectSC: {
167     const Instruction *I =
168         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
169     (void)I;
170     assert((!I || !I->mayHaveSideEffects()) &&
171            "underlying instruction has side-effects");
172     return false;
173   }
174   case VPInterleaveSC:
175     return mayWriteToMemory();
176   case VPWidenLoadEVLSC:
177   case VPWidenLoadSC:
178   case VPWidenStoreEVLSC:
179   case VPWidenStoreSC:
180     assert(
181         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
182             mayWriteToMemory() &&
183         "mayHaveSideffects result for ingredient differs from this "
184         "implementation");
185     return mayWriteToMemory();
186   case VPReplicateSC: {
187     auto *R = cast<VPReplicateRecipe>(this);
188     return R->getUnderlyingInstr()->mayHaveSideEffects();
189   }
190   default:
191     return true;
192   }
193 }
194 
fixPhi(VPlan & Plan,VPTransformState & State)195 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
196   VPValue *ExitValue = getOperand(0);
197   auto Lane = vputils::isUniformAfterVectorization(ExitValue)
198                   ? VPLane::getFirstLane()
199                   : VPLane::getLastLaneForVF(State.VF);
200   VPBasicBlock *MiddleVPBB =
201       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
202   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
203   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
204   // Values leaving the vector loop reach live out phi's in the exiting block
205   // via middle block.
206   auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
207                        ? MiddleVPBB
208                        : ExitingVPBB;
209   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
210   // Set insertion point in PredBB in case an extract needs to be generated.
211   // TODO: Model extracts explicitly.
212   State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
213   Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
214   if (Phi->getBasicBlockIndex(PredBB) != -1)
215     Phi->setIncomingValueForBlock(PredBB, V);
216   else
217     Phi->addIncoming(V, PredBB);
218 }
219 
220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,VPSlotTracker & SlotTracker) const221 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
222   O << "Live-out ";
223   getPhi()->printAsOperand(O);
224   O << " = ";
225   getOperand(0)->printAsOperand(O, SlotTracker);
226   O << "\n";
227 }
228 #endif
229 
insertBefore(VPRecipeBase * InsertPos)230 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
231   assert(!Parent && "Recipe already in some VPBasicBlock");
232   assert(InsertPos->getParent() &&
233          "Insertion position not in any VPBasicBlock");
234   InsertPos->getParent()->insert(this, InsertPos->getIterator());
235 }
236 
insertBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)237 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
238                                 iplist<VPRecipeBase>::iterator I) {
239   assert(!Parent && "Recipe already in some VPBasicBlock");
240   assert(I == BB.end() || I->getParent() == &BB);
241   BB.insert(this, I);
242 }
243 
insertAfter(VPRecipeBase * InsertPos)244 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
245   assert(!Parent && "Recipe already in some VPBasicBlock");
246   assert(InsertPos->getParent() &&
247          "Insertion position not in any VPBasicBlock");
248   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
249 }
250 
removeFromParent()251 void VPRecipeBase::removeFromParent() {
252   assert(getParent() && "Recipe not in any VPBasicBlock");
253   getParent()->getRecipeList().remove(getIterator());
254   Parent = nullptr;
255 }
256 
eraseFromParent()257 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
258   assert(getParent() && "Recipe not in any VPBasicBlock");
259   return getParent()->getRecipeList().erase(getIterator());
260 }
261 
moveAfter(VPRecipeBase * InsertPos)262 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
263   removeFromParent();
264   insertAfter(InsertPos);
265 }
266 
moveBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)267 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
268                               iplist<VPRecipeBase>::iterator I) {
269   removeFromParent();
270   insertBefore(BB, I);
271 }
272 
273 /// Return the underlying instruction to be used for computing \p R's cost via
274 /// the legacy cost model. Return nullptr if there's no suitable instruction.
getInstructionForCost(const VPRecipeBase * R)275 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
276   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
277     return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
278   if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
279     return IG->getInsertPos();
280   if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
281     return &WidenMem->getIngredient();
282   return nullptr;
283 }
284 
cost(ElementCount VF,VPCostContext & Ctx)285 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
286   if (auto *UI = getInstructionForCost(this))
287     if (Ctx.skipCostComputation(UI, VF.isVector()))
288       return 0;
289 
290   InstructionCost RecipeCost = computeCost(VF, Ctx);
291   if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
292       RecipeCost.isValid())
293     RecipeCost = InstructionCost(ForceTargetInstructionCost);
294 
295   LLVM_DEBUG({
296     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
297     dump();
298   });
299   return RecipeCost;
300 }
301 
computeCost(ElementCount VF,VPCostContext & Ctx) const302 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
303                                           VPCostContext &Ctx) const {
304   // Compute the cost for the recipe falling back to the legacy cost model using
305   // the underlying instruction. If there is no underlying instruction, returns
306   // 0.
307   Instruction *UI = getInstructionForCost(this);
308   if (UI && isa<VPReplicateRecipe>(this)) {
309     // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
310     // transform, avoid computing their cost multiple times for now.
311     Ctx.SkipCostComputation.insert(UI);
312   }
313   return UI ? Ctx.getLegacyCost(UI, VF) : 0;
314 }
315 
getFastMathFlags() const316 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
317   assert(OpType == OperationType::FPMathOp &&
318          "recipe doesn't have fast math flags");
319   FastMathFlags Res;
320   Res.setAllowReassoc(FMFs.AllowReassoc);
321   Res.setNoNaNs(FMFs.NoNaNs);
322   Res.setNoInfs(FMFs.NoInfs);
323   Res.setNoSignedZeros(FMFs.NoSignedZeros);
324   Res.setAllowReciprocal(FMFs.AllowReciprocal);
325   Res.setAllowContract(FMFs.AllowContract);
326   Res.setApproxFunc(FMFs.ApproxFunc);
327   return Res;
328 }
329 
VPInstruction(unsigned Opcode,CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)330 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
331                              VPValue *A, VPValue *B, DebugLoc DL,
332                              const Twine &Name)
333     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
334                           Pred, DL),
335       Opcode(Opcode), Name(Name.str()) {
336   assert(Opcode == Instruction::ICmp &&
337          "only ICmp predicates supported at the moment");
338 }
339 
VPInstruction(unsigned Opcode,std::initializer_list<VPValue * > Operands,FastMathFlags FMFs,DebugLoc DL,const Twine & Name)340 VPInstruction::VPInstruction(unsigned Opcode,
341                              std::initializer_list<VPValue *> Operands,
342                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
343     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
344       Opcode(Opcode), Name(Name.str()) {
345   // Make sure the VPInstruction is a floating-point operation.
346   assert(isFPMathOp() && "this op can't take fast-math flags");
347 }
348 
doesGeneratePerAllLanes() const349 bool VPInstruction::doesGeneratePerAllLanes() const {
350   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
351 }
352 
canGenerateScalarForFirstLane() const353 bool VPInstruction::canGenerateScalarForFirstLane() const {
354   if (Instruction::isBinaryOp(getOpcode()))
355     return true;
356   if (isSingleScalar() || isVectorToScalar())
357     return true;
358   switch (Opcode) {
359   case Instruction::ICmp:
360   case VPInstruction::BranchOnCond:
361   case VPInstruction::BranchOnCount:
362   case VPInstruction::CalculateTripCountMinusVF:
363   case VPInstruction::CanonicalIVIncrementForPart:
364   case VPInstruction::PtrAdd:
365   case VPInstruction::ExplicitVectorLength:
366     return true;
367   default:
368     return false;
369   }
370 }
371 
generatePerLane(VPTransformState & State,const VPIteration & Lane)372 Value *VPInstruction::generatePerLane(VPTransformState &State,
373                                       const VPIteration &Lane) {
374   IRBuilderBase &Builder = State.Builder;
375 
376   assert(getOpcode() == VPInstruction::PtrAdd &&
377          "only PtrAdd opcodes are supported for now");
378   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
379                               State.get(getOperand(1), Lane), Name);
380 }
381 
generatePerPart(VPTransformState & State,unsigned Part)382 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
383   IRBuilderBase &Builder = State.Builder;
384 
385   if (Instruction::isBinaryOp(getOpcode())) {
386     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
387     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
388     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
389     auto *Res =
390         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
391     if (auto *I = dyn_cast<Instruction>(Res))
392       setFlags(I);
393     return Res;
394   }
395 
396   switch (getOpcode()) {
397   case VPInstruction::Not: {
398     Value *A = State.get(getOperand(0), Part);
399     return Builder.CreateNot(A, Name);
400   }
401   case Instruction::ICmp: {
402     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
403     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
404     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
405     return Builder.CreateCmp(getPredicate(), A, B, Name);
406   }
407   case Instruction::Select: {
408     Value *Cond = State.get(getOperand(0), Part);
409     Value *Op1 = State.get(getOperand(1), Part);
410     Value *Op2 = State.get(getOperand(2), Part);
411     return Builder.CreateSelect(Cond, Op1, Op2, Name);
412   }
413   case VPInstruction::ActiveLaneMask: {
414     // Get first lane of vector induction variable.
415     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
416     // Get the original loop tripcount.
417     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
418 
419     // If this part of the active lane mask is scalar, generate the CMP directly
420     // to avoid unnecessary extracts.
421     if (State.VF.isScalar())
422       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
423                                Name);
424 
425     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
426     auto *PredTy = VectorType::get(Int1Ty, State.VF);
427     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
428                                    {PredTy, ScalarTC->getType()},
429                                    {VIVElem0, ScalarTC}, nullptr, Name);
430   }
431   case VPInstruction::FirstOrderRecurrenceSplice: {
432     // Generate code to combine the previous and current values in vector v3.
433     //
434     //   vector.ph:
435     //     v_init = vector(..., ..., ..., a[-1])
436     //     br vector.body
437     //
438     //   vector.body
439     //     i = phi [0, vector.ph], [i+4, vector.body]
440     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
441     //     v2 = a[i, i+1, i+2, i+3];
442     //     v3 = vector(v1(3), v2(0, 1, 2))
443 
444     // For the first part, use the recurrence phi (v1), otherwise v2.
445     auto *V1 = State.get(getOperand(0), 0);
446     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
447     if (!PartMinus1->getType()->isVectorTy())
448       return PartMinus1;
449     Value *V2 = State.get(getOperand(1), Part);
450     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
451   }
452   case VPInstruction::CalculateTripCountMinusVF: {
453     if (Part != 0)
454       return State.get(this, 0, /*IsScalar*/ true);
455 
456     Value *ScalarTC = State.get(getOperand(0), {0, 0});
457     Value *Step =
458         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
459     Value *Sub = Builder.CreateSub(ScalarTC, Step);
460     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
461     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
462     return Builder.CreateSelect(Cmp, Sub, Zero);
463   }
464   case VPInstruction::ExplicitVectorLength: {
465     // Compute EVL
466     auto GetEVL = [=](VPTransformState &State, Value *AVL) {
467       assert(AVL->getType()->isIntegerTy() &&
468              "Requested vector length should be an integer.");
469 
470       // TODO: Add support for MaxSafeDist for correct loop emission.
471       assert(State.VF.isScalable() && "Expected scalable vector factor.");
472       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
473 
474       Value *EVL = State.Builder.CreateIntrinsic(
475           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
476           {AVL, VFArg, State.Builder.getTrue()});
477       return EVL;
478     };
479     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
480     // be outside of the main loop.
481     assert(Part == 0 && "No unrolling expected for predicated vectorization.");
482     // Compute VTC - IV as the AVL (requested vector length).
483     Value *Index = State.get(getOperand(0), VPIteration(0, 0));
484     Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
485     Value *AVL = State.Builder.CreateSub(TripCount, Index);
486     Value *EVL = GetEVL(State, AVL);
487     return EVL;
488   }
489   case VPInstruction::CanonicalIVIncrementForPart: {
490     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
491     if (Part == 0)
492       return IV;
493 
494     // The canonical IV is incremented by the vectorization factor (num of SIMD
495     // elements) times the unroll part.
496     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
497     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
498                              hasNoSignedWrap());
499   }
500   case VPInstruction::BranchOnCond: {
501     if (Part != 0)
502       return nullptr;
503 
504     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
505     // Replace the temporary unreachable terminator with a new conditional
506     // branch, hooking it up to backward destination for exiting blocks now and
507     // to forward destination(s) later when they are created.
508     BranchInst *CondBr =
509         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
510     CondBr->setSuccessor(0, nullptr);
511     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
512 
513     if (!getParent()->isExiting())
514       return CondBr;
515 
516     VPRegionBlock *ParentRegion = getParent()->getParent();
517     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
518     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
519     return CondBr;
520   }
521   case VPInstruction::BranchOnCount: {
522     if (Part != 0)
523       return nullptr;
524     // First create the compare.
525     Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
526     Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
527     Value *Cond = Builder.CreateICmpEQ(IV, TC);
528 
529     // Now create the branch.
530     auto *Plan = getParent()->getPlan();
531     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
532     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
533 
534     // Replace the temporary unreachable terminator with a new conditional
535     // branch, hooking it up to backward destination (the header) now and to the
536     // forward destination (the exit/middle block) later when it is created.
537     // Note that CreateCondBr expects a valid BB as first argument, so we need
538     // to set it to nullptr later.
539     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
540                                               State.CFG.VPBB2IRBB[Header]);
541     CondBr->setSuccessor(0, nullptr);
542     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
543     return CondBr;
544   }
545   case VPInstruction::ComputeReductionResult: {
546     if (Part != 0)
547       return State.get(this, 0, /*IsScalar*/ true);
548 
549     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
550     // and will be removed by breaking up the recipe further.
551     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
552     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
553     // Get its reduction variable descriptor.
554     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
555 
556     RecurKind RK = RdxDesc.getRecurrenceKind();
557 
558     VPValue *LoopExitingDef = getOperand(1);
559     Type *PhiTy = OrigPhi->getType();
560     VectorParts RdxParts(State.UF);
561     for (unsigned Part = 0; Part < State.UF; ++Part)
562       RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
563 
564     // If the vector reduction can be performed in a smaller type, we truncate
565     // then extend the loop exit value to enable InstCombine to evaluate the
566     // entire expression in the smaller type.
567     // TODO: Handle this in truncateToMinBW.
568     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
569       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
570       for (unsigned Part = 0; Part < State.UF; ++Part)
571         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
572     }
573     // Reduce all of the unrolled parts into a single vector.
574     Value *ReducedPartRdx = RdxParts[0];
575     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
576     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
577       Op = Instruction::Or;
578 
579     if (PhiR->isOrdered()) {
580       ReducedPartRdx = RdxParts[State.UF - 1];
581     } else {
582       // Floating-point operations should have some FMF to enable the reduction.
583       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
584       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
585       for (unsigned Part = 1; Part < State.UF; ++Part) {
586         Value *RdxPart = RdxParts[Part];
587         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
588           ReducedPartRdx = Builder.CreateBinOp(
589               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
590         else
591           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
592       }
593     }
594 
595     // Create the reduction after the loop. Note that inloop reductions create
596     // the target reduction in the loop using a Reduction recipe.
597     if ((State.VF.isVector() ||
598          RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
599         !PhiR->isInLoop()) {
600       ReducedPartRdx =
601           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
602       // If the reduction can be performed in a smaller type, we need to extend
603       // the reduction to the wider type before we branch to the original loop.
604       if (PhiTy != RdxDesc.getRecurrenceType())
605         ReducedPartRdx = RdxDesc.isSigned()
606                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
607                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
608     }
609 
610     // If there were stores of the reduction value to a uniform memory address
611     // inside the loop, create the final store here.
612     if (StoreInst *SI = RdxDesc.IntermediateStore) {
613       auto *NewSI = Builder.CreateAlignedStore(
614           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
615       propagateMetadata(NewSI, SI);
616     }
617 
618     return ReducedPartRdx;
619   }
620   case VPInstruction::ExtractFromEnd: {
621     if (Part != 0)
622       return State.get(this, 0, /*IsScalar*/ true);
623 
624     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
625     unsigned Offset = CI->getZExtValue();
626     assert(Offset > 0 && "Offset from end must be positive");
627     Value *Res;
628     if (State.VF.isVector()) {
629       assert(Offset <= State.VF.getKnownMinValue() &&
630              "invalid offset to extract from");
631       // Extract lane VF - Offset from the operand.
632       Res = State.get(
633           getOperand(0),
634           VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
635     } else {
636       assert(Offset <= State.UF && "invalid offset to extract from");
637       // When loop is unrolled without vectorizing, retrieve UF - Offset.
638       Res = State.get(getOperand(0), State.UF - Offset);
639     }
640     if (isa<ExtractElementInst>(Res))
641       Res->setName(Name);
642     return Res;
643   }
644   case VPInstruction::LogicalAnd: {
645     Value *A = State.get(getOperand(0), Part);
646     Value *B = State.get(getOperand(1), Part);
647     return Builder.CreateLogicalAnd(A, B, Name);
648   }
649   case VPInstruction::PtrAdd: {
650     assert(vputils::onlyFirstLaneUsed(this) &&
651            "can only generate first lane for PtrAdd");
652     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
653     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
654     return Builder.CreatePtrAdd(Ptr, Addend, Name);
655   }
656   case VPInstruction::ResumePhi: {
657     if (Part != 0)
658       return State.get(this, 0, /*IsScalar*/ true);
659     Value *IncomingFromVPlanPred =
660         State.get(getOperand(0), Part, /* IsScalar */ true);
661     Value *IncomingFromOtherPreds =
662         State.get(getOperand(1), Part, /* IsScalar */ true);
663     auto *NewPhi =
664         Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
665     BasicBlock *VPlanPred =
666         State.CFG
667             .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
668     NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
669     for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
670       assert(OtherPred != VPlanPred &&
671              "VPlan predecessors should not be connected yet");
672       NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
673     }
674     return NewPhi;
675   }
676 
677   default:
678     llvm_unreachable("Unsupported opcode for instruction");
679   }
680 }
681 
isVectorToScalar() const682 bool VPInstruction::isVectorToScalar() const {
683   return getOpcode() == VPInstruction::ExtractFromEnd ||
684          getOpcode() == VPInstruction::ComputeReductionResult;
685 }
686 
isSingleScalar() const687 bool VPInstruction::isSingleScalar() const {
688   return getOpcode() == VPInstruction::ResumePhi;
689 }
690 
691 #if !defined(NDEBUG)
isFPMathOp() const692 bool VPInstruction::isFPMathOp() const {
693   // Inspired by FPMathOperator::classof. Notable differences are that we don't
694   // support Call, PHI and Select opcodes here yet.
695   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
696          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
697          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
698          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
699 }
700 #endif
701 
execute(VPTransformState & State)702 void VPInstruction::execute(VPTransformState &State) {
703   assert(!State.Instance && "VPInstruction executing an Instance");
704   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
705   assert((hasFastMathFlags() == isFPMathOp() ||
706           getOpcode() == Instruction::Select) &&
707          "Recipe not a FPMathOp but has fast-math flags?");
708   if (hasFastMathFlags())
709     State.Builder.setFastMathFlags(getFastMathFlags());
710   State.setDebugLocFrom(getDebugLoc());
711   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
712                                    (vputils::onlyFirstLaneUsed(this) ||
713                                     isVectorToScalar() || isSingleScalar());
714   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
715   bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
716   for (unsigned Part = 0; Part < State.UF; ++Part) {
717     if (GeneratesPerAllLanes) {
718       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
719            Lane != NumLanes; ++Lane) {
720         Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
721         assert(GeneratedValue && "generatePerLane must produce a value");
722         State.set(this, GeneratedValue, VPIteration(Part, Lane));
723       }
724       continue;
725     }
726 
727     if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
728       Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
729       State.set(this, Part0, Part,
730                 /*IsScalar*/ GeneratesPerFirstLaneOnly);
731       continue;
732     }
733 
734     Value *GeneratedValue = generatePerPart(State, Part);
735     if (!hasResult())
736       continue;
737     assert(GeneratedValue && "generatePerPart must produce a value");
738     assert((GeneratedValue->getType()->isVectorTy() ==
739                 !GeneratesPerFirstLaneOnly ||
740             State.VF.isScalar()) &&
741            "scalar value but not only first lane defined");
742     State.set(this, GeneratedValue, Part,
743               /*IsScalar*/ GeneratesPerFirstLaneOnly);
744   }
745 }
746 
onlyFirstLaneUsed(const VPValue * Op) const747 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
748   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
749   if (Instruction::isBinaryOp(getOpcode()))
750     return vputils::onlyFirstLaneUsed(this);
751 
752   switch (getOpcode()) {
753   default:
754     return false;
755   case Instruction::ICmp:
756   case VPInstruction::PtrAdd:
757     // TODO: Cover additional opcodes.
758     return vputils::onlyFirstLaneUsed(this);
759   case VPInstruction::ActiveLaneMask:
760   case VPInstruction::ExplicitVectorLength:
761   case VPInstruction::CalculateTripCountMinusVF:
762   case VPInstruction::CanonicalIVIncrementForPart:
763   case VPInstruction::BranchOnCount:
764   case VPInstruction::BranchOnCond:
765   case VPInstruction::ResumePhi:
766     return true;
767   };
768   llvm_unreachable("switch should return");
769 }
770 
onlyFirstPartUsed(const VPValue * Op) const771 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
772   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
773   if (Instruction::isBinaryOp(getOpcode()))
774     return vputils::onlyFirstPartUsed(this);
775 
776   switch (getOpcode()) {
777   default:
778     return false;
779   case Instruction::ICmp:
780   case Instruction::Select:
781     return vputils::onlyFirstPartUsed(this);
782   case VPInstruction::BranchOnCount:
783   case VPInstruction::BranchOnCond:
784   case VPInstruction::CanonicalIVIncrementForPart:
785     return true;
786   };
787   llvm_unreachable("switch should return");
788 }
789 
790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const791 void VPInstruction::dump() const {
792   VPSlotTracker SlotTracker(getParent()->getPlan());
793   print(dbgs(), "", SlotTracker);
794 }
795 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const796 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
797                           VPSlotTracker &SlotTracker) const {
798   O << Indent << "EMIT ";
799 
800   if (hasResult()) {
801     printAsOperand(O, SlotTracker);
802     O << " = ";
803   }
804 
805   switch (getOpcode()) {
806   case VPInstruction::Not:
807     O << "not";
808     break;
809   case VPInstruction::SLPLoad:
810     O << "combined load";
811     break;
812   case VPInstruction::SLPStore:
813     O << "combined store";
814     break;
815   case VPInstruction::ActiveLaneMask:
816     O << "active lane mask";
817     break;
818   case VPInstruction::ResumePhi:
819     O << "resume-phi";
820     break;
821   case VPInstruction::ExplicitVectorLength:
822     O << "EXPLICIT-VECTOR-LENGTH";
823     break;
824   case VPInstruction::FirstOrderRecurrenceSplice:
825     O << "first-order splice";
826     break;
827   case VPInstruction::BranchOnCond:
828     O << "branch-on-cond";
829     break;
830   case VPInstruction::CalculateTripCountMinusVF:
831     O << "TC > VF ? TC - VF : 0";
832     break;
833   case VPInstruction::CanonicalIVIncrementForPart:
834     O << "VF * Part +";
835     break;
836   case VPInstruction::BranchOnCount:
837     O << "branch-on-count";
838     break;
839   case VPInstruction::ExtractFromEnd:
840     O << "extract-from-end";
841     break;
842   case VPInstruction::ComputeReductionResult:
843     O << "compute-reduction-result";
844     break;
845   case VPInstruction::LogicalAnd:
846     O << "logical-and";
847     break;
848   case VPInstruction::PtrAdd:
849     O << "ptradd";
850     break;
851   default:
852     O << Instruction::getOpcodeName(getOpcode());
853   }
854 
855   printFlags(O);
856   printOperands(O, SlotTracker);
857 
858   if (auto DL = getDebugLoc()) {
859     O << ", !dbg ";
860     DL.print(O);
861   }
862 }
863 #endif
864 
execute(VPTransformState & State)865 void VPWidenCallRecipe::execute(VPTransformState &State) {
866   assert(State.VF.isVector() && "not widening");
867   Function *CalledScalarFn = getCalledScalarFunction();
868   assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
869          "DbgInfoIntrinsic should have been dropped during VPlan construction");
870   State.setDebugLocFrom(getDebugLoc());
871 
872   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
873   FunctionType *VFTy = nullptr;
874   if (Variant)
875     VFTy = Variant->getFunctionType();
876   for (unsigned Part = 0; Part < State.UF; ++Part) {
877     SmallVector<Type *, 2> TysForDecl;
878     // Add return type if intrinsic is overloaded on it.
879     if (UseIntrinsic &&
880         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
881       TysForDecl.push_back(VectorType::get(
882           CalledScalarFn->getReturnType()->getScalarType(), State.VF));
883     SmallVector<Value *, 4> Args;
884     for (const auto &I : enumerate(arg_operands())) {
885       // Some intrinsics have a scalar argument - don't replace it with a
886       // vector.
887       Value *Arg;
888       if (UseIntrinsic &&
889           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
890         Arg = State.get(I.value(), VPIteration(0, 0));
891       // Some vectorized function variants may also take a scalar argument,
892       // e.g. linear parameters for pointers. This needs to be the scalar value
893       // from the start of the respective part when interleaving.
894       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
895         Arg = State.get(I.value(), VPIteration(Part, 0));
896       else
897         Arg = State.get(I.value(), Part);
898       if (UseIntrinsic &&
899           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
900         TysForDecl.push_back(Arg->getType());
901       Args.push_back(Arg);
902     }
903 
904     Function *VectorF;
905     if (UseIntrinsic) {
906       // Use vector version of the intrinsic.
907       Module *M = State.Builder.GetInsertBlock()->getModule();
908       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
909       assert(VectorF && "Can't retrieve vector intrinsic.");
910     } else {
911 #ifndef NDEBUG
912       assert(Variant != nullptr && "Can't create vector function.");
913 #endif
914       VectorF = Variant;
915     }
916 
917     auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
918     SmallVector<OperandBundleDef, 1> OpBundles;
919     if (CI)
920       CI->getOperandBundlesAsDefs(OpBundles);
921 
922     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
923 
924     if (isa<FPMathOperator>(V))
925       V->copyFastMathFlags(CI);
926 
927     if (!V->getType()->isVoidTy())
928       State.set(this, V, Part);
929     State.addMetadata(V, CI);
930   }
931 }
932 
933 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const934 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
935                               VPSlotTracker &SlotTracker) const {
936   O << Indent << "WIDEN-CALL ";
937 
938   Function *CalledFn = getCalledScalarFunction();
939   if (CalledFn->getReturnType()->isVoidTy())
940     O << "void ";
941   else {
942     printAsOperand(O, SlotTracker);
943     O << " = ";
944   }
945 
946   O << "call @" << CalledFn->getName() << "(";
947   interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
948     Op->printAsOperand(O, SlotTracker);
949   });
950   O << ")";
951 
952   if (VectorIntrinsicID)
953     O << " (using vector intrinsic)";
954   else {
955     O << " (using library function";
956     if (Variant->hasName())
957       O << ": " << Variant->getName();
958     O << ")";
959   }
960 }
961 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const962 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
963                                 VPSlotTracker &SlotTracker) const {
964   O << Indent << "WIDEN-SELECT ";
965   printAsOperand(O, SlotTracker);
966   O << " = select ";
967   getOperand(0)->printAsOperand(O, SlotTracker);
968   O << ", ";
969   getOperand(1)->printAsOperand(O, SlotTracker);
970   O << ", ";
971   getOperand(2)->printAsOperand(O, SlotTracker);
972   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
973 }
974 #endif
975 
execute(VPTransformState & State)976 void VPWidenSelectRecipe::execute(VPTransformState &State) {
977   State.setDebugLocFrom(getDebugLoc());
978 
979   // The condition can be loop invariant but still defined inside the
980   // loop. This means that we can't just use the original 'cond' value.
981   // We have to take the 'vectorized' value and pick the first lane.
982   // Instcombine will make this a no-op.
983   auto *InvarCond =
984       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
985 
986   for (unsigned Part = 0; Part < State.UF; ++Part) {
987     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
988     Value *Op0 = State.get(getOperand(1), Part);
989     Value *Op1 = State.get(getOperand(2), Part);
990     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
991     State.set(this, Sel, Part);
992     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
993   }
994 }
995 
FastMathFlagsTy(const FastMathFlags & FMF)996 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
997     const FastMathFlags &FMF) {
998   AllowReassoc = FMF.allowReassoc();
999   NoNaNs = FMF.noNaNs();
1000   NoInfs = FMF.noInfs();
1001   NoSignedZeros = FMF.noSignedZeros();
1002   AllowReciprocal = FMF.allowReciprocal();
1003   AllowContract = FMF.allowContract();
1004   ApproxFunc = FMF.approxFunc();
1005 }
1006 
1007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printFlags(raw_ostream & O) const1008 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1009   switch (OpType) {
1010   case OperationType::Cmp:
1011     O << " " << CmpInst::getPredicateName(getPredicate());
1012     break;
1013   case OperationType::DisjointOp:
1014     if (DisjointFlags.IsDisjoint)
1015       O << " disjoint";
1016     break;
1017   case OperationType::PossiblyExactOp:
1018     if (ExactFlags.IsExact)
1019       O << " exact";
1020     break;
1021   case OperationType::OverflowingBinOp:
1022     if (WrapFlags.HasNUW)
1023       O << " nuw";
1024     if (WrapFlags.HasNSW)
1025       O << " nsw";
1026     break;
1027   case OperationType::FPMathOp:
1028     getFastMathFlags().print(O);
1029     break;
1030   case OperationType::GEPOp:
1031     if (GEPFlags.IsInBounds)
1032       O << " inbounds";
1033     break;
1034   case OperationType::NonNegOp:
1035     if (NonNegFlags.NonNeg)
1036       O << " nneg";
1037     break;
1038   case OperationType::Other:
1039     break;
1040   }
1041   if (getNumOperands() > 0)
1042     O << " ";
1043 }
1044 #endif
1045 
execute(VPTransformState & State)1046 void VPWidenRecipe::execute(VPTransformState &State) {
1047   State.setDebugLocFrom(getDebugLoc());
1048   auto &Builder = State.Builder;
1049   switch (Opcode) {
1050   case Instruction::Call:
1051   case Instruction::Br:
1052   case Instruction::PHI:
1053   case Instruction::GetElementPtr:
1054   case Instruction::Select:
1055     llvm_unreachable("This instruction is handled by a different recipe.");
1056   case Instruction::UDiv:
1057   case Instruction::SDiv:
1058   case Instruction::SRem:
1059   case Instruction::URem:
1060   case Instruction::Add:
1061   case Instruction::FAdd:
1062   case Instruction::Sub:
1063   case Instruction::FSub:
1064   case Instruction::FNeg:
1065   case Instruction::Mul:
1066   case Instruction::FMul:
1067   case Instruction::FDiv:
1068   case Instruction::FRem:
1069   case Instruction::Shl:
1070   case Instruction::LShr:
1071   case Instruction::AShr:
1072   case Instruction::And:
1073   case Instruction::Or:
1074   case Instruction::Xor: {
1075     // Just widen unops and binops.
1076     for (unsigned Part = 0; Part < State.UF; ++Part) {
1077       SmallVector<Value *, 2> Ops;
1078       for (VPValue *VPOp : operands())
1079         Ops.push_back(State.get(VPOp, Part));
1080 
1081       Value *V = Builder.CreateNAryOp(Opcode, Ops);
1082 
1083       if (auto *VecOp = dyn_cast<Instruction>(V))
1084         setFlags(VecOp);
1085 
1086       // Use this vector value for all users of the original instruction.
1087       State.set(this, V, Part);
1088       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1089     }
1090 
1091     break;
1092   }
1093   case Instruction::Freeze: {
1094     for (unsigned Part = 0; Part < State.UF; ++Part) {
1095       Value *Op = State.get(getOperand(0), Part);
1096 
1097       Value *Freeze = Builder.CreateFreeze(Op);
1098       State.set(this, Freeze, Part);
1099     }
1100     break;
1101   }
1102   case Instruction::ICmp:
1103   case Instruction::FCmp: {
1104     // Widen compares. Generate vector compares.
1105     bool FCmp = Opcode == Instruction::FCmp;
1106     for (unsigned Part = 0; Part < State.UF; ++Part) {
1107       Value *A = State.get(getOperand(0), Part);
1108       Value *B = State.get(getOperand(1), Part);
1109       Value *C = nullptr;
1110       if (FCmp) {
1111         // Propagate fast math flags.
1112         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1113         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1114           Builder.setFastMathFlags(I->getFastMathFlags());
1115         C = Builder.CreateFCmp(getPredicate(), A, B);
1116       } else {
1117         C = Builder.CreateICmp(getPredicate(), A, B);
1118       }
1119       State.set(this, C, Part);
1120       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1121     }
1122 
1123     break;
1124   }
1125   default:
1126     // This instruction is not vectorized by simple widening.
1127     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1128                       << Instruction::getOpcodeName(Opcode));
1129     llvm_unreachable("Unhandled instruction!");
1130   } // end of switch.
1131 
1132 #if !defined(NDEBUG)
1133   // Verify that VPlan type inference results agree with the type of the
1134   // generated values.
1135   for (unsigned Part = 0; Part < State.UF; ++Part) {
1136     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1137                            State.VF) == State.get(this, Part)->getType() &&
1138            "inferred type and type from generated instructions do not match");
1139   }
1140 #endif
1141 }
1142 
1143 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1144 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1145                           VPSlotTracker &SlotTracker) const {
1146   O << Indent << "WIDEN ";
1147   printAsOperand(O, SlotTracker);
1148   O << " = " << Instruction::getOpcodeName(Opcode);
1149   printFlags(O);
1150   printOperands(O, SlotTracker);
1151 }
1152 #endif
1153 
execute(VPTransformState & State)1154 void VPWidenCastRecipe::execute(VPTransformState &State) {
1155   State.setDebugLocFrom(getDebugLoc());
1156   auto &Builder = State.Builder;
1157   /// Vectorize casts.
1158   assert(State.VF.isVector() && "Not vectorizing?");
1159   Type *DestTy = VectorType::get(getResultType(), State.VF);
1160   VPValue *Op = getOperand(0);
1161   for (unsigned Part = 0; Part < State.UF; ++Part) {
1162     if (Part > 0 && Op->isLiveIn()) {
1163       // FIXME: Remove once explicit unrolling is implemented using VPlan.
1164       State.set(this, State.get(this, 0), Part);
1165       continue;
1166     }
1167     Value *A = State.get(Op, Part);
1168     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1169     State.set(this, Cast, Part);
1170     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1171   }
1172 }
1173 
1174 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1175 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1176                               VPSlotTracker &SlotTracker) const {
1177   O << Indent << "WIDEN-CAST ";
1178   printAsOperand(O, SlotTracker);
1179   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1180   printFlags(O);
1181   printOperands(O, SlotTracker);
1182   O << " to " << *getResultType();
1183 }
1184 #endif
1185 
1186 /// This function adds
1187 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1188 /// to each vector element of Val. The sequence starts at StartIndex.
1189 /// \p Opcode is relevant for FP induction variable.
getStepVector(Value * Val,Value * StartIdx,Value * Step,Instruction::BinaryOps BinOp,ElementCount VF,IRBuilderBase & Builder)1190 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1191                             Instruction::BinaryOps BinOp, ElementCount VF,
1192                             IRBuilderBase &Builder) {
1193   assert(VF.isVector() && "only vector VFs are supported");
1194 
1195   // Create and check the types.
1196   auto *ValVTy = cast<VectorType>(Val->getType());
1197   ElementCount VLen = ValVTy->getElementCount();
1198 
1199   Type *STy = Val->getType()->getScalarType();
1200   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1201          "Induction Step must be an integer or FP");
1202   assert(Step->getType() == STy && "Step has wrong type");
1203 
1204   SmallVector<Constant *, 8> Indices;
1205 
1206   // Create a vector of consecutive numbers from zero to VF.
1207   VectorType *InitVecValVTy = ValVTy;
1208   if (STy->isFloatingPointTy()) {
1209     Type *InitVecValSTy =
1210         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1211     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1212   }
1213   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1214 
1215   // Splat the StartIdx
1216   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1217 
1218   if (STy->isIntegerTy()) {
1219     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1220     Step = Builder.CreateVectorSplat(VLen, Step);
1221     assert(Step->getType() == Val->getType() && "Invalid step vec");
1222     // FIXME: The newly created binary instructions should contain nsw/nuw
1223     // flags, which can be found from the original scalar operations.
1224     Step = Builder.CreateMul(InitVec, Step);
1225     return Builder.CreateAdd(Val, Step, "induction");
1226   }
1227 
1228   // Floating point induction.
1229   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1230          "Binary Opcode should be specified for FP induction");
1231   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1232   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1233 
1234   Step = Builder.CreateVectorSplat(VLen, Step);
1235   Value *MulOp = Builder.CreateFMul(InitVec, Step);
1236   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1237 }
1238 
1239 /// A helper function that returns an integer or floating-point constant with
1240 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)1241 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1242   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1243                            : ConstantFP::get(Ty, C);
1244 }
1245 
getRuntimeVFAsFloat(IRBuilderBase & B,Type * FTy,ElementCount VF)1246 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1247                                   ElementCount VF) {
1248   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1249   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1250   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1251   return B.CreateUIToFP(RuntimeVF, FTy);
1252 }
1253 
execute(VPTransformState & State)1254 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1255   assert(!State.Instance && "Int or FP induction being replicated.");
1256 
1257   Value *Start = getStartValue()->getLiveInIRValue();
1258   const InductionDescriptor &ID = getInductionDescriptor();
1259   TruncInst *Trunc = getTruncInst();
1260   IRBuilderBase &Builder = State.Builder;
1261   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1262   assert(State.VF.isVector() && "must have vector VF");
1263 
1264   // The value from the original loop to which we are mapping the new induction
1265   // variable.
1266   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1267 
1268   // Fast-math-flags propagate from the original induction instruction.
1269   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1270   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1271     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1272 
1273   // Now do the actual transformations, and start with fetching the step value.
1274   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1275 
1276   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1277          "Expected either an induction phi-node or a truncate of it!");
1278 
1279   // Construct the initial value of the vector IV in the vector loop preheader
1280   auto CurrIP = Builder.saveIP();
1281   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1282   Builder.SetInsertPoint(VectorPH->getTerminator());
1283   if (isa<TruncInst>(EntryVal)) {
1284     assert(Start->getType()->isIntegerTy() &&
1285            "Truncation requires an integer type");
1286     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1287     Step = Builder.CreateTrunc(Step, TruncType);
1288     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1289   }
1290 
1291   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1292   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1293   Value *SteppedStart = getStepVector(
1294       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1295 
1296   // We create vector phi nodes for both integer and floating-point induction
1297   // variables. Here, we determine the kind of arithmetic we will perform.
1298   Instruction::BinaryOps AddOp;
1299   Instruction::BinaryOps MulOp;
1300   if (Step->getType()->isIntegerTy()) {
1301     AddOp = Instruction::Add;
1302     MulOp = Instruction::Mul;
1303   } else {
1304     AddOp = ID.getInductionOpcode();
1305     MulOp = Instruction::FMul;
1306   }
1307 
1308   // Multiply the vectorization factor by the step using integer or
1309   // floating-point arithmetic as appropriate.
1310   Type *StepType = Step->getType();
1311   Value *RuntimeVF;
1312   if (Step->getType()->isFloatingPointTy())
1313     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1314   else
1315     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1316   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1317 
1318   // Create a vector splat to use in the induction update.
1319   //
1320   // FIXME: If the step is non-constant, we create the vector splat with
1321   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1322   //        handle a constant vector splat.
1323   Value *SplatVF = isa<Constant>(Mul)
1324                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1325                        : Builder.CreateVectorSplat(State.VF, Mul);
1326   Builder.restoreIP(CurrIP);
1327 
1328   // We may need to add the step a number of times, depending on the unroll
1329   // factor. The last of those goes into the PHI.
1330   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1331   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1332   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1333   Instruction *LastInduction = VecInd;
1334   for (unsigned Part = 0; Part < State.UF; ++Part) {
1335     State.set(this, LastInduction, Part);
1336 
1337     if (isa<TruncInst>(EntryVal))
1338       State.addMetadata(LastInduction, EntryVal);
1339 
1340     LastInduction = cast<Instruction>(
1341         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1342     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1343   }
1344 
1345   LastInduction->setName("vec.ind.next");
1346   VecInd->addIncoming(SteppedStart, VectorPH);
1347   // Add induction update using an incorrect block temporarily. The phi node
1348   // will be fixed after VPlan execution. Note that at this point the latch
1349   // block cannot be used, as it does not exist yet.
1350   // TODO: Model increment value in VPlan, by turning the recipe into a
1351   // multi-def and a subclass of VPHeaderPHIRecipe.
1352   VecInd->addIncoming(LastInduction, VectorPH);
1353 }
1354 
1355 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1356 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1357                                           VPSlotTracker &SlotTracker) const {
1358   O << Indent << "WIDEN-INDUCTION";
1359   if (getTruncInst()) {
1360     O << "\\l\"";
1361     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1362     O << " +\n" << Indent << "\"  ";
1363     getVPValue(0)->printAsOperand(O, SlotTracker);
1364   } else
1365     O << " " << VPlanIngredient(IV);
1366 
1367   O << ", ";
1368   getStepValue()->printAsOperand(O, SlotTracker);
1369 }
1370 #endif
1371 
isCanonical() const1372 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1373   // The step may be defined by a recipe in the preheader (e.g. if it requires
1374   // SCEV expansion), but for the canonical induction the step is required to be
1375   // 1, which is represented as live-in.
1376   if (getStepValue()->getDefiningRecipe())
1377     return false;
1378   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1379   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1380   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1381   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1382          getScalarType() == CanIV->getScalarType();
1383 }
1384 
1385 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1386 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1387                               VPSlotTracker &SlotTracker) const {
1388   O << Indent;
1389   printAsOperand(O, SlotTracker);
1390   O << Indent << "= DERIVED-IV ";
1391   getStartValue()->printAsOperand(O, SlotTracker);
1392   O << " + ";
1393   getOperand(1)->printAsOperand(O, SlotTracker);
1394   O << " * ";
1395   getStepValue()->printAsOperand(O, SlotTracker);
1396 }
1397 #endif
1398 
execute(VPTransformState & State)1399 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1400   // Fast-math-flags propagate from the original induction instruction.
1401   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1402   if (hasFastMathFlags())
1403     State.Builder.setFastMathFlags(getFastMathFlags());
1404 
1405   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1406   /// variable on which to base the steps, \p Step is the size of the step.
1407 
1408   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1409   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1410   IRBuilderBase &Builder = State.Builder;
1411 
1412   // Ensure step has the same type as that of scalar IV.
1413   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1414   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1415 
1416   // We build scalar steps for both integer and floating-point induction
1417   // variables. Here, we determine the kind of arithmetic we will perform.
1418   Instruction::BinaryOps AddOp;
1419   Instruction::BinaryOps MulOp;
1420   if (BaseIVTy->isIntegerTy()) {
1421     AddOp = Instruction::Add;
1422     MulOp = Instruction::Mul;
1423   } else {
1424     AddOp = InductionOpcode;
1425     MulOp = Instruction::FMul;
1426   }
1427 
1428   // Determine the number of scalars we need to generate for each unroll
1429   // iteration.
1430   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1431   // Compute the scalar steps and save the results in State.
1432   Type *IntStepTy =
1433       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1434   Type *VecIVTy = nullptr;
1435   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1436   if (!FirstLaneOnly && State.VF.isScalable()) {
1437     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1438     UnitStepVec =
1439         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1440     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1441     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1442   }
1443 
1444   unsigned StartPart = 0;
1445   unsigned EndPart = State.UF;
1446   unsigned StartLane = 0;
1447   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1448   if (State.Instance) {
1449     StartPart = State.Instance->Part;
1450     EndPart = StartPart + 1;
1451     StartLane = State.Instance->Lane.getKnownLane();
1452     EndLane = StartLane + 1;
1453   }
1454   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1455     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1456 
1457     if (!FirstLaneOnly && State.VF.isScalable()) {
1458       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1459       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1460       if (BaseIVTy->isFloatingPointTy())
1461         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1462       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1463       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1464       State.set(this, Add, Part);
1465       // It's useful to record the lane values too for the known minimum number
1466       // of elements so we do those below. This improves the code quality when
1467       // trying to extract the first element, for example.
1468     }
1469 
1470     if (BaseIVTy->isFloatingPointTy())
1471       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1472 
1473     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1474       Value *StartIdx = Builder.CreateBinOp(
1475           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1476       // The step returned by `createStepForVF` is a runtime-evaluated value
1477       // when VF is scalable. Otherwise, it should be folded into a Constant.
1478       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1479              "Expected StartIdx to be folded to a constant when VF is not "
1480              "scalable");
1481       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1482       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1483       State.set(this, Add, VPIteration(Part, Lane));
1484     }
1485   }
1486 }
1487 
1488 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1489 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1490                                   VPSlotTracker &SlotTracker) const {
1491   O << Indent;
1492   printAsOperand(O, SlotTracker);
1493   O << " = SCALAR-STEPS ";
1494   printOperands(O, SlotTracker);
1495 }
1496 #endif
1497 
execute(VPTransformState & State)1498 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1499   assert(State.VF.isVector() && "not widening");
1500   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1501   // Construct a vector GEP by widening the operands of the scalar GEP as
1502   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1503   // results in a vector of pointers when at least one operand of the GEP
1504   // is vector-typed. Thus, to keep the representation compact, we only use
1505   // vector-typed operands for loop-varying values.
1506 
1507   if (areAllOperandsInvariant()) {
1508     // If we are vectorizing, but the GEP has only loop-invariant operands,
1509     // the GEP we build (by only using vector-typed operands for
1510     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1511     // produce a vector of pointers, we need to either arbitrarily pick an
1512     // operand to broadcast, or broadcast a clone of the original GEP.
1513     // Here, we broadcast a clone of the original.
1514     //
1515     // TODO: If at some point we decide to scalarize instructions having
1516     //       loop-invariant operands, this special case will no longer be
1517     //       required. We would add the scalarization decision to
1518     //       collectLoopScalars() and teach getVectorValue() to broadcast
1519     //       the lane-zero scalar value.
1520     SmallVector<Value *> Ops;
1521     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1522       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1523 
1524     auto *NewGEP =
1525         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1526                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1527     for (unsigned Part = 0; Part < State.UF; ++Part) {
1528       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1529       State.set(this, EntryPart, Part);
1530       State.addMetadata(EntryPart, GEP);
1531     }
1532   } else {
1533     // If the GEP has at least one loop-varying operand, we are sure to
1534     // produce a vector of pointers. But if we are only unrolling, we want
1535     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1536     // produce with the code below will be scalar (if VF == 1) or vector
1537     // (otherwise). Note that for the unroll-only case, we still maintain
1538     // values in the vector mapping with initVector, as we do for other
1539     // instructions.
1540     for (unsigned Part = 0; Part < State.UF; ++Part) {
1541       // The pointer operand of the new GEP. If it's loop-invariant, we
1542       // won't broadcast it.
1543       auto *Ptr = isPointerLoopInvariant()
1544                       ? State.get(getOperand(0), VPIteration(0, 0))
1545                       : State.get(getOperand(0), Part);
1546 
1547       // Collect all the indices for the new GEP. If any index is
1548       // loop-invariant, we won't broadcast it.
1549       SmallVector<Value *, 4> Indices;
1550       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1551         VPValue *Operand = getOperand(I);
1552         if (isIndexLoopInvariant(I - 1))
1553           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1554         else
1555           Indices.push_back(State.get(Operand, Part));
1556       }
1557 
1558       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1559       // but it should be a vector, otherwise.
1560       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1561                                              Indices, "", isInBounds());
1562       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1563              "NewGEP is not a pointer vector");
1564       State.set(this, NewGEP, Part);
1565       State.addMetadata(NewGEP, GEP);
1566     }
1567   }
1568 }
1569 
1570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1571 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1572                              VPSlotTracker &SlotTracker) const {
1573   O << Indent << "WIDEN-GEP ";
1574   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1575   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1576     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1577 
1578   O << " ";
1579   printAsOperand(O, SlotTracker);
1580   O << " = getelementptr";
1581   printFlags(O);
1582   printOperands(O, SlotTracker);
1583 }
1584 #endif
1585 
execute(VPTransformState & State)1586 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1587   auto &Builder = State.Builder;
1588   State.setDebugLocFrom(getDebugLoc());
1589   for (unsigned Part = 0; Part < State.UF; ++Part) {
1590     // Calculate the pointer for the specific unroll-part.
1591     Value *PartPtr = nullptr;
1592     // Use i32 for the gep index type when the value is constant,
1593     // or query DataLayout for a more suitable index type otherwise.
1594     const DataLayout &DL =
1595         Builder.GetInsertBlock()->getDataLayout();
1596     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1597                         ? DL.getIndexType(IndexedTy->getPointerTo())
1598                         : Builder.getInt32Ty();
1599     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1600     bool InBounds = isInBounds();
1601     if (IsReverse) {
1602       // If the address is consecutive but reversed, then the
1603       // wide store needs to start at the last vector element.
1604       // RunTimeVF =  VScale * VF.getKnownMinValue()
1605       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1606       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1607       // NumElt = -Part * RunTimeVF
1608       Value *NumElt = Builder.CreateMul(
1609           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1610       // LastLane = 1 - RunTimeVF
1611       Value *LastLane =
1612           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1613       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1614       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1615     } else {
1616       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1617       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1618     }
1619 
1620     State.set(this, PartPtr, Part, /*IsScalar*/ true);
1621   }
1622 }
1623 
1624 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1625 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1626                                   VPSlotTracker &SlotTracker) const {
1627   O << Indent;
1628   printAsOperand(O, SlotTracker);
1629   O << " = vector-pointer ";
1630   if (IsReverse)
1631     O << "(reverse) ";
1632 
1633   printOperands(O, SlotTracker);
1634 }
1635 #endif
1636 
execute(VPTransformState & State)1637 void VPBlendRecipe::execute(VPTransformState &State) {
1638   State.setDebugLocFrom(getDebugLoc());
1639   // We know that all PHIs in non-header blocks are converted into
1640   // selects, so we don't have to worry about the insertion order and we
1641   // can just use the builder.
1642   // At this point we generate the predication tree. There may be
1643   // duplications since this is a simple recursive scan, but future
1644   // optimizations will clean it up.
1645 
1646   unsigned NumIncoming = getNumIncomingValues();
1647 
1648   // Generate a sequence of selects of the form:
1649   // SELECT(Mask3, In3,
1650   //        SELECT(Mask2, In2,
1651   //               SELECT(Mask1, In1,
1652   //                      In0)))
1653   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1654   // are essentially undef are taken from In0.
1655  VectorParts Entry(State.UF);
1656  bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1657  for (unsigned In = 0; In < NumIncoming; ++In) {
1658    for (unsigned Part = 0; Part < State.UF; ++Part) {
1659      // We might have single edge PHIs (blocks) - use an identity
1660      // 'select' for the first PHI operand.
1661      Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1662      if (In == 0)
1663        Entry[Part] = In0; // Initialize with the first incoming value.
1664      else {
1665        // Select between the current value and the previous incoming edge
1666        // based on the incoming mask.
1667        Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1668        Entry[Part] =
1669            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1670      }
1671    }
1672  }
1673   for (unsigned Part = 0; Part < State.UF; ++Part)
1674     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1675 }
1676 
1677 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1678 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1679                           VPSlotTracker &SlotTracker) const {
1680   O << Indent << "BLEND ";
1681   printAsOperand(O, SlotTracker);
1682   O << " =";
1683   if (getNumIncomingValues() == 1) {
1684     // Not a User of any mask: not really blending, this is a
1685     // single-predecessor phi.
1686     O << " ";
1687     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1688   } else {
1689     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1690       O << " ";
1691       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1692       if (I == 0)
1693         continue;
1694       O << "/";
1695       getMask(I)->printAsOperand(O, SlotTracker);
1696     }
1697   }
1698 }
1699 #endif
1700 
execute(VPTransformState & State)1701 void VPReductionRecipe::execute(VPTransformState &State) {
1702   assert(!State.Instance && "Reduction being replicated.");
1703   Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1704   RecurKind Kind = RdxDesc.getRecurrenceKind();
1705   // Propagate the fast-math flags carried by the underlying instruction.
1706   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1707   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1708   for (unsigned Part = 0; Part < State.UF; ++Part) {
1709     Value *NewVecOp = State.get(getVecOp(), Part);
1710     if (VPValue *Cond = getCondOp()) {
1711       Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1712       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1713       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1714       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1715                                                   RdxDesc.getFastMathFlags());
1716       if (State.VF.isVector()) {
1717         Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1718       }
1719 
1720       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1721       NewVecOp = Select;
1722     }
1723     Value *NewRed;
1724     Value *NextInChain;
1725     if (IsOrdered) {
1726       if (State.VF.isVector())
1727         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1728                                         PrevInChain);
1729       else
1730         NewRed = State.Builder.CreateBinOp(
1731             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1732             NewVecOp);
1733       PrevInChain = NewRed;
1734     } else {
1735       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1736       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1737     }
1738     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1739       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1740                                    NewRed, PrevInChain);
1741     } else if (IsOrdered)
1742       NextInChain = NewRed;
1743     else
1744       NextInChain = State.Builder.CreateBinOp(
1745           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1746     State.set(this, NextInChain, Part, /*IsScalar*/ true);
1747   }
1748 }
1749 
execute(VPTransformState & State)1750 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1751   assert(!State.Instance && "Reduction being replicated.");
1752   assert(State.UF == 1 &&
1753          "Expected only UF == 1 when vectorizing with explicit vector length.");
1754 
1755   auto &Builder = State.Builder;
1756   // Propagate the fast-math flags carried by the underlying instruction.
1757   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1758   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1759   Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1760 
1761   RecurKind Kind = RdxDesc.getRecurrenceKind();
1762   Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1763   Value *VecOp = State.get(getVecOp(), 0);
1764   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1765 
1766   VectorBuilder VBuilder(Builder);
1767   VBuilder.setEVL(EVL);
1768   Value *Mask;
1769   // TODO: move the all-true mask generation into VectorBuilder.
1770   if (VPValue *CondOp = getCondOp())
1771     Mask = State.get(CondOp, 0);
1772   else
1773     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1774   VBuilder.setMask(Mask);
1775 
1776   Value *NewRed;
1777   if (isOrdered()) {
1778     NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1779   } else {
1780     NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
1781     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1782       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1783     else
1784       NewRed = Builder.CreateBinOp(
1785           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1786   }
1787   State.set(this, NewRed, 0, /*IsScalar*/ true);
1788 }
1789 
1790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1791 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1792                               VPSlotTracker &SlotTracker) const {
1793   O << Indent << "REDUCE ";
1794   printAsOperand(O, SlotTracker);
1795   O << " = ";
1796   getChainOp()->printAsOperand(O, SlotTracker);
1797   O << " +";
1798   if (isa<FPMathOperator>(getUnderlyingInstr()))
1799     O << getUnderlyingInstr()->getFastMathFlags();
1800   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1801   getVecOp()->printAsOperand(O, SlotTracker);
1802   if (isConditional()) {
1803     O << ", ";
1804     getCondOp()->printAsOperand(O, SlotTracker);
1805   }
1806   O << ")";
1807   if (RdxDesc.IntermediateStore)
1808     O << " (with final reduction value stored in invariant address sank "
1809          "outside of loop)";
1810 }
1811 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1812 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1813                                  VPSlotTracker &SlotTracker) const {
1814   const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1815   O << Indent << "REDUCE ";
1816   printAsOperand(O, SlotTracker);
1817   O << " = ";
1818   getChainOp()->printAsOperand(O, SlotTracker);
1819   O << " +";
1820   if (isa<FPMathOperator>(getUnderlyingInstr()))
1821     O << getUnderlyingInstr()->getFastMathFlags();
1822   O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1823   getVecOp()->printAsOperand(O, SlotTracker);
1824   O << ", ";
1825   getEVL()->printAsOperand(O, SlotTracker);
1826   if (isConditional()) {
1827     O << ", ";
1828     getCondOp()->printAsOperand(O, SlotTracker);
1829   }
1830   O << ")";
1831   if (RdxDesc.IntermediateStore)
1832     O << " (with final reduction value stored in invariant address sank "
1833          "outside of loop)";
1834 }
1835 #endif
1836 
shouldPack() const1837 bool VPReplicateRecipe::shouldPack() const {
1838   // Find if the recipe is used by a widened recipe via an intervening
1839   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1840   return any_of(users(), [](const VPUser *U) {
1841     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1842       return any_of(PredR->users(), [PredR](const VPUser *U) {
1843         return !U->usesScalars(PredR);
1844       });
1845     return false;
1846   });
1847 }
1848 
1849 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1850 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1851                               VPSlotTracker &SlotTracker) const {
1852   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1853 
1854   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1855     printAsOperand(O, SlotTracker);
1856     O << " = ";
1857   }
1858   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1859     O << "call";
1860     printFlags(O);
1861     O << "@" << CB->getCalledFunction()->getName() << "(";
1862     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1863                     O, [&O, &SlotTracker](VPValue *Op) {
1864                       Op->printAsOperand(O, SlotTracker);
1865                     });
1866     O << ")";
1867   } else {
1868     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1869     printFlags(O);
1870     printOperands(O, SlotTracker);
1871   }
1872 
1873   if (shouldPack())
1874     O << " (S->V)";
1875 }
1876 #endif
1877 
1878 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1879 /// if it is either defined outside the vector region or its operand is known to
1880 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1881 /// TODO: Uniformity should be associated with a VPValue and there should be a
1882 /// generic way to check.
isUniformAcrossVFsAndUFs(VPScalarCastRecipe * C)1883 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1884   return C->isDefinedOutsideVectorRegions() ||
1885          isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1886          isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1887 }
1888 
generate(VPTransformState & State,unsigned Part)1889 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1890   assert(vputils::onlyFirstLaneUsed(this) &&
1891          "Codegen only implemented for first lane.");
1892   switch (Opcode) {
1893   case Instruction::SExt:
1894   case Instruction::ZExt:
1895   case Instruction::Trunc: {
1896     // Note: SExt/ZExt not used yet.
1897     Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1898     return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1899   }
1900   default:
1901     llvm_unreachable("opcode not implemented yet");
1902   }
1903 }
1904 
execute(VPTransformState & State)1905 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1906   bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1907   for (unsigned Part = 0; Part != State.UF; ++Part) {
1908     Value *Res;
1909     // Only generate a single instance, if the recipe is uniform across UFs and
1910     // VFs.
1911     if (Part > 0 && IsUniformAcrossVFsAndUFs)
1912       Res = State.get(this, VPIteration(0, 0));
1913     else
1914       Res = generate(State, Part);
1915     State.set(this, Res, VPIteration(Part, 0));
1916   }
1917 }
1918 
1919 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1920 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1921                                 VPSlotTracker &SlotTracker) const {
1922   O << Indent << "SCALAR-CAST ";
1923   printAsOperand(O, SlotTracker);
1924   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1925   printOperands(O, SlotTracker);
1926   O << " to " << *ResultTy;
1927 }
1928 #endif
1929 
execute(VPTransformState & State)1930 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1931   assert(State.Instance && "Branch on Mask works only on single instance.");
1932 
1933   unsigned Part = State.Instance->Part;
1934   unsigned Lane = State.Instance->Lane.getKnownLane();
1935 
1936   Value *ConditionBit = nullptr;
1937   VPValue *BlockInMask = getMask();
1938   if (BlockInMask) {
1939     ConditionBit = State.get(BlockInMask, Part);
1940     if (ConditionBit->getType()->isVectorTy())
1941       ConditionBit = State.Builder.CreateExtractElement(
1942           ConditionBit, State.Builder.getInt32(Lane));
1943   } else // Block in mask is all-one.
1944     ConditionBit = State.Builder.getTrue();
1945 
1946   // Replace the temporary unreachable terminator with a new conditional branch,
1947   // whose two destinations will be set later when they are created.
1948   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1949   assert(isa<UnreachableInst>(CurrentTerminator) &&
1950          "Expected to replace unreachable terminator with conditional branch.");
1951   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1952   CondBr->setSuccessor(0, nullptr);
1953   ReplaceInstWithInst(CurrentTerminator, CondBr);
1954 }
1955 
execute(VPTransformState & State)1956 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1957   assert(State.Instance && "Predicated instruction PHI works per instance.");
1958   Instruction *ScalarPredInst =
1959       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1960   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1961   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1962   assert(PredicatingBB && "Predicated block has no single predecessor.");
1963   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1964          "operand must be VPReplicateRecipe");
1965 
1966   // By current pack/unpack logic we need to generate only a single phi node: if
1967   // a vector value for the predicated instruction exists at this point it means
1968   // the instruction has vector users only, and a phi for the vector value is
1969   // needed. In this case the recipe of the predicated instruction is marked to
1970   // also do that packing, thereby "hoisting" the insert-element sequence.
1971   // Otherwise, a phi node for the scalar value is needed.
1972   unsigned Part = State.Instance->Part;
1973   if (State.hasVectorValue(getOperand(0), Part)) {
1974     Value *VectorValue = State.get(getOperand(0), Part);
1975     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1976     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1977     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1978     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1979     if (State.hasVectorValue(this, Part))
1980       State.reset(this, VPhi, Part);
1981     else
1982       State.set(this, VPhi, Part);
1983     // NOTE: Currently we need to update the value of the operand, so the next
1984     // predicated iteration inserts its generated value in the correct vector.
1985     State.reset(getOperand(0), VPhi, Part);
1986   } else {
1987     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1988     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1989     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1990                      PredicatingBB);
1991     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1992     if (State.hasScalarValue(this, *State.Instance))
1993       State.reset(this, Phi, *State.Instance);
1994     else
1995       State.set(this, Phi, *State.Instance);
1996     // NOTE: Currently we need to update the value of the operand, so the next
1997     // predicated iteration inserts its generated value in the correct vector.
1998     State.reset(getOperand(0), Phi, *State.Instance);
1999   }
2000 }
2001 
2002 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2003 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2004                                 VPSlotTracker &SlotTracker) const {
2005   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2006   printAsOperand(O, SlotTracker);
2007   O << " = ";
2008   printOperands(O, SlotTracker);
2009 }
2010 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2011 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2012                               VPSlotTracker &SlotTracker) const {
2013   O << Indent << "WIDEN ";
2014   printAsOperand(O, SlotTracker);
2015   O << " = load ";
2016   printOperands(O, SlotTracker);
2017 }
2018 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2019 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2020                                  VPSlotTracker &SlotTracker) const {
2021   O << Indent << "WIDEN ";
2022   printAsOperand(O, SlotTracker);
2023   O << " = vp.load ";
2024   printOperands(O, SlotTracker);
2025 }
2026 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2027 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2028                                VPSlotTracker &SlotTracker) const {
2029   O << Indent << "WIDEN store ";
2030   printOperands(O, SlotTracker);
2031 }
2032 
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2033 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2034                                   VPSlotTracker &SlotTracker) const {
2035   O << Indent << "WIDEN vp.store ";
2036   printOperands(O, SlotTracker);
2037 }
2038 #endif
2039 
createBitOrPointerCast(IRBuilderBase & Builder,Value * V,VectorType * DstVTy,const DataLayout & DL)2040 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2041                                      VectorType *DstVTy, const DataLayout &DL) {
2042   // Verify that V is a vector type with same number of elements as DstVTy.
2043   auto VF = DstVTy->getElementCount();
2044   auto *SrcVecTy = cast<VectorType>(V->getType());
2045   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2046   Type *SrcElemTy = SrcVecTy->getElementType();
2047   Type *DstElemTy = DstVTy->getElementType();
2048   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2049          "Vector elements must have same size");
2050 
2051   // Do a direct cast if element types are castable.
2052   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2053     return Builder.CreateBitOrPointerCast(V, DstVTy);
2054   }
2055   // V cannot be directly casted to desired vector type.
2056   // May happen when V is a floating point vector but DstVTy is a vector of
2057   // pointers or vice-versa. Handle this using a two-step bitcast using an
2058   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2059   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2060          "Only one type should be a pointer type");
2061   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2062          "Only one type should be a floating point type");
2063   Type *IntTy =
2064       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2065   auto *VecIntTy = VectorType::get(IntTy, VF);
2066   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2067   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2068 }
2069 
2070 /// Return a vector containing interleaved elements from multiple
2071 /// smaller input vectors.
interleaveVectors(IRBuilderBase & Builder,ArrayRef<Value * > Vals,const Twine & Name)2072 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2073                                 const Twine &Name) {
2074   unsigned Factor = Vals.size();
2075   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2076 
2077   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2078 #ifndef NDEBUG
2079   for (Value *Val : Vals)
2080     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2081 #endif
2082 
2083   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2084   // must use intrinsics to interleave.
2085   if (VecTy->isScalableTy()) {
2086     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2087     return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2088                                    Vals,
2089                                    /*FMFSource=*/nullptr, Name);
2090   }
2091 
2092   // Fixed length. Start by concatenating all vectors into a wide vector.
2093   Value *WideVec = concatenateVectors(Builder, Vals);
2094 
2095   // Interleave the elements into the wide vector.
2096   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2097   return Builder.CreateShuffleVector(
2098       WideVec, createInterleaveMask(NumElts, Factor), Name);
2099 }
2100 
2101 // Try to vectorize the interleave group that \p Instr belongs to.
2102 //
2103 // E.g. Translate following interleaved load group (factor = 3):
2104 //   for (i = 0; i < N; i+=3) {
2105 //     R = Pic[i];             // Member of index 0
2106 //     G = Pic[i+1];           // Member of index 1
2107 //     B = Pic[i+2];           // Member of index 2
2108 //     ... // do something to R, G, B
2109 //   }
2110 // To:
2111 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2112 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2113 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2114 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2115 //
2116 // Or translate following interleaved store group (factor = 3):
2117 //   for (i = 0; i < N; i+=3) {
2118 //     ... do something to R, G, B
2119 //     Pic[i]   = R;           // Member of index 0
2120 //     Pic[i+1] = G;           // Member of index 1
2121 //     Pic[i+2] = B;           // Member of index 2
2122 //   }
2123 // To:
2124 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2125 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2126 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2127 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2128 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
execute(VPTransformState & State)2129 void VPInterleaveRecipe::execute(VPTransformState &State) {
2130   assert(!State.Instance && "Interleave group being replicated.");
2131   const InterleaveGroup<Instruction> *Group = IG;
2132   Instruction *Instr = Group->getInsertPos();
2133 
2134   // Prepare for the vector type of the interleaved load/store.
2135   Type *ScalarTy = getLoadStoreType(Instr);
2136   unsigned InterleaveFactor = Group->getFactor();
2137   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2138 
2139   // Prepare for the new pointers.
2140   SmallVector<Value *, 2> AddrParts;
2141   unsigned Index = Group->getIndex(Instr);
2142 
2143   // TODO: extend the masked interleaved-group support to reversed access.
2144   VPValue *BlockInMask = getMask();
2145   assert((!BlockInMask || !Group->isReverse()) &&
2146          "Reversed masked interleave-group not supported.");
2147 
2148   Value *Idx;
2149   // If the group is reverse, adjust the index to refer to the last vector lane
2150   // instead of the first. We adjust the index from the first vector lane,
2151   // rather than directly getting the pointer for lane VF - 1, because the
2152   // pointer operand of the interleaved access is supposed to be uniform. For
2153   // uniform instructions, we're only required to generate a value for the
2154   // first vector lane in each unroll iteration.
2155   if (Group->isReverse()) {
2156     Value *RuntimeVF =
2157         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2158     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2159     Idx = State.Builder.CreateMul(Idx,
2160                                   State.Builder.getInt32(Group->getFactor()));
2161     Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2162     Idx = State.Builder.CreateNeg(Idx);
2163   } else
2164     Idx = State.Builder.getInt32(-Index);
2165 
2166   VPValue *Addr = getAddr();
2167   for (unsigned Part = 0; Part < State.UF; Part++) {
2168     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2169     if (auto *I = dyn_cast<Instruction>(AddrPart))
2170       State.setDebugLocFrom(I->getDebugLoc());
2171 
2172     // Notice current instruction could be any index. Need to adjust the address
2173     // to the member of index 0.
2174     //
2175     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2176     //       b = A[i];       // Member of index 0
2177     // Current pointer is pointed to A[i+1], adjust it to A[i].
2178     //
2179     // E.g.  A[i+1] = a;     // Member of index 1
2180     //       A[i]   = b;     // Member of index 0
2181     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2182     // Current pointer is pointed to A[i+2], adjust it to A[i].
2183 
2184     bool InBounds = false;
2185     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2186       InBounds = gep->isInBounds();
2187     AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2188     AddrParts.push_back(AddrPart);
2189   }
2190 
2191   State.setDebugLocFrom(Instr->getDebugLoc());
2192   Value *PoisonVec = PoisonValue::get(VecTy);
2193 
2194   auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2195                              unsigned Part, Value *MaskForGaps) -> Value * {
2196     if (State.VF.isScalable()) {
2197       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2198       assert(InterleaveFactor == 2 &&
2199              "Unsupported deinterleave factor for scalable vectors");
2200       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2201       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2202       auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2203                                      State.VF.getKnownMinValue() * 2, true);
2204       return State.Builder.CreateIntrinsic(
2205           MaskTy, Intrinsic::vector_interleave2, Ops,
2206           /*FMFSource=*/nullptr, "interleaved.mask");
2207     }
2208 
2209     if (!BlockInMask)
2210       return MaskForGaps;
2211 
2212     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2213     Value *ShuffledMask = State.Builder.CreateShuffleVector(
2214         BlockInMaskPart,
2215         createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2216         "interleaved.mask");
2217     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2218                                                    ShuffledMask, MaskForGaps)
2219                        : ShuffledMask;
2220   };
2221 
2222   const DataLayout &DL = Instr->getDataLayout();
2223   // Vectorize the interleaved load group.
2224   if (isa<LoadInst>(Instr)) {
2225     Value *MaskForGaps = nullptr;
2226     if (NeedsMaskForGaps) {
2227       MaskForGaps = createBitMaskForGaps(State.Builder,
2228                                          State.VF.getKnownMinValue(), *Group);
2229       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2230     }
2231 
2232     // For each unroll part, create a wide load for the group.
2233     SmallVector<Value *, 2> NewLoads;
2234     for (unsigned Part = 0; Part < State.UF; Part++) {
2235       Instruction *NewLoad;
2236       if (BlockInMask || MaskForGaps) {
2237         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2238         NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
2239                                                  Group->getAlign(), GroupMask,
2240                                                  PoisonVec, "wide.masked.vec");
2241       } else
2242         NewLoad = State.Builder.CreateAlignedLoad(
2243             VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
2244       Group->addMetadata(NewLoad);
2245       NewLoads.push_back(NewLoad);
2246     }
2247 
2248     ArrayRef<VPValue *> VPDefs = definedValues();
2249     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2250     if (VecTy->isScalableTy()) {
2251       assert(InterleaveFactor == 2 &&
2252              "Unsupported deinterleave factor for scalable vectors");
2253 
2254       for (unsigned Part = 0; Part < State.UF; ++Part) {
2255         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2256         // so must use intrinsics to deinterleave.
2257         Value *DI = State.Builder.CreateIntrinsic(
2258             Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2259             /*FMFSource=*/nullptr, "strided.vec");
2260         unsigned J = 0;
2261         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2262           Instruction *Member = Group->getMember(I);
2263 
2264           if (!Member)
2265             continue;
2266 
2267           Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2268           // If this member has different type, cast the result type.
2269           if (Member->getType() != ScalarTy) {
2270             VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2271             StridedVec =
2272                 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2273           }
2274 
2275           if (Group->isReverse())
2276             StridedVec =
2277                 State.Builder.CreateVectorReverse(StridedVec, "reverse");
2278 
2279           State.set(VPDefs[J], StridedVec, Part);
2280           ++J;
2281         }
2282       }
2283 
2284       return;
2285     }
2286 
2287     // For each member in the group, shuffle out the appropriate data from the
2288     // wide loads.
2289     unsigned J = 0;
2290     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2291       Instruction *Member = Group->getMember(I);
2292 
2293       // Skip the gaps in the group.
2294       if (!Member)
2295         continue;
2296 
2297       auto StrideMask =
2298           createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2299       for (unsigned Part = 0; Part < State.UF; Part++) {
2300         Value *StridedVec = State.Builder.CreateShuffleVector(
2301             NewLoads[Part], StrideMask, "strided.vec");
2302 
2303         // If this member has different type, cast the result type.
2304         if (Member->getType() != ScalarTy) {
2305           assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2306           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2307           StridedVec =
2308               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2309         }
2310 
2311         if (Group->isReverse())
2312           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2313 
2314         State.set(VPDefs[J], StridedVec, Part);
2315       }
2316       ++J;
2317     }
2318     return;
2319   }
2320 
2321   // The sub vector type for current instruction.
2322   auto *SubVT = VectorType::get(ScalarTy, State.VF);
2323 
2324   // Vectorize the interleaved store group.
2325   Value *MaskForGaps =
2326       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2327   assert((!MaskForGaps || !State.VF.isScalable()) &&
2328          "masking gaps for scalable vectors is not yet supported.");
2329   ArrayRef<VPValue *> StoredValues = getStoredValues();
2330   for (unsigned Part = 0; Part < State.UF; Part++) {
2331     // Collect the stored vector from each member.
2332     SmallVector<Value *, 4> StoredVecs;
2333     unsigned StoredIdx = 0;
2334     for (unsigned i = 0; i < InterleaveFactor; i++) {
2335       assert((Group->getMember(i) || MaskForGaps) &&
2336              "Fail to get a member from an interleaved store group");
2337       Instruction *Member = Group->getMember(i);
2338 
2339       // Skip the gaps in the group.
2340       if (!Member) {
2341         Value *Undef = PoisonValue::get(SubVT);
2342         StoredVecs.push_back(Undef);
2343         continue;
2344       }
2345 
2346       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2347       ++StoredIdx;
2348 
2349       if (Group->isReverse())
2350         StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2351 
2352       // If this member has different type, cast it to a unified type.
2353 
2354       if (StoredVec->getType() != SubVT)
2355         StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2356 
2357       StoredVecs.push_back(StoredVec);
2358     }
2359 
2360     // Interleave all the smaller vectors into one wider vector.
2361     Value *IVec =
2362         interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2363     Instruction *NewStoreInstr;
2364     if (BlockInMask || MaskForGaps) {
2365       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2366       NewStoreInstr = State.Builder.CreateMaskedStore(
2367           IVec, AddrParts[Part], Group->getAlign(), GroupMask);
2368     } else
2369       NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
2370                                                        Group->getAlign());
2371 
2372     Group->addMetadata(NewStoreInstr);
2373   }
2374 }
2375 
2376 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2377 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2378                                VPSlotTracker &SlotTracker) const {
2379   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2380   IG->getInsertPos()->printAsOperand(O, false);
2381   O << ", ";
2382   getAddr()->printAsOperand(O, SlotTracker);
2383   VPValue *Mask = getMask();
2384   if (Mask) {
2385     O << ", ";
2386     Mask->printAsOperand(O, SlotTracker);
2387   }
2388 
2389   unsigned OpIdx = 0;
2390   for (unsigned i = 0; i < IG->getFactor(); ++i) {
2391     if (!IG->getMember(i))
2392       continue;
2393     if (getNumStoreOperands() > 0) {
2394       O << "\n" << Indent << "  store ";
2395       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2396       O << " to index " << i;
2397     } else {
2398       O << "\n" << Indent << "  ";
2399       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2400       O << " = load from index " << i;
2401     }
2402     ++OpIdx;
2403   }
2404 }
2405 #endif
2406 
execute(VPTransformState & State)2407 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2408   Value *Start = getStartValue()->getLiveInIRValue();
2409   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
2410   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2411 
2412   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2413   EntryPart->addIncoming(Start, VectorPH);
2414   EntryPart->setDebugLoc(getDebugLoc());
2415   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2416     State.set(this, EntryPart, Part, /*IsScalar*/ true);
2417 }
2418 
2419 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2420 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2421                                    VPSlotTracker &SlotTracker) const {
2422   O << Indent << "EMIT ";
2423   printAsOperand(O, SlotTracker);
2424   O << " = CANONICAL-INDUCTION ";
2425   printOperands(O, SlotTracker);
2426 }
2427 #endif
2428 
isCanonical(InductionDescriptor::InductionKind Kind,VPValue * Start,VPValue * Step) const2429 bool VPCanonicalIVPHIRecipe::isCanonical(
2430     InductionDescriptor::InductionKind Kind, VPValue *Start,
2431     VPValue *Step) const {
2432   // Must be an integer induction.
2433   if (Kind != InductionDescriptor::IK_IntInduction)
2434     return false;
2435   // Start must match the start value of this canonical induction.
2436   if (Start != getStartValue())
2437     return false;
2438 
2439   // If the step is defined by a recipe, it is not a ConstantInt.
2440   if (Step->getDefiningRecipe())
2441     return false;
2442 
2443   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2444   return StepC && StepC->isOne();
2445 }
2446 
onlyScalarsGenerated(bool IsScalable)2447 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2448   return IsScalarAfterVectorization &&
2449          (!IsScalable || vputils::onlyFirstLaneUsed(this));
2450 }
2451 
2452 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2453 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2454                                           VPSlotTracker &SlotTracker) const {
2455   O << Indent << "EMIT ";
2456   printAsOperand(O, SlotTracker);
2457   O << " = WIDEN-POINTER-INDUCTION ";
2458   getStartValue()->printAsOperand(O, SlotTracker);
2459   O << ", " << *IndDesc.getStep();
2460 }
2461 #endif
2462 
execute(VPTransformState & State)2463 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2464   assert(!State.Instance && "cannot be used in per-lane");
2465   const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2466   SCEVExpander Exp(SE, DL, "induction");
2467 
2468   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2469                                  &*State.Builder.GetInsertPoint());
2470   assert(!State.ExpandedSCEVs.contains(Expr) &&
2471          "Same SCEV expanded multiple times");
2472   State.ExpandedSCEVs[Expr] = Res;
2473   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2474     State.set(this, Res, {Part, 0});
2475 }
2476 
2477 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2478 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2479                                VPSlotTracker &SlotTracker) const {
2480   O << Indent << "EMIT ";
2481   getVPSingleValue()->printAsOperand(O, SlotTracker);
2482   O << " = EXPAND SCEV " << *Expr;
2483 }
2484 #endif
2485 
execute(VPTransformState & State)2486 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2487   Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2488   Type *STy = CanonicalIV->getType();
2489   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2490   ElementCount VF = State.VF;
2491   Value *VStart = VF.isScalar()
2492                       ? CanonicalIV
2493                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2494   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2495     Value *VStep = createStepForVF(Builder, STy, VF, Part);
2496     if (VF.isVector()) {
2497       VStep = Builder.CreateVectorSplat(VF, VStep);
2498       VStep =
2499           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2500     }
2501     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2502     State.set(this, CanonicalVectorIV, Part);
2503   }
2504 }
2505 
2506 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2507 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2508                                      VPSlotTracker &SlotTracker) const {
2509   O << Indent << "EMIT ";
2510   printAsOperand(O, SlotTracker);
2511   O << " = WIDEN-CANONICAL-INDUCTION ";
2512   printOperands(O, SlotTracker);
2513 }
2514 #endif
2515 
execute(VPTransformState & State)2516 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2517   auto &Builder = State.Builder;
2518   // Create a vector from the initial value.
2519   auto *VectorInit = getStartValue()->getLiveInIRValue();
2520 
2521   Type *VecTy = State.VF.isScalar()
2522                     ? VectorInit->getType()
2523                     : VectorType::get(VectorInit->getType(), State.VF);
2524 
2525   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2526   if (State.VF.isVector()) {
2527     auto *IdxTy = Builder.getInt32Ty();
2528     auto *One = ConstantInt::get(IdxTy, 1);
2529     IRBuilder<>::InsertPointGuard Guard(Builder);
2530     Builder.SetInsertPoint(VectorPH->getTerminator());
2531     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2532     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2533     VectorInit = Builder.CreateInsertElement(
2534         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2535   }
2536 
2537   // Create a phi node for the new recurrence.
2538   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
2539   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2540   EntryPart->addIncoming(VectorInit, VectorPH);
2541   State.set(this, EntryPart, 0);
2542 }
2543 
2544 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2545 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2546                                             VPSlotTracker &SlotTracker) const {
2547   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2548   printAsOperand(O, SlotTracker);
2549   O << " = phi ";
2550   printOperands(O, SlotTracker);
2551 }
2552 #endif
2553 
execute(VPTransformState & State)2554 void VPReductionPHIRecipe::execute(VPTransformState &State) {
2555   auto &Builder = State.Builder;
2556 
2557   // Reductions do not have to start at zero. They can start with
2558   // any loop invariant values.
2559   VPValue *StartVPV = getStartValue();
2560   Value *StartV = StartVPV->getLiveInIRValue();
2561 
2562   // In order to support recurrences we need to be able to vectorize Phi nodes.
2563   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2564   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2565   // this value when we vectorize all of the instructions that use the PHI.
2566   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2567   Type *VecTy = ScalarPHI ? StartV->getType()
2568                           : VectorType::get(StartV->getType(), State.VF);
2569 
2570   BasicBlock *HeaderBB = State.CFG.PrevBB;
2571   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2572          "recipe must be in the vector loop header");
2573   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2574   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2575     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
2576     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
2577     State.set(this, EntryPart, Part, IsInLoop);
2578   }
2579 
2580   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2581 
2582   Value *Iden = nullptr;
2583   RecurKind RK = RdxDesc.getRecurrenceKind();
2584   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2585       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2586     // MinMax and AnyOf reductions have the start value as their identity.
2587     if (ScalarPHI) {
2588       Iden = StartV;
2589     } else {
2590       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2591       Builder.SetInsertPoint(VectorPH->getTerminator());
2592       StartV = Iden =
2593           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2594     }
2595   } else {
2596     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2597                                          RdxDesc.getFastMathFlags());
2598 
2599     if (!ScalarPHI) {
2600       Iden = Builder.CreateVectorSplat(State.VF, Iden);
2601       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2602       Builder.SetInsertPoint(VectorPH->getTerminator());
2603       Constant *Zero = Builder.getInt32(0);
2604       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2605     }
2606   }
2607 
2608   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2609     Value *EntryPart = State.get(this, Part, IsInLoop);
2610     // Make sure to add the reduction start value only to the
2611     // first unroll part.
2612     Value *StartVal = (Part == 0) ? StartV : Iden;
2613     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2614   }
2615 }
2616 
2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2618 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2619                                  VPSlotTracker &SlotTracker) const {
2620   O << Indent << "WIDEN-REDUCTION-PHI ";
2621 
2622   printAsOperand(O, SlotTracker);
2623   O << " = phi ";
2624   printOperands(O, SlotTracker);
2625 }
2626 #endif
2627 
execute(VPTransformState & State)2628 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2629   assert(EnableVPlanNativePath &&
2630          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2631 
2632   Value *Op0 = State.get(getOperand(0), 0);
2633   Type *VecTy = Op0->getType();
2634   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2635   State.set(this, VecPhi, 0);
2636 }
2637 
2638 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2639 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2640                              VPSlotTracker &SlotTracker) const {
2641   O << Indent << "WIDEN-PHI ";
2642 
2643   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2644   // Unless all incoming values are modeled in VPlan  print the original PHI
2645   // directly.
2646   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2647   // values as VPValues.
2648   if (getNumOperands() != OriginalPhi->getNumOperands()) {
2649     O << VPlanIngredient(OriginalPhi);
2650     return;
2651   }
2652 
2653   printAsOperand(O, SlotTracker);
2654   O << " = phi ";
2655   printOperands(O, SlotTracker);
2656 }
2657 #endif
2658 
2659 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2660 // remove VPActiveLaneMaskPHIRecipe.
execute(VPTransformState & State)2661 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2662   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2663   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2664     Value *StartMask = State.get(getOperand(0), Part);
2665     PHINode *EntryPart =
2666         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2667     EntryPart->addIncoming(StartMask, VectorPH);
2668     EntryPart->setDebugLoc(getDebugLoc());
2669     State.set(this, EntryPart, Part);
2670   }
2671 }
2672 
2673 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2674 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2675                                       VPSlotTracker &SlotTracker) const {
2676   O << Indent << "ACTIVE-LANE-MASK-PHI ";
2677 
2678   printAsOperand(O, SlotTracker);
2679   O << " = phi ";
2680   printOperands(O, SlotTracker);
2681 }
2682 #endif
2683 
execute(VPTransformState & State)2684 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2685   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2686   assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2687   Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2688   PHINode *EntryPart =
2689       State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2690   EntryPart->addIncoming(Start, VectorPH);
2691   EntryPart->setDebugLoc(getDebugLoc());
2692   State.set(this, EntryPart, 0, /*IsScalar=*/true);
2693 }
2694 
2695 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2696 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2697                                   VPSlotTracker &SlotTracker) const {
2698   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2699 
2700   printAsOperand(O, SlotTracker);
2701   O << " = phi ";
2702   printOperands(O, SlotTracker);
2703 }
2704 #endif
2705