xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "LoopVectorizationPlanner.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanHelpers.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanUtils.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Analysis/AssumptionCache.h"
24 #include "llvm/Analysis/IVDescriptors.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Value.h"
33 #include "llvm/Support/Casting.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/LoopVersioning.h"
40 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
41 #include <cassert>
42 
43 using namespace llvm;
44 
45 using VectorParts = SmallVector<Value *, 2>;
46 
47 #define LV_NAME "loop-vectorize"
48 #define DEBUG_TYPE LV_NAME
49 
50 bool VPRecipeBase::mayWriteToMemory() const {
51   switch (getVPDefID()) {
52   case VPExpressionSC:
53     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54   case VPInstructionSC:
55     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56   case VPInterleaveSC:
57     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
58   case VPWidenStoreEVLSC:
59   case VPWidenStoreSC:
60     return true;
61   case VPReplicateSC:
62     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
63         ->mayWriteToMemory();
64   case VPWidenCallSC:
65     return !cast<VPWidenCallRecipe>(this)
66                 ->getCalledScalarFunction()
67                 ->onlyReadsMemory();
68   case VPWidenIntrinsicSC:
69     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
70   case VPCanonicalIVPHISC:
71   case VPBranchOnMaskSC:
72   case VPFirstOrderRecurrencePHISC:
73   case VPReductionPHISC:
74   case VPScalarIVStepsSC:
75   case VPPredInstPHISC:
76     return false;
77   case VPBlendSC:
78   case VPReductionEVLSC:
79   case VPReductionSC:
80   case VPVectorPointerSC:
81   case VPWidenCanonicalIVSC:
82   case VPWidenCastSC:
83   case VPWidenGEPSC:
84   case VPWidenIntOrFpInductionSC:
85   case VPWidenLoadEVLSC:
86   case VPWidenLoadSC:
87   case VPWidenPHISC:
88   case VPWidenSC:
89   case VPWidenSelectSC: {
90     const Instruction *I =
91         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
92     (void)I;
93     assert((!I || !I->mayWriteToMemory()) &&
94            "underlying instruction may write to memory");
95     return false;
96   }
97   default:
98     return true;
99   }
100 }
101 
102 bool VPRecipeBase::mayReadFromMemory() const {
103   switch (getVPDefID()) {
104   case VPExpressionSC:
105     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
106   case VPInstructionSC:
107     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
108   case VPWidenLoadEVLSC:
109   case VPWidenLoadSC:
110     return true;
111   case VPReplicateSC:
112     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
113         ->mayReadFromMemory();
114   case VPWidenCallSC:
115     return !cast<VPWidenCallRecipe>(this)
116                 ->getCalledScalarFunction()
117                 ->onlyWritesMemory();
118   case VPWidenIntrinsicSC:
119     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
120   case VPBranchOnMaskSC:
121   case VPFirstOrderRecurrencePHISC:
122   case VPPredInstPHISC:
123   case VPScalarIVStepsSC:
124   case VPWidenStoreEVLSC:
125   case VPWidenStoreSC:
126     return false;
127   case VPBlendSC:
128   case VPReductionEVLSC:
129   case VPReductionSC:
130   case VPVectorPointerSC:
131   case VPWidenCanonicalIVSC:
132   case VPWidenCastSC:
133   case VPWidenGEPSC:
134   case VPWidenIntOrFpInductionSC:
135   case VPWidenPHISC:
136   case VPWidenSC:
137   case VPWidenSelectSC: {
138     const Instruction *I =
139         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
140     (void)I;
141     assert((!I || !I->mayReadFromMemory()) &&
142            "underlying instruction may read from memory");
143     return false;
144   }
145   default:
146     return true;
147   }
148 }
149 
150 bool VPRecipeBase::mayHaveSideEffects() const {
151   switch (getVPDefID()) {
152   case VPExpressionSC:
153     return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
154   case VPDerivedIVSC:
155   case VPFirstOrderRecurrencePHISC:
156   case VPPredInstPHISC:
157   case VPVectorEndPointerSC:
158     return false;
159   case VPInstructionSC:
160     return mayWriteToMemory();
161   case VPWidenCallSC: {
162     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
163     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
164   }
165   case VPWidenIntrinsicSC:
166     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
167   case VPBlendSC:
168   case VPReductionEVLSC:
169   case VPReductionSC:
170   case VPScalarIVStepsSC:
171   case VPVectorPointerSC:
172   case VPWidenCanonicalIVSC:
173   case VPWidenCastSC:
174   case VPWidenGEPSC:
175   case VPWidenIntOrFpInductionSC:
176   case VPWidenPHISC:
177   case VPWidenPointerInductionSC:
178   case VPWidenSC:
179   case VPWidenSelectSC: {
180     const Instruction *I =
181         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
182     (void)I;
183     assert((!I || !I->mayHaveSideEffects()) &&
184            "underlying instruction has side-effects");
185     return false;
186   }
187   case VPInterleaveSC:
188     return mayWriteToMemory();
189   case VPWidenLoadEVLSC:
190   case VPWidenLoadSC:
191   case VPWidenStoreEVLSC:
192   case VPWidenStoreSC:
193     assert(
194         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
195             mayWriteToMemory() &&
196         "mayHaveSideffects result for ingredient differs from this "
197         "implementation");
198     return mayWriteToMemory();
199   case VPReplicateSC: {
200     auto *R = cast<VPReplicateRecipe>(this);
201     return R->getUnderlyingInstr()->mayHaveSideEffects();
202   }
203   default:
204     return true;
205   }
206 }
207 
208 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
209   assert(!Parent && "Recipe already in some VPBasicBlock");
210   assert(InsertPos->getParent() &&
211          "Insertion position not in any VPBasicBlock");
212   InsertPos->getParent()->insert(this, InsertPos->getIterator());
213 }
214 
215 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
216                                 iplist<VPRecipeBase>::iterator I) {
217   assert(!Parent && "Recipe already in some VPBasicBlock");
218   assert(I == BB.end() || I->getParent() == &BB);
219   BB.insert(this, I);
220 }
221 
222 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
223   assert(!Parent && "Recipe already in some VPBasicBlock");
224   assert(InsertPos->getParent() &&
225          "Insertion position not in any VPBasicBlock");
226   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
227 }
228 
229 void VPRecipeBase::removeFromParent() {
230   assert(getParent() && "Recipe not in any VPBasicBlock");
231   getParent()->getRecipeList().remove(getIterator());
232   Parent = nullptr;
233 }
234 
235 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
236   assert(getParent() && "Recipe not in any VPBasicBlock");
237   return getParent()->getRecipeList().erase(getIterator());
238 }
239 
240 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
241   removeFromParent();
242   insertAfter(InsertPos);
243 }
244 
245 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
246                               iplist<VPRecipeBase>::iterator I) {
247   removeFromParent();
248   insertBefore(BB, I);
249 }
250 
251 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
252   // Get the underlying instruction for the recipe, if there is one. It is used
253   // to
254   //   * decide if cost computation should be skipped for this recipe,
255   //   * apply forced target instruction cost.
256   Instruction *UI = nullptr;
257   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
258     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
259   else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
260     UI = IG->getInsertPos();
261   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
262     UI = &WidenMem->getIngredient();
263 
264   InstructionCost RecipeCost;
265   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
266     RecipeCost = 0;
267   } else {
268     RecipeCost = computeCost(VF, Ctx);
269     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
270         RecipeCost.isValid())
271       RecipeCost = InstructionCost(ForceTargetInstructionCost);
272   }
273 
274   LLVM_DEBUG({
275     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
276     dump();
277   });
278   return RecipeCost;
279 }
280 
281 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
282                                           VPCostContext &Ctx) const {
283   llvm_unreachable("subclasses should implement computeCost");
284 }
285 
286 bool VPRecipeBase::isPhi() const {
287   return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
288          (isa<VPInstruction>(this) &&
289           cast<VPInstruction>(this)->getOpcode() == Instruction::PHI) ||
290          isa<VPIRPhi>(this);
291 }
292 
293 bool VPRecipeBase::isScalarCast() const {
294   auto *VPI = dyn_cast<VPInstruction>(this);
295   return VPI && Instruction::isCast(VPI->getOpcode());
296 }
297 
298 InstructionCost
299 VPPartialReductionRecipe::computeCost(ElementCount VF,
300                                       VPCostContext &Ctx) const {
301   std::optional<unsigned> Opcode;
302   VPValue *Op = getOperand(0);
303   VPRecipeBase *OpR = Op->getDefiningRecipe();
304 
305   // If the partial reduction is predicated, a select will be operand 0
306   using namespace llvm::VPlanPatternMatch;
307   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
308     OpR = Op->getDefiningRecipe();
309   }
310 
311   Type *InputTypeA = nullptr, *InputTypeB = nullptr;
312   TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
313                                   ExtBType = TTI::PR_None;
314 
315   auto GetExtendKind = [](VPRecipeBase *R) {
316     if (!R)
317       return TTI::PR_None;
318     auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
319     if (!WidenCastR)
320       return TTI::PR_None;
321     if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
322       return TTI::PR_ZeroExtend;
323     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
324       return TTI::PR_SignExtend;
325     return TTI::PR_None;
326   };
327 
328   // Pick out opcode, type/ext information and use sub side effects from a widen
329   // recipe.
330   auto HandleWiden = [&](VPWidenRecipe *Widen) {
331     if (match(Widen,
332               m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Op)))) {
333       Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
334     }
335     Opcode = Widen->getOpcode();
336     VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
337     VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
338     InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
339                                                  : Widen->getOperand(0));
340     InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
341                                                  : Widen->getOperand(1));
342     ExtAType = GetExtendKind(ExtAR);
343     ExtBType = GetExtendKind(ExtBR);
344   };
345 
346   if (isa<VPWidenCastRecipe>(OpR)) {
347     InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
348     ExtAType = GetExtendKind(OpR);
349   } else if (isa<VPReductionPHIRecipe>(OpR)) {
350     auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
351     if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
352       InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
353       ExtAType = GetExtendKind(RedPhiOp1R);
354     } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
355       HandleWiden(Widen);
356   } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
357     HandleWiden(Widen);
358   } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
359     return Reduction->computeCost(VF, Ctx);
360   }
361   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
362   return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
363                                          PhiType, VF, ExtAType, ExtBType,
364                                          Opcode, Ctx.CostKind);
365 }
366 
367 void VPPartialReductionRecipe::execute(VPTransformState &State) {
368   auto &Builder = State.Builder;
369 
370   assert(getOpcode() == Instruction::Add &&
371          "Unhandled partial reduction opcode");
372 
373   Value *BinOpVal = State.get(getOperand(1));
374   Value *PhiVal = State.get(getOperand(0));
375   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
376 
377   Type *RetTy = PhiVal->getType();
378 
379   CallInst *V = Builder.CreateIntrinsic(
380       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
381       {PhiVal, BinOpVal}, nullptr, "partial.reduce");
382 
383   State.set(this, V);
384 }
385 
386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
387 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
388                                      VPSlotTracker &SlotTracker) const {
389   O << Indent << "PARTIAL-REDUCE ";
390   printAsOperand(O, SlotTracker);
391   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
392   printOperands(O, SlotTracker);
393 }
394 #endif
395 
396 FastMathFlags VPIRFlags::getFastMathFlags() const {
397   assert(OpType == OperationType::FPMathOp &&
398          "recipe doesn't have fast math flags");
399   FastMathFlags Res;
400   Res.setAllowReassoc(FMFs.AllowReassoc);
401   Res.setNoNaNs(FMFs.NoNaNs);
402   Res.setNoInfs(FMFs.NoInfs);
403   Res.setNoSignedZeros(FMFs.NoSignedZeros);
404   Res.setAllowReciprocal(FMFs.AllowReciprocal);
405   Res.setAllowContract(FMFs.AllowContract);
406   Res.setApproxFunc(FMFs.ApproxFunc);
407   return Res;
408 }
409 
410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
411 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
412 #endif
413 
414 template <unsigned PartOpIdx>
415 VPValue *
416 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
417   if (U.getNumOperands() == PartOpIdx + 1)
418     return U.getOperand(PartOpIdx);
419   return nullptr;
420 }
421 
422 template <unsigned PartOpIdx>
423 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
424   if (auto *UnrollPartOp = getUnrollPartOperand(U))
425     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
426   return 0;
427 }
428 
429 namespace llvm {
430 template class VPUnrollPartAccessor<2>;
431 template class VPUnrollPartAccessor<3>;
432 }
433 
434 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
435                              const VPIRFlags &Flags, DebugLoc DL,
436                              const Twine &Name)
437     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
438       VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
439   assert(flagsValidForOpcode(getOpcode()) &&
440          "Set flags not supported for the provided opcode");
441   assert((getNumOperandsForOpcode(Opcode) == -1u ||
442           getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
443          "number of operands does not match opcode");
444 }
445 
446 #ifndef NDEBUG
447 unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
448   if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
449     return 1;
450 
451   if (Instruction::isBinaryOp(Opcode))
452     return 2;
453 
454   switch (Opcode) {
455   case VPInstruction::StepVector:
456     return 0;
457   case Instruction::Alloca:
458   case Instruction::ExtractValue:
459   case Instruction::Freeze:
460   case Instruction::Load:
461   case VPInstruction::AnyOf:
462   case VPInstruction::BranchOnCond:
463   case VPInstruction::CalculateTripCountMinusVF:
464   case VPInstruction::CanonicalIVIncrementForPart:
465   case VPInstruction::ExplicitVectorLength:
466   case VPInstruction::ExtractLastElement:
467   case VPInstruction::ExtractPenultimateElement:
468   case VPInstruction::FirstActiveLane:
469   case VPInstruction::Not:
470     return 1;
471   case Instruction::ICmp:
472   case Instruction::FCmp:
473   case Instruction::Store:
474   case VPInstruction::ActiveLaneMask:
475   case VPInstruction::BranchOnCount:
476   case VPInstruction::ComputeReductionResult:
477   case VPInstruction::FirstOrderRecurrenceSplice:
478   case VPInstruction::LogicalAnd:
479   case VPInstruction::PtrAdd:
480   case VPInstruction::WideIVStep:
481     return 2;
482   case Instruction::Select:
483   case VPInstruction::ComputeAnyOfResult:
484   case VPInstruction::ReductionStartVector:
485     return 3;
486   case VPInstruction::ComputeFindIVResult:
487     return 4;
488   case Instruction::Call:
489   case Instruction::GetElementPtr:
490   case Instruction::PHI:
491   case Instruction::Switch:
492     // Cannot determine the number of operands from the opcode.
493     return -1u;
494   }
495   llvm_unreachable("all cases should be handled above");
496 }
497 #endif
498 
499 bool VPInstruction::doesGeneratePerAllLanes() const {
500   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
501 }
502 
503 bool VPInstruction::canGenerateScalarForFirstLane() const {
504   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
505     return true;
506   if (isSingleScalar() || isVectorToScalar())
507     return true;
508   switch (Opcode) {
509   case Instruction::Freeze:
510   case Instruction::ICmp:
511   case Instruction::PHI:
512   case Instruction::Select:
513   case VPInstruction::BranchOnCond:
514   case VPInstruction::BranchOnCount:
515   case VPInstruction::CalculateTripCountMinusVF:
516   case VPInstruction::CanonicalIVIncrementForPart:
517   case VPInstruction::PtrAdd:
518   case VPInstruction::ExplicitVectorLength:
519   case VPInstruction::AnyOf:
520     return true;
521   default:
522     return false;
523   }
524 }
525 
526 Value *VPInstruction::generatePerLane(VPTransformState &State,
527                                       const VPLane &Lane) {
528   IRBuilderBase &Builder = State.Builder;
529 
530   assert(getOpcode() == VPInstruction::PtrAdd &&
531          "only PtrAdd opcodes are supported for now");
532   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
533                               State.get(getOperand(1), Lane), Name);
534 }
535 
536 /// Create a conditional branch using \p Cond branching to the successors of \p
537 /// VPBB. Note that the first successor is always forward (i.e. not created yet)
538 /// while the second successor may already have been created (if it is a header
539 /// block and VPBB is a latch).
540 static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
541                                     VPTransformState &State) {
542   // Replace the temporary unreachable terminator with a new conditional
543   // branch, hooking it up to backward destination (header) for latch blocks
544   // now, and to forward destination(s) later when they are created.
545   // Second successor may be backwards - iff it is already in VPBB2IRBB.
546   VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
547   BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
548   BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
549   BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
550   // First successor is always forward, reset it to nullptr
551   CondBr->setSuccessor(0, nullptr);
552   IRBB->getTerminator()->eraseFromParent();
553   return CondBr;
554 }
555 
556 Value *VPInstruction::generate(VPTransformState &State) {
557   IRBuilderBase &Builder = State.Builder;
558 
559   if (Instruction::isBinaryOp(getOpcode())) {
560     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
561     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
562     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
563     auto *Res =
564         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
565     if (auto *I = dyn_cast<Instruction>(Res))
566       applyFlags(*I);
567     return Res;
568   }
569 
570   switch (getOpcode()) {
571   case VPInstruction::Not: {
572     Value *A = State.get(getOperand(0));
573     return Builder.CreateNot(A, Name);
574   }
575   case Instruction::ExtractElement: {
576     assert(State.VF.isVector() && "Only extract elements from vectors");
577     if (getOperand(1)->isLiveIn()) {
578       unsigned IdxToExtract =
579           cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
580       return State.get(getOperand(0), VPLane(IdxToExtract));
581     }
582     Value *Vec = State.get(getOperand(0));
583     Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
584     return Builder.CreateExtractElement(Vec, Idx, Name);
585   }
586   case Instruction::Freeze: {
587     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
588     return Builder.CreateFreeze(Op, Name);
589   }
590   case Instruction::ICmp: {
591     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
592     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
593     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
594     return Builder.CreateCmp(getPredicate(), A, B, Name);
595   }
596   case Instruction::PHI: {
597     llvm_unreachable("should be handled by VPPhi::execute");
598   }
599   case Instruction::Select: {
600     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
601     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
602     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
603     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
604     return Builder.CreateSelect(Cond, Op1, Op2, Name);
605   }
606   case VPInstruction::ActiveLaneMask: {
607     // Get first lane of vector induction variable.
608     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
609     // Get the original loop tripcount.
610     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
611 
612     // If this part of the active lane mask is scalar, generate the CMP directly
613     // to avoid unnecessary extracts.
614     if (State.VF.isScalar())
615       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
616                                Name);
617 
618     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
619     auto *PredTy = VectorType::get(Int1Ty, State.VF);
620     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
621                                    {PredTy, ScalarTC->getType()},
622                                    {VIVElem0, ScalarTC}, nullptr, Name);
623   }
624   case VPInstruction::FirstOrderRecurrenceSplice: {
625     // Generate code to combine the previous and current values in vector v3.
626     //
627     //   vector.ph:
628     //     v_init = vector(..., ..., ..., a[-1])
629     //     br vector.body
630     //
631     //   vector.body
632     //     i = phi [0, vector.ph], [i+4, vector.body]
633     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
634     //     v2 = a[i, i+1, i+2, i+3];
635     //     v3 = vector(v1(3), v2(0, 1, 2))
636 
637     auto *V1 = State.get(getOperand(0));
638     if (!V1->getType()->isVectorTy())
639       return V1;
640     Value *V2 = State.get(getOperand(1));
641     return Builder.CreateVectorSplice(V1, V2, -1, Name);
642   }
643   case VPInstruction::CalculateTripCountMinusVF: {
644     unsigned UF = getParent()->getPlan()->getUF();
645     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
646     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
647     Value *Sub = Builder.CreateSub(ScalarTC, Step);
648     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
649     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
650     return Builder.CreateSelect(Cmp, Sub, Zero);
651   }
652   case VPInstruction::ExplicitVectorLength: {
653     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
654     // be outside of the main loop.
655     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
656     // Compute EVL
657     assert(AVL->getType()->isIntegerTy() &&
658            "Requested vector length should be an integer.");
659 
660     assert(State.VF.isScalable() && "Expected scalable vector factor.");
661     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
662 
663     Value *EVL = State.Builder.CreateIntrinsic(
664         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
665         {AVL, VFArg, State.Builder.getTrue()});
666     return EVL;
667   }
668   case VPInstruction::CanonicalIVIncrementForPart: {
669     unsigned Part = getUnrollPart(*this);
670     auto *IV = State.get(getOperand(0), VPLane(0));
671     assert(Part != 0 && "Must have a positive part");
672     // The canonical IV is incremented by the vectorization factor (num of
673     // SIMD elements) times the unroll part.
674     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
675     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
676                              hasNoSignedWrap());
677   }
678   case VPInstruction::BranchOnCond: {
679     Value *Cond = State.get(getOperand(0), VPLane(0));
680     auto *Br = createCondBranch(Cond, getParent(), State);
681     applyMetadata(*Br);
682     return Br;
683   }
684   case VPInstruction::BranchOnCount: {
685     // First create the compare.
686     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
687     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
688     Value *Cond = Builder.CreateICmpEQ(IV, TC);
689     return createCondBranch(Cond, getParent(), State);
690   }
691   case VPInstruction::Broadcast: {
692     return Builder.CreateVectorSplat(
693         State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
694   }
695   case VPInstruction::BuildStructVector: {
696     // For struct types, we need to build a new 'wide' struct type, where each
697     // element is widened, i.e., we create a struct of vectors.
698     auto *StructTy =
699         cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
700     Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
701     for (const auto &[LaneIndex, Op] : enumerate(operands())) {
702       for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
703            FieldIndex++) {
704         Value *ScalarValue =
705             Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
706         Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
707         VectorValue =
708             Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
709         Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
710       }
711     }
712     return Res;
713   }
714   case VPInstruction::BuildVector: {
715     auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
716     auto NumOfElements = ElementCount::getFixed(getNumOperands());
717     Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
718     for (const auto &[Idx, Op] : enumerate(operands()))
719       Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
720                                               State.Builder.getInt32(Idx));
721     return Res;
722   }
723   case VPInstruction::ReductionStartVector: {
724     if (State.VF.isScalar())
725       return State.get(getOperand(0), true);
726     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
727     Builder.setFastMathFlags(getFastMathFlags());
728     // If this start vector is scaled then it should produce a vector with fewer
729     // elements than the VF.
730     ElementCount VF = State.VF.divideCoefficientBy(
731         cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
732     auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
733     Constant *Zero = Builder.getInt32(0);
734     return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
735                                        Zero);
736   }
737   case VPInstruction::ComputeAnyOfResult: {
738     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
739     // and will be removed by breaking up the recipe further.
740     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
741     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
742     Value *ReducedPartRdx = State.get(getOperand(2));
743     for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
744       ReducedPartRdx = Builder.CreateBinOp(
745           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(
746               RecurKind::AnyOf),
747           State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
748     return createAnyOfReduction(Builder, ReducedPartRdx,
749                                 State.get(getOperand(1), VPLane(0)), OrigPhi);
750   }
751   case VPInstruction::ComputeFindIVResult: {
752     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
753     // and will be removed by breaking up the recipe further.
754     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
755     // Get its reduction variable descriptor.
756     RecurKind RK = PhiR->getRecurrenceKind();
757     assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
758            "Unexpected reduction kind");
759     assert(!PhiR->isInLoop() &&
760            "In-loop FindLastIV reduction is not supported yet");
761 
762     // The recipe's operands are the reduction phi, the start value, the
763     // sentinel value, followed by one operand for each part of the reduction.
764     unsigned UF = getNumOperands() - 3;
765     Value *ReducedPartRdx = State.get(getOperand(3));
766     RecurKind MinMaxKind;
767     bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
768     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
769       MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
770     else
771       MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
772     for (unsigned Part = 1; Part < UF; ++Part)
773       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
774                                       State.get(getOperand(3 + Part)));
775 
776     Value *Start = State.get(getOperand(1), true);
777     Value *Sentinel = getOperand(2)->getLiveInIRValue();
778     return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
779                                      Sentinel);
780   }
781   case VPInstruction::ComputeReductionResult: {
782     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
783     // and will be removed by breaking up the recipe further.
784     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
785     // Get its reduction variable descriptor.
786 
787     RecurKind RK = PhiR->getRecurrenceKind();
788     assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
789            "should be handled by ComputeFindIVResult");
790 
791     // The recipe's operands are the reduction phi, followed by one operand for
792     // each part of the reduction.
793     unsigned UF = getNumOperands() - 1;
794     VectorParts RdxParts(UF);
795     for (unsigned Part = 0; Part < UF; ++Part)
796       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
797 
798     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
799     if (hasFastMathFlags())
800       Builder.setFastMathFlags(getFastMathFlags());
801 
802     // Reduce all of the unrolled parts into a single vector.
803     Value *ReducedPartRdx = RdxParts[0];
804     if (PhiR->isOrdered()) {
805       ReducedPartRdx = RdxParts[UF - 1];
806     } else {
807       // Floating-point operations should have some FMF to enable the reduction.
808       for (unsigned Part = 1; Part < UF; ++Part) {
809         Value *RdxPart = RdxParts[Part];
810         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
811           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
812         else
813           ReducedPartRdx = Builder.CreateBinOp(
814               (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
815               RdxPart, ReducedPartRdx, "bin.rdx");
816       }
817     }
818 
819     // Create the reduction after the loop. Note that inloop reductions create
820     // the target reduction in the loop using a Reduction recipe.
821     if (State.VF.isVector() && !PhiR->isInLoop()) {
822       // TODO: Support in-order reductions based on the recurrence descriptor.
823       // All ops in the reduction inherit fast-math-flags from the recurrence
824       // descriptor.
825       ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
826     }
827 
828     return ReducedPartRdx;
829   }
830   case VPInstruction::ExtractLastElement:
831   case VPInstruction::ExtractPenultimateElement: {
832     unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
833     Value *Res;
834     if (State.VF.isVector()) {
835       assert(Offset <= State.VF.getKnownMinValue() &&
836              "invalid offset to extract from");
837       // Extract lane VF - Offset from the operand.
838       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
839     } else {
840       assert(Offset <= 1 && "invalid offset to extract from");
841       Res = State.get(getOperand(0));
842     }
843     if (isa<ExtractElementInst>(Res))
844       Res->setName(Name);
845     return Res;
846   }
847   case VPInstruction::LogicalAnd: {
848     Value *A = State.get(getOperand(0));
849     Value *B = State.get(getOperand(1));
850     return Builder.CreateLogicalAnd(A, B, Name);
851   }
852   case VPInstruction::PtrAdd: {
853     assert(vputils::onlyFirstLaneUsed(this) &&
854            "can only generate first lane for PtrAdd");
855     Value *Ptr = State.get(getOperand(0), VPLane(0));
856     Value *Addend = State.get(getOperand(1), VPLane(0));
857     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
858   }
859   case VPInstruction::AnyOf: {
860     Value *Res = State.get(getOperand(0));
861     for (VPValue *Op : drop_begin(operands()))
862       Res = Builder.CreateOr(Res, State.get(Op));
863     return Builder.CreateOrReduce(Res);
864   }
865   case VPInstruction::FirstActiveLane: {
866     if (getNumOperands() == 1) {
867       Value *Mask = State.get(getOperand(0));
868       return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
869                                                   true, Name);
870     }
871     // If there are multiple operands, create a chain of selects to pick the
872     // first operand with an active lane and add the number of lanes of the
873     // preceding operands.
874     Value *RuntimeVF =
875         getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
876     unsigned LastOpIdx = getNumOperands() - 1;
877     Value *Res = nullptr;
878     for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
879       Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
880           Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
881       Value *Current = Builder.CreateAdd(
882           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
883       if (Res) {
884         Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
885         Res = Builder.CreateSelect(Cmp, Current, Res);
886       } else {
887         Res = Current;
888       }
889     }
890 
891     return Res;
892   }
893   default:
894     llvm_unreachable("Unsupported opcode for instruction");
895   }
896 }
897 
898 InstructionCost VPInstruction::computeCost(ElementCount VF,
899                                            VPCostContext &Ctx) const {
900   if (Instruction::isBinaryOp(getOpcode())) {
901     Type *ResTy = Ctx.Types.inferScalarType(this);
902     if (!vputils::onlyFirstLaneUsed(this))
903       ResTy = toVectorTy(ResTy, VF);
904 
905     if (!getUnderlyingValue()) {
906       switch (getOpcode()) {
907       case Instruction::FMul:
908         return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
909       default:
910         // TODO: Compute cost for VPInstructions without underlying values once
911         // the legacy cost model has been retired.
912         return 0;
913       }
914     }
915 
916     assert(!doesGeneratePerAllLanes() &&
917            "Should only generate a vector value or single scalar, not scalars "
918            "for all lanes.");
919     return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
920   }
921 
922   switch (getOpcode()) {
923   case Instruction::ExtractElement: {
924     // Add on the cost of extracting the element.
925     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
926     return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
927                                       Ctx.CostKind);
928   }
929   case VPInstruction::AnyOf: {
930     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
931     return Ctx.TTI.getArithmeticReductionCost(
932         Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
933   }
934   case VPInstruction::FirstActiveLane: {
935     // Calculate the cost of determining the lane index.
936     auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
937     IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
938                                   Type::getInt64Ty(Ctx.LLVMCtx),
939                                   {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
940     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
941   }
942   case VPInstruction::FirstOrderRecurrenceSplice: {
943     assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
944     SmallVector<int> Mask(VF.getKnownMinValue());
945     std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
946     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
947 
948     return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
949                                   cast<VectorType>(VectorTy),
950                                   cast<VectorType>(VectorTy), Mask,
951                                   Ctx.CostKind, VF.getKnownMinValue() - 1);
952   }
953   case VPInstruction::ActiveLaneMask: {
954     Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
955     Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
956     IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
957                                   {ArgTy, ArgTy});
958     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
959   }
960   case VPInstruction::ExplicitVectorLength: {
961     Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
962     Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
963     Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
964     IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
965                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
966     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
967   }
968   case VPInstruction::ExtractPenultimateElement:
969     if (VF == ElementCount::getScalable(1))
970       return InstructionCost::getInvalid();
971   LLVM_FALLTHROUGH;
972   default:
973     // TODO: Compute cost other VPInstructions once the legacy cost model has
974     // been retired.
975     assert(!getUnderlyingValue() &&
976            "unexpected VPInstruction witht underlying value");
977     return 0;
978   }
979 }
980 
981 bool VPInstruction::isVectorToScalar() const {
982   return getOpcode() == VPInstruction::ExtractLastElement ||
983          getOpcode() == VPInstruction::ExtractPenultimateElement ||
984          getOpcode() == Instruction::ExtractElement ||
985          getOpcode() == VPInstruction::FirstActiveLane ||
986          getOpcode() == VPInstruction::ComputeAnyOfResult ||
987          getOpcode() == VPInstruction::ComputeFindIVResult ||
988          getOpcode() == VPInstruction::ComputeReductionResult ||
989          getOpcode() == VPInstruction::AnyOf;
990 }
991 
992 bool VPInstruction::isSingleScalar() const {
993   return getOpcode() == Instruction::PHI || isScalarCast();
994 }
995 
996 void VPInstruction::execute(VPTransformState &State) {
997   assert(!State.Lane && "VPInstruction executing an Lane");
998   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
999   assert(flagsValidForOpcode(getOpcode()) &&
1000          "Set flags not supported for the provided opcode");
1001   if (hasFastMathFlags())
1002     State.Builder.setFastMathFlags(getFastMathFlags());
1003   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1004                                    (vputils::onlyFirstLaneUsed(this) ||
1005                                     isVectorToScalar() || isSingleScalar());
1006   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1007   if (GeneratesPerAllLanes) {
1008     for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1009          Lane != NumLanes; ++Lane) {
1010       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
1011       assert(GeneratedValue && "generatePerLane must produce a value");
1012       State.set(this, GeneratedValue, VPLane(Lane));
1013     }
1014     return;
1015   }
1016 
1017   Value *GeneratedValue = generate(State);
1018   if (!hasResult())
1019     return;
1020   assert(GeneratedValue && "generate must produce a value");
1021   assert((((GeneratedValue->getType()->isVectorTy() ||
1022             GeneratedValue->getType()->isStructTy()) ==
1023            !GeneratesPerFirstLaneOnly) ||
1024           State.VF.isScalar()) &&
1025          "scalar value but not only first lane defined");
1026   State.set(this, GeneratedValue,
1027             /*IsScalar*/ GeneratesPerFirstLaneOnly);
1028 }
1029 
1030 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1031   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
1032     return false;
1033   switch (getOpcode()) {
1034   case Instruction::ExtractElement:
1035   case Instruction::Freeze:
1036   case Instruction::ICmp:
1037   case Instruction::Select:
1038   case VPInstruction::AnyOf:
1039   case VPInstruction::BuildStructVector:
1040   case VPInstruction::BuildVector:
1041   case VPInstruction::CalculateTripCountMinusVF:
1042   case VPInstruction::CanonicalIVIncrementForPart:
1043   case VPInstruction::ExtractLastElement:
1044   case VPInstruction::ExtractPenultimateElement:
1045   case VPInstruction::FirstActiveLane:
1046   case VPInstruction::FirstOrderRecurrenceSplice:
1047   case VPInstruction::LogicalAnd:
1048   case VPInstruction::Not:
1049   case VPInstruction::PtrAdd:
1050   case VPInstruction::WideIVStep:
1051   case VPInstruction::StepVector:
1052   case VPInstruction::ReductionStartVector:
1053     return false;
1054   default:
1055     return true;
1056   }
1057 }
1058 
1059 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
1060   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1061   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
1062     return vputils::onlyFirstLaneUsed(this);
1063 
1064   switch (getOpcode()) {
1065   default:
1066     return false;
1067   case Instruction::ExtractElement:
1068     return Op == getOperand(1);
1069   case Instruction::PHI:
1070     return true;
1071   case Instruction::ICmp:
1072   case Instruction::Select:
1073   case Instruction::Or:
1074   case Instruction::Freeze:
1075     // TODO: Cover additional opcodes.
1076     return vputils::onlyFirstLaneUsed(this);
1077   case VPInstruction::ActiveLaneMask:
1078   case VPInstruction::ExplicitVectorLength:
1079   case VPInstruction::CalculateTripCountMinusVF:
1080   case VPInstruction::CanonicalIVIncrementForPart:
1081   case VPInstruction::BranchOnCount:
1082   case VPInstruction::BranchOnCond:
1083   case VPInstruction::Broadcast:
1084   case VPInstruction::ReductionStartVector:
1085     return true;
1086   case VPInstruction::PtrAdd:
1087     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1088   case VPInstruction::ComputeAnyOfResult:
1089   case VPInstruction::ComputeFindIVResult:
1090     return Op == getOperand(1);
1091   };
1092   llvm_unreachable("switch should return");
1093 }
1094 
1095 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
1096   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1097   if (Instruction::isBinaryOp(getOpcode()))
1098     return vputils::onlyFirstPartUsed(this);
1099 
1100   switch (getOpcode()) {
1101   default:
1102     return false;
1103   case Instruction::ICmp:
1104   case Instruction::Select:
1105     return vputils::onlyFirstPartUsed(this);
1106   case VPInstruction::BranchOnCount:
1107   case VPInstruction::BranchOnCond:
1108   case VPInstruction::CanonicalIVIncrementForPart:
1109     return true;
1110   };
1111   llvm_unreachable("switch should return");
1112 }
1113 
1114 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1115 void VPInstruction::dump() const {
1116   VPSlotTracker SlotTracker(getParent()->getPlan());
1117   print(dbgs(), "", SlotTracker);
1118 }
1119 
1120 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
1121                           VPSlotTracker &SlotTracker) const {
1122   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1123 
1124   if (hasResult()) {
1125     printAsOperand(O, SlotTracker);
1126     O << " = ";
1127   }
1128 
1129   switch (getOpcode()) {
1130   case VPInstruction::Not:
1131     O << "not";
1132     break;
1133   case VPInstruction::SLPLoad:
1134     O << "combined load";
1135     break;
1136   case VPInstruction::SLPStore:
1137     O << "combined store";
1138     break;
1139   case VPInstruction::ActiveLaneMask:
1140     O << "active lane mask";
1141     break;
1142   case VPInstruction::ExplicitVectorLength:
1143     O << "EXPLICIT-VECTOR-LENGTH";
1144     break;
1145   case VPInstruction::FirstOrderRecurrenceSplice:
1146     O << "first-order splice";
1147     break;
1148   case VPInstruction::BranchOnCond:
1149     O << "branch-on-cond";
1150     break;
1151   case VPInstruction::CalculateTripCountMinusVF:
1152     O << "TC > VF ? TC - VF : 0";
1153     break;
1154   case VPInstruction::CanonicalIVIncrementForPart:
1155     O << "VF * Part +";
1156     break;
1157   case VPInstruction::BranchOnCount:
1158     O << "branch-on-count";
1159     break;
1160   case VPInstruction::Broadcast:
1161     O << "broadcast";
1162     break;
1163   case VPInstruction::BuildStructVector:
1164     O << "buildstructvector";
1165     break;
1166   case VPInstruction::BuildVector:
1167     O << "buildvector";
1168     break;
1169   case VPInstruction::ExtractLastElement:
1170     O << "extract-last-element";
1171     break;
1172   case VPInstruction::ExtractPenultimateElement:
1173     O << "extract-penultimate-element";
1174     break;
1175   case VPInstruction::ComputeAnyOfResult:
1176     O << "compute-anyof-result";
1177     break;
1178   case VPInstruction::ComputeFindIVResult:
1179     O << "compute-find-iv-result";
1180     break;
1181   case VPInstruction::ComputeReductionResult:
1182     O << "compute-reduction-result";
1183     break;
1184   case VPInstruction::LogicalAnd:
1185     O << "logical-and";
1186     break;
1187   case VPInstruction::PtrAdd:
1188     O << "ptradd";
1189     break;
1190   case VPInstruction::AnyOf:
1191     O << "any-of";
1192     break;
1193   case VPInstruction::FirstActiveLane:
1194     O << "first-active-lane";
1195     break;
1196   case VPInstruction::ReductionStartVector:
1197     O << "reduction-start-vector";
1198     break;
1199   default:
1200     O << Instruction::getOpcodeName(getOpcode());
1201   }
1202 
1203   printFlags(O);
1204   printOperands(O, SlotTracker);
1205 
1206   if (auto DL = getDebugLoc()) {
1207     O << ", !dbg ";
1208     DL.print(O);
1209   }
1210 }
1211 #endif
1212 
1213 void VPInstructionWithType::execute(VPTransformState &State) {
1214   State.setDebugLocFrom(getDebugLoc());
1215   if (isScalarCast()) {
1216     Value *Op = State.get(getOperand(0), VPLane(0));
1217     Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1218                                            Op, ResultTy);
1219     State.set(this, Cast, VPLane(0));
1220     return;
1221   }
1222   switch (getOpcode()) {
1223   case VPInstruction::StepVector: {
1224     Value *StepVector =
1225         State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1226     State.set(this, StepVector);
1227     break;
1228   }
1229   default:
1230     llvm_unreachable("opcode not implemented yet");
1231   }
1232 }
1233 
1234 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1235 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
1236                                   VPSlotTracker &SlotTracker) const {
1237   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1238   printAsOperand(O, SlotTracker);
1239   O << " = ";
1240 
1241   switch (getOpcode()) {
1242   case VPInstruction::WideIVStep:
1243     O << "wide-iv-step ";
1244     printOperands(O, SlotTracker);
1245     break;
1246   case VPInstruction::StepVector:
1247     O << "step-vector " << *ResultTy;
1248     break;
1249   default:
1250     assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1251     O << Instruction::getOpcodeName(getOpcode()) << " ";
1252     printOperands(O, SlotTracker);
1253     O << " to " << *ResultTy;
1254   }
1255 }
1256 #endif
1257 
1258 void VPPhi::execute(VPTransformState &State) {
1259   State.setDebugLocFrom(getDebugLoc());
1260   PHINode *NewPhi = State.Builder.CreatePHI(
1261       State.TypeAnalysis.inferScalarType(this), 2, getName());
1262   unsigned NumIncoming = getNumIncoming();
1263   if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1264     // TODO: Fixup all incoming values of header phis once recipes defining them
1265     // are introduced.
1266     NumIncoming = 1;
1267   }
1268   for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1269     Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1270     BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1271     NewPhi->addIncoming(IncV, PredBB);
1272   }
1273   State.set(this, NewPhi, VPLane(0));
1274 }
1275 
1276 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1277 void VPPhi::print(raw_ostream &O, const Twine &Indent,
1278                   VPSlotTracker &SlotTracker) const {
1279   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1280   printAsOperand(O, SlotTracker);
1281   O << " = phi ";
1282   printPhiOperands(O, SlotTracker);
1283 }
1284 #endif
1285 
1286 VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1287   if (auto *Phi = dyn_cast<PHINode>(&I))
1288     return new VPIRPhi(*Phi);
1289   return new VPIRInstruction(I);
1290 }
1291 
1292 void VPIRInstruction::execute(VPTransformState &State) {
1293   assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1294          "PHINodes must be handled by VPIRPhi");
1295   // Advance the insert point after the wrapped IR instruction. This allows
1296   // interleaving VPIRInstructions and other recipes.
1297   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1298 }
1299 
1300 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1301                                              VPCostContext &Ctx) const {
1302   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1303   // hence it does not contribute to the cost-modeling for the VPlan.
1304   return 0;
1305 }
1306 
1307 void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
1308   assert(isa<PHINode>(getInstruction()) &&
1309          "can only update exiting operands to phi nodes");
1310   assert(getNumOperands() > 0 && "must have at least one operand");
1311   VPValue *Exiting = getOperand(0);
1312   if (Exiting->isLiveIn())
1313     return;
1314 
1315   Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1316   setOperand(0, Exiting);
1317 }
1318 
1319 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1320 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
1321                             VPSlotTracker &SlotTracker) const {
1322   O << Indent << "IR " << I;
1323 }
1324 #endif
1325 
1326 void VPIRPhi::execute(VPTransformState &State) {
1327   PHINode *Phi = &getIRPhi();
1328   for (const auto &[Idx, Op] : enumerate(operands())) {
1329     VPValue *ExitValue = Op;
1330     auto Lane = vputils::isSingleScalar(ExitValue)
1331                     ? VPLane::getFirstLane()
1332                     : VPLane::getLastLaneForVF(State.VF);
1333     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1334     auto *PredVPBB = Pred->getExitingBasicBlock();
1335     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1336     // Set insertion point in PredBB in case an extract needs to be generated.
1337     // TODO: Model extracts explicitly.
1338     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1339     Value *V = State.get(ExitValue, VPLane(Lane));
1340     // If there is no existing block for PredBB in the phi, add a new incoming
1341     // value. Otherwise update the existing incoming value for PredBB.
1342     if (Phi->getBasicBlockIndex(PredBB) == -1)
1343       Phi->addIncoming(V, PredBB);
1344     else
1345       Phi->setIncomingValueForBlock(PredBB, V);
1346   }
1347 
1348   // Advance the insert point after the wrapped IR instruction. This allows
1349   // interleaving VPIRInstructions and other recipes.
1350   State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1351 }
1352 
1353 void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1354   VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1355   assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1356          "Number of phi operands must match number of predecessors");
1357   unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1358   R->removeOperand(Position);
1359 }
1360 
1361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1362 void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1363                                       VPSlotTracker &SlotTracker) const {
1364   interleaveComma(enumerate(getAsRecipe()->operands()), O,
1365                   [this, &O, &SlotTracker](auto Op) {
1366                     O << "[ ";
1367                     Op.value()->printAsOperand(O, SlotTracker);
1368                     O << ", ";
1369                     getIncomingBlock(Op.index())->printAsOperand(O);
1370                     O << " ]";
1371                   });
1372 }
1373 #endif
1374 
1375 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1376 void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1377                     VPSlotTracker &SlotTracker) const {
1378   VPIRInstruction::print(O, Indent, SlotTracker);
1379 
1380   if (getNumOperands() != 0) {
1381     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1382     interleaveComma(
1383         enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
1384           Op.value()->printAsOperand(O, SlotTracker);
1385           O << " from ";
1386           getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
1387         });
1388     O << ")";
1389   }
1390 }
1391 #endif
1392 
1393 VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
1394     : VPIRMetadata(I) {
1395   if (!LVer || !isa<LoadInst, StoreInst>(&I))
1396     return;
1397   const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
1398   if (AliasScopeMD)
1399     Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
1400   if (NoAliasMD)
1401     Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
1402 }
1403 
1404 void VPIRMetadata::applyMetadata(Instruction &I) const {
1405   for (const auto &[Kind, Node] : Metadata)
1406     I.setMetadata(Kind, Node);
1407 }
1408 
1409 void VPWidenCallRecipe::execute(VPTransformState &State) {
1410   assert(State.VF.isVector() && "not widening");
1411   assert(Variant != nullptr && "Can't create vector function.");
1412 
1413   FunctionType *VFTy = Variant->getFunctionType();
1414   // Add return type if intrinsic is overloaded on it.
1415   SmallVector<Value *, 4> Args;
1416   for (const auto &I : enumerate(args())) {
1417     Value *Arg;
1418     // Some vectorized function variants may also take a scalar argument,
1419     // e.g. linear parameters for pointers. This needs to be the scalar value
1420     // from the start of the respective part when interleaving.
1421     if (!VFTy->getParamType(I.index())->isVectorTy())
1422       Arg = State.get(I.value(), VPLane(0));
1423     else
1424       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1425     Args.push_back(Arg);
1426   }
1427 
1428   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1429   SmallVector<OperandBundleDef, 1> OpBundles;
1430   if (CI)
1431     CI->getOperandBundlesAsDefs(OpBundles);
1432 
1433   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1434   applyFlags(*V);
1435   applyMetadata(*V);
1436   V->setCallingConv(Variant->getCallingConv());
1437 
1438   if (!V->getType()->isVoidTy())
1439     State.set(this, V);
1440 }
1441 
1442 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1443                                                VPCostContext &Ctx) const {
1444   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1445                                   Variant->getFunctionType()->params(),
1446                                   Ctx.CostKind);
1447 }
1448 
1449 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1450 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1451                               VPSlotTracker &SlotTracker) const {
1452   O << Indent << "WIDEN-CALL ";
1453 
1454   Function *CalledFn = getCalledScalarFunction();
1455   if (CalledFn->getReturnType()->isVoidTy())
1456     O << "void ";
1457   else {
1458     printAsOperand(O, SlotTracker);
1459     O << " = ";
1460   }
1461 
1462   O << "call";
1463   printFlags(O);
1464   O << " @" << CalledFn->getName() << "(";
1465   interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1466     Op->printAsOperand(O, SlotTracker);
1467   });
1468   O << ")";
1469 
1470   O << " (using library function";
1471   if (Variant->hasName())
1472     O << ": " << Variant->getName();
1473   O << ")";
1474 }
1475 #endif
1476 
1477 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1478   assert(State.VF.isVector() && "not widening");
1479 
1480   SmallVector<Type *, 2> TysForDecl;
1481   // Add return type if intrinsic is overloaded on it.
1482   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1483     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1484   SmallVector<Value *, 4> Args;
1485   for (const auto &I : enumerate(operands())) {
1486     // Some intrinsics have a scalar argument - don't replace it with a
1487     // vector.
1488     Value *Arg;
1489     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1490                                            State.TTI))
1491       Arg = State.get(I.value(), VPLane(0));
1492     else
1493       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1494     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1495                                                State.TTI))
1496       TysForDecl.push_back(Arg->getType());
1497     Args.push_back(Arg);
1498   }
1499 
1500   // Use vector version of the intrinsic.
1501   Module *M = State.Builder.GetInsertBlock()->getModule();
1502   Function *VectorF =
1503       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1504   assert(VectorF &&
1505          "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1506 
1507   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1508   SmallVector<OperandBundleDef, 1> OpBundles;
1509   if (CI)
1510     CI->getOperandBundlesAsDefs(OpBundles);
1511 
1512   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1513 
1514   applyFlags(*V);
1515   applyMetadata(*V);
1516 
1517   if (!V->getType()->isVoidTy())
1518     State.set(this, V);
1519 }
1520 
1521 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1522                                                     VPCostContext &Ctx) const {
1523   // Some backends analyze intrinsic arguments to determine cost. Use the
1524   // underlying value for the operand if it has one. Otherwise try to use the
1525   // operand of the underlying call instruction, if there is one. Otherwise
1526   // clear Arguments.
1527   // TODO: Rework TTI interface to be independent of concrete IR values.
1528   SmallVector<const Value *> Arguments;
1529   for (const auto &[Idx, Op] : enumerate(operands())) {
1530     auto *V = Op->getUnderlyingValue();
1531     if (!V) {
1532       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1533         Arguments.push_back(UI->getArgOperand(Idx));
1534         continue;
1535       }
1536       Arguments.clear();
1537       break;
1538     }
1539     Arguments.push_back(V);
1540   }
1541 
1542   Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
1543   SmallVector<Type *> ParamTys;
1544   for (unsigned I = 0; I != getNumOperands(); ++I)
1545     ParamTys.push_back(
1546         toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1547 
1548   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1549   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1550   IntrinsicCostAttributes CostAttrs(
1551       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1552       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
1553       InstructionCost::getInvalid(), &Ctx.TLI);
1554   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1555 }
1556 
1557 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1558   return Intrinsic::getBaseName(VectorIntrinsicID);
1559 }
1560 
1561 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1562   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1563   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1564     auto [Idx, V] = X;
1565     return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1566                                                          Idx, nullptr);
1567   });
1568 }
1569 
1570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1571 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1572                                    VPSlotTracker &SlotTracker) const {
1573   O << Indent << "WIDEN-INTRINSIC ";
1574   if (ResultTy->isVoidTy()) {
1575     O << "void ";
1576   } else {
1577     printAsOperand(O, SlotTracker);
1578     O << " = ";
1579   }
1580 
1581   O << "call";
1582   printFlags(O);
1583   O << getIntrinsicName() << "(";
1584 
1585   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1586     Op->printAsOperand(O, SlotTracker);
1587   });
1588   O << ")";
1589 }
1590 #endif
1591 
1592 void VPHistogramRecipe::execute(VPTransformState &State) {
1593   IRBuilderBase &Builder = State.Builder;
1594 
1595   Value *Address = State.get(getOperand(0));
1596   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1597   VectorType *VTy = cast<VectorType>(Address->getType());
1598 
1599   // The histogram intrinsic requires a mask even if the recipe doesn't;
1600   // if the mask operand was omitted then all lanes should be executed and
1601   // we just need to synthesize an all-true mask.
1602   Value *Mask = nullptr;
1603   if (VPValue *VPMask = getMask())
1604     Mask = State.get(VPMask);
1605   else
1606     Mask =
1607         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1608 
1609   // If this is a subtract, we want to invert the increment amount. We may
1610   // add a separate intrinsic in future, but for now we'll try this.
1611   if (Opcode == Instruction::Sub)
1612     IncAmt = Builder.CreateNeg(IncAmt);
1613   else
1614     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1615 
1616   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1617                                 {VTy, IncAmt->getType()},
1618                                 {Address, IncAmt, Mask});
1619 }
1620 
1621 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1622                                                VPCostContext &Ctx) const {
1623   // FIXME: Take the gather and scatter into account as well. For now we're
1624   //        generating the same cost as the fallback path, but we'll likely
1625   //        need to create a new TTI method for determining the cost, including
1626   //        whether we can use base + vec-of-smaller-indices or just
1627   //        vec-of-pointers.
1628   assert(VF.isVector() && "Invalid VF for histogram cost");
1629   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1630   VPValue *IncAmt = getOperand(1);
1631   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1632   VectorType *VTy = VectorType::get(IncTy, VF);
1633 
1634   // Assume that a non-constant update value (or a constant != 1) requires
1635   // a multiply, and add that into the cost.
1636   InstructionCost MulCost =
1637       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1638   if (IncAmt->isLiveIn()) {
1639     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1640 
1641     if (CI && CI->getZExtValue() == 1)
1642       MulCost = TTI::TCC_Free;
1643   }
1644 
1645   // Find the cost of the histogram operation itself.
1646   Type *PtrTy = VectorType::get(AddressTy, VF);
1647   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1648   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1649                               Type::getVoidTy(Ctx.LLVMCtx),
1650                               {PtrTy, IncTy, MaskTy});
1651 
1652   // Add the costs together with the add/sub operation.
1653   return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1654          Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1655 }
1656 
1657 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1658 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1659                               VPSlotTracker &SlotTracker) const {
1660   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1661   getOperand(0)->printAsOperand(O, SlotTracker);
1662 
1663   if (Opcode == Instruction::Sub)
1664     O << ", dec: ";
1665   else {
1666     assert(Opcode == Instruction::Add);
1667     O << ", inc: ";
1668   }
1669   getOperand(1)->printAsOperand(O, SlotTracker);
1670 
1671   if (VPValue *Mask = getMask()) {
1672     O << ", mask: ";
1673     Mask->printAsOperand(O, SlotTracker);
1674   }
1675 }
1676 
1677 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1678                                 VPSlotTracker &SlotTracker) const {
1679   O << Indent << "WIDEN-SELECT ";
1680   printAsOperand(O, SlotTracker);
1681   O << " = select ";
1682   printFlags(O);
1683   getOperand(0)->printAsOperand(O, SlotTracker);
1684   O << ", ";
1685   getOperand(1)->printAsOperand(O, SlotTracker);
1686   O << ", ";
1687   getOperand(2)->printAsOperand(O, SlotTracker);
1688   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1689 }
1690 #endif
1691 
1692 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1693   // The condition can be loop invariant but still defined inside the
1694   // loop. This means that we can't just use the original 'cond' value.
1695   // We have to take the 'vectorized' value and pick the first lane.
1696   // Instcombine will make this a no-op.
1697   auto *InvarCond =
1698       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1699 
1700   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1701   Value *Op0 = State.get(getOperand(1));
1702   Value *Op1 = State.get(getOperand(2));
1703   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1704   State.set(this, Sel);
1705   if (auto *I = dyn_cast<Instruction>(Sel)) {
1706     if (isa<FPMathOperator>(I))
1707       applyFlags(*I);
1708     applyMetadata(*I);
1709   }
1710 }
1711 
1712 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1713                                                  VPCostContext &Ctx) const {
1714   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1715   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1716   Type *ScalarTy = Ctx.Types.inferScalarType(this);
1717   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1718 
1719   VPValue *Op0, *Op1;
1720   using namespace llvm::VPlanPatternMatch;
1721   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1722       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1723        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1724     // select x, y, false --> x & y
1725     // select x, true, y --> x | y
1726     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1727     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1728 
1729     SmallVector<const Value *, 2> Operands;
1730     if (all_of(operands(),
1731                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1732       Operands.append(SI->op_begin(), SI->op_end());
1733     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1734     return Ctx.TTI.getArithmeticInstrCost(
1735         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
1736         Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1737   }
1738 
1739   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1740   if (!ScalarCond)
1741     CondTy = VectorType::get(CondTy, VF);
1742 
1743   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1744   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1745     Pred = Cmp->getPredicate();
1746   return Ctx.TTI.getCmpSelInstrCost(
1747       Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1748       {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1749 }
1750 
1751 VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
1752   AllowReassoc = FMF.allowReassoc();
1753   NoNaNs = FMF.noNaNs();
1754   NoInfs = FMF.noInfs();
1755   NoSignedZeros = FMF.noSignedZeros();
1756   AllowReciprocal = FMF.allowReciprocal();
1757   AllowContract = FMF.allowContract();
1758   ApproxFunc = FMF.approxFunc();
1759 }
1760 
1761 #if !defined(NDEBUG)
1762 bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
1763   switch (OpType) {
1764   case OperationType::OverflowingBinOp:
1765     return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
1766            Opcode == Instruction::Mul ||
1767            Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
1768   case OperationType::Trunc:
1769     return Opcode == Instruction::Trunc;
1770   case OperationType::DisjointOp:
1771     return Opcode == Instruction::Or;
1772   case OperationType::PossiblyExactOp:
1773     return Opcode == Instruction::AShr;
1774   case OperationType::GEPOp:
1775     return Opcode == Instruction::GetElementPtr ||
1776            Opcode == VPInstruction::PtrAdd;
1777   case OperationType::FPMathOp:
1778     return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1779            Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
1780            Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
1781            Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
1782            Opcode == VPInstruction::WideIVStep ||
1783            Opcode == VPInstruction::ReductionStartVector ||
1784            Opcode == VPInstruction::ComputeReductionResult;
1785   case OperationType::NonNegOp:
1786     return Opcode == Instruction::ZExt;
1787     break;
1788   case OperationType::Cmp:
1789     return Opcode == Instruction::ICmp;
1790   case OperationType::Other:
1791     return true;
1792   }
1793   llvm_unreachable("Unknown OperationType enum");
1794 }
1795 #endif
1796 
1797 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1798 void VPIRFlags::printFlags(raw_ostream &O) const {
1799   switch (OpType) {
1800   case OperationType::Cmp:
1801     O << " " << CmpInst::getPredicateName(getPredicate());
1802     break;
1803   case OperationType::DisjointOp:
1804     if (DisjointFlags.IsDisjoint)
1805       O << " disjoint";
1806     break;
1807   case OperationType::PossiblyExactOp:
1808     if (ExactFlags.IsExact)
1809       O << " exact";
1810     break;
1811   case OperationType::OverflowingBinOp:
1812     if (WrapFlags.HasNUW)
1813       O << " nuw";
1814     if (WrapFlags.HasNSW)
1815       O << " nsw";
1816     break;
1817   case OperationType::Trunc:
1818     if (TruncFlags.HasNUW)
1819       O << " nuw";
1820     if (TruncFlags.HasNSW)
1821       O << " nsw";
1822     break;
1823   case OperationType::FPMathOp:
1824     getFastMathFlags().print(O);
1825     break;
1826   case OperationType::GEPOp:
1827     if (GEPFlags.isInBounds())
1828       O << " inbounds";
1829     else if (GEPFlags.hasNoUnsignedSignedWrap())
1830       O << " nusw";
1831     if (GEPFlags.hasNoUnsignedWrap())
1832       O << " nuw";
1833     break;
1834   case OperationType::NonNegOp:
1835     if (NonNegFlags.NonNeg)
1836       O << " nneg";
1837     break;
1838   case OperationType::Other:
1839     break;
1840   }
1841   O << " ";
1842 }
1843 #endif
1844 
1845 void VPWidenRecipe::execute(VPTransformState &State) {
1846   auto &Builder = State.Builder;
1847   switch (Opcode) {
1848   case Instruction::Call:
1849   case Instruction::Br:
1850   case Instruction::PHI:
1851   case Instruction::GetElementPtr:
1852   case Instruction::Select:
1853     llvm_unreachable("This instruction is handled by a different recipe.");
1854   case Instruction::UDiv:
1855   case Instruction::SDiv:
1856   case Instruction::SRem:
1857   case Instruction::URem:
1858   case Instruction::Add:
1859   case Instruction::FAdd:
1860   case Instruction::Sub:
1861   case Instruction::FSub:
1862   case Instruction::FNeg:
1863   case Instruction::Mul:
1864   case Instruction::FMul:
1865   case Instruction::FDiv:
1866   case Instruction::FRem:
1867   case Instruction::Shl:
1868   case Instruction::LShr:
1869   case Instruction::AShr:
1870   case Instruction::And:
1871   case Instruction::Or:
1872   case Instruction::Xor: {
1873     // Just widen unops and binops.
1874     SmallVector<Value *, 2> Ops;
1875     for (VPValue *VPOp : operands())
1876       Ops.push_back(State.get(VPOp));
1877 
1878     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1879 
1880     if (auto *VecOp = dyn_cast<Instruction>(V)) {
1881       applyFlags(*VecOp);
1882       applyMetadata(*VecOp);
1883     }
1884 
1885     // Use this vector value for all users of the original instruction.
1886     State.set(this, V);
1887     break;
1888   }
1889   case Instruction::ExtractValue: {
1890     assert(getNumOperands() == 2 && "expected single level extractvalue");
1891     Value *Op = State.get(getOperand(0));
1892     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
1893     Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
1894     State.set(this, Extract);
1895     break;
1896   }
1897   case Instruction::Freeze: {
1898     Value *Op = State.get(getOperand(0));
1899     Value *Freeze = Builder.CreateFreeze(Op);
1900     State.set(this, Freeze);
1901     break;
1902   }
1903   case Instruction::ICmp:
1904   case Instruction::FCmp: {
1905     // Widen compares. Generate vector compares.
1906     bool FCmp = Opcode == Instruction::FCmp;
1907     Value *A = State.get(getOperand(0));
1908     Value *B = State.get(getOperand(1));
1909     Value *C = nullptr;
1910     if (FCmp) {
1911       // Propagate fast math flags.
1912       C = Builder.CreateFCmpFMF(
1913           getPredicate(), A, B,
1914           dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1915     } else {
1916       C = Builder.CreateICmp(getPredicate(), A, B);
1917     }
1918     if (auto *I = dyn_cast<Instruction>(C))
1919       applyMetadata(*I);
1920     State.set(this, C);
1921     break;
1922   }
1923   default:
1924     // This instruction is not vectorized by simple widening.
1925     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1926                       << Instruction::getOpcodeName(Opcode));
1927     llvm_unreachable("Unhandled instruction!");
1928   } // end of switch.
1929 
1930 #if !defined(NDEBUG)
1931   // Verify that VPlan type inference results agree with the type of the
1932   // generated values.
1933   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1934              State.get(this)->getType() &&
1935          "inferred type and type from generated instructions do not match");
1936 #endif
1937 }
1938 
1939 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1940                                            VPCostContext &Ctx) const {
1941   switch (Opcode) {
1942   case Instruction::FNeg: {
1943     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1944     return Ctx.TTI.getArithmeticInstrCost(
1945         Opcode, VectorTy, Ctx.CostKind,
1946         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1947         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1948   }
1949 
1950   case Instruction::UDiv:
1951   case Instruction::SDiv:
1952   case Instruction::SRem:
1953   case Instruction::URem:
1954     // More complex computation, let the legacy cost-model handle this for now.
1955     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1956   case Instruction::Add:
1957   case Instruction::FAdd:
1958   case Instruction::Sub:
1959   case Instruction::FSub:
1960   case Instruction::Mul:
1961   case Instruction::FMul:
1962   case Instruction::FDiv:
1963   case Instruction::FRem:
1964   case Instruction::Shl:
1965   case Instruction::LShr:
1966   case Instruction::AShr:
1967   case Instruction::And:
1968   case Instruction::Or:
1969   case Instruction::Xor: {
1970     VPValue *RHS = getOperand(1);
1971     // Certain instructions can be cheaper to vectorize if they have a constant
1972     // second vector operand. One example of this are shifts on x86.
1973     TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1974 
1975     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1976         getOperand(1)->isDefinedOutsideLoopRegions())
1977       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1978     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1979     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1980 
1981     SmallVector<const Value *, 4> Operands;
1982     if (CtxI)
1983       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1984     return Ctx.TTI.getArithmeticInstrCost(
1985         Opcode, VectorTy, Ctx.CostKind,
1986         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1987         RHSInfo, Operands, CtxI, &Ctx.TLI);
1988   }
1989   case Instruction::Freeze: {
1990     // This opcode is unknown. Assume that it is the same as 'mul'.
1991     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1992     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
1993                                           Ctx.CostKind);
1994   }
1995   case Instruction::ExtractValue: {
1996     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1997                                              Ctx.CostKind);
1998   }
1999   case Instruction::ICmp:
2000   case Instruction::FCmp: {
2001     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
2002     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
2003     return Ctx.TTI.getCmpSelInstrCost(
2004         Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(),
2005         Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2006         {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
2007   }
2008   default:
2009     llvm_unreachable("Unsupported opcode for instruction");
2010   }
2011 }
2012 
2013 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2014 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
2015                           VPSlotTracker &SlotTracker) const {
2016   O << Indent << "WIDEN ";
2017   printAsOperand(O, SlotTracker);
2018   O << " = " << Instruction::getOpcodeName(Opcode);
2019   printFlags(O);
2020   printOperands(O, SlotTracker);
2021 }
2022 #endif
2023 
2024 void VPWidenCastRecipe::execute(VPTransformState &State) {
2025   auto &Builder = State.Builder;
2026   /// Vectorize casts.
2027   assert(State.VF.isVector() && "Not vectorizing?");
2028   Type *DestTy = VectorType::get(getResultType(), State.VF);
2029   VPValue *Op = getOperand(0);
2030   Value *A = State.get(Op);
2031   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2032   State.set(this, Cast);
2033   if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2034     applyFlags(*CastOp);
2035     applyMetadata(*CastOp);
2036   }
2037 }
2038 
2039 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2040                                                VPCostContext &Ctx) const {
2041   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2042   // the legacy cost model, including truncates/extends when evaluating a
2043   // reduction in a smaller type.
2044   if (!getUnderlyingValue())
2045     return 0;
2046   // Computes the CastContextHint from a recipes that may access memory.
2047   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2048     if (VF.isScalar())
2049       return TTI::CastContextHint::Normal;
2050     if (isa<VPInterleaveRecipe>(R))
2051       return TTI::CastContextHint::Interleave;
2052     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2053       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2054                                              : TTI::CastContextHint::Normal;
2055     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2056     if (WidenMemoryRecipe == nullptr)
2057       return TTI::CastContextHint::None;
2058     if (!WidenMemoryRecipe->isConsecutive())
2059       return TTI::CastContextHint::GatherScatter;
2060     if (WidenMemoryRecipe->isReverse())
2061       return TTI::CastContextHint::Reversed;
2062     if (WidenMemoryRecipe->isMasked())
2063       return TTI::CastContextHint::Masked;
2064     return TTI::CastContextHint::Normal;
2065   };
2066 
2067   VPValue *Operand = getOperand(0);
2068   TTI::CastContextHint CCH = TTI::CastContextHint::None;
2069   // For Trunc/FPTrunc, get the context from the only user.
2070   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2071       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
2072     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2073       CCH = ComputeCCH(StoreRecipe);
2074   }
2075   // For Z/Sext, get the context from the operand.
2076   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2077            Opcode == Instruction::FPExt) {
2078     if (Operand->isLiveIn())
2079       CCH = TTI::CastContextHint::Normal;
2080     else if (Operand->getDefiningRecipe())
2081       CCH = ComputeCCH(Operand->getDefiningRecipe());
2082   }
2083 
2084   auto *SrcTy =
2085       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2086   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2087   // Arm TTI will use the underlying instruction to determine the cost.
2088   return Ctx.TTI.getCastInstrCost(
2089       Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2090       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
2091 }
2092 
2093 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2094 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
2095                               VPSlotTracker &SlotTracker) const {
2096   O << Indent << "WIDEN-CAST ";
2097   printAsOperand(O, SlotTracker);
2098   O << " = " << Instruction::getOpcodeName(Opcode);
2099   printFlags(O);
2100   printOperands(O, SlotTracker);
2101   O << " to " << *getResultType();
2102 }
2103 #endif
2104 
2105 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2106                                                VPCostContext &Ctx) const {
2107   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2108 }
2109 
2110 /// A helper function that returns an integer or floating-point constant with
2111 /// value C.
2112 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
2113   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2114                            : ConstantFP::get(Ty, C);
2115 }
2116 
2117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2118 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2119                                           VPSlotTracker &SlotTracker) const {
2120   O << Indent;
2121   printAsOperand(O, SlotTracker);
2122   O << " = WIDEN-INDUCTION  ";
2123   printOperands(O, SlotTracker);
2124 
2125   if (auto *TI = getTruncInst())
2126     O << " (truncated to " << *TI->getType() << ")";
2127 }
2128 #endif
2129 
2130 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2131   // The step may be defined by a recipe in the preheader (e.g. if it requires
2132   // SCEV expansion), but for the canonical induction the step is required to be
2133   // 1, which is represented as live-in.
2134   if (getStepValue()->getDefiningRecipe())
2135     return false;
2136   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
2137   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
2138   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
2139   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2140          getScalarType() == CanIV->getScalarType();
2141 }
2142 
2143 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2144 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
2145                               VPSlotTracker &SlotTracker) const {
2146   O << Indent;
2147   printAsOperand(O, SlotTracker);
2148   O << " = DERIVED-IV ";
2149   getStartValue()->printAsOperand(O, SlotTracker);
2150   O << " + ";
2151   getOperand(1)->printAsOperand(O, SlotTracker);
2152   O << " * ";
2153   getStepValue()->printAsOperand(O, SlotTracker);
2154 }
2155 #endif
2156 
2157 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2158   // Fast-math-flags propagate from the original induction instruction.
2159   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2160   if (hasFastMathFlags())
2161     State.Builder.setFastMathFlags(getFastMathFlags());
2162 
2163   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2164   /// variable on which to base the steps, \p Step is the size of the step.
2165 
2166   Value *BaseIV = State.get(getOperand(0), VPLane(0));
2167   Value *Step = State.get(getStepValue(), VPLane(0));
2168   IRBuilderBase &Builder = State.Builder;
2169 
2170   // Ensure step has the same type as that of scalar IV.
2171   Type *BaseIVTy = BaseIV->getType()->getScalarType();
2172   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2173 
2174   // We build scalar steps for both integer and floating-point induction
2175   // variables. Here, we determine the kind of arithmetic we will perform.
2176   Instruction::BinaryOps AddOp;
2177   Instruction::BinaryOps MulOp;
2178   if (BaseIVTy->isIntegerTy()) {
2179     AddOp = Instruction::Add;
2180     MulOp = Instruction::Mul;
2181   } else {
2182     AddOp = InductionOpcode;
2183     MulOp = Instruction::FMul;
2184   }
2185 
2186   // Determine the number of scalars we need to generate for each unroll
2187   // iteration.
2188   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2189   // Compute the scalar steps and save the results in State.
2190   Type *IntStepTy =
2191       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2192   Type *VecIVTy = nullptr;
2193   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2194   if (!FirstLaneOnly && State.VF.isScalable()) {
2195     VecIVTy = VectorType::get(BaseIVTy, State.VF);
2196     UnitStepVec =
2197         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2198     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2199     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2200   }
2201 
2202   unsigned StartLane = 0;
2203   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2204   if (State.Lane) {
2205     StartLane = State.Lane->getKnownLane();
2206     EndLane = StartLane + 1;
2207   }
2208   Value *StartIdx0;
2209   if (getUnrollPart(*this) == 0)
2210     StartIdx0 = ConstantInt::get(IntStepTy, 0);
2211   else {
2212     StartIdx0 = State.get(getOperand(2), true);
2213     if (getUnrollPart(*this) != 1) {
2214       StartIdx0 =
2215           Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2216                                                         getUnrollPart(*this)));
2217     }
2218     StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2219   }
2220 
2221   if (!FirstLaneOnly && State.VF.isScalable()) {
2222     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2223     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2224     if (BaseIVTy->isFloatingPointTy())
2225       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2226     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2227     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2228     State.set(this, Add);
2229     // It's useful to record the lane values too for the known minimum number
2230     // of elements so we do those below. This improves the code quality when
2231     // trying to extract the first element, for example.
2232   }
2233 
2234   if (BaseIVTy->isFloatingPointTy())
2235     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2236 
2237   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2238     Value *StartIdx = Builder.CreateBinOp(
2239         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2240     // The step returned by `createStepForVF` is a runtime-evaluated value
2241     // when VF is scalable. Otherwise, it should be folded into a Constant.
2242     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2243            "Expected StartIdx to be folded to a constant when VF is not "
2244            "scalable");
2245     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2246     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2247     State.set(this, Add, VPLane(Lane));
2248   }
2249 }
2250 
2251 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2252 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
2253                                   VPSlotTracker &SlotTracker) const {
2254   O << Indent;
2255   printAsOperand(O, SlotTracker);
2256   O << " = SCALAR-STEPS ";
2257   printOperands(O, SlotTracker);
2258 }
2259 #endif
2260 
2261 void VPWidenGEPRecipe::execute(VPTransformState &State) {
2262   assert(State.VF.isVector() && "not widening");
2263   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
2264   // Construct a vector GEP by widening the operands of the scalar GEP as
2265   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2266   // results in a vector of pointers when at least one operand of the GEP
2267   // is vector-typed. Thus, to keep the representation compact, we only use
2268   // vector-typed operands for loop-varying values.
2269 
2270   if (areAllOperandsInvariant()) {
2271     // If we are vectorizing, but the GEP has only loop-invariant operands,
2272     // the GEP we build (by only using vector-typed operands for
2273     // loop-varying values) would be a scalar pointer. Thus, to ensure we
2274     // produce a vector of pointers, we need to either arbitrarily pick an
2275     // operand to broadcast, or broadcast a clone of the original GEP.
2276     // Here, we broadcast a clone of the original.
2277     //
2278     // TODO: If at some point we decide to scalarize instructions having
2279     //       loop-invariant operands, this special case will no longer be
2280     //       required. We would add the scalarization decision to
2281     //       collectLoopScalars() and teach getVectorValue() to broadcast
2282     //       the lane-zero scalar value.
2283     SmallVector<Value *> Ops;
2284     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2285       Ops.push_back(State.get(getOperand(I), VPLane(0)));
2286 
2287     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
2288                                            ArrayRef(Ops).drop_front(), "",
2289                                            getGEPNoWrapFlags());
2290     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2291     State.set(this, Splat);
2292   } else {
2293     // If the GEP has at least one loop-varying operand, we are sure to
2294     // produce a vector of pointers unless VF is scalar.
2295     // The pointer operand of the new GEP. If it's loop-invariant, we
2296     // won't broadcast it.
2297     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
2298                                          : State.get(getOperand(0));
2299 
2300     // Collect all the indices for the new GEP. If any index is
2301     // loop-invariant, we won't broadcast it.
2302     SmallVector<Value *, 4> Indices;
2303     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2304       VPValue *Operand = getOperand(I);
2305       if (isIndexLoopInvariant(I - 1))
2306         Indices.push_back(State.get(Operand, VPLane(0)));
2307       else
2308         Indices.push_back(State.get(Operand));
2309     }
2310 
2311     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2312     // but it should be a vector, otherwise.
2313     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
2314                                            Indices, "", getGEPNoWrapFlags());
2315     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2316            "NewGEP is not a pointer vector");
2317     State.set(this, NewGEP);
2318   }
2319 }
2320 
2321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2322 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2323                              VPSlotTracker &SlotTracker) const {
2324   O << Indent << "WIDEN-GEP ";
2325   O << (isPointerLoopInvariant() ? "Inv" : "Var");
2326   for (size_t I = 0; I < getNumOperands() - 1; ++I)
2327     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2328 
2329   O << " ";
2330   printAsOperand(O, SlotTracker);
2331   O << " = getelementptr";
2332   printFlags(O);
2333   printOperands(O, SlotTracker);
2334 }
2335 #endif
2336 
2337 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2338                            unsigned CurrentPart, IRBuilderBase &Builder) {
2339   // Use i32 for the gep index type when the value is constant,
2340   // or query DataLayout for a more suitable index type otherwise.
2341   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2342   return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2343              ? DL.getIndexType(Builder.getPtrTy(0))
2344              : Builder.getInt32Ty();
2345 }
2346 
2347 void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2348   auto &Builder = State.Builder;
2349   unsigned CurrentPart = getUnrollPart(*this);
2350   bool IsUnitStride = Stride == 1 || Stride == -1;
2351   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2352                                 IsUnitStride, CurrentPart, Builder);
2353 
2354   // The wide store needs to start at the last vector element.
2355   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2356   if (IndexTy != RunTimeVF->getType())
2357     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2358   // NumElt = Stride * CurrentPart * RunTimeVF
2359   Value *NumElt = Builder.CreateMul(
2360       ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2361   // LastLane = Stride * (RunTimeVF - 1)
2362   Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2363   if (Stride != 1)
2364     LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2365   Value *Ptr = State.get(getOperand(0), VPLane(0));
2366   Value *ResultPtr =
2367       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2368   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2369                                 getGEPNoWrapFlags());
2370 
2371   State.set(this, ResultPtr, /*IsScalar*/ true);
2372 }
2373 
2374 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2375 void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2376                                      VPSlotTracker &SlotTracker) const {
2377   O << Indent;
2378   printAsOperand(O, SlotTracker);
2379   O << " = vector-end-pointer";
2380   printFlags(O);
2381   printOperands(O, SlotTracker);
2382 }
2383 #endif
2384 
2385 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2386   auto &Builder = State.Builder;
2387   unsigned CurrentPart = getUnrollPart(*this);
2388   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2389                                 /*IsUnitStride*/ true, CurrentPart, Builder);
2390   Value *Ptr = State.get(getOperand(0), VPLane(0));
2391 
2392   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2393   Value *ResultPtr =
2394       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2395 
2396   State.set(this, ResultPtr, /*IsScalar*/ true);
2397 }
2398 
2399 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2400 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2401                                   VPSlotTracker &SlotTracker) const {
2402   O << Indent;
2403   printAsOperand(O, SlotTracker);
2404   O << " = vector-pointer ";
2405 
2406   printOperands(O, SlotTracker);
2407 }
2408 #endif
2409 
2410 void VPBlendRecipe::execute(VPTransformState &State) {
2411   assert(isNormalized() && "Expected blend to be normalized!");
2412   // We know that all PHIs in non-header blocks are converted into
2413   // selects, so we don't have to worry about the insertion order and we
2414   // can just use the builder.
2415   // At this point we generate the predication tree. There may be
2416   // duplications since this is a simple recursive scan, but future
2417   // optimizations will clean it up.
2418 
2419   unsigned NumIncoming = getNumIncomingValues();
2420 
2421   // Generate a sequence of selects of the form:
2422   // SELECT(Mask3, In3,
2423   //        SELECT(Mask2, In2,
2424   //               SELECT(Mask1, In1,
2425   //                      In0)))
2426   // Note that Mask0 is never used: lanes for which no path reaches this phi and
2427   // are essentially undef are taken from In0.
2428   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2429   Value *Result = nullptr;
2430   for (unsigned In = 0; In < NumIncoming; ++In) {
2431     // We might have single edge PHIs (blocks) - use an identity
2432     // 'select' for the first PHI operand.
2433     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2434     if (In == 0)
2435       Result = In0; // Initialize with the first incoming value.
2436     else {
2437       // Select between the current value and the previous incoming edge
2438       // based on the incoming mask.
2439       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2440       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2441     }
2442   }
2443   State.set(this, Result, OnlyFirstLaneUsed);
2444 }
2445 
2446 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2447                                            VPCostContext &Ctx) const {
2448   // Handle cases where only the first lane is used the same way as the legacy
2449   // cost model.
2450   if (vputils::onlyFirstLaneUsed(this))
2451     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2452 
2453   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2454   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2455   return (getNumIncomingValues() - 1) *
2456          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2457                                     CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2458 }
2459 
2460 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2461 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2462                           VPSlotTracker &SlotTracker) const {
2463   O << Indent << "BLEND ";
2464   printAsOperand(O, SlotTracker);
2465   O << " =";
2466   if (getNumIncomingValues() == 1) {
2467     // Not a User of any mask: not really blending, this is a
2468     // single-predecessor phi.
2469     O << " ";
2470     getIncomingValue(0)->printAsOperand(O, SlotTracker);
2471   } else {
2472     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2473       O << " ";
2474       getIncomingValue(I)->printAsOperand(O, SlotTracker);
2475       if (I == 0)
2476         continue;
2477       O << "/";
2478       getMask(I)->printAsOperand(O, SlotTracker);
2479     }
2480   }
2481 }
2482 #endif
2483 
2484 void VPReductionRecipe::execute(VPTransformState &State) {
2485   assert(!State.Lane && "Reduction being replicated.");
2486   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2487   RecurKind Kind = getRecurrenceKind();
2488   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2489          "In-loop AnyOf reductions aren't currently supported");
2490   // Propagate the fast-math flags carried by the underlying instruction.
2491   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2492   State.Builder.setFastMathFlags(getFastMathFlags());
2493   Value *NewVecOp = State.get(getVecOp());
2494   if (VPValue *Cond = getCondOp()) {
2495     Value *NewCond = State.get(Cond, State.VF.isScalar());
2496     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2497     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2498 
2499     Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2500     if (State.VF.isVector())
2501       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2502 
2503     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2504     NewVecOp = Select;
2505   }
2506   Value *NewRed;
2507   Value *NextInChain;
2508   if (IsOrdered) {
2509     if (State.VF.isVector())
2510       NewRed =
2511           createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2512     else
2513       NewRed = State.Builder.CreateBinOp(
2514           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2515           PrevInChain, NewVecOp);
2516     PrevInChain = NewRed;
2517     NextInChain = NewRed;
2518   } else {
2519     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2520     NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2521     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2522       NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2523     else
2524       NextInChain = State.Builder.CreateBinOp(
2525           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
2526           PrevInChain);
2527   }
2528   State.set(this, NextInChain, /*IsScalar*/ true);
2529 }
2530 
2531 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2532   assert(!State.Lane && "Reduction being replicated.");
2533 
2534   auto &Builder = State.Builder;
2535   // Propagate the fast-math flags carried by the underlying instruction.
2536   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2537   Builder.setFastMathFlags(getFastMathFlags());
2538 
2539   RecurKind Kind = getRecurrenceKind();
2540   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2541   Value *VecOp = State.get(getVecOp());
2542   Value *EVL = State.get(getEVL(), VPLane(0));
2543 
2544   Value *Mask;
2545   if (VPValue *CondOp = getCondOp())
2546     Mask = State.get(CondOp);
2547   else
2548     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2549 
2550   Value *NewRed;
2551   if (isOrdered()) {
2552     NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2553   } else {
2554     NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2555     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2556       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2557     else
2558       NewRed = Builder.CreateBinOp(
2559           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
2560           Prev);
2561   }
2562   State.set(this, NewRed, /*IsScalar*/ true);
2563 }
2564 
2565 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2566                                                VPCostContext &Ctx) const {
2567   RecurKind RdxKind = getRecurrenceKind();
2568   Type *ElementTy = Ctx.Types.inferScalarType(this);
2569   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2570   unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2571   FastMathFlags FMFs = getFastMathFlags();
2572   std::optional<FastMathFlags> OptionalFMF =
2573       ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2574 
2575   // TODO: Support any-of reductions.
2576   assert(
2577       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2578        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2579       "Any-of reduction not implemented in VPlan-based cost model currently.");
2580 
2581   // Note that TTI should model the cost of moving result to the scalar register
2582   // and the BinOp cost in the getMinMaxReductionCost().
2583   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2584     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2585     return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2586   }
2587 
2588   // Note that TTI should model the cost of moving result to the scalar register
2589   // and the BinOp cost in the getArithmeticReductionCost().
2590   return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2591                                             Ctx.CostKind);
2592 }
2593 
2594 VPExpressionRecipe::VPExpressionRecipe(
2595     ExpressionTypes ExpressionType,
2596     ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2597     : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2598       ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
2599                             ExpressionRecipes.begin(), ExpressionRecipes.end())
2600                             .takeVector()),
2601       ExpressionType(ExpressionType) {
2602   assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2603   assert(
2604       none_of(ExpressionRecipes,
2605               [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2606       "expression cannot contain recipes with side-effects");
2607 
2608   // Maintain a copy of the expression recipes as a set of users.
2609   SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2610   for (auto *R : ExpressionRecipes)
2611     ExpressionRecipesAsSetOfUsers.insert(R);
2612 
2613   // Recipes in the expression, except the last one, must only be used by
2614   // (other) recipes inside the expression. If there are other users, external
2615   // to the expression, use a clone of the recipe for external users.
2616   for (VPSingleDefRecipe *R : ExpressionRecipes) {
2617     if (R != ExpressionRecipes.back() &&
2618         any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2619           return !ExpressionRecipesAsSetOfUsers.contains(U);
2620         })) {
2621       // There are users outside of the expression. Clone the recipe and use the
2622       // clone those external users.
2623       VPSingleDefRecipe *CopyForExtUsers = R->clone();
2624       R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2625                                                 VPUser &U, unsigned) {
2626         return !ExpressionRecipesAsSetOfUsers.contains(&U);
2627       });
2628       CopyForExtUsers->insertBefore(R);
2629     }
2630     if (R->getParent())
2631       R->removeFromParent();
2632   }
2633 
2634   // Internalize all external operands to the expression recipes. To do so,
2635   // create new temporary VPValues for all operands defined by a recipe outside
2636   // the expression. The original operands are added as operands of the
2637   // VPExpressionRecipe itself.
2638   for (auto *R : ExpressionRecipes) {
2639     for (const auto &[Idx, Op] : enumerate(R->operands())) {
2640       auto *Def = Op->getDefiningRecipe();
2641       if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2642         continue;
2643       addOperand(Op);
2644       LiveInPlaceholders.push_back(new VPValue());
2645       R->setOperand(Idx, LiveInPlaceholders.back());
2646     }
2647   }
2648 }
2649 
2650 void VPExpressionRecipe::decompose() {
2651   for (auto *R : ExpressionRecipes)
2652     R->insertBefore(this);
2653 
2654   for (const auto &[Idx, Op] : enumerate(operands()))
2655     LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2656 
2657   replaceAllUsesWith(ExpressionRecipes.back());
2658   ExpressionRecipes.clear();
2659 }
2660 
2661 InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
2662                                                 VPCostContext &Ctx) const {
2663   Type *RedTy = Ctx.Types.inferScalarType(this);
2664   auto *SrcVecTy = cast<VectorType>(
2665       toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2666   assert(RedTy->isIntegerTy() &&
2667          "VPExpressionRecipe only supports integer types currently.");
2668   switch (ExpressionType) {
2669   case ExpressionTypes::ExtendedReduction: {
2670     unsigned Opcode = RecurrenceDescriptor::getOpcode(
2671         cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2672     return Ctx.TTI.getExtendedReductionCost(
2673         Opcode,
2674         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2675             Instruction::ZExt,
2676         RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
2677   }
2678   case ExpressionTypes::MulAccReduction:
2679     return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
2680 
2681   case ExpressionTypes::ExtMulAccReduction:
2682     return Ctx.TTI.getMulAccReductionCost(
2683         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2684             Instruction::ZExt,
2685         RedTy, SrcVecTy, Ctx.CostKind);
2686   }
2687   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2688 }
2689 
2690 bool VPExpressionRecipe::mayReadOrWriteMemory() const {
2691   return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2692     return R->mayReadFromMemory() || R->mayWriteToMemory();
2693   });
2694 }
2695 
2696 bool VPExpressionRecipe::mayHaveSideEffects() const {
2697   assert(
2698       none_of(ExpressionRecipes,
2699               [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2700       "expression cannot contain recipes with side-effects");
2701   return false;
2702 }
2703 
2704 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2705 
2706 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
2707                                VPSlotTracker &SlotTracker) const {
2708   O << Indent << "EXPRESSION ";
2709   printAsOperand(O, SlotTracker);
2710   O << " = ";
2711   auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2712   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2713 
2714   switch (ExpressionType) {
2715   case ExpressionTypes::ExtendedReduction: {
2716     getOperand(1)->printAsOperand(O, SlotTracker);
2717     O << " +";
2718     O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
2719     getOperand(0)->printAsOperand(O, SlotTracker);
2720     Red->printFlags(O);
2721 
2722     auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2723     O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2724       << *Ext0->getResultType();
2725     if (Red->isConditional()) {
2726       O << ", ";
2727       Red->getCondOp()->printAsOperand(O, SlotTracker);
2728     }
2729     O << ")";
2730     break;
2731   }
2732   case ExpressionTypes::MulAccReduction:
2733   case ExpressionTypes::ExtMulAccReduction: {
2734     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
2735     O << " + ";
2736     O << "reduce."
2737       << Instruction::getOpcodeName(
2738              RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2739       << " (";
2740     O << "mul";
2741     bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
2742     auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
2743                                                : ExpressionRecipes[0]);
2744     Mul->printFlags(O);
2745     if (IsExtended)
2746       O << "(";
2747     getOperand(0)->printAsOperand(O, SlotTracker);
2748     if (IsExtended) {
2749       auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2750       O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2751         << *Ext0->getResultType() << "), (";
2752     } else {
2753       O << ", ";
2754     }
2755     getOperand(1)->printAsOperand(O, SlotTracker);
2756     if (IsExtended) {
2757       auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2758       O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2759         << *Ext1->getResultType() << ")";
2760     }
2761     if (Red->isConditional()) {
2762       O << ", ";
2763       Red->getCondOp()->printAsOperand(O, SlotTracker);
2764     }
2765     O << ")";
2766     break;
2767   }
2768   }
2769 }
2770 
2771 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2772                               VPSlotTracker &SlotTracker) const {
2773   O << Indent << "REDUCE ";
2774   printAsOperand(O, SlotTracker);
2775   O << " = ";
2776   getChainOp()->printAsOperand(O, SlotTracker);
2777   O << " +";
2778   printFlags(O);
2779   O << " reduce."
2780     << Instruction::getOpcodeName(
2781            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2782     << " (";
2783   getVecOp()->printAsOperand(O, SlotTracker);
2784   if (isConditional()) {
2785     O << ", ";
2786     getCondOp()->printAsOperand(O, SlotTracker);
2787   }
2788   O << ")";
2789 }
2790 
2791 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2792                                  VPSlotTracker &SlotTracker) const {
2793   O << Indent << "REDUCE ";
2794   printAsOperand(O, SlotTracker);
2795   O << " = ";
2796   getChainOp()->printAsOperand(O, SlotTracker);
2797   O << " +";
2798   printFlags(O);
2799   O << " vp.reduce."
2800     << Instruction::getOpcodeName(
2801            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2802     << " (";
2803   getVecOp()->printAsOperand(O, SlotTracker);
2804   O << ", ";
2805   getEVL()->printAsOperand(O, SlotTracker);
2806   if (isConditional()) {
2807     O << ", ";
2808     getCondOp()->printAsOperand(O, SlotTracker);
2809   }
2810   O << ")";
2811 }
2812 
2813 #endif
2814 
2815 /// A helper function to scalarize a single Instruction in the innermost loop.
2816 /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
2817 /// operands from \p RepRecipe instead of \p Instr's operands.
2818 static void scalarizeInstruction(const Instruction *Instr,
2819                                  VPReplicateRecipe *RepRecipe,
2820                                  const VPLane &Lane, VPTransformState &State) {
2821   assert((!Instr->getType()->isAggregateType() ||
2822           canVectorizeTy(Instr->getType())) &&
2823          "Expected vectorizable or non-aggregate type.");
2824 
2825   // Does this instruction return a value ?
2826   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2827 
2828   Instruction *Cloned = Instr->clone();
2829   if (!IsVoidRetTy) {
2830     Cloned->setName(Instr->getName() + ".cloned");
2831 #if !defined(NDEBUG)
2832     // Verify that VPlan type inference results agree with the type of the
2833     // generated values.
2834     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2835            "inferred type and type from generated instructions do not match");
2836 #endif
2837   }
2838 
2839   RepRecipe->applyFlags(*Cloned);
2840   RepRecipe->applyMetadata(*Cloned);
2841 
2842   if (auto DL = RepRecipe->getDebugLoc())
2843     State.setDebugLocFrom(DL);
2844 
2845   // Replace the operands of the cloned instructions with their scalar
2846   // equivalents in the new loop.
2847   for (const auto &I : enumerate(RepRecipe->operands())) {
2848     auto InputLane = Lane;
2849     VPValue *Operand = I.value();
2850     if (vputils::isSingleScalar(Operand))
2851       InputLane = VPLane::getFirstLane();
2852     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2853   }
2854 
2855   // Place the cloned scalar in the new loop.
2856   State.Builder.Insert(Cloned);
2857 
2858   State.set(RepRecipe, Cloned, Lane);
2859 
2860   // If we just cloned a new assumption, add it the assumption cache.
2861   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2862     State.AC->registerAssumption(II);
2863 
2864   assert(
2865       (RepRecipe->getParent()->getParent() ||
2866        !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2867        all_of(RepRecipe->operands(),
2868               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2869       "Expected a recipe is either within a region or all of its operands "
2870       "are defined outside the vectorized region.");
2871 }
2872 
2873 void VPReplicateRecipe::execute(VPTransformState &State) {
2874   Instruction *UI = getUnderlyingInstr();
2875 
2876   if (!State.Lane) {
2877     assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
2878                              "must have already been unrolled");
2879     scalarizeInstruction(UI, this, VPLane(0), State);
2880     return;
2881   }
2882 
2883   assert((State.VF.isScalar() || !isSingleScalar()) &&
2884          "uniform recipe shouldn't be predicated");
2885   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2886   scalarizeInstruction(UI, this, *State.Lane, State);
2887   // Insert scalar instance packing it into a vector.
2888   if (State.VF.isVector() && shouldPack()) {
2889     Value *WideValue =
2890         State.Lane->isFirstLane()
2891             ? PoisonValue::get(VectorType::get(UI->getType(), State.VF))
2892             : State.get(this);
2893     State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
2894                                                         *State.Lane));
2895   }
2896 }
2897 
2898 bool VPReplicateRecipe::shouldPack() const {
2899   // Find if the recipe is used by a widened recipe via an intervening
2900   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2901   return any_of(users(), [](const VPUser *U) {
2902     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2903       return any_of(PredR->users(), [PredR](const VPUser *U) {
2904         return !U->usesScalars(PredR);
2905       });
2906     return false;
2907   });
2908 }
2909 
2910 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2911                                                VPCostContext &Ctx) const {
2912   Instruction *UI = cast<Instruction>(getUnderlyingValue());
2913   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2914   // transform, avoid computing their cost multiple times for now.
2915   Ctx.SkipCostComputation.insert(UI);
2916 
2917   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2918   Type *ResultTy = Ctx.Types.inferScalarType(this);
2919   switch (UI->getOpcode()) {
2920   case Instruction::GetElementPtr:
2921     // We mark this instruction as zero-cost because the cost of GEPs in
2922     // vectorized code depends on whether the corresponding memory instruction
2923     // is scalarized or not. Therefore, we handle GEPs with the memory
2924     // instruction cost.
2925     return 0;
2926   case Instruction::Add:
2927   case Instruction::Sub:
2928   case Instruction::FAdd:
2929   case Instruction::FSub:
2930   case Instruction::Mul:
2931   case Instruction::FMul:
2932   case Instruction::FDiv:
2933   case Instruction::FRem:
2934   case Instruction::Shl:
2935   case Instruction::LShr:
2936   case Instruction::AShr:
2937   case Instruction::And:
2938   case Instruction::Or:
2939   case Instruction::Xor: {
2940     auto Op2Info = Ctx.getOperandInfo(getOperand(1));
2941     SmallVector<const Value *, 4> Operands(UI->operand_values());
2942     return Ctx.TTI.getArithmeticInstrCost(
2943                UI->getOpcode(), ResultTy, CostKind,
2944                {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2945                Op2Info, Operands, UI, &Ctx.TLI) *
2946            (isSingleScalar() ? 1 : VF.getFixedValue());
2947   }
2948   }
2949 
2950   return Ctx.getLegacyCost(UI, VF);
2951 }
2952 
2953 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2954 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2955                               VPSlotTracker &SlotTracker) const {
2956   O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
2957 
2958   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2959     printAsOperand(O, SlotTracker);
2960     O << " = ";
2961   }
2962   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2963     O << "call";
2964     printFlags(O);
2965     O << "@" << CB->getCalledFunction()->getName() << "(";
2966     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2967                     O, [&O, &SlotTracker](VPValue *Op) {
2968                       Op->printAsOperand(O, SlotTracker);
2969                     });
2970     O << ")";
2971   } else {
2972     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2973     printFlags(O);
2974     printOperands(O, SlotTracker);
2975   }
2976 
2977   if (shouldPack())
2978     O << " (S->V)";
2979 }
2980 #endif
2981 
2982 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2983   assert(State.Lane && "Branch on Mask works only on single instance.");
2984 
2985   VPValue *BlockInMask = getOperand(0);
2986   Value *ConditionBit = State.get(BlockInMask, *State.Lane);
2987 
2988   // Replace the temporary unreachable terminator with a new conditional branch,
2989   // whose two destinations will be set later when they are created.
2990   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2991   assert(isa<UnreachableInst>(CurrentTerminator) &&
2992          "Expected to replace unreachable terminator with conditional branch.");
2993   auto CondBr =
2994       State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
2995   CondBr->setSuccessor(0, nullptr);
2996   CurrentTerminator->eraseFromParent();
2997 }
2998 
2999 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3000                                                   VPCostContext &Ctx) const {
3001   // The legacy cost model doesn't assign costs to branches for individual
3002   // replicate regions. Match the current behavior in the VPlan cost model for
3003   // now.
3004   return 0;
3005 }
3006 
3007 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3008   assert(State.Lane && "Predicated instruction PHI works per instance.");
3009   Instruction *ScalarPredInst =
3010       cast<Instruction>(State.get(getOperand(0), *State.Lane));
3011   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3012   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3013   assert(PredicatingBB && "Predicated block has no single predecessor.");
3014   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3015          "operand must be VPReplicateRecipe");
3016 
3017   // By current pack/unpack logic we need to generate only a single phi node: if
3018   // a vector value for the predicated instruction exists at this point it means
3019   // the instruction has vector users only, and a phi for the vector value is
3020   // needed. In this case the recipe of the predicated instruction is marked to
3021   // also do that packing, thereby "hoisting" the insert-element sequence.
3022   // Otherwise, a phi node for the scalar value is needed.
3023   if (State.hasVectorValue(getOperand(0))) {
3024     Value *VectorValue = State.get(getOperand(0));
3025     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
3026     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
3027     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
3028     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
3029     if (State.hasVectorValue(this))
3030       State.reset(this, VPhi);
3031     else
3032       State.set(this, VPhi);
3033     // NOTE: Currently we need to update the value of the operand, so the next
3034     // predicated iteration inserts its generated value in the correct vector.
3035     State.reset(getOperand(0), VPhi);
3036   } else {
3037     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3038       return;
3039 
3040     Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3041     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3042     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3043                      PredicatingBB);
3044     Phi->addIncoming(ScalarPredInst, PredicatedBB);
3045     if (State.hasScalarValue(this, *State.Lane))
3046       State.reset(this, Phi, *State.Lane);
3047     else
3048       State.set(this, Phi, *State.Lane);
3049     // NOTE: Currently we need to update the value of the operand, so the next
3050     // predicated iteration inserts its generated value in the correct vector.
3051     State.reset(getOperand(0), Phi, *State.Lane);
3052   }
3053 }
3054 
3055 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3056 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3057                                 VPSlotTracker &SlotTracker) const {
3058   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3059   printAsOperand(O, SlotTracker);
3060   O << " = ";
3061   printOperands(O, SlotTracker);
3062 }
3063 #endif
3064 
3065 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3066                                                  VPCostContext &Ctx) const {
3067   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3068   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3069   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3070                     ->getAddressSpace();
3071   unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3072                         ? Instruction::Load
3073                         : Instruction::Store;
3074 
3075   if (!Consecutive) {
3076     // TODO: Using the original IR may not be accurate.
3077     // Currently, ARM will use the underlying IR to calculate gather/scatter
3078     // instruction cost.
3079     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
3080     assert(!Reverse &&
3081            "Inconsecutive memory access should not have the order.");
3082     return Ctx.TTI.getAddressComputationCost(Ty) +
3083            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3084                                           Ctx.CostKind, &Ingredient);
3085   }
3086 
3087   InstructionCost Cost = 0;
3088   if (IsMasked) {
3089     Cost +=
3090         Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
3091   } else {
3092     TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3093         isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
3094                                                            : getOperand(1));
3095     Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3096                                     OpInfo, &Ingredient);
3097   }
3098   if (!Reverse)
3099     return Cost;
3100 
3101   return Cost += Ctx.TTI.getShuffleCost(
3102              TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3103              cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3104 }
3105 
3106 void VPWidenLoadRecipe::execute(VPTransformState &State) {
3107   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3108   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3109   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3110   bool CreateGather = !isConsecutive();
3111 
3112   auto &Builder = State.Builder;
3113   Value *Mask = nullptr;
3114   if (auto *VPMask = getMask()) {
3115     // Mask reversal is only needed for non-all-one (null) masks, as reverse
3116     // of a null all-one mask is a null mask.
3117     Mask = State.get(VPMask);
3118     if (isReverse())
3119       Mask = Builder.CreateVectorReverse(Mask, "reverse");
3120   }
3121 
3122   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3123   Value *NewLI;
3124   if (CreateGather) {
3125     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3126                                        "wide.masked.gather");
3127   } else if (Mask) {
3128     NewLI =
3129         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3130                                  PoisonValue::get(DataTy), "wide.masked.load");
3131   } else {
3132     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3133   }
3134   applyMetadata(*cast<Instruction>(NewLI));
3135   if (Reverse)
3136     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3137   State.set(this, NewLI);
3138 }
3139 
3140 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3141 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
3142                               VPSlotTracker &SlotTracker) const {
3143   O << Indent << "WIDEN ";
3144   printAsOperand(O, SlotTracker);
3145   O << " = load ";
3146   printOperands(O, SlotTracker);
3147 }
3148 #endif
3149 
3150 /// Use all-true mask for reverse rather than actual mask, as it avoids a
3151 /// dependence w/o affecting the result.
3152 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3153                                      Value *EVL, const Twine &Name) {
3154   VectorType *ValTy = cast<VectorType>(Operand->getType());
3155   Value *AllTrueMask =
3156       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3157   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3158                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
3159 }
3160 
3161 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3162   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3163   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3164   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3165   bool CreateGather = !isConsecutive();
3166 
3167   auto &Builder = State.Builder;
3168   CallInst *NewLI;
3169   Value *EVL = State.get(getEVL(), VPLane(0));
3170   Value *Addr = State.get(getAddr(), !CreateGather);
3171   Value *Mask = nullptr;
3172   if (VPValue *VPMask = getMask()) {
3173     Mask = State.get(VPMask);
3174     if (isReverse())
3175       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3176   } else {
3177     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3178   }
3179 
3180   if (CreateGather) {
3181     NewLI =
3182         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3183                                 nullptr, "wide.masked.gather");
3184   } else {
3185     NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3186                                     {Addr, Mask, EVL}, nullptr, "vp.op.load");
3187   }
3188   NewLI->addParamAttr(
3189       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
3190   applyMetadata(*NewLI);
3191   Instruction *Res = NewLI;
3192   if (isReverse())
3193     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3194   State.set(this, Res);
3195 }
3196 
3197 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3198                                                   VPCostContext &Ctx) const {
3199   if (!Consecutive || IsMasked)
3200     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3201 
3202   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3203   // here because the EVL recipes using EVL to replace the tail mask. But in the
3204   // legacy model, it will always calculate the cost of mask.
3205   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3206   // don't need to compare to the legacy cost model.
3207   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3208   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3209   unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3210   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3211       Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
3212   if (!Reverse)
3213     return Cost;
3214 
3215   return Cost + Ctx.TTI.getShuffleCost(
3216                     TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3217                     cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3218 }
3219 
3220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3221 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3222                                  VPSlotTracker &SlotTracker) const {
3223   O << Indent << "WIDEN ";
3224   printAsOperand(O, SlotTracker);
3225   O << " = vp.load ";
3226   printOperands(O, SlotTracker);
3227 }
3228 #endif
3229 
3230 void VPWidenStoreRecipe::execute(VPTransformState &State) {
3231   VPValue *StoredVPValue = getStoredValue();
3232   bool CreateScatter = !isConsecutive();
3233   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3234 
3235   auto &Builder = State.Builder;
3236 
3237   Value *Mask = nullptr;
3238   if (auto *VPMask = getMask()) {
3239     // Mask reversal is only needed for non-all-one (null) masks, as reverse
3240     // of a null all-one mask is a null mask.
3241     Mask = State.get(VPMask);
3242     if (isReverse())
3243       Mask = Builder.CreateVectorReverse(Mask, "reverse");
3244   }
3245 
3246   Value *StoredVal = State.get(StoredVPValue);
3247   if (isReverse()) {
3248     // If we store to reverse consecutive memory locations, then we need
3249     // to reverse the order of elements in the stored value.
3250     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3251     // We don't want to update the value in the map as it might be used in
3252     // another expression. So don't call resetVectorValue(StoredVal).
3253   }
3254   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3255   Instruction *NewSI = nullptr;
3256   if (CreateScatter)
3257     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3258   else if (Mask)
3259     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3260   else
3261     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3262   applyMetadata(*NewSI);
3263 }
3264 
3265 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3266 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
3267                                VPSlotTracker &SlotTracker) const {
3268   O << Indent << "WIDEN store ";
3269   printOperands(O, SlotTracker);
3270 }
3271 #endif
3272 
3273 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3274   VPValue *StoredValue = getStoredValue();
3275   bool CreateScatter = !isConsecutive();
3276   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3277 
3278   auto &Builder = State.Builder;
3279 
3280   CallInst *NewSI = nullptr;
3281   Value *StoredVal = State.get(StoredValue);
3282   Value *EVL = State.get(getEVL(), VPLane(0));
3283   if (isReverse())
3284     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3285   Value *Mask = nullptr;
3286   if (VPValue *VPMask = getMask()) {
3287     Mask = State.get(VPMask);
3288     if (isReverse())
3289       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3290   } else {
3291     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3292   }
3293   Value *Addr = State.get(getAddr(), !CreateScatter);
3294   if (CreateScatter) {
3295     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3296                                     Intrinsic::vp_scatter,
3297                                     {StoredVal, Addr, Mask, EVL});
3298   } else {
3299     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3300                                     Intrinsic::vp_store,
3301                                     {StoredVal, Addr, Mask, EVL});
3302   }
3303   NewSI->addParamAttr(
3304       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
3305   applyMetadata(*NewSI);
3306 }
3307 
3308 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3309                                                    VPCostContext &Ctx) const {
3310   if (!Consecutive || IsMasked)
3311     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3312 
3313   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3314   // here because the EVL recipes using EVL to replace the tail mask. But in the
3315   // legacy model, it will always calculate the cost of mask.
3316   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3317   // don't need to compare to the legacy cost model.
3318   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3319   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3320   unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3321   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3322       Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3323   if (!Reverse)
3324     return Cost;
3325 
3326   return Cost + Ctx.TTI.getShuffleCost(
3327                     TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3328                     cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3329 }
3330 
3331 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3332 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3333                                   VPSlotTracker &SlotTracker) const {
3334   O << Indent << "WIDEN vp.store ";
3335   printOperands(O, SlotTracker);
3336 }
3337 #endif
3338 
3339 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
3340                                      VectorType *DstVTy, const DataLayout &DL) {
3341   // Verify that V is a vector type with same number of elements as DstVTy.
3342   auto VF = DstVTy->getElementCount();
3343   auto *SrcVecTy = cast<VectorType>(V->getType());
3344   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3345   Type *SrcElemTy = SrcVecTy->getElementType();
3346   Type *DstElemTy = DstVTy->getElementType();
3347   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3348          "Vector elements must have same size");
3349 
3350   // Do a direct cast if element types are castable.
3351   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3352     return Builder.CreateBitOrPointerCast(V, DstVTy);
3353   }
3354   // V cannot be directly casted to desired vector type.
3355   // May happen when V is a floating point vector but DstVTy is a vector of
3356   // pointers or vice-versa. Handle this using a two-step bitcast using an
3357   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3358   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3359          "Only one type should be a pointer type");
3360   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3361          "Only one type should be a floating point type");
3362   Type *IntTy =
3363       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3364   auto *VecIntTy = VectorType::get(IntTy, VF);
3365   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3366   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3367 }
3368 
3369 /// Return a vector containing interleaved elements from multiple
3370 /// smaller input vectors.
3371 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
3372                                 const Twine &Name) {
3373   unsigned Factor = Vals.size();
3374   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3375 
3376   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3377 #ifndef NDEBUG
3378   for (Value *Val : Vals)
3379     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3380 #endif
3381 
3382   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3383   // must use intrinsics to interleave.
3384   if (VecTy->isScalableTy()) {
3385     assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3386     VectorType *InterleaveTy =
3387         VectorType::get(VecTy->getElementType(),
3388                         VecTy->getElementCount().multiplyCoefficientBy(Factor));
3389     return Builder.CreateIntrinsic(InterleaveTy,
3390                                    getInterleaveIntrinsicID(Factor), Vals,
3391                                    /*FMFSource=*/nullptr, Name);
3392   }
3393 
3394   // Fixed length. Start by concatenating all vectors into a wide vector.
3395   Value *WideVec = concatenateVectors(Builder, Vals);
3396 
3397   // Interleave the elements into the wide vector.
3398   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3399   return Builder.CreateShuffleVector(
3400       WideVec, createInterleaveMask(NumElts, Factor), Name);
3401 }
3402 
3403 // Try to vectorize the interleave group that \p Instr belongs to.
3404 //
3405 // E.g. Translate following interleaved load group (factor = 3):
3406 //   for (i = 0; i < N; i+=3) {
3407 //     R = Pic[i];             // Member of index 0
3408 //     G = Pic[i+1];           // Member of index 1
3409 //     B = Pic[i+2];           // Member of index 2
3410 //     ... // do something to R, G, B
3411 //   }
3412 // To:
3413 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
3414 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
3415 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
3416 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
3417 //
3418 // Or translate following interleaved store group (factor = 3):
3419 //   for (i = 0; i < N; i+=3) {
3420 //     ... do something to R, G, B
3421 //     Pic[i]   = R;           // Member of index 0
3422 //     Pic[i+1] = G;           // Member of index 1
3423 //     Pic[i+2] = B;           // Member of index 2
3424 //   }
3425 // To:
3426 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3427 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3428 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3429 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
3430 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
3431 void VPInterleaveRecipe::execute(VPTransformState &State) {
3432   assert(!State.Lane && "Interleave group being replicated.");
3433   const InterleaveGroup<Instruction> *Group = IG;
3434   Instruction *Instr = Group->getInsertPos();
3435 
3436   // Prepare for the vector type of the interleaved load/store.
3437   Type *ScalarTy = getLoadStoreType(Instr);
3438   unsigned InterleaveFactor = Group->getFactor();
3439   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3440 
3441   VPValue *BlockInMask = getMask();
3442   VPValue *Addr = getAddr();
3443   Value *ResAddr = State.get(Addr, VPLane(0));
3444   Value *PoisonVec = PoisonValue::get(VecTy);
3445 
3446   auto CreateGroupMask = [&BlockInMask, &State,
3447                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
3448     if (State.VF.isScalable()) {
3449       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3450       assert(InterleaveFactor <= 8 &&
3451              "Unsupported deinterleave factor for scalable vectors");
3452       auto *ResBlockInMask = State.get(BlockInMask);
3453       SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3454       return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3455     }
3456 
3457     if (!BlockInMask)
3458       return MaskForGaps;
3459 
3460     Value *ResBlockInMask = State.get(BlockInMask);
3461     Value *ShuffledMask = State.Builder.CreateShuffleVector(
3462         ResBlockInMask,
3463         createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3464         "interleaved.mask");
3465     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3466                                                    ShuffledMask, MaskForGaps)
3467                        : ShuffledMask;
3468   };
3469 
3470   const DataLayout &DL = Instr->getDataLayout();
3471   // Vectorize the interleaved load group.
3472   if (isa<LoadInst>(Instr)) {
3473     Value *MaskForGaps = nullptr;
3474     if (NeedsMaskForGaps) {
3475       MaskForGaps =
3476           createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3477       assert(MaskForGaps && "Mask for Gaps is required but it is null");
3478     }
3479 
3480     Instruction *NewLoad;
3481     if (BlockInMask || MaskForGaps) {
3482       Value *GroupMask = CreateGroupMask(MaskForGaps);
3483       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3484                                                Group->getAlign(), GroupMask,
3485                                                PoisonVec, "wide.masked.vec");
3486     } else
3487       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3488                                                 Group->getAlign(), "wide.vec");
3489     Group->addMetadata(NewLoad);
3490 
3491     ArrayRef<VPValue *> VPDefs = definedValues();
3492     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3493     if (VecTy->isScalableTy()) {
3494       // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3495       // so must use intrinsics to deinterleave.
3496       assert(InterleaveFactor <= 8 &&
3497              "Unsupported deinterleave factor for scalable vectors");
3498       Value *Deinterleave = State.Builder.CreateIntrinsic(
3499           getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
3500           NewLoad,
3501           /*FMFSource=*/nullptr, "strided.vec");
3502 
3503       for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3504         Instruction *Member = Group->getMember(I);
3505         Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
3506         if (!Member) {
3507           // This value is not needed as it's not used
3508           cast<Instruction>(StridedVec)->eraseFromParent();
3509           continue;
3510         }
3511         // If this member has different type, cast the result type.
3512         if (Member->getType() != ScalarTy) {
3513           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3514           StridedVec =
3515               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3516         }
3517 
3518         if (Group->isReverse())
3519           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3520 
3521         State.set(VPDefs[J], StridedVec);
3522         ++J;
3523       }
3524 
3525       return;
3526     }
3527     assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3528 
3529     // For each member in the group, shuffle out the appropriate data from the
3530     // wide loads.
3531     unsigned J = 0;
3532     for (unsigned I = 0; I < InterleaveFactor; ++I) {
3533       Instruction *Member = Group->getMember(I);
3534 
3535       // Skip the gaps in the group.
3536       if (!Member)
3537         continue;
3538 
3539       auto StrideMask =
3540           createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
3541       Value *StridedVec =
3542           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
3543 
3544       // If this member has different type, cast the result type.
3545       if (Member->getType() != ScalarTy) {
3546         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3547         StridedVec =
3548             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3549       }
3550 
3551       if (Group->isReverse())
3552         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3553 
3554       State.set(VPDefs[J], StridedVec);
3555       ++J;
3556     }
3557     return;
3558   }
3559 
3560   // The sub vector type for current instruction.
3561   auto *SubVT = VectorType::get(ScalarTy, State.VF);
3562 
3563   // Vectorize the interleaved store group.
3564   Value *MaskForGaps =
3565       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3566   assert((!MaskForGaps || !State.VF.isScalable()) &&
3567          "masking gaps for scalable vectors is not yet supported.");
3568   ArrayRef<VPValue *> StoredValues = getStoredValues();
3569   // Collect the stored vector from each member.
3570   SmallVector<Value *, 4> StoredVecs;
3571   unsigned StoredIdx = 0;
3572   for (unsigned i = 0; i < InterleaveFactor; i++) {
3573     assert((Group->getMember(i) || MaskForGaps) &&
3574            "Fail to get a member from an interleaved store group");
3575     Instruction *Member = Group->getMember(i);
3576 
3577     // Skip the gaps in the group.
3578     if (!Member) {
3579       Value *Undef = PoisonValue::get(SubVT);
3580       StoredVecs.push_back(Undef);
3581       continue;
3582     }
3583 
3584     Value *StoredVec = State.get(StoredValues[StoredIdx]);
3585     ++StoredIdx;
3586 
3587     if (Group->isReverse())
3588       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3589 
3590     // If this member has different type, cast it to a unified type.
3591 
3592     if (StoredVec->getType() != SubVT)
3593       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3594 
3595     StoredVecs.push_back(StoredVec);
3596   }
3597 
3598   // Interleave all the smaller vectors into one wider vector.
3599   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3600   Instruction *NewStoreInstr;
3601   if (BlockInMask || MaskForGaps) {
3602     Value *GroupMask = CreateGroupMask(MaskForGaps);
3603     NewStoreInstr = State.Builder.CreateMaskedStore(
3604         IVec, ResAddr, Group->getAlign(), GroupMask);
3605   } else
3606     NewStoreInstr =
3607         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3608 
3609   Group->addMetadata(NewStoreInstr);
3610 }
3611 
3612 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3613 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3614                                VPSlotTracker &SlotTracker) const {
3615   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3616   IG->getInsertPos()->printAsOperand(O, false);
3617   O << ", ";
3618   getAddr()->printAsOperand(O, SlotTracker);
3619   VPValue *Mask = getMask();
3620   if (Mask) {
3621     O << ", ";
3622     Mask->printAsOperand(O, SlotTracker);
3623   }
3624 
3625   unsigned OpIdx = 0;
3626   for (unsigned i = 0; i < IG->getFactor(); ++i) {
3627     if (!IG->getMember(i))
3628       continue;
3629     if (getNumStoreOperands() > 0) {
3630       O << "\n" << Indent << "  store ";
3631       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3632       O << " to index " << i;
3633     } else {
3634       O << "\n" << Indent << "  ";
3635       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3636       O << " = load from index " << i;
3637     }
3638     ++OpIdx;
3639   }
3640 }
3641 #endif
3642 
3643 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3644                                                 VPCostContext &Ctx) const {
3645   Instruction *InsertPos = getInsertPos();
3646   // Find the VPValue index of the interleave group. We need to skip gaps.
3647   unsigned InsertPosIdx = 0;
3648   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3649     if (auto *Member = IG->getMember(Idx)) {
3650       if (Member == InsertPos)
3651         break;
3652       InsertPosIdx++;
3653     }
3654   Type *ValTy = Ctx.Types.inferScalarType(
3655       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3656                                 : getStoredValues()[InsertPosIdx]);
3657   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3658   unsigned AS = getLoadStoreAddressSpace(InsertPos);
3659 
3660   unsigned InterleaveFactor = IG->getFactor();
3661   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3662 
3663   // Holds the indices of existing members in the interleaved group.
3664   SmallVector<unsigned, 4> Indices;
3665   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3666     if (IG->getMember(IF))
3667       Indices.push_back(IF);
3668 
3669   // Calculate the cost of the whole interleaved group.
3670   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3671       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3672       IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
3673 
3674   if (!IG->isReverse())
3675     return Cost;
3676 
3677   return Cost + IG->getNumMembers() *
3678                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3679                                            VectorTy, VectorTy, {}, Ctx.CostKind,
3680                                            0);
3681 }
3682 
3683 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3684 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3685                                    VPSlotTracker &SlotTracker) const {
3686   O << Indent << "EMIT ";
3687   printAsOperand(O, SlotTracker);
3688   O << " = CANONICAL-INDUCTION ";
3689   printOperands(O, SlotTracker);
3690 }
3691 #endif
3692 
3693 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3694   return IsScalarAfterVectorization &&
3695          (!IsScalable || vputils::onlyFirstLaneUsed(this));
3696 }
3697 
3698 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3699   assert(getInductionDescriptor().getKind() ==
3700              InductionDescriptor::IK_PtrInduction &&
3701          "Not a pointer induction according to InductionDescriptor!");
3702   assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
3703          "Unexpected type.");
3704   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3705          "Recipe should have been replaced");
3706 
3707   unsigned CurrentPart = getUnrollPart(*this);
3708 
3709   // Build a pointer phi
3710   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3711   Type *ScStValueType = ScalarStartValue->getType();
3712 
3713   BasicBlock *VectorPH =
3714       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3715   PHINode *NewPointerPhi = nullptr;
3716   if (CurrentPart == 0) {
3717     IRBuilder<>::InsertPointGuard Guard(State.Builder);
3718     if (State.Builder.GetInsertPoint() !=
3719         State.Builder.GetInsertBlock()->getFirstNonPHIIt())
3720       State.Builder.SetInsertPoint(
3721           State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3722     NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi");
3723     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3724     NewPointerPhi->setDebugLoc(getDebugLoc());
3725   } else {
3726     // The recipe has been unrolled. In that case, fetch the single pointer phi
3727     // shared among all unrolled parts of the recipe.
3728     auto *GEP =
3729         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3730     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3731   }
3732 
3733   // A pointer induction, performed by using a gep
3734   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3735   Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3736   Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3737   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3738   // Add induction update using an incorrect block temporarily. The phi node
3739   // will be fixed after VPlan execution. Note that at this point the latch
3740   // block cannot be used, as it does not exist yet.
3741   // TODO: Model increment value in VPlan, by turning the recipe into a
3742   // multi-def and a subclass of VPHeaderPHIRecipe.
3743   if (CurrentPart == 0) {
3744     // The recipe represents the first part of the pointer induction. Create the
3745     // GEP to increment the phi across all unrolled parts.
3746     Value *NumUnrolledElems = State.get(getOperand(2), true);
3747 
3748     Value *InductionGEP = GetElementPtrInst::Create(
3749         State.Builder.getInt8Ty(), NewPointerPhi,
3750         State.Builder.CreateMul(
3751             ScalarStepValue,
3752             State.Builder.CreateTrunc(NumUnrolledElems, PhiType)),
3753         "ptr.ind", InductionLoc);
3754 
3755     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3756   }
3757 
3758   // Create actual address geps that use the pointer phi as base and a
3759   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3760   Type *VecPhiType = VectorType::get(PhiType, State.VF);
3761   Value *StartOffsetScalar = State.Builder.CreateMul(
3762       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3763   Value *StartOffset =
3764       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3765   // Create a vector of consecutive numbers from zero to VF.
3766   StartOffset = State.Builder.CreateAdd(
3767       StartOffset, State.Builder.CreateStepVector(VecPhiType));
3768 
3769   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3770          "scalar step must be the same across all parts");
3771   Value *GEP = State.Builder.CreateGEP(
3772       State.Builder.getInt8Ty(), NewPointerPhi,
3773       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3774                                                State.VF, ScalarStepValue)),
3775       "vector.gep");
3776   State.set(this, GEP);
3777 }
3778 
3779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3780 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3781                                           VPSlotTracker &SlotTracker) const {
3782   assert((getNumOperands() == 3 || getNumOperands() == 5) &&
3783          "unexpected number of operands");
3784   O << Indent << "EMIT ";
3785   printAsOperand(O, SlotTracker);
3786   O << " = WIDEN-POINTER-INDUCTION ";
3787   getStartValue()->printAsOperand(O, SlotTracker);
3788   O << ", ";
3789   getStepValue()->printAsOperand(O, SlotTracker);
3790   O << ", ";
3791   getOperand(2)->printAsOperand(O, SlotTracker);
3792   if (getNumOperands() == 5) {
3793     O << ", ";
3794     getOperand(3)->printAsOperand(O, SlotTracker);
3795     O << ", ";
3796     getOperand(4)->printAsOperand(O, SlotTracker);
3797   }
3798 }
3799 #endif
3800 
3801 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3802   assert(!State.Lane && "cannot be used in per-lane");
3803   const DataLayout &DL = SE.getDataLayout();
3804   SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true);
3805   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3806                                  &*State.Builder.GetInsertPoint());
3807   State.set(this, Res, VPLane(0));
3808 }
3809 
3810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3811 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3812                                VPSlotTracker &SlotTracker) const {
3813   O << Indent << "EMIT ";
3814   printAsOperand(O, SlotTracker);
3815   O << " = EXPAND SCEV " << *Expr;
3816 }
3817 #endif
3818 
3819 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3820   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3821   Type *STy = CanonicalIV->getType();
3822   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3823   ElementCount VF = State.VF;
3824   Value *VStart = VF.isScalar()
3825                       ? CanonicalIV
3826                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3827   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3828   if (VF.isVector()) {
3829     VStep = Builder.CreateVectorSplat(VF, VStep);
3830     VStep =
3831         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3832   }
3833   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3834   State.set(this, CanonicalVectorIV);
3835 }
3836 
3837 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3838 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3839                                      VPSlotTracker &SlotTracker) const {
3840   O << Indent << "EMIT ";
3841   printAsOperand(O, SlotTracker);
3842   O << " = WIDEN-CANONICAL-INDUCTION ";
3843   printOperands(O, SlotTracker);
3844 }
3845 #endif
3846 
3847 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3848   auto &Builder = State.Builder;
3849   // Create a vector from the initial value.
3850   auto *VectorInit = getStartValue()->getLiveInIRValue();
3851 
3852   Type *VecTy = State.VF.isScalar()
3853                     ? VectorInit->getType()
3854                     : VectorType::get(VectorInit->getType(), State.VF);
3855 
3856   BasicBlock *VectorPH =
3857       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3858   if (State.VF.isVector()) {
3859     auto *IdxTy = Builder.getInt32Ty();
3860     auto *One = ConstantInt::get(IdxTy, 1);
3861     IRBuilder<>::InsertPointGuard Guard(Builder);
3862     Builder.SetInsertPoint(VectorPH->getTerminator());
3863     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3864     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3865     VectorInit = Builder.CreateInsertElement(
3866         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3867   }
3868 
3869   // Create a phi node for the new recurrence.
3870   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3871   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3872   Phi->addIncoming(VectorInit, VectorPH);
3873   State.set(this, Phi);
3874 }
3875 
3876 InstructionCost
3877 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3878                                              VPCostContext &Ctx) const {
3879   if (VF.isScalar())
3880     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3881 
3882   return 0;
3883 }
3884 
3885 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3886 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3887                                             VPSlotTracker &SlotTracker) const {
3888   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3889   printAsOperand(O, SlotTracker);
3890   O << " = phi ";
3891   printOperands(O, SlotTracker);
3892 }
3893 #endif
3894 
3895 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3896   // Reductions do not have to start at zero. They can start with
3897   // any loop invariant values.
3898   VPValue *StartVPV = getStartValue();
3899 
3900   // In order to support recurrences we need to be able to vectorize Phi nodes.
3901   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3902   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3903   // this value when we vectorize all of the instructions that use the PHI.
3904   BasicBlock *VectorPH =
3905       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3906   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3907   Value *StartV = State.get(StartVPV, ScalarPHI);
3908   Type *VecTy = StartV->getType();
3909 
3910   BasicBlock *HeaderBB = State.CFG.PrevBB;
3911   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3912          "recipe must be in the vector loop header");
3913   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3914   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3915   State.set(this, Phi, IsInLoop);
3916 
3917   Phi->addIncoming(StartV, VectorPH);
3918 }
3919 
3920 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3921 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3922                                  VPSlotTracker &SlotTracker) const {
3923   O << Indent << "WIDEN-REDUCTION-PHI ";
3924 
3925   printAsOperand(O, SlotTracker);
3926   O << " = phi ";
3927   printOperands(O, SlotTracker);
3928   if (VFScaleFactor != 1)
3929     O << " (VF scaled by 1/" << VFScaleFactor << ")";
3930 }
3931 #endif
3932 
3933 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3934   Value *Op0 = State.get(getOperand(0));
3935   Type *VecTy = Op0->getType();
3936   Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
3937   // Manually move it with the other PHIs in case PHI recipes above this one
3938   // also inserted non-phi instructions.
3939   // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
3940   // convertToConcreteRecipes.
3941   VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3942   State.set(this, VecPhi);
3943 }
3944 
3945 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3946 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3947                              VPSlotTracker &SlotTracker) const {
3948   O << Indent << "WIDEN-PHI ";
3949 
3950   printAsOperand(O, SlotTracker);
3951   O << " = phi ";
3952   printPhiOperands(O, SlotTracker);
3953 }
3954 #endif
3955 
3956 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3957 // remove VPActiveLaneMaskPHIRecipe.
3958 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3959   BasicBlock *VectorPH =
3960       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3961   Value *StartMask = State.get(getOperand(0));
3962   PHINode *Phi =
3963       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3964   Phi->addIncoming(StartMask, VectorPH);
3965   State.set(this, Phi);
3966 }
3967 
3968 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3969 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3970                                       VPSlotTracker &SlotTracker) const {
3971   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3972 
3973   printAsOperand(O, SlotTracker);
3974   O << " = phi ";
3975   printOperands(O, SlotTracker);
3976 }
3977 #endif
3978 
3979 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3980 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3981                                   VPSlotTracker &SlotTracker) const {
3982   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3983 
3984   printAsOperand(O, SlotTracker);
3985   O << " = phi ";
3986   printOperands(O, SlotTracker);
3987 }
3988 #endif
3989