xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (revision b7daab8be1d4555f23a297e60e4128c01caabf82)
1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "LoopVectorizationPlanner.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanHelpers.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanUtils.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Analysis/AssumptionCache.h"
24 #include "llvm/Analysis/IVDescriptors.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Value.h"
33 #include "llvm/Support/Casting.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/LoopVersioning.h"
40 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
41 #include <cassert>
42 
43 using namespace llvm;
44 
45 using VectorParts = SmallVector<Value *, 2>;
46 
47 #define LV_NAME "loop-vectorize"
48 #define DEBUG_TYPE LV_NAME
49 
50 bool VPRecipeBase::mayWriteToMemory() const {
51   switch (getVPDefID()) {
52   case VPExpressionSC:
53     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54   case VPInstructionSC:
55     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56   case VPInterleaveSC:
57     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
58   case VPWidenStoreEVLSC:
59   case VPWidenStoreSC:
60     return true;
61   case VPReplicateSC:
62     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
63         ->mayWriteToMemory();
64   case VPWidenCallSC:
65     return !cast<VPWidenCallRecipe>(this)
66                 ->getCalledScalarFunction()
67                 ->onlyReadsMemory();
68   case VPWidenIntrinsicSC:
69     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
70   case VPCanonicalIVPHISC:
71   case VPBranchOnMaskSC:
72   case VPFirstOrderRecurrencePHISC:
73   case VPReductionPHISC:
74   case VPScalarIVStepsSC:
75   case VPPredInstPHISC:
76     return false;
77   case VPBlendSC:
78   case VPReductionEVLSC:
79   case VPReductionSC:
80   case VPVectorPointerSC:
81   case VPWidenCanonicalIVSC:
82   case VPWidenCastSC:
83   case VPWidenGEPSC:
84   case VPWidenIntOrFpInductionSC:
85   case VPWidenLoadEVLSC:
86   case VPWidenLoadSC:
87   case VPWidenPHISC:
88   case VPWidenSC:
89   case VPWidenSelectSC: {
90     const Instruction *I =
91         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
92     (void)I;
93     assert((!I || !I->mayWriteToMemory()) &&
94            "underlying instruction may write to memory");
95     return false;
96   }
97   default:
98     return true;
99   }
100 }
101 
102 bool VPRecipeBase::mayReadFromMemory() const {
103   switch (getVPDefID()) {
104   case VPExpressionSC:
105     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
106   case VPInstructionSC:
107     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
108   case VPWidenLoadEVLSC:
109   case VPWidenLoadSC:
110     return true;
111   case VPReplicateSC:
112     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
113         ->mayReadFromMemory();
114   case VPWidenCallSC:
115     return !cast<VPWidenCallRecipe>(this)
116                 ->getCalledScalarFunction()
117                 ->onlyWritesMemory();
118   case VPWidenIntrinsicSC:
119     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
120   case VPBranchOnMaskSC:
121   case VPFirstOrderRecurrencePHISC:
122   case VPPredInstPHISC:
123   case VPScalarIVStepsSC:
124   case VPWidenStoreEVLSC:
125   case VPWidenStoreSC:
126     return false;
127   case VPBlendSC:
128   case VPReductionEVLSC:
129   case VPReductionSC:
130   case VPVectorPointerSC:
131   case VPWidenCanonicalIVSC:
132   case VPWidenCastSC:
133   case VPWidenGEPSC:
134   case VPWidenIntOrFpInductionSC:
135   case VPWidenPHISC:
136   case VPWidenSC:
137   case VPWidenSelectSC: {
138     const Instruction *I =
139         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
140     (void)I;
141     assert((!I || !I->mayReadFromMemory()) &&
142            "underlying instruction may read from memory");
143     return false;
144   }
145   default:
146     return true;
147   }
148 }
149 
150 bool VPRecipeBase::mayHaveSideEffects() const {
151   switch (getVPDefID()) {
152   case VPExpressionSC:
153     return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
154   case VPDerivedIVSC:
155   case VPFirstOrderRecurrencePHISC:
156   case VPPredInstPHISC:
157   case VPVectorEndPointerSC:
158     return false;
159   case VPInstructionSC:
160     return mayWriteToMemory();
161   case VPWidenCallSC: {
162     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
163     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
164   }
165   case VPWidenIntrinsicSC:
166     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
167   case VPBlendSC:
168   case VPReductionEVLSC:
169   case VPReductionSC:
170   case VPScalarIVStepsSC:
171   case VPVectorPointerSC:
172   case VPWidenCanonicalIVSC:
173   case VPWidenCastSC:
174   case VPWidenGEPSC:
175   case VPWidenIntOrFpInductionSC:
176   case VPWidenPHISC:
177   case VPWidenPointerInductionSC:
178   case VPWidenSC:
179   case VPWidenSelectSC: {
180     const Instruction *I =
181         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
182     (void)I;
183     assert((!I || !I->mayHaveSideEffects()) &&
184            "underlying instruction has side-effects");
185     return false;
186   }
187   case VPInterleaveSC:
188     return mayWriteToMemory();
189   case VPWidenLoadEVLSC:
190   case VPWidenLoadSC:
191   case VPWidenStoreEVLSC:
192   case VPWidenStoreSC:
193     assert(
194         cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
195             mayWriteToMemory() &&
196         "mayHaveSideffects result for ingredient differs from this "
197         "implementation");
198     return mayWriteToMemory();
199   case VPReplicateSC: {
200     auto *R = cast<VPReplicateRecipe>(this);
201     return R->getUnderlyingInstr()->mayHaveSideEffects();
202   }
203   default:
204     return true;
205   }
206 }
207 
208 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
209   assert(!Parent && "Recipe already in some VPBasicBlock");
210   assert(InsertPos->getParent() &&
211          "Insertion position not in any VPBasicBlock");
212   InsertPos->getParent()->insert(this, InsertPos->getIterator());
213 }
214 
215 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
216                                 iplist<VPRecipeBase>::iterator I) {
217   assert(!Parent && "Recipe already in some VPBasicBlock");
218   assert(I == BB.end() || I->getParent() == &BB);
219   BB.insert(this, I);
220 }
221 
222 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
223   assert(!Parent && "Recipe already in some VPBasicBlock");
224   assert(InsertPos->getParent() &&
225          "Insertion position not in any VPBasicBlock");
226   InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
227 }
228 
229 void VPRecipeBase::removeFromParent() {
230   assert(getParent() && "Recipe not in any VPBasicBlock");
231   getParent()->getRecipeList().remove(getIterator());
232   Parent = nullptr;
233 }
234 
235 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
236   assert(getParent() && "Recipe not in any VPBasicBlock");
237   return getParent()->getRecipeList().erase(getIterator());
238 }
239 
240 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
241   removeFromParent();
242   insertAfter(InsertPos);
243 }
244 
245 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
246                               iplist<VPRecipeBase>::iterator I) {
247   removeFromParent();
248   insertBefore(BB, I);
249 }
250 
251 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
252   // Get the underlying instruction for the recipe, if there is one. It is used
253   // to
254   //   * decide if cost computation should be skipped for this recipe,
255   //   * apply forced target instruction cost.
256   Instruction *UI = nullptr;
257   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
258     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
259   else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
260     UI = IG->getInsertPos();
261   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
262     UI = &WidenMem->getIngredient();
263 
264   InstructionCost RecipeCost;
265   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
266     RecipeCost = 0;
267   } else {
268     RecipeCost = computeCost(VF, Ctx);
269     if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
270         RecipeCost.isValid())
271       RecipeCost = InstructionCost(ForceTargetInstructionCost);
272   }
273 
274   LLVM_DEBUG({
275     dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
276     dump();
277   });
278   return RecipeCost;
279 }
280 
281 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
282                                           VPCostContext &Ctx) const {
283   llvm_unreachable("subclasses should implement computeCost");
284 }
285 
286 bool VPRecipeBase::isPhi() const {
287   return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
288          (isa<VPInstruction>(this) &&
289           cast<VPInstruction>(this)->getOpcode() == Instruction::PHI) ||
290          isa<VPIRPhi>(this);
291 }
292 
293 bool VPRecipeBase::isScalarCast() const {
294   auto *VPI = dyn_cast<VPInstruction>(this);
295   return VPI && Instruction::isCast(VPI->getOpcode());
296 }
297 
298 InstructionCost
299 VPPartialReductionRecipe::computeCost(ElementCount VF,
300                                       VPCostContext &Ctx) const {
301   std::optional<unsigned> Opcode;
302   VPValue *Op = getOperand(0);
303   VPRecipeBase *OpR = Op->getDefiningRecipe();
304 
305   // If the partial reduction is predicated, a select will be operand 0
306   using namespace llvm::VPlanPatternMatch;
307   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
308     OpR = Op->getDefiningRecipe();
309   }
310 
311   Type *InputTypeA = nullptr, *InputTypeB = nullptr;
312   TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
313                                   ExtBType = TTI::PR_None;
314 
315   auto GetExtendKind = [](VPRecipeBase *R) {
316     if (!R)
317       return TTI::PR_None;
318     auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
319     if (!WidenCastR)
320       return TTI::PR_None;
321     if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
322       return TTI::PR_ZeroExtend;
323     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
324       return TTI::PR_SignExtend;
325     return TTI::PR_None;
326   };
327 
328   // Pick out opcode, type/ext information and use sub side effects from a widen
329   // recipe.
330   auto HandleWiden = [&](VPWidenRecipe *Widen) {
331     if (match(Widen,
332               m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Op)))) {
333       Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
334     }
335     Opcode = Widen->getOpcode();
336     VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
337     VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
338     InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
339                                                  : Widen->getOperand(0));
340     InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
341                                                  : Widen->getOperand(1));
342     ExtAType = GetExtendKind(ExtAR);
343     ExtBType = GetExtendKind(ExtBR);
344   };
345 
346   if (isa<VPWidenCastRecipe>(OpR)) {
347     InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
348     ExtAType = GetExtendKind(OpR);
349   } else if (isa<VPReductionPHIRecipe>(OpR)) {
350     auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
351     if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
352       InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
353       ExtAType = GetExtendKind(RedPhiOp1R);
354     } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
355       HandleWiden(Widen);
356   } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
357     HandleWiden(Widen);
358   } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
359     return Reduction->computeCost(VF, Ctx);
360   }
361   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
362   return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
363                                          PhiType, VF, ExtAType, ExtBType,
364                                          Opcode, Ctx.CostKind);
365 }
366 
367 void VPPartialReductionRecipe::execute(VPTransformState &State) {
368   auto &Builder = State.Builder;
369 
370   assert(getOpcode() == Instruction::Add &&
371          "Unhandled partial reduction opcode");
372 
373   Value *BinOpVal = State.get(getOperand(1));
374   Value *PhiVal = State.get(getOperand(0));
375   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
376 
377   Type *RetTy = PhiVal->getType();
378 
379   CallInst *V = Builder.CreateIntrinsic(
380       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
381       {PhiVal, BinOpVal}, nullptr, "partial.reduce");
382 
383   State.set(this, V);
384 }
385 
386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
387 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
388                                      VPSlotTracker &SlotTracker) const {
389   O << Indent << "PARTIAL-REDUCE ";
390   printAsOperand(O, SlotTracker);
391   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
392   printOperands(O, SlotTracker);
393 }
394 #endif
395 
396 FastMathFlags VPIRFlags::getFastMathFlags() const {
397   assert(OpType == OperationType::FPMathOp &&
398          "recipe doesn't have fast math flags");
399   FastMathFlags Res;
400   Res.setAllowReassoc(FMFs.AllowReassoc);
401   Res.setNoNaNs(FMFs.NoNaNs);
402   Res.setNoInfs(FMFs.NoInfs);
403   Res.setNoSignedZeros(FMFs.NoSignedZeros);
404   Res.setAllowReciprocal(FMFs.AllowReciprocal);
405   Res.setAllowContract(FMFs.AllowContract);
406   Res.setApproxFunc(FMFs.ApproxFunc);
407   return Res;
408 }
409 
410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
411 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
412 #endif
413 
414 template <unsigned PartOpIdx>
415 VPValue *
416 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
417   if (U.getNumOperands() == PartOpIdx + 1)
418     return U.getOperand(PartOpIdx);
419   return nullptr;
420 }
421 
422 template <unsigned PartOpIdx>
423 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
424   if (auto *UnrollPartOp = getUnrollPartOperand(U))
425     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
426   return 0;
427 }
428 
429 namespace llvm {
430 template class VPUnrollPartAccessor<2>;
431 template class VPUnrollPartAccessor<3>;
432 }
433 
434 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
435                              const VPIRFlags &Flags, DebugLoc DL,
436                              const Twine &Name)
437     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
438       VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
439   assert(flagsValidForOpcode(getOpcode()) &&
440          "Set flags not supported for the provided opcode");
441   assert((getNumOperandsForOpcode(Opcode) == -1u ||
442           getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
443          "number of operands does not match opcode");
444 }
445 
446 #ifndef NDEBUG
447 unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
448   if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
449     return 1;
450 
451   if (Instruction::isBinaryOp(Opcode))
452     return 2;
453 
454   switch (Opcode) {
455   case VPInstruction::StepVector:
456     return 0;
457   case Instruction::Alloca:
458   case Instruction::ExtractValue:
459   case Instruction::Freeze:
460   case Instruction::Load:
461   case VPInstruction::AnyOf:
462   case VPInstruction::BranchOnCond:
463   case VPInstruction::CalculateTripCountMinusVF:
464   case VPInstruction::CanonicalIVIncrementForPart:
465   case VPInstruction::ExplicitVectorLength:
466   case VPInstruction::ExtractLastElement:
467   case VPInstruction::ExtractPenultimateElement:
468   case VPInstruction::FirstActiveLane:
469   case VPInstruction::Not:
470     return 1;
471   case Instruction::ICmp:
472   case Instruction::FCmp:
473   case Instruction::Store:
474   case VPInstruction::ActiveLaneMask:
475   case VPInstruction::BranchOnCount:
476   case VPInstruction::ComputeReductionResult:
477   case VPInstruction::FirstOrderRecurrenceSplice:
478   case VPInstruction::LogicalAnd:
479   case VPInstruction::PtrAdd:
480   case VPInstruction::WideIVStep:
481     return 2;
482   case Instruction::Select:
483   case VPInstruction::ComputeAnyOfResult:
484   case VPInstruction::ReductionStartVector:
485     return 3;
486   case VPInstruction::ComputeFindIVResult:
487     return 4;
488   case Instruction::Call:
489   case Instruction::GetElementPtr:
490   case Instruction::PHI:
491   case Instruction::Switch:
492     // Cannot determine the number of operands from the opcode.
493     return -1u;
494   }
495   llvm_unreachable("all cases should be handled above");
496 }
497 #endif
498 
499 bool VPInstruction::doesGeneratePerAllLanes() const {
500   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
501 }
502 
503 bool VPInstruction::canGenerateScalarForFirstLane() const {
504   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
505     return true;
506   if (isSingleScalar() || isVectorToScalar())
507     return true;
508   switch (Opcode) {
509   case Instruction::Freeze:
510   case Instruction::ICmp:
511   case Instruction::PHI:
512   case Instruction::Select:
513   case VPInstruction::BranchOnCond:
514   case VPInstruction::BranchOnCount:
515   case VPInstruction::CalculateTripCountMinusVF:
516   case VPInstruction::CanonicalIVIncrementForPart:
517   case VPInstruction::PtrAdd:
518   case VPInstruction::ExplicitVectorLength:
519   case VPInstruction::AnyOf:
520     return true;
521   default:
522     return false;
523   }
524 }
525 
526 Value *VPInstruction::generatePerLane(VPTransformState &State,
527                                       const VPLane &Lane) {
528   IRBuilderBase &Builder = State.Builder;
529 
530   assert(getOpcode() == VPInstruction::PtrAdd &&
531          "only PtrAdd opcodes are supported for now");
532   return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
533                               State.get(getOperand(1), Lane), Name);
534 }
535 
536 /// Create a conditional branch using \p Cond branching to the successors of \p
537 /// VPBB. Note that the first successor is always forward (i.e. not created yet)
538 /// while the second successor may already have been created (if it is a header
539 /// block and VPBB is a latch).
540 static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
541                                     VPTransformState &State) {
542   // Replace the temporary unreachable terminator with a new conditional
543   // branch, hooking it up to backward destination (header) for latch blocks
544   // now, and to forward destination(s) later when they are created.
545   // Second successor may be backwards - iff it is already in VPBB2IRBB.
546   VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
547   BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
548   BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
549   BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
550   // First successor is always forward, reset it to nullptr
551   CondBr->setSuccessor(0, nullptr);
552   IRBB->getTerminator()->eraseFromParent();
553   return CondBr;
554 }
555 
556 Value *VPInstruction::generate(VPTransformState &State) {
557   IRBuilderBase &Builder = State.Builder;
558 
559   if (Instruction::isBinaryOp(getOpcode())) {
560     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
561     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
562     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
563     auto *Res =
564         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
565     if (auto *I = dyn_cast<Instruction>(Res))
566       applyFlags(*I);
567     return Res;
568   }
569 
570   switch (getOpcode()) {
571   case VPInstruction::Not: {
572     Value *A = State.get(getOperand(0));
573     return Builder.CreateNot(A, Name);
574   }
575   case Instruction::ExtractElement: {
576     assert(State.VF.isVector() && "Only extract elements from vectors");
577     if (getOperand(1)->isLiveIn()) {
578       unsigned IdxToExtract =
579           cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
580       return State.get(getOperand(0), VPLane(IdxToExtract));
581     }
582     Value *Vec = State.get(getOperand(0));
583     Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
584     return Builder.CreateExtractElement(Vec, Idx, Name);
585   }
586   case Instruction::Freeze: {
587     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
588     return Builder.CreateFreeze(Op, Name);
589   }
590   case Instruction::FCmp:
591   case Instruction::ICmp: {
592     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
593     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
594     Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
595     return Builder.CreateCmp(getPredicate(), A, B, Name);
596   }
597   case Instruction::PHI: {
598     llvm_unreachable("should be handled by VPPhi::execute");
599   }
600   case Instruction::Select: {
601     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
602     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
603     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
604     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
605     return Builder.CreateSelect(Cond, Op1, Op2, Name);
606   }
607   case VPInstruction::ActiveLaneMask: {
608     // Get first lane of vector induction variable.
609     Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
610     // Get the original loop tripcount.
611     Value *ScalarTC = State.get(getOperand(1), VPLane(0));
612 
613     // If this part of the active lane mask is scalar, generate the CMP directly
614     // to avoid unnecessary extracts.
615     if (State.VF.isScalar())
616       return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
617                                Name);
618 
619     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
620     auto *PredTy = VectorType::get(Int1Ty, State.VF);
621     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
622                                    {PredTy, ScalarTC->getType()},
623                                    {VIVElem0, ScalarTC}, nullptr, Name);
624   }
625   case VPInstruction::FirstOrderRecurrenceSplice: {
626     // Generate code to combine the previous and current values in vector v3.
627     //
628     //   vector.ph:
629     //     v_init = vector(..., ..., ..., a[-1])
630     //     br vector.body
631     //
632     //   vector.body
633     //     i = phi [0, vector.ph], [i+4, vector.body]
634     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
635     //     v2 = a[i, i+1, i+2, i+3];
636     //     v3 = vector(v1(3), v2(0, 1, 2))
637 
638     auto *V1 = State.get(getOperand(0));
639     if (!V1->getType()->isVectorTy())
640       return V1;
641     Value *V2 = State.get(getOperand(1));
642     return Builder.CreateVectorSplice(V1, V2, -1, Name);
643   }
644   case VPInstruction::CalculateTripCountMinusVF: {
645     unsigned UF = getParent()->getPlan()->getUF();
646     Value *ScalarTC = State.get(getOperand(0), VPLane(0));
647     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
648     Value *Sub = Builder.CreateSub(ScalarTC, Step);
649     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
650     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
651     return Builder.CreateSelect(Cmp, Sub, Zero);
652   }
653   case VPInstruction::ExplicitVectorLength: {
654     // TODO: Restructure this code with an explicit remainder loop, vsetvli can
655     // be outside of the main loop.
656     Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
657     // Compute EVL
658     assert(AVL->getType()->isIntegerTy() &&
659            "Requested vector length should be an integer.");
660 
661     assert(State.VF.isScalable() && "Expected scalable vector factor.");
662     Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
663 
664     Value *EVL = State.Builder.CreateIntrinsic(
665         State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
666         {AVL, VFArg, State.Builder.getTrue()});
667     return EVL;
668   }
669   case VPInstruction::CanonicalIVIncrementForPart: {
670     unsigned Part = getUnrollPart(*this);
671     auto *IV = State.get(getOperand(0), VPLane(0));
672     assert(Part != 0 && "Must have a positive part");
673     // The canonical IV is incremented by the vectorization factor (num of
674     // SIMD elements) times the unroll part.
675     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
676     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
677                              hasNoSignedWrap());
678   }
679   case VPInstruction::BranchOnCond: {
680     Value *Cond = State.get(getOperand(0), VPLane(0));
681     auto *Br = createCondBranch(Cond, getParent(), State);
682     applyMetadata(*Br);
683     return Br;
684   }
685   case VPInstruction::BranchOnCount: {
686     // First create the compare.
687     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
688     Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
689     Value *Cond = Builder.CreateICmpEQ(IV, TC);
690     return createCondBranch(Cond, getParent(), State);
691   }
692   case VPInstruction::Broadcast: {
693     return Builder.CreateVectorSplat(
694         State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
695   }
696   case VPInstruction::BuildStructVector: {
697     // For struct types, we need to build a new 'wide' struct type, where each
698     // element is widened, i.e., we create a struct of vectors.
699     auto *StructTy =
700         cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
701     Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
702     for (const auto &[LaneIndex, Op] : enumerate(operands())) {
703       for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
704            FieldIndex++) {
705         Value *ScalarValue =
706             Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
707         Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
708         VectorValue =
709             Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
710         Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
711       }
712     }
713     return Res;
714   }
715   case VPInstruction::BuildVector: {
716     auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
717     auto NumOfElements = ElementCount::getFixed(getNumOperands());
718     Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
719     for (const auto &[Idx, Op] : enumerate(operands()))
720       Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
721                                               State.Builder.getInt32(Idx));
722     return Res;
723   }
724   case VPInstruction::ReductionStartVector: {
725     if (State.VF.isScalar())
726       return State.get(getOperand(0), true);
727     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
728     Builder.setFastMathFlags(getFastMathFlags());
729     // If this start vector is scaled then it should produce a vector with fewer
730     // elements than the VF.
731     ElementCount VF = State.VF.divideCoefficientBy(
732         cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
733     auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
734     Constant *Zero = Builder.getInt32(0);
735     return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
736                                        Zero);
737   }
738   case VPInstruction::ComputeAnyOfResult: {
739     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
740     // and will be removed by breaking up the recipe further.
741     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
742     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
743     Value *ReducedPartRdx = State.get(getOperand(2));
744     for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
745       ReducedPartRdx = Builder.CreateBinOp(
746           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(
747               RecurKind::AnyOf),
748           State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
749     return createAnyOfReduction(Builder, ReducedPartRdx,
750                                 State.get(getOperand(1), VPLane(0)), OrigPhi);
751   }
752   case VPInstruction::ComputeFindIVResult: {
753     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
754     // and will be removed by breaking up the recipe further.
755     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
756     // Get its reduction variable descriptor.
757     RecurKind RK = PhiR->getRecurrenceKind();
758     assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
759            "Unexpected reduction kind");
760     assert(!PhiR->isInLoop() &&
761            "In-loop FindLastIV reduction is not supported yet");
762 
763     // The recipe's operands are the reduction phi, the start value, the
764     // sentinel value, followed by one operand for each part of the reduction.
765     unsigned UF = getNumOperands() - 3;
766     Value *ReducedPartRdx = State.get(getOperand(3));
767     RecurKind MinMaxKind;
768     bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
769     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
770       MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
771     else
772       MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
773     for (unsigned Part = 1; Part < UF; ++Part)
774       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
775                                       State.get(getOperand(3 + Part)));
776 
777     Value *Start = State.get(getOperand(1), true);
778     Value *Sentinel = getOperand(2)->getLiveInIRValue();
779     return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
780                                      Sentinel);
781   }
782   case VPInstruction::ComputeReductionResult: {
783     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
784     // and will be removed by breaking up the recipe further.
785     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
786     // Get its reduction variable descriptor.
787 
788     RecurKind RK = PhiR->getRecurrenceKind();
789     assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
790            "should be handled by ComputeFindIVResult");
791 
792     // The recipe's operands are the reduction phi, followed by one operand for
793     // each part of the reduction.
794     unsigned UF = getNumOperands() - 1;
795     VectorParts RdxParts(UF);
796     for (unsigned Part = 0; Part < UF; ++Part)
797       RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
798 
799     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
800     if (hasFastMathFlags())
801       Builder.setFastMathFlags(getFastMathFlags());
802 
803     // Reduce all of the unrolled parts into a single vector.
804     Value *ReducedPartRdx = RdxParts[0];
805     if (PhiR->isOrdered()) {
806       ReducedPartRdx = RdxParts[UF - 1];
807     } else {
808       // Floating-point operations should have some FMF to enable the reduction.
809       for (unsigned Part = 1; Part < UF; ++Part) {
810         Value *RdxPart = RdxParts[Part];
811         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
812           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
813         else
814           ReducedPartRdx = Builder.CreateBinOp(
815               (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
816               RdxPart, ReducedPartRdx, "bin.rdx");
817       }
818     }
819 
820     // Create the reduction after the loop. Note that inloop reductions create
821     // the target reduction in the loop using a Reduction recipe.
822     if (State.VF.isVector() && !PhiR->isInLoop()) {
823       // TODO: Support in-order reductions based on the recurrence descriptor.
824       // All ops in the reduction inherit fast-math-flags from the recurrence
825       // descriptor.
826       ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
827     }
828 
829     return ReducedPartRdx;
830   }
831   case VPInstruction::ExtractLastElement:
832   case VPInstruction::ExtractPenultimateElement: {
833     unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
834     Value *Res;
835     if (State.VF.isVector()) {
836       assert(Offset <= State.VF.getKnownMinValue() &&
837              "invalid offset to extract from");
838       // Extract lane VF - Offset from the operand.
839       Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
840     } else {
841       assert(Offset <= 1 && "invalid offset to extract from");
842       Res = State.get(getOperand(0));
843     }
844     if (isa<ExtractElementInst>(Res))
845       Res->setName(Name);
846     return Res;
847   }
848   case VPInstruction::LogicalAnd: {
849     Value *A = State.get(getOperand(0));
850     Value *B = State.get(getOperand(1));
851     return Builder.CreateLogicalAnd(A, B, Name);
852   }
853   case VPInstruction::PtrAdd: {
854     assert(vputils::onlyFirstLaneUsed(this) &&
855            "can only generate first lane for PtrAdd");
856     Value *Ptr = State.get(getOperand(0), VPLane(0));
857     Value *Addend = State.get(getOperand(1), VPLane(0));
858     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
859   }
860   case VPInstruction::AnyOf: {
861     Value *Res = State.get(getOperand(0));
862     for (VPValue *Op : drop_begin(operands()))
863       Res = Builder.CreateOr(Res, State.get(Op));
864     return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
865   }
866   case VPInstruction::FirstActiveLane: {
867     if (getNumOperands() == 1) {
868       Value *Mask = State.get(getOperand(0));
869       return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
870                                                   true, Name);
871     }
872     // If there are multiple operands, create a chain of selects to pick the
873     // first operand with an active lane and add the number of lanes of the
874     // preceding operands.
875     Value *RuntimeVF =
876         getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
877     unsigned LastOpIdx = getNumOperands() - 1;
878     Value *Res = nullptr;
879     for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
880       Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
881           Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
882       Value *Current = Builder.CreateAdd(
883           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
884       if (Res) {
885         Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
886         Res = Builder.CreateSelect(Cmp, Current, Res);
887       } else {
888         Res = Current;
889       }
890     }
891 
892     return Res;
893   }
894   default:
895     llvm_unreachable("Unsupported opcode for instruction");
896   }
897 }
898 
899 InstructionCost VPInstruction::computeCost(ElementCount VF,
900                                            VPCostContext &Ctx) const {
901   if (Instruction::isBinaryOp(getOpcode())) {
902     Type *ResTy = Ctx.Types.inferScalarType(this);
903     if (!vputils::onlyFirstLaneUsed(this))
904       ResTy = toVectorTy(ResTy, VF);
905 
906     if (!getUnderlyingValue()) {
907       switch (getOpcode()) {
908       case Instruction::FMul:
909         return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
910       default:
911         // TODO: Compute cost for VPInstructions without underlying values once
912         // the legacy cost model has been retired.
913         return 0;
914       }
915     }
916 
917     assert(!doesGeneratePerAllLanes() &&
918            "Should only generate a vector value or single scalar, not scalars "
919            "for all lanes.");
920     return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
921   }
922 
923   switch (getOpcode()) {
924   case Instruction::ExtractElement: {
925     // Add on the cost of extracting the element.
926     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
927     return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
928                                       Ctx.CostKind);
929   }
930   case VPInstruction::AnyOf: {
931     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
932     return Ctx.TTI.getArithmeticReductionCost(
933         Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
934   }
935   case VPInstruction::FirstActiveLane: {
936     // Calculate the cost of determining the lane index.
937     auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
938     IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
939                                   Type::getInt64Ty(Ctx.LLVMCtx),
940                                   {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
941     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
942   }
943   case VPInstruction::FirstOrderRecurrenceSplice: {
944     assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
945     SmallVector<int> Mask(VF.getKnownMinValue());
946     std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
947     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
948 
949     return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
950                                   cast<VectorType>(VectorTy),
951                                   cast<VectorType>(VectorTy), Mask,
952                                   Ctx.CostKind, VF.getKnownMinValue() - 1);
953   }
954   case VPInstruction::ActiveLaneMask: {
955     Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
956     Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
957     IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
958                                   {ArgTy, ArgTy});
959     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
960   }
961   case VPInstruction::ExplicitVectorLength: {
962     Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
963     Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
964     Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
965     IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
966                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
967     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
968   }
969   case VPInstruction::ExtractPenultimateElement:
970     if (VF == ElementCount::getScalable(1))
971       return InstructionCost::getInvalid();
972   LLVM_FALLTHROUGH;
973   default:
974     // TODO: Compute cost other VPInstructions once the legacy cost model has
975     // been retired.
976     assert(!getUnderlyingValue() &&
977            "unexpected VPInstruction witht underlying value");
978     return 0;
979   }
980 }
981 
982 bool VPInstruction::isVectorToScalar() const {
983   return getOpcode() == VPInstruction::ExtractLastElement ||
984          getOpcode() == VPInstruction::ExtractPenultimateElement ||
985          getOpcode() == Instruction::ExtractElement ||
986          getOpcode() == VPInstruction::FirstActiveLane ||
987          getOpcode() == VPInstruction::ComputeAnyOfResult ||
988          getOpcode() == VPInstruction::ComputeFindIVResult ||
989          getOpcode() == VPInstruction::ComputeReductionResult ||
990          getOpcode() == VPInstruction::AnyOf;
991 }
992 
993 bool VPInstruction::isSingleScalar() const {
994   return getOpcode() == Instruction::PHI || isScalarCast();
995 }
996 
997 void VPInstruction::execute(VPTransformState &State) {
998   assert(!State.Lane && "VPInstruction executing an Lane");
999   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1000   assert(flagsValidForOpcode(getOpcode()) &&
1001          "Set flags not supported for the provided opcode");
1002   if (hasFastMathFlags())
1003     State.Builder.setFastMathFlags(getFastMathFlags());
1004   bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1005                                    (vputils::onlyFirstLaneUsed(this) ||
1006                                     isVectorToScalar() || isSingleScalar());
1007   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1008   if (GeneratesPerAllLanes) {
1009     for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1010          Lane != NumLanes; ++Lane) {
1011       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
1012       assert(GeneratedValue && "generatePerLane must produce a value");
1013       State.set(this, GeneratedValue, VPLane(Lane));
1014     }
1015     return;
1016   }
1017 
1018   Value *GeneratedValue = generate(State);
1019   if (!hasResult())
1020     return;
1021   assert(GeneratedValue && "generate must produce a value");
1022   assert((((GeneratedValue->getType()->isVectorTy() ||
1023             GeneratedValue->getType()->isStructTy()) ==
1024            !GeneratesPerFirstLaneOnly) ||
1025           State.VF.isScalar()) &&
1026          "scalar value but not only first lane defined");
1027   State.set(this, GeneratedValue,
1028             /*IsScalar*/ GeneratesPerFirstLaneOnly);
1029 }
1030 
1031 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1032   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
1033     return false;
1034   switch (getOpcode()) {
1035   case Instruction::ExtractElement:
1036   case Instruction::Freeze:
1037   case Instruction::FCmp:
1038   case Instruction::ICmp:
1039   case Instruction::Select:
1040   case VPInstruction::AnyOf:
1041   case VPInstruction::BuildStructVector:
1042   case VPInstruction::BuildVector:
1043   case VPInstruction::CalculateTripCountMinusVF:
1044   case VPInstruction::CanonicalIVIncrementForPart:
1045   case VPInstruction::ExtractLastElement:
1046   case VPInstruction::ExtractPenultimateElement:
1047   case VPInstruction::FirstActiveLane:
1048   case VPInstruction::FirstOrderRecurrenceSplice:
1049   case VPInstruction::LogicalAnd:
1050   case VPInstruction::Not:
1051   case VPInstruction::PtrAdd:
1052   case VPInstruction::WideIVStep:
1053   case VPInstruction::StepVector:
1054   case VPInstruction::ReductionStartVector:
1055     return false;
1056   default:
1057     return true;
1058   }
1059 }
1060 
1061 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
1062   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1063   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
1064     return vputils::onlyFirstLaneUsed(this);
1065 
1066   switch (getOpcode()) {
1067   default:
1068     return false;
1069   case Instruction::ExtractElement:
1070     return Op == getOperand(1);
1071   case Instruction::PHI:
1072     return true;
1073   case Instruction::FCmp:
1074   case Instruction::ICmp:
1075   case Instruction::Select:
1076   case Instruction::Or:
1077   case Instruction::Freeze:
1078     // TODO: Cover additional opcodes.
1079     return vputils::onlyFirstLaneUsed(this);
1080   case VPInstruction::ActiveLaneMask:
1081   case VPInstruction::ExplicitVectorLength:
1082   case VPInstruction::CalculateTripCountMinusVF:
1083   case VPInstruction::CanonicalIVIncrementForPart:
1084   case VPInstruction::BranchOnCount:
1085   case VPInstruction::BranchOnCond:
1086   case VPInstruction::Broadcast:
1087   case VPInstruction::ReductionStartVector:
1088     return true;
1089   case VPInstruction::PtrAdd:
1090     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1091   case VPInstruction::ComputeAnyOfResult:
1092   case VPInstruction::ComputeFindIVResult:
1093     return Op == getOperand(1);
1094   };
1095   llvm_unreachable("switch should return");
1096 }
1097 
1098 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
1099   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1100   if (Instruction::isBinaryOp(getOpcode()))
1101     return vputils::onlyFirstPartUsed(this);
1102 
1103   switch (getOpcode()) {
1104   default:
1105     return false;
1106   case Instruction::FCmp:
1107   case Instruction::ICmp:
1108   case Instruction::Select:
1109     return vputils::onlyFirstPartUsed(this);
1110   case VPInstruction::BranchOnCount:
1111   case VPInstruction::BranchOnCond:
1112   case VPInstruction::CanonicalIVIncrementForPart:
1113     return true;
1114   };
1115   llvm_unreachable("switch should return");
1116 }
1117 
1118 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1119 void VPInstruction::dump() const {
1120   VPSlotTracker SlotTracker(getParent()->getPlan());
1121   print(dbgs(), "", SlotTracker);
1122 }
1123 
1124 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
1125                           VPSlotTracker &SlotTracker) const {
1126   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1127 
1128   if (hasResult()) {
1129     printAsOperand(O, SlotTracker);
1130     O << " = ";
1131   }
1132 
1133   switch (getOpcode()) {
1134   case VPInstruction::Not:
1135     O << "not";
1136     break;
1137   case VPInstruction::SLPLoad:
1138     O << "combined load";
1139     break;
1140   case VPInstruction::SLPStore:
1141     O << "combined store";
1142     break;
1143   case VPInstruction::ActiveLaneMask:
1144     O << "active lane mask";
1145     break;
1146   case VPInstruction::ExplicitVectorLength:
1147     O << "EXPLICIT-VECTOR-LENGTH";
1148     break;
1149   case VPInstruction::FirstOrderRecurrenceSplice:
1150     O << "first-order splice";
1151     break;
1152   case VPInstruction::BranchOnCond:
1153     O << "branch-on-cond";
1154     break;
1155   case VPInstruction::CalculateTripCountMinusVF:
1156     O << "TC > VF ? TC - VF : 0";
1157     break;
1158   case VPInstruction::CanonicalIVIncrementForPart:
1159     O << "VF * Part +";
1160     break;
1161   case VPInstruction::BranchOnCount:
1162     O << "branch-on-count";
1163     break;
1164   case VPInstruction::Broadcast:
1165     O << "broadcast";
1166     break;
1167   case VPInstruction::BuildStructVector:
1168     O << "buildstructvector";
1169     break;
1170   case VPInstruction::BuildVector:
1171     O << "buildvector";
1172     break;
1173   case VPInstruction::ExtractLastElement:
1174     O << "extract-last-element";
1175     break;
1176   case VPInstruction::ExtractPenultimateElement:
1177     O << "extract-penultimate-element";
1178     break;
1179   case VPInstruction::ComputeAnyOfResult:
1180     O << "compute-anyof-result";
1181     break;
1182   case VPInstruction::ComputeFindIVResult:
1183     O << "compute-find-iv-result";
1184     break;
1185   case VPInstruction::ComputeReductionResult:
1186     O << "compute-reduction-result";
1187     break;
1188   case VPInstruction::LogicalAnd:
1189     O << "logical-and";
1190     break;
1191   case VPInstruction::PtrAdd:
1192     O << "ptradd";
1193     break;
1194   case VPInstruction::AnyOf:
1195     O << "any-of";
1196     break;
1197   case VPInstruction::FirstActiveLane:
1198     O << "first-active-lane";
1199     break;
1200   case VPInstruction::ReductionStartVector:
1201     O << "reduction-start-vector";
1202     break;
1203   default:
1204     O << Instruction::getOpcodeName(getOpcode());
1205   }
1206 
1207   printFlags(O);
1208   printOperands(O, SlotTracker);
1209 
1210   if (auto DL = getDebugLoc()) {
1211     O << ", !dbg ";
1212     DL.print(O);
1213   }
1214 }
1215 #endif
1216 
1217 void VPInstructionWithType::execute(VPTransformState &State) {
1218   State.setDebugLocFrom(getDebugLoc());
1219   if (isScalarCast()) {
1220     Value *Op = State.get(getOperand(0), VPLane(0));
1221     Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1222                                            Op, ResultTy);
1223     State.set(this, Cast, VPLane(0));
1224     return;
1225   }
1226   switch (getOpcode()) {
1227   case VPInstruction::StepVector: {
1228     Value *StepVector =
1229         State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1230     State.set(this, StepVector);
1231     break;
1232   }
1233   default:
1234     llvm_unreachable("opcode not implemented yet");
1235   }
1236 }
1237 
1238 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1239 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
1240                                   VPSlotTracker &SlotTracker) const {
1241   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1242   printAsOperand(O, SlotTracker);
1243   O << " = ";
1244 
1245   switch (getOpcode()) {
1246   case VPInstruction::WideIVStep:
1247     O << "wide-iv-step ";
1248     printOperands(O, SlotTracker);
1249     break;
1250   case VPInstruction::StepVector:
1251     O << "step-vector " << *ResultTy;
1252     break;
1253   default:
1254     assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1255     O << Instruction::getOpcodeName(getOpcode()) << " ";
1256     printOperands(O, SlotTracker);
1257     O << " to " << *ResultTy;
1258   }
1259 }
1260 #endif
1261 
1262 void VPPhi::execute(VPTransformState &State) {
1263   State.setDebugLocFrom(getDebugLoc());
1264   PHINode *NewPhi = State.Builder.CreatePHI(
1265       State.TypeAnalysis.inferScalarType(this), 2, getName());
1266   unsigned NumIncoming = getNumIncoming();
1267   if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1268     // TODO: Fixup all incoming values of header phis once recipes defining them
1269     // are introduced.
1270     NumIncoming = 1;
1271   }
1272   for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1273     Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1274     BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1275     NewPhi->addIncoming(IncV, PredBB);
1276   }
1277   State.set(this, NewPhi, VPLane(0));
1278 }
1279 
1280 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1281 void VPPhi::print(raw_ostream &O, const Twine &Indent,
1282                   VPSlotTracker &SlotTracker) const {
1283   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1284   printAsOperand(O, SlotTracker);
1285   O << " = phi ";
1286   printPhiOperands(O, SlotTracker);
1287 }
1288 #endif
1289 
1290 VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1291   if (auto *Phi = dyn_cast<PHINode>(&I))
1292     return new VPIRPhi(*Phi);
1293   return new VPIRInstruction(I);
1294 }
1295 
1296 void VPIRInstruction::execute(VPTransformState &State) {
1297   assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1298          "PHINodes must be handled by VPIRPhi");
1299   // Advance the insert point after the wrapped IR instruction. This allows
1300   // interleaving VPIRInstructions and other recipes.
1301   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1302 }
1303 
1304 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1305                                              VPCostContext &Ctx) const {
1306   // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1307   // hence it does not contribute to the cost-modeling for the VPlan.
1308   return 0;
1309 }
1310 
1311 void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
1312   assert(isa<PHINode>(getInstruction()) &&
1313          "can only update exiting operands to phi nodes");
1314   assert(getNumOperands() > 0 && "must have at least one operand");
1315   VPValue *Exiting = getOperand(0);
1316   if (Exiting->isLiveIn())
1317     return;
1318 
1319   Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1320   setOperand(0, Exiting);
1321 }
1322 
1323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1324 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
1325                             VPSlotTracker &SlotTracker) const {
1326   O << Indent << "IR " << I;
1327 }
1328 #endif
1329 
1330 void VPIRPhi::execute(VPTransformState &State) {
1331   PHINode *Phi = &getIRPhi();
1332   for (const auto &[Idx, Op] : enumerate(operands())) {
1333     VPValue *ExitValue = Op;
1334     auto Lane = vputils::isSingleScalar(ExitValue)
1335                     ? VPLane::getFirstLane()
1336                     : VPLane::getLastLaneForVF(State.VF);
1337     VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1338     auto *PredVPBB = Pred->getExitingBasicBlock();
1339     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1340     // Set insertion point in PredBB in case an extract needs to be generated.
1341     // TODO: Model extracts explicitly.
1342     State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1343     Value *V = State.get(ExitValue, VPLane(Lane));
1344     // If there is no existing block for PredBB in the phi, add a new incoming
1345     // value. Otherwise update the existing incoming value for PredBB.
1346     if (Phi->getBasicBlockIndex(PredBB) == -1)
1347       Phi->addIncoming(V, PredBB);
1348     else
1349       Phi->setIncomingValueForBlock(PredBB, V);
1350   }
1351 
1352   // Advance the insert point after the wrapped IR instruction. This allows
1353   // interleaving VPIRInstructions and other recipes.
1354   State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1355 }
1356 
1357 void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1358   VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1359   assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1360          "Number of phi operands must match number of predecessors");
1361   unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1362   R->removeOperand(Position);
1363 }
1364 
1365 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1366 void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1367                                       VPSlotTracker &SlotTracker) const {
1368   interleaveComma(enumerate(getAsRecipe()->operands()), O,
1369                   [this, &O, &SlotTracker](auto Op) {
1370                     O << "[ ";
1371                     Op.value()->printAsOperand(O, SlotTracker);
1372                     O << ", ";
1373                     getIncomingBlock(Op.index())->printAsOperand(O);
1374                     O << " ]";
1375                   });
1376 }
1377 #endif
1378 
1379 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1380 void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1381                     VPSlotTracker &SlotTracker) const {
1382   VPIRInstruction::print(O, Indent, SlotTracker);
1383 
1384   if (getNumOperands() != 0) {
1385     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1386     interleaveComma(
1387         enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
1388           Op.value()->printAsOperand(O, SlotTracker);
1389           O << " from ";
1390           getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
1391         });
1392     O << ")";
1393   }
1394 }
1395 #endif
1396 
1397 VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
1398     : VPIRMetadata(I) {
1399   if (!LVer || !isa<LoadInst, StoreInst>(&I))
1400     return;
1401   const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
1402   if (AliasScopeMD)
1403     Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
1404   if (NoAliasMD)
1405     Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
1406 }
1407 
1408 void VPIRMetadata::applyMetadata(Instruction &I) const {
1409   for (const auto &[Kind, Node] : Metadata)
1410     I.setMetadata(Kind, Node);
1411 }
1412 
1413 void VPWidenCallRecipe::execute(VPTransformState &State) {
1414   assert(State.VF.isVector() && "not widening");
1415   assert(Variant != nullptr && "Can't create vector function.");
1416 
1417   FunctionType *VFTy = Variant->getFunctionType();
1418   // Add return type if intrinsic is overloaded on it.
1419   SmallVector<Value *, 4> Args;
1420   for (const auto &I : enumerate(args())) {
1421     Value *Arg;
1422     // Some vectorized function variants may also take a scalar argument,
1423     // e.g. linear parameters for pointers. This needs to be the scalar value
1424     // from the start of the respective part when interleaving.
1425     if (!VFTy->getParamType(I.index())->isVectorTy())
1426       Arg = State.get(I.value(), VPLane(0));
1427     else
1428       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1429     Args.push_back(Arg);
1430   }
1431 
1432   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1433   SmallVector<OperandBundleDef, 1> OpBundles;
1434   if (CI)
1435     CI->getOperandBundlesAsDefs(OpBundles);
1436 
1437   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1438   applyFlags(*V);
1439   applyMetadata(*V);
1440   V->setCallingConv(Variant->getCallingConv());
1441 
1442   if (!V->getType()->isVoidTy())
1443     State.set(this, V);
1444 }
1445 
1446 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1447                                                VPCostContext &Ctx) const {
1448   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1449                                   Variant->getFunctionType()->params(),
1450                                   Ctx.CostKind);
1451 }
1452 
1453 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1454 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1455                               VPSlotTracker &SlotTracker) const {
1456   O << Indent << "WIDEN-CALL ";
1457 
1458   Function *CalledFn = getCalledScalarFunction();
1459   if (CalledFn->getReturnType()->isVoidTy())
1460     O << "void ";
1461   else {
1462     printAsOperand(O, SlotTracker);
1463     O << " = ";
1464   }
1465 
1466   O << "call";
1467   printFlags(O);
1468   O << " @" << CalledFn->getName() << "(";
1469   interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1470     Op->printAsOperand(O, SlotTracker);
1471   });
1472   O << ")";
1473 
1474   O << " (using library function";
1475   if (Variant->hasName())
1476     O << ": " << Variant->getName();
1477   O << ")";
1478 }
1479 #endif
1480 
1481 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1482   assert(State.VF.isVector() && "not widening");
1483 
1484   SmallVector<Type *, 2> TysForDecl;
1485   // Add return type if intrinsic is overloaded on it.
1486   if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1487     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1488   SmallVector<Value *, 4> Args;
1489   for (const auto &I : enumerate(operands())) {
1490     // Some intrinsics have a scalar argument - don't replace it with a
1491     // vector.
1492     Value *Arg;
1493     if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1494                                            State.TTI))
1495       Arg = State.get(I.value(), VPLane(0));
1496     else
1497       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1498     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1499                                                State.TTI))
1500       TysForDecl.push_back(Arg->getType());
1501     Args.push_back(Arg);
1502   }
1503 
1504   // Use vector version of the intrinsic.
1505   Module *M = State.Builder.GetInsertBlock()->getModule();
1506   Function *VectorF =
1507       Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1508   assert(VectorF &&
1509          "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1510 
1511   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1512   SmallVector<OperandBundleDef, 1> OpBundles;
1513   if (CI)
1514     CI->getOperandBundlesAsDefs(OpBundles);
1515 
1516   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1517 
1518   applyFlags(*V);
1519   applyMetadata(*V);
1520 
1521   if (!V->getType()->isVoidTy())
1522     State.set(this, V);
1523 }
1524 
1525 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1526                                                     VPCostContext &Ctx) const {
1527   // Some backends analyze intrinsic arguments to determine cost. Use the
1528   // underlying value for the operand if it has one. Otherwise try to use the
1529   // operand of the underlying call instruction, if there is one. Otherwise
1530   // clear Arguments.
1531   // TODO: Rework TTI interface to be independent of concrete IR values.
1532   SmallVector<const Value *> Arguments;
1533   for (const auto &[Idx, Op] : enumerate(operands())) {
1534     auto *V = Op->getUnderlyingValue();
1535     if (!V) {
1536       if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1537         Arguments.push_back(UI->getArgOperand(Idx));
1538         continue;
1539       }
1540       Arguments.clear();
1541       break;
1542     }
1543     Arguments.push_back(V);
1544   }
1545 
1546   Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
1547   SmallVector<Type *> ParamTys;
1548   for (unsigned I = 0; I != getNumOperands(); ++I)
1549     ParamTys.push_back(
1550         toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1551 
1552   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1553   FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1554   IntrinsicCostAttributes CostAttrs(
1555       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1556       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
1557       InstructionCost::getInvalid(), &Ctx.TLI);
1558   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1559 }
1560 
1561 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1562   return Intrinsic::getBaseName(VectorIntrinsicID);
1563 }
1564 
1565 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1566   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1567   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1568     auto [Idx, V] = X;
1569     return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1570                                                          Idx, nullptr);
1571   });
1572 }
1573 
1574 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1575 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1576                                    VPSlotTracker &SlotTracker) const {
1577   O << Indent << "WIDEN-INTRINSIC ";
1578   if (ResultTy->isVoidTy()) {
1579     O << "void ";
1580   } else {
1581     printAsOperand(O, SlotTracker);
1582     O << " = ";
1583   }
1584 
1585   O << "call";
1586   printFlags(O);
1587   O << getIntrinsicName() << "(";
1588 
1589   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1590     Op->printAsOperand(O, SlotTracker);
1591   });
1592   O << ")";
1593 }
1594 #endif
1595 
1596 void VPHistogramRecipe::execute(VPTransformState &State) {
1597   IRBuilderBase &Builder = State.Builder;
1598 
1599   Value *Address = State.get(getOperand(0));
1600   Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1601   VectorType *VTy = cast<VectorType>(Address->getType());
1602 
1603   // The histogram intrinsic requires a mask even if the recipe doesn't;
1604   // if the mask operand was omitted then all lanes should be executed and
1605   // we just need to synthesize an all-true mask.
1606   Value *Mask = nullptr;
1607   if (VPValue *VPMask = getMask())
1608     Mask = State.get(VPMask);
1609   else
1610     Mask =
1611         Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1612 
1613   // If this is a subtract, we want to invert the increment amount. We may
1614   // add a separate intrinsic in future, but for now we'll try this.
1615   if (Opcode == Instruction::Sub)
1616     IncAmt = Builder.CreateNeg(IncAmt);
1617   else
1618     assert(Opcode == Instruction::Add && "only add or sub supported for now");
1619 
1620   State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1621                                 {VTy, IncAmt->getType()},
1622                                 {Address, IncAmt, Mask});
1623 }
1624 
1625 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1626                                                VPCostContext &Ctx) const {
1627   // FIXME: Take the gather and scatter into account as well. For now we're
1628   //        generating the same cost as the fallback path, but we'll likely
1629   //        need to create a new TTI method for determining the cost, including
1630   //        whether we can use base + vec-of-smaller-indices or just
1631   //        vec-of-pointers.
1632   assert(VF.isVector() && "Invalid VF for histogram cost");
1633   Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1634   VPValue *IncAmt = getOperand(1);
1635   Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1636   VectorType *VTy = VectorType::get(IncTy, VF);
1637 
1638   // Assume that a non-constant update value (or a constant != 1) requires
1639   // a multiply, and add that into the cost.
1640   InstructionCost MulCost =
1641       Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1642   if (IncAmt->isLiveIn()) {
1643     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1644 
1645     if (CI && CI->getZExtValue() == 1)
1646       MulCost = TTI::TCC_Free;
1647   }
1648 
1649   // Find the cost of the histogram operation itself.
1650   Type *PtrTy = VectorType::get(AddressTy, VF);
1651   Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1652   IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1653                               Type::getVoidTy(Ctx.LLVMCtx),
1654                               {PtrTy, IncTy, MaskTy});
1655 
1656   // Add the costs together with the add/sub operation.
1657   return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1658          Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1659 }
1660 
1661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1663                               VPSlotTracker &SlotTracker) const {
1664   O << Indent << "WIDEN-HISTOGRAM buckets: ";
1665   getOperand(0)->printAsOperand(O, SlotTracker);
1666 
1667   if (Opcode == Instruction::Sub)
1668     O << ", dec: ";
1669   else {
1670     assert(Opcode == Instruction::Add);
1671     O << ", inc: ";
1672   }
1673   getOperand(1)->printAsOperand(O, SlotTracker);
1674 
1675   if (VPValue *Mask = getMask()) {
1676     O << ", mask: ";
1677     Mask->printAsOperand(O, SlotTracker);
1678   }
1679 }
1680 
1681 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1682                                 VPSlotTracker &SlotTracker) const {
1683   O << Indent << "WIDEN-SELECT ";
1684   printAsOperand(O, SlotTracker);
1685   O << " = select ";
1686   printFlags(O);
1687   getOperand(0)->printAsOperand(O, SlotTracker);
1688   O << ", ";
1689   getOperand(1)->printAsOperand(O, SlotTracker);
1690   O << ", ";
1691   getOperand(2)->printAsOperand(O, SlotTracker);
1692   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1693 }
1694 #endif
1695 
1696 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1697   // The condition can be loop invariant but still defined inside the
1698   // loop. This means that we can't just use the original 'cond' value.
1699   // We have to take the 'vectorized' value and pick the first lane.
1700   // Instcombine will make this a no-op.
1701   auto *InvarCond =
1702       isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1703 
1704   Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1705   Value *Op0 = State.get(getOperand(1));
1706   Value *Op1 = State.get(getOperand(2));
1707   Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1708   State.set(this, Sel);
1709   if (auto *I = dyn_cast<Instruction>(Sel)) {
1710     if (isa<FPMathOperator>(I))
1711       applyFlags(*I);
1712     applyMetadata(*I);
1713   }
1714 }
1715 
1716 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1717                                                  VPCostContext &Ctx) const {
1718   SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1719   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1720   Type *ScalarTy = Ctx.Types.inferScalarType(this);
1721   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1722 
1723   VPValue *Op0, *Op1;
1724   using namespace llvm::VPlanPatternMatch;
1725   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1726       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1727        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1728     // select x, y, false --> x & y
1729     // select x, true, y --> x | y
1730     const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1731     const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1732 
1733     SmallVector<const Value *, 2> Operands;
1734     if (all_of(operands(),
1735                [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1736       Operands.append(SI->op_begin(), SI->op_end());
1737     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1738     return Ctx.TTI.getArithmeticInstrCost(
1739         IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
1740         Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1741   }
1742 
1743   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1744   if (!ScalarCond)
1745     CondTy = VectorType::get(CondTy, VF);
1746 
1747   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1748   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1749     Pred = Cmp->getPredicate();
1750   return Ctx.TTI.getCmpSelInstrCost(
1751       Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1752       {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1753 }
1754 
1755 VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
1756   AllowReassoc = FMF.allowReassoc();
1757   NoNaNs = FMF.noNaNs();
1758   NoInfs = FMF.noInfs();
1759   NoSignedZeros = FMF.noSignedZeros();
1760   AllowReciprocal = FMF.allowReciprocal();
1761   AllowContract = FMF.allowContract();
1762   ApproxFunc = FMF.approxFunc();
1763 }
1764 
1765 #if !defined(NDEBUG)
1766 bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
1767   switch (OpType) {
1768   case OperationType::OverflowingBinOp:
1769     return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
1770            Opcode == Instruction::Mul ||
1771            Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
1772   case OperationType::Trunc:
1773     return Opcode == Instruction::Trunc;
1774   case OperationType::DisjointOp:
1775     return Opcode == Instruction::Or;
1776   case OperationType::PossiblyExactOp:
1777     return Opcode == Instruction::AShr;
1778   case OperationType::GEPOp:
1779     return Opcode == Instruction::GetElementPtr ||
1780            Opcode == VPInstruction::PtrAdd;
1781   case OperationType::FPMathOp:
1782     return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1783            Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
1784            Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
1785            Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
1786            Opcode == VPInstruction::WideIVStep ||
1787            Opcode == VPInstruction::ReductionStartVector ||
1788            Opcode == VPInstruction::ComputeReductionResult;
1789   case OperationType::NonNegOp:
1790     return Opcode == Instruction::ZExt;
1791     break;
1792   case OperationType::Cmp:
1793     return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
1794   case OperationType::Other:
1795     return true;
1796   }
1797   llvm_unreachable("Unknown OperationType enum");
1798 }
1799 #endif
1800 
1801 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1802 void VPIRFlags::printFlags(raw_ostream &O) const {
1803   switch (OpType) {
1804   case OperationType::Cmp:
1805     O << " " << CmpInst::getPredicateName(getPredicate());
1806     break;
1807   case OperationType::DisjointOp:
1808     if (DisjointFlags.IsDisjoint)
1809       O << " disjoint";
1810     break;
1811   case OperationType::PossiblyExactOp:
1812     if (ExactFlags.IsExact)
1813       O << " exact";
1814     break;
1815   case OperationType::OverflowingBinOp:
1816     if (WrapFlags.HasNUW)
1817       O << " nuw";
1818     if (WrapFlags.HasNSW)
1819       O << " nsw";
1820     break;
1821   case OperationType::Trunc:
1822     if (TruncFlags.HasNUW)
1823       O << " nuw";
1824     if (TruncFlags.HasNSW)
1825       O << " nsw";
1826     break;
1827   case OperationType::FPMathOp:
1828     getFastMathFlags().print(O);
1829     break;
1830   case OperationType::GEPOp:
1831     if (GEPFlags.isInBounds())
1832       O << " inbounds";
1833     else if (GEPFlags.hasNoUnsignedSignedWrap())
1834       O << " nusw";
1835     if (GEPFlags.hasNoUnsignedWrap())
1836       O << " nuw";
1837     break;
1838   case OperationType::NonNegOp:
1839     if (NonNegFlags.NonNeg)
1840       O << " nneg";
1841     break;
1842   case OperationType::Other:
1843     break;
1844   }
1845   O << " ";
1846 }
1847 #endif
1848 
1849 void VPWidenRecipe::execute(VPTransformState &State) {
1850   auto &Builder = State.Builder;
1851   switch (Opcode) {
1852   case Instruction::Call:
1853   case Instruction::Br:
1854   case Instruction::PHI:
1855   case Instruction::GetElementPtr:
1856   case Instruction::Select:
1857     llvm_unreachable("This instruction is handled by a different recipe.");
1858   case Instruction::UDiv:
1859   case Instruction::SDiv:
1860   case Instruction::SRem:
1861   case Instruction::URem:
1862   case Instruction::Add:
1863   case Instruction::FAdd:
1864   case Instruction::Sub:
1865   case Instruction::FSub:
1866   case Instruction::FNeg:
1867   case Instruction::Mul:
1868   case Instruction::FMul:
1869   case Instruction::FDiv:
1870   case Instruction::FRem:
1871   case Instruction::Shl:
1872   case Instruction::LShr:
1873   case Instruction::AShr:
1874   case Instruction::And:
1875   case Instruction::Or:
1876   case Instruction::Xor: {
1877     // Just widen unops and binops.
1878     SmallVector<Value *, 2> Ops;
1879     for (VPValue *VPOp : operands())
1880       Ops.push_back(State.get(VPOp));
1881 
1882     Value *V = Builder.CreateNAryOp(Opcode, Ops);
1883 
1884     if (auto *VecOp = dyn_cast<Instruction>(V)) {
1885       applyFlags(*VecOp);
1886       applyMetadata(*VecOp);
1887     }
1888 
1889     // Use this vector value for all users of the original instruction.
1890     State.set(this, V);
1891     break;
1892   }
1893   case Instruction::ExtractValue: {
1894     assert(getNumOperands() == 2 && "expected single level extractvalue");
1895     Value *Op = State.get(getOperand(0));
1896     auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
1897     Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
1898     State.set(this, Extract);
1899     break;
1900   }
1901   case Instruction::Freeze: {
1902     Value *Op = State.get(getOperand(0));
1903     Value *Freeze = Builder.CreateFreeze(Op);
1904     State.set(this, Freeze);
1905     break;
1906   }
1907   case Instruction::ICmp:
1908   case Instruction::FCmp: {
1909     // Widen compares. Generate vector compares.
1910     bool FCmp = Opcode == Instruction::FCmp;
1911     Value *A = State.get(getOperand(0));
1912     Value *B = State.get(getOperand(1));
1913     Value *C = nullptr;
1914     if (FCmp) {
1915       // Propagate fast math flags.
1916       C = Builder.CreateFCmpFMF(
1917           getPredicate(), A, B,
1918           dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1919     } else {
1920       C = Builder.CreateICmp(getPredicate(), A, B);
1921     }
1922     if (auto *I = dyn_cast<Instruction>(C))
1923       applyMetadata(*I);
1924     State.set(this, C);
1925     break;
1926   }
1927   default:
1928     // This instruction is not vectorized by simple widening.
1929     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1930                       << Instruction::getOpcodeName(Opcode));
1931     llvm_unreachable("Unhandled instruction!");
1932   } // end of switch.
1933 
1934 #if !defined(NDEBUG)
1935   // Verify that VPlan type inference results agree with the type of the
1936   // generated values.
1937   assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1938              State.get(this)->getType() &&
1939          "inferred type and type from generated instructions do not match");
1940 #endif
1941 }
1942 
1943 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1944                                            VPCostContext &Ctx) const {
1945   switch (Opcode) {
1946   case Instruction::FNeg: {
1947     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1948     return Ctx.TTI.getArithmeticInstrCost(
1949         Opcode, VectorTy, Ctx.CostKind,
1950         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1951         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1952   }
1953 
1954   case Instruction::UDiv:
1955   case Instruction::SDiv:
1956   case Instruction::SRem:
1957   case Instruction::URem:
1958     // More complex computation, let the legacy cost-model handle this for now.
1959     return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1960   case Instruction::Add:
1961   case Instruction::FAdd:
1962   case Instruction::Sub:
1963   case Instruction::FSub:
1964   case Instruction::Mul:
1965   case Instruction::FMul:
1966   case Instruction::FDiv:
1967   case Instruction::FRem:
1968   case Instruction::Shl:
1969   case Instruction::LShr:
1970   case Instruction::AShr:
1971   case Instruction::And:
1972   case Instruction::Or:
1973   case Instruction::Xor: {
1974     VPValue *RHS = getOperand(1);
1975     // Certain instructions can be cheaper to vectorize if they have a constant
1976     // second vector operand. One example of this are shifts on x86.
1977     TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1978 
1979     if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1980         getOperand(1)->isDefinedOutsideLoopRegions())
1981       RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1982     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1983     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1984 
1985     SmallVector<const Value *, 4> Operands;
1986     if (CtxI)
1987       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1988     return Ctx.TTI.getArithmeticInstrCost(
1989         Opcode, VectorTy, Ctx.CostKind,
1990         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1991         RHSInfo, Operands, CtxI, &Ctx.TLI);
1992   }
1993   case Instruction::Freeze: {
1994     // This opcode is unknown. Assume that it is the same as 'mul'.
1995     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1996     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
1997                                           Ctx.CostKind);
1998   }
1999   case Instruction::ExtractValue: {
2000     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
2001                                              Ctx.CostKind);
2002   }
2003   case Instruction::ICmp:
2004   case Instruction::FCmp: {
2005     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
2006     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
2007     return Ctx.TTI.getCmpSelInstrCost(
2008         Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(),
2009         Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2010         {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
2011   }
2012   default:
2013     llvm_unreachable("Unsupported opcode for instruction");
2014   }
2015 }
2016 
2017 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2018 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
2019                           VPSlotTracker &SlotTracker) const {
2020   O << Indent << "WIDEN ";
2021   printAsOperand(O, SlotTracker);
2022   O << " = " << Instruction::getOpcodeName(Opcode);
2023   printFlags(O);
2024   printOperands(O, SlotTracker);
2025 }
2026 #endif
2027 
2028 void VPWidenCastRecipe::execute(VPTransformState &State) {
2029   auto &Builder = State.Builder;
2030   /// Vectorize casts.
2031   assert(State.VF.isVector() && "Not vectorizing?");
2032   Type *DestTy = VectorType::get(getResultType(), State.VF);
2033   VPValue *Op = getOperand(0);
2034   Value *A = State.get(Op);
2035   Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2036   State.set(this, Cast);
2037   if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2038     applyFlags(*CastOp);
2039     applyMetadata(*CastOp);
2040   }
2041 }
2042 
2043 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2044                                                VPCostContext &Ctx) const {
2045   // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2046   // the legacy cost model, including truncates/extends when evaluating a
2047   // reduction in a smaller type.
2048   if (!getUnderlyingValue())
2049     return 0;
2050   // Computes the CastContextHint from a recipes that may access memory.
2051   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2052     if (VF.isScalar())
2053       return TTI::CastContextHint::Normal;
2054     if (isa<VPInterleaveRecipe>(R))
2055       return TTI::CastContextHint::Interleave;
2056     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2057       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2058                                              : TTI::CastContextHint::Normal;
2059     const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2060     if (WidenMemoryRecipe == nullptr)
2061       return TTI::CastContextHint::None;
2062     if (!WidenMemoryRecipe->isConsecutive())
2063       return TTI::CastContextHint::GatherScatter;
2064     if (WidenMemoryRecipe->isReverse())
2065       return TTI::CastContextHint::Reversed;
2066     if (WidenMemoryRecipe->isMasked())
2067       return TTI::CastContextHint::Masked;
2068     return TTI::CastContextHint::Normal;
2069   };
2070 
2071   VPValue *Operand = getOperand(0);
2072   TTI::CastContextHint CCH = TTI::CastContextHint::None;
2073   // For Trunc/FPTrunc, get the context from the only user.
2074   if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2075       !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
2076     if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2077       CCH = ComputeCCH(StoreRecipe);
2078   }
2079   // For Z/Sext, get the context from the operand.
2080   else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2081            Opcode == Instruction::FPExt) {
2082     if (Operand->isLiveIn())
2083       CCH = TTI::CastContextHint::Normal;
2084     else if (Operand->getDefiningRecipe())
2085       CCH = ComputeCCH(Operand->getDefiningRecipe());
2086   }
2087 
2088   auto *SrcTy =
2089       cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2090   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2091   // Arm TTI will use the underlying instruction to determine the cost.
2092   return Ctx.TTI.getCastInstrCost(
2093       Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2094       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
2095 }
2096 
2097 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2098 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
2099                               VPSlotTracker &SlotTracker) const {
2100   O << Indent << "WIDEN-CAST ";
2101   printAsOperand(O, SlotTracker);
2102   O << " = " << Instruction::getOpcodeName(Opcode);
2103   printFlags(O);
2104   printOperands(O, SlotTracker);
2105   O << " to " << *getResultType();
2106 }
2107 #endif
2108 
2109 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2110                                                VPCostContext &Ctx) const {
2111   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2112 }
2113 
2114 /// A helper function that returns an integer or floating-point constant with
2115 /// value C.
2116 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
2117   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2118                            : ConstantFP::get(Ty, C);
2119 }
2120 
2121 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2122 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2123                                           VPSlotTracker &SlotTracker) const {
2124   O << Indent;
2125   printAsOperand(O, SlotTracker);
2126   O << " = WIDEN-INDUCTION  ";
2127   printOperands(O, SlotTracker);
2128 
2129   if (auto *TI = getTruncInst())
2130     O << " (truncated to " << *TI->getType() << ")";
2131 }
2132 #endif
2133 
2134 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2135   // The step may be defined by a recipe in the preheader (e.g. if it requires
2136   // SCEV expansion), but for the canonical induction the step is required to be
2137   // 1, which is represented as live-in.
2138   if (getStepValue()->getDefiningRecipe())
2139     return false;
2140   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
2141   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
2142   auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
2143   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2144          getScalarType() == CanIV->getScalarType();
2145 }
2146 
2147 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2148 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
2149                               VPSlotTracker &SlotTracker) const {
2150   O << Indent;
2151   printAsOperand(O, SlotTracker);
2152   O << " = DERIVED-IV ";
2153   getStartValue()->printAsOperand(O, SlotTracker);
2154   O << " + ";
2155   getOperand(1)->printAsOperand(O, SlotTracker);
2156   O << " * ";
2157   getStepValue()->printAsOperand(O, SlotTracker);
2158 }
2159 #endif
2160 
2161 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2162   // Fast-math-flags propagate from the original induction instruction.
2163   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2164   if (hasFastMathFlags())
2165     State.Builder.setFastMathFlags(getFastMathFlags());
2166 
2167   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2168   /// variable on which to base the steps, \p Step is the size of the step.
2169 
2170   Value *BaseIV = State.get(getOperand(0), VPLane(0));
2171   Value *Step = State.get(getStepValue(), VPLane(0));
2172   IRBuilderBase &Builder = State.Builder;
2173 
2174   // Ensure step has the same type as that of scalar IV.
2175   Type *BaseIVTy = BaseIV->getType()->getScalarType();
2176   assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2177 
2178   // We build scalar steps for both integer and floating-point induction
2179   // variables. Here, we determine the kind of arithmetic we will perform.
2180   Instruction::BinaryOps AddOp;
2181   Instruction::BinaryOps MulOp;
2182   if (BaseIVTy->isIntegerTy()) {
2183     AddOp = Instruction::Add;
2184     MulOp = Instruction::Mul;
2185   } else {
2186     AddOp = InductionOpcode;
2187     MulOp = Instruction::FMul;
2188   }
2189 
2190   // Determine the number of scalars we need to generate for each unroll
2191   // iteration.
2192   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2193   // Compute the scalar steps and save the results in State.
2194   Type *IntStepTy =
2195       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2196   Type *VecIVTy = nullptr;
2197   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2198   if (!FirstLaneOnly && State.VF.isScalable()) {
2199     VecIVTy = VectorType::get(BaseIVTy, State.VF);
2200     UnitStepVec =
2201         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2202     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2203     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2204   }
2205 
2206   unsigned StartLane = 0;
2207   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2208   if (State.Lane) {
2209     StartLane = State.Lane->getKnownLane();
2210     EndLane = StartLane + 1;
2211   }
2212   Value *StartIdx0;
2213   if (getUnrollPart(*this) == 0)
2214     StartIdx0 = ConstantInt::get(IntStepTy, 0);
2215   else {
2216     StartIdx0 = State.get(getOperand(2), true);
2217     if (getUnrollPart(*this) != 1) {
2218       StartIdx0 =
2219           Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2220                                                         getUnrollPart(*this)));
2221     }
2222     StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2223   }
2224 
2225   if (!FirstLaneOnly && State.VF.isScalable()) {
2226     auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2227     auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2228     if (BaseIVTy->isFloatingPointTy())
2229       InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2230     auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2231     auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2232     State.set(this, Add);
2233     // It's useful to record the lane values too for the known minimum number
2234     // of elements so we do those below. This improves the code quality when
2235     // trying to extract the first element, for example.
2236   }
2237 
2238   if (BaseIVTy->isFloatingPointTy())
2239     StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2240 
2241   for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2242     Value *StartIdx = Builder.CreateBinOp(
2243         AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2244     // The step returned by `createStepForVF` is a runtime-evaluated value
2245     // when VF is scalable. Otherwise, it should be folded into a Constant.
2246     assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2247            "Expected StartIdx to be folded to a constant when VF is not "
2248            "scalable");
2249     auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2250     auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2251     State.set(this, Add, VPLane(Lane));
2252   }
2253 }
2254 
2255 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2256 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
2257                                   VPSlotTracker &SlotTracker) const {
2258   O << Indent;
2259   printAsOperand(O, SlotTracker);
2260   O << " = SCALAR-STEPS ";
2261   printOperands(O, SlotTracker);
2262 }
2263 #endif
2264 
2265 void VPWidenGEPRecipe::execute(VPTransformState &State) {
2266   assert(State.VF.isVector() && "not widening");
2267   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
2268   // Construct a vector GEP by widening the operands of the scalar GEP as
2269   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2270   // results in a vector of pointers when at least one operand of the GEP
2271   // is vector-typed. Thus, to keep the representation compact, we only use
2272   // vector-typed operands for loop-varying values.
2273 
2274   if (areAllOperandsInvariant()) {
2275     // If we are vectorizing, but the GEP has only loop-invariant operands,
2276     // the GEP we build (by only using vector-typed operands for
2277     // loop-varying values) would be a scalar pointer. Thus, to ensure we
2278     // produce a vector of pointers, we need to either arbitrarily pick an
2279     // operand to broadcast, or broadcast a clone of the original GEP.
2280     // Here, we broadcast a clone of the original.
2281     //
2282     // TODO: If at some point we decide to scalarize instructions having
2283     //       loop-invariant operands, this special case will no longer be
2284     //       required. We would add the scalarization decision to
2285     //       collectLoopScalars() and teach getVectorValue() to broadcast
2286     //       the lane-zero scalar value.
2287     SmallVector<Value *> Ops;
2288     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2289       Ops.push_back(State.get(getOperand(I), VPLane(0)));
2290 
2291     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
2292                                            ArrayRef(Ops).drop_front(), "",
2293                                            getGEPNoWrapFlags());
2294     Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2295     State.set(this, Splat);
2296   } else {
2297     // If the GEP has at least one loop-varying operand, we are sure to
2298     // produce a vector of pointers unless VF is scalar.
2299     // The pointer operand of the new GEP. If it's loop-invariant, we
2300     // won't broadcast it.
2301     auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
2302                                          : State.get(getOperand(0));
2303 
2304     // Collect all the indices for the new GEP. If any index is
2305     // loop-invariant, we won't broadcast it.
2306     SmallVector<Value *, 4> Indices;
2307     for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2308       VPValue *Operand = getOperand(I);
2309       if (isIndexLoopInvariant(I - 1))
2310         Indices.push_back(State.get(Operand, VPLane(0)));
2311       else
2312         Indices.push_back(State.get(Operand));
2313     }
2314 
2315     // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2316     // but it should be a vector, otherwise.
2317     auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
2318                                            Indices, "", getGEPNoWrapFlags());
2319     assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2320            "NewGEP is not a pointer vector");
2321     State.set(this, NewGEP);
2322   }
2323 }
2324 
2325 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2326 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2327                              VPSlotTracker &SlotTracker) const {
2328   O << Indent << "WIDEN-GEP ";
2329   O << (isPointerLoopInvariant() ? "Inv" : "Var");
2330   for (size_t I = 0; I < getNumOperands() - 1; ++I)
2331     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2332 
2333   O << " ";
2334   printAsOperand(O, SlotTracker);
2335   O << " = getelementptr";
2336   printFlags(O);
2337   printOperands(O, SlotTracker);
2338 }
2339 #endif
2340 
2341 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2342                            unsigned CurrentPart, IRBuilderBase &Builder) {
2343   // Use i32 for the gep index type when the value is constant,
2344   // or query DataLayout for a more suitable index type otherwise.
2345   const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2346   return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2347              ? DL.getIndexType(Builder.getPtrTy(0))
2348              : Builder.getInt32Ty();
2349 }
2350 
2351 void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2352   auto &Builder = State.Builder;
2353   unsigned CurrentPart = getUnrollPart(*this);
2354   bool IsUnitStride = Stride == 1 || Stride == -1;
2355   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2356                                 IsUnitStride, CurrentPart, Builder);
2357 
2358   // The wide store needs to start at the last vector element.
2359   Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2360   if (IndexTy != RunTimeVF->getType())
2361     RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2362   // NumElt = Stride * CurrentPart * RunTimeVF
2363   Value *NumElt = Builder.CreateMul(
2364       ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2365   // LastLane = Stride * (RunTimeVF - 1)
2366   Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2367   if (Stride != 1)
2368     LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2369   Value *Ptr = State.get(getOperand(0), VPLane(0));
2370   Value *ResultPtr =
2371       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2372   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2373                                 getGEPNoWrapFlags());
2374 
2375   State.set(this, ResultPtr, /*IsScalar*/ true);
2376 }
2377 
2378 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2379 void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2380                                      VPSlotTracker &SlotTracker) const {
2381   O << Indent;
2382   printAsOperand(O, SlotTracker);
2383   O << " = vector-end-pointer";
2384   printFlags(O);
2385   printOperands(O, SlotTracker);
2386 }
2387 #endif
2388 
2389 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2390   auto &Builder = State.Builder;
2391   unsigned CurrentPart = getUnrollPart(*this);
2392   Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2393                                 /*IsUnitStride*/ true, CurrentPart, Builder);
2394   Value *Ptr = State.get(getOperand(0), VPLane(0));
2395 
2396   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2397   Value *ResultPtr =
2398       Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2399 
2400   State.set(this, ResultPtr, /*IsScalar*/ true);
2401 }
2402 
2403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2404 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2405                                   VPSlotTracker &SlotTracker) const {
2406   O << Indent;
2407   printAsOperand(O, SlotTracker);
2408   O << " = vector-pointer ";
2409 
2410   printOperands(O, SlotTracker);
2411 }
2412 #endif
2413 
2414 void VPBlendRecipe::execute(VPTransformState &State) {
2415   assert(isNormalized() && "Expected blend to be normalized!");
2416   // We know that all PHIs in non-header blocks are converted into
2417   // selects, so we don't have to worry about the insertion order and we
2418   // can just use the builder.
2419   // At this point we generate the predication tree. There may be
2420   // duplications since this is a simple recursive scan, but future
2421   // optimizations will clean it up.
2422 
2423   unsigned NumIncoming = getNumIncomingValues();
2424 
2425   // Generate a sequence of selects of the form:
2426   // SELECT(Mask3, In3,
2427   //        SELECT(Mask2, In2,
2428   //               SELECT(Mask1, In1,
2429   //                      In0)))
2430   // Note that Mask0 is never used: lanes for which no path reaches this phi and
2431   // are essentially undef are taken from In0.
2432   bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2433   Value *Result = nullptr;
2434   for (unsigned In = 0; In < NumIncoming; ++In) {
2435     // We might have single edge PHIs (blocks) - use an identity
2436     // 'select' for the first PHI operand.
2437     Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2438     if (In == 0)
2439       Result = In0; // Initialize with the first incoming value.
2440     else {
2441       // Select between the current value and the previous incoming edge
2442       // based on the incoming mask.
2443       Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2444       Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2445     }
2446   }
2447   State.set(this, Result, OnlyFirstLaneUsed);
2448 }
2449 
2450 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2451                                            VPCostContext &Ctx) const {
2452   // Handle cases where only the first lane is used the same way as the legacy
2453   // cost model.
2454   if (vputils::onlyFirstLaneUsed(this))
2455     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2456 
2457   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2458   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2459   return (getNumIncomingValues() - 1) *
2460          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2461                                     CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2462 }
2463 
2464 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2465 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2466                           VPSlotTracker &SlotTracker) const {
2467   O << Indent << "BLEND ";
2468   printAsOperand(O, SlotTracker);
2469   O << " =";
2470   if (getNumIncomingValues() == 1) {
2471     // Not a User of any mask: not really blending, this is a
2472     // single-predecessor phi.
2473     O << " ";
2474     getIncomingValue(0)->printAsOperand(O, SlotTracker);
2475   } else {
2476     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2477       O << " ";
2478       getIncomingValue(I)->printAsOperand(O, SlotTracker);
2479       if (I == 0)
2480         continue;
2481       O << "/";
2482       getMask(I)->printAsOperand(O, SlotTracker);
2483     }
2484   }
2485 }
2486 #endif
2487 
2488 void VPReductionRecipe::execute(VPTransformState &State) {
2489   assert(!State.Lane && "Reduction being replicated.");
2490   Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2491   RecurKind Kind = getRecurrenceKind();
2492   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2493          "In-loop AnyOf reductions aren't currently supported");
2494   // Propagate the fast-math flags carried by the underlying instruction.
2495   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2496   State.Builder.setFastMathFlags(getFastMathFlags());
2497   Value *NewVecOp = State.get(getVecOp());
2498   if (VPValue *Cond = getCondOp()) {
2499     Value *NewCond = State.get(Cond, State.VF.isScalar());
2500     VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2501     Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2502 
2503     Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2504     if (State.VF.isVector())
2505       Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2506 
2507     Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2508     NewVecOp = Select;
2509   }
2510   Value *NewRed;
2511   Value *NextInChain;
2512   if (IsOrdered) {
2513     if (State.VF.isVector())
2514       NewRed =
2515           createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2516     else
2517       NewRed = State.Builder.CreateBinOp(
2518           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2519           PrevInChain, NewVecOp);
2520     PrevInChain = NewRed;
2521     NextInChain = NewRed;
2522   } else {
2523     PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2524     NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2525     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2526       NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2527     else
2528       NextInChain = State.Builder.CreateBinOp(
2529           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
2530           PrevInChain);
2531   }
2532   State.set(this, NextInChain, /*IsScalar*/ true);
2533 }
2534 
2535 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2536   assert(!State.Lane && "Reduction being replicated.");
2537 
2538   auto &Builder = State.Builder;
2539   // Propagate the fast-math flags carried by the underlying instruction.
2540   IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2541   Builder.setFastMathFlags(getFastMathFlags());
2542 
2543   RecurKind Kind = getRecurrenceKind();
2544   Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2545   Value *VecOp = State.get(getVecOp());
2546   Value *EVL = State.get(getEVL(), VPLane(0));
2547 
2548   Value *Mask;
2549   if (VPValue *CondOp = getCondOp())
2550     Mask = State.get(CondOp);
2551   else
2552     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2553 
2554   Value *NewRed;
2555   if (isOrdered()) {
2556     NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2557   } else {
2558     NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2559     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2560       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2561     else
2562       NewRed = Builder.CreateBinOp(
2563           (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
2564           Prev);
2565   }
2566   State.set(this, NewRed, /*IsScalar*/ true);
2567 }
2568 
2569 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2570                                                VPCostContext &Ctx) const {
2571   RecurKind RdxKind = getRecurrenceKind();
2572   Type *ElementTy = Ctx.Types.inferScalarType(this);
2573   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2574   unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2575   FastMathFlags FMFs = getFastMathFlags();
2576   std::optional<FastMathFlags> OptionalFMF =
2577       ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2578 
2579   // TODO: Support any-of reductions.
2580   assert(
2581       (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2582        ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2583       "Any-of reduction not implemented in VPlan-based cost model currently.");
2584 
2585   // Note that TTI should model the cost of moving result to the scalar register
2586   // and the BinOp cost in the getMinMaxReductionCost().
2587   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2588     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2589     return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2590   }
2591 
2592   // Note that TTI should model the cost of moving result to the scalar register
2593   // and the BinOp cost in the getArithmeticReductionCost().
2594   return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2595                                             Ctx.CostKind);
2596 }
2597 
2598 VPExpressionRecipe::VPExpressionRecipe(
2599     ExpressionTypes ExpressionType,
2600     ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2601     : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2602       ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
2603                             ExpressionRecipes.begin(), ExpressionRecipes.end())
2604                             .takeVector()),
2605       ExpressionType(ExpressionType) {
2606   assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2607   assert(
2608       none_of(ExpressionRecipes,
2609               [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2610       "expression cannot contain recipes with side-effects");
2611 
2612   // Maintain a copy of the expression recipes as a set of users.
2613   SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2614   for (auto *R : ExpressionRecipes)
2615     ExpressionRecipesAsSetOfUsers.insert(R);
2616 
2617   // Recipes in the expression, except the last one, must only be used by
2618   // (other) recipes inside the expression. If there are other users, external
2619   // to the expression, use a clone of the recipe for external users.
2620   for (VPSingleDefRecipe *R : ExpressionRecipes) {
2621     if (R != ExpressionRecipes.back() &&
2622         any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2623           return !ExpressionRecipesAsSetOfUsers.contains(U);
2624         })) {
2625       // There are users outside of the expression. Clone the recipe and use the
2626       // clone those external users.
2627       VPSingleDefRecipe *CopyForExtUsers = R->clone();
2628       R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2629                                                 VPUser &U, unsigned) {
2630         return !ExpressionRecipesAsSetOfUsers.contains(&U);
2631       });
2632       CopyForExtUsers->insertBefore(R);
2633     }
2634     if (R->getParent())
2635       R->removeFromParent();
2636   }
2637 
2638   // Internalize all external operands to the expression recipes. To do so,
2639   // create new temporary VPValues for all operands defined by a recipe outside
2640   // the expression. The original operands are added as operands of the
2641   // VPExpressionRecipe itself.
2642   for (auto *R : ExpressionRecipes) {
2643     for (const auto &[Idx, Op] : enumerate(R->operands())) {
2644       auto *Def = Op->getDefiningRecipe();
2645       if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2646         continue;
2647       addOperand(Op);
2648       LiveInPlaceholders.push_back(new VPValue());
2649       R->setOperand(Idx, LiveInPlaceholders.back());
2650     }
2651   }
2652 }
2653 
2654 void VPExpressionRecipe::decompose() {
2655   for (auto *R : ExpressionRecipes)
2656     R->insertBefore(this);
2657 
2658   for (const auto &[Idx, Op] : enumerate(operands()))
2659     LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2660 
2661   replaceAllUsesWith(ExpressionRecipes.back());
2662   ExpressionRecipes.clear();
2663 }
2664 
2665 InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
2666                                                 VPCostContext &Ctx) const {
2667   Type *RedTy = Ctx.Types.inferScalarType(this);
2668   auto *SrcVecTy = cast<VectorType>(
2669       toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2670   assert(RedTy->isIntegerTy() &&
2671          "VPExpressionRecipe only supports integer types currently.");
2672   switch (ExpressionType) {
2673   case ExpressionTypes::ExtendedReduction: {
2674     unsigned Opcode = RecurrenceDescriptor::getOpcode(
2675         cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2676     return Ctx.TTI.getExtendedReductionCost(
2677         Opcode,
2678         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2679             Instruction::ZExt,
2680         RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
2681   }
2682   case ExpressionTypes::MulAccReduction:
2683     return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
2684 
2685   case ExpressionTypes::ExtMulAccReduction:
2686     return Ctx.TTI.getMulAccReductionCost(
2687         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2688             Instruction::ZExt,
2689         RedTy, SrcVecTy, Ctx.CostKind);
2690   }
2691   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2692 }
2693 
2694 bool VPExpressionRecipe::mayReadOrWriteMemory() const {
2695   return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2696     return R->mayReadFromMemory() || R->mayWriteToMemory();
2697   });
2698 }
2699 
2700 bool VPExpressionRecipe::mayHaveSideEffects() const {
2701   assert(
2702       none_of(ExpressionRecipes,
2703               [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2704       "expression cannot contain recipes with side-effects");
2705   return false;
2706 }
2707 
2708 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2709 
2710 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
2711                                VPSlotTracker &SlotTracker) const {
2712   O << Indent << "EXPRESSION ";
2713   printAsOperand(O, SlotTracker);
2714   O << " = ";
2715   auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2716   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2717 
2718   switch (ExpressionType) {
2719   case ExpressionTypes::ExtendedReduction: {
2720     getOperand(1)->printAsOperand(O, SlotTracker);
2721     O << " +";
2722     O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
2723     getOperand(0)->printAsOperand(O, SlotTracker);
2724     Red->printFlags(O);
2725 
2726     auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2727     O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2728       << *Ext0->getResultType();
2729     if (Red->isConditional()) {
2730       O << ", ";
2731       Red->getCondOp()->printAsOperand(O, SlotTracker);
2732     }
2733     O << ")";
2734     break;
2735   }
2736   case ExpressionTypes::MulAccReduction:
2737   case ExpressionTypes::ExtMulAccReduction: {
2738     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
2739     O << " + ";
2740     O << "reduce."
2741       << Instruction::getOpcodeName(
2742              RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2743       << " (";
2744     O << "mul";
2745     bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
2746     auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
2747                                                : ExpressionRecipes[0]);
2748     Mul->printFlags(O);
2749     if (IsExtended)
2750       O << "(";
2751     getOperand(0)->printAsOperand(O, SlotTracker);
2752     if (IsExtended) {
2753       auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2754       O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2755         << *Ext0->getResultType() << "), (";
2756     } else {
2757       O << ", ";
2758     }
2759     getOperand(1)->printAsOperand(O, SlotTracker);
2760     if (IsExtended) {
2761       auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2762       O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2763         << *Ext1->getResultType() << ")";
2764     }
2765     if (Red->isConditional()) {
2766       O << ", ";
2767       Red->getCondOp()->printAsOperand(O, SlotTracker);
2768     }
2769     O << ")";
2770     break;
2771   }
2772   }
2773 }
2774 
2775 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2776                               VPSlotTracker &SlotTracker) const {
2777   O << Indent << "REDUCE ";
2778   printAsOperand(O, SlotTracker);
2779   O << " = ";
2780   getChainOp()->printAsOperand(O, SlotTracker);
2781   O << " +";
2782   printFlags(O);
2783   O << " reduce."
2784     << Instruction::getOpcodeName(
2785            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2786     << " (";
2787   getVecOp()->printAsOperand(O, SlotTracker);
2788   if (isConditional()) {
2789     O << ", ";
2790     getCondOp()->printAsOperand(O, SlotTracker);
2791   }
2792   O << ")";
2793 }
2794 
2795 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2796                                  VPSlotTracker &SlotTracker) const {
2797   O << Indent << "REDUCE ";
2798   printAsOperand(O, SlotTracker);
2799   O << " = ";
2800   getChainOp()->printAsOperand(O, SlotTracker);
2801   O << " +";
2802   printFlags(O);
2803   O << " vp.reduce."
2804     << Instruction::getOpcodeName(
2805            RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2806     << " (";
2807   getVecOp()->printAsOperand(O, SlotTracker);
2808   O << ", ";
2809   getEVL()->printAsOperand(O, SlotTracker);
2810   if (isConditional()) {
2811     O << ", ";
2812     getCondOp()->printAsOperand(O, SlotTracker);
2813   }
2814   O << ")";
2815 }
2816 
2817 #endif
2818 
2819 /// A helper function to scalarize a single Instruction in the innermost loop.
2820 /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
2821 /// operands from \p RepRecipe instead of \p Instr's operands.
2822 static void scalarizeInstruction(const Instruction *Instr,
2823                                  VPReplicateRecipe *RepRecipe,
2824                                  const VPLane &Lane, VPTransformState &State) {
2825   assert((!Instr->getType()->isAggregateType() ||
2826           canVectorizeTy(Instr->getType())) &&
2827          "Expected vectorizable or non-aggregate type.");
2828 
2829   // Does this instruction return a value ?
2830   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2831 
2832   Instruction *Cloned = Instr->clone();
2833   if (!IsVoidRetTy) {
2834     Cloned->setName(Instr->getName() + ".cloned");
2835 #if !defined(NDEBUG)
2836     // Verify that VPlan type inference results agree with the type of the
2837     // generated values.
2838     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2839            "inferred type and type from generated instructions do not match");
2840 #endif
2841   }
2842 
2843   RepRecipe->applyFlags(*Cloned);
2844   RepRecipe->applyMetadata(*Cloned);
2845 
2846   if (auto DL = RepRecipe->getDebugLoc())
2847     State.setDebugLocFrom(DL);
2848 
2849   // Replace the operands of the cloned instructions with their scalar
2850   // equivalents in the new loop.
2851   for (const auto &I : enumerate(RepRecipe->operands())) {
2852     auto InputLane = Lane;
2853     VPValue *Operand = I.value();
2854     if (vputils::isSingleScalar(Operand))
2855       InputLane = VPLane::getFirstLane();
2856     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2857   }
2858 
2859   // Place the cloned scalar in the new loop.
2860   State.Builder.Insert(Cloned);
2861 
2862   State.set(RepRecipe, Cloned, Lane);
2863 
2864   // If we just cloned a new assumption, add it the assumption cache.
2865   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2866     State.AC->registerAssumption(II);
2867 
2868   assert(
2869       (RepRecipe->getParent()->getParent() ||
2870        !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2871        all_of(RepRecipe->operands(),
2872               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2873       "Expected a recipe is either within a region or all of its operands "
2874       "are defined outside the vectorized region.");
2875 }
2876 
2877 void VPReplicateRecipe::execute(VPTransformState &State) {
2878   Instruction *UI = getUnderlyingInstr();
2879 
2880   if (!State.Lane) {
2881     assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
2882                              "must have already been unrolled");
2883     scalarizeInstruction(UI, this, VPLane(0), State);
2884     return;
2885   }
2886 
2887   assert((State.VF.isScalar() || !isSingleScalar()) &&
2888          "uniform recipe shouldn't be predicated");
2889   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2890   scalarizeInstruction(UI, this, *State.Lane, State);
2891   // Insert scalar instance packing it into a vector.
2892   if (State.VF.isVector() && shouldPack()) {
2893     Value *WideValue =
2894         State.Lane->isFirstLane()
2895             ? PoisonValue::get(VectorType::get(UI->getType(), State.VF))
2896             : State.get(this);
2897     State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
2898                                                         *State.Lane));
2899   }
2900 }
2901 
2902 bool VPReplicateRecipe::shouldPack() const {
2903   // Find if the recipe is used by a widened recipe via an intervening
2904   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2905   return any_of(users(), [](const VPUser *U) {
2906     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2907       return any_of(PredR->users(), [PredR](const VPUser *U) {
2908         return !U->usesScalars(PredR);
2909       });
2910     return false;
2911   });
2912 }
2913 
2914 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2915                                                VPCostContext &Ctx) const {
2916   Instruction *UI = cast<Instruction>(getUnderlyingValue());
2917   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2918   // transform, avoid computing their cost multiple times for now.
2919   Ctx.SkipCostComputation.insert(UI);
2920 
2921   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2922   Type *ResultTy = Ctx.Types.inferScalarType(this);
2923   switch (UI->getOpcode()) {
2924   case Instruction::GetElementPtr:
2925     // We mark this instruction as zero-cost because the cost of GEPs in
2926     // vectorized code depends on whether the corresponding memory instruction
2927     // is scalarized or not. Therefore, we handle GEPs with the memory
2928     // instruction cost.
2929     return 0;
2930   case Instruction::Add:
2931   case Instruction::Sub:
2932   case Instruction::FAdd:
2933   case Instruction::FSub:
2934   case Instruction::Mul:
2935   case Instruction::FMul:
2936   case Instruction::FDiv:
2937   case Instruction::FRem:
2938   case Instruction::Shl:
2939   case Instruction::LShr:
2940   case Instruction::AShr:
2941   case Instruction::And:
2942   case Instruction::Or:
2943   case Instruction::Xor: {
2944     auto Op2Info = Ctx.getOperandInfo(getOperand(1));
2945     SmallVector<const Value *, 4> Operands(UI->operand_values());
2946     return Ctx.TTI.getArithmeticInstrCost(
2947                UI->getOpcode(), ResultTy, CostKind,
2948                {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2949                Op2Info, Operands, UI, &Ctx.TLI) *
2950            (isSingleScalar() ? 1 : VF.getFixedValue());
2951   }
2952   }
2953 
2954   return Ctx.getLegacyCost(UI, VF);
2955 }
2956 
2957 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2958 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2959                               VPSlotTracker &SlotTracker) const {
2960   O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
2961 
2962   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2963     printAsOperand(O, SlotTracker);
2964     O << " = ";
2965   }
2966   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2967     O << "call";
2968     printFlags(O);
2969     O << "@" << CB->getCalledFunction()->getName() << "(";
2970     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2971                     O, [&O, &SlotTracker](VPValue *Op) {
2972                       Op->printAsOperand(O, SlotTracker);
2973                     });
2974     O << ")";
2975   } else {
2976     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2977     printFlags(O);
2978     printOperands(O, SlotTracker);
2979   }
2980 
2981   if (shouldPack())
2982     O << " (S->V)";
2983 }
2984 #endif
2985 
2986 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2987   assert(State.Lane && "Branch on Mask works only on single instance.");
2988 
2989   VPValue *BlockInMask = getOperand(0);
2990   Value *ConditionBit = State.get(BlockInMask, *State.Lane);
2991 
2992   // Replace the temporary unreachable terminator with a new conditional branch,
2993   // whose two destinations will be set later when they are created.
2994   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2995   assert(isa<UnreachableInst>(CurrentTerminator) &&
2996          "Expected to replace unreachable terminator with conditional branch.");
2997   auto CondBr =
2998       State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
2999   CondBr->setSuccessor(0, nullptr);
3000   CurrentTerminator->eraseFromParent();
3001 }
3002 
3003 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3004                                                   VPCostContext &Ctx) const {
3005   // The legacy cost model doesn't assign costs to branches for individual
3006   // replicate regions. Match the current behavior in the VPlan cost model for
3007   // now.
3008   return 0;
3009 }
3010 
3011 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3012   assert(State.Lane && "Predicated instruction PHI works per instance.");
3013   Instruction *ScalarPredInst =
3014       cast<Instruction>(State.get(getOperand(0), *State.Lane));
3015   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3016   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3017   assert(PredicatingBB && "Predicated block has no single predecessor.");
3018   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3019          "operand must be VPReplicateRecipe");
3020 
3021   // By current pack/unpack logic we need to generate only a single phi node: if
3022   // a vector value for the predicated instruction exists at this point it means
3023   // the instruction has vector users only, and a phi for the vector value is
3024   // needed. In this case the recipe of the predicated instruction is marked to
3025   // also do that packing, thereby "hoisting" the insert-element sequence.
3026   // Otherwise, a phi node for the scalar value is needed.
3027   if (State.hasVectorValue(getOperand(0))) {
3028     Value *VectorValue = State.get(getOperand(0));
3029     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
3030     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
3031     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
3032     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
3033     if (State.hasVectorValue(this))
3034       State.reset(this, VPhi);
3035     else
3036       State.set(this, VPhi);
3037     // NOTE: Currently we need to update the value of the operand, so the next
3038     // predicated iteration inserts its generated value in the correct vector.
3039     State.reset(getOperand(0), VPhi);
3040   } else {
3041     if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3042       return;
3043 
3044     Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3045     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3046     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3047                      PredicatingBB);
3048     Phi->addIncoming(ScalarPredInst, PredicatedBB);
3049     if (State.hasScalarValue(this, *State.Lane))
3050       State.reset(this, Phi, *State.Lane);
3051     else
3052       State.set(this, Phi, *State.Lane);
3053     // NOTE: Currently we need to update the value of the operand, so the next
3054     // predicated iteration inserts its generated value in the correct vector.
3055     State.reset(getOperand(0), Phi, *State.Lane);
3056   }
3057 }
3058 
3059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3060 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3061                                 VPSlotTracker &SlotTracker) const {
3062   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3063   printAsOperand(O, SlotTracker);
3064   O << " = ";
3065   printOperands(O, SlotTracker);
3066 }
3067 #endif
3068 
3069 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3070                                                  VPCostContext &Ctx) const {
3071   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3072   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3073   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3074                     ->getAddressSpace();
3075   unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3076                         ? Instruction::Load
3077                         : Instruction::Store;
3078 
3079   if (!Consecutive) {
3080     // TODO: Using the original IR may not be accurate.
3081     // Currently, ARM will use the underlying IR to calculate gather/scatter
3082     // instruction cost.
3083     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
3084     assert(!Reverse &&
3085            "Inconsecutive memory access should not have the order.");
3086     return Ctx.TTI.getAddressComputationCost(Ty) +
3087            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3088                                           Ctx.CostKind, &Ingredient);
3089   }
3090 
3091   InstructionCost Cost = 0;
3092   if (IsMasked) {
3093     Cost +=
3094         Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
3095   } else {
3096     TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3097         isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
3098                                                            : getOperand(1));
3099     Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3100                                     OpInfo, &Ingredient);
3101   }
3102   if (!Reverse)
3103     return Cost;
3104 
3105   return Cost += Ctx.TTI.getShuffleCost(
3106              TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3107              cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3108 }
3109 
3110 void VPWidenLoadRecipe::execute(VPTransformState &State) {
3111   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3112   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3113   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3114   bool CreateGather = !isConsecutive();
3115 
3116   auto &Builder = State.Builder;
3117   Value *Mask = nullptr;
3118   if (auto *VPMask = getMask()) {
3119     // Mask reversal is only needed for non-all-one (null) masks, as reverse
3120     // of a null all-one mask is a null mask.
3121     Mask = State.get(VPMask);
3122     if (isReverse())
3123       Mask = Builder.CreateVectorReverse(Mask, "reverse");
3124   }
3125 
3126   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3127   Value *NewLI;
3128   if (CreateGather) {
3129     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3130                                        "wide.masked.gather");
3131   } else if (Mask) {
3132     NewLI =
3133         Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3134                                  PoisonValue::get(DataTy), "wide.masked.load");
3135   } else {
3136     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3137   }
3138   applyMetadata(*cast<Instruction>(NewLI));
3139   if (Reverse)
3140     NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3141   State.set(this, NewLI);
3142 }
3143 
3144 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3145 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
3146                               VPSlotTracker &SlotTracker) const {
3147   O << Indent << "WIDEN ";
3148   printAsOperand(O, SlotTracker);
3149   O << " = load ";
3150   printOperands(O, SlotTracker);
3151 }
3152 #endif
3153 
3154 /// Use all-true mask for reverse rather than actual mask, as it avoids a
3155 /// dependence w/o affecting the result.
3156 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3157                                      Value *EVL, const Twine &Name) {
3158   VectorType *ValTy = cast<VectorType>(Operand->getType());
3159   Value *AllTrueMask =
3160       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3161   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3162                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
3163 }
3164 
3165 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3166   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3167   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3168   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3169   bool CreateGather = !isConsecutive();
3170 
3171   auto &Builder = State.Builder;
3172   CallInst *NewLI;
3173   Value *EVL = State.get(getEVL(), VPLane(0));
3174   Value *Addr = State.get(getAddr(), !CreateGather);
3175   Value *Mask = nullptr;
3176   if (VPValue *VPMask = getMask()) {
3177     Mask = State.get(VPMask);
3178     if (isReverse())
3179       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3180   } else {
3181     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3182   }
3183 
3184   if (CreateGather) {
3185     NewLI =
3186         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3187                                 nullptr, "wide.masked.gather");
3188   } else {
3189     NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3190                                     {Addr, Mask, EVL}, nullptr, "vp.op.load");
3191   }
3192   NewLI->addParamAttr(
3193       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
3194   applyMetadata(*NewLI);
3195   Instruction *Res = NewLI;
3196   if (isReverse())
3197     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3198   State.set(this, Res);
3199 }
3200 
3201 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3202                                                   VPCostContext &Ctx) const {
3203   if (!Consecutive || IsMasked)
3204     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3205 
3206   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3207   // here because the EVL recipes using EVL to replace the tail mask. But in the
3208   // legacy model, it will always calculate the cost of mask.
3209   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3210   // don't need to compare to the legacy cost model.
3211   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3212   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3213   unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3214   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3215       Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
3216   if (!Reverse)
3217     return Cost;
3218 
3219   return Cost + Ctx.TTI.getShuffleCost(
3220                     TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3221                     cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3222 }
3223 
3224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3225 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3226                                  VPSlotTracker &SlotTracker) const {
3227   O << Indent << "WIDEN ";
3228   printAsOperand(O, SlotTracker);
3229   O << " = vp.load ";
3230   printOperands(O, SlotTracker);
3231 }
3232 #endif
3233 
3234 void VPWidenStoreRecipe::execute(VPTransformState &State) {
3235   VPValue *StoredVPValue = getStoredValue();
3236   bool CreateScatter = !isConsecutive();
3237   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3238 
3239   auto &Builder = State.Builder;
3240 
3241   Value *Mask = nullptr;
3242   if (auto *VPMask = getMask()) {
3243     // Mask reversal is only needed for non-all-one (null) masks, as reverse
3244     // of a null all-one mask is a null mask.
3245     Mask = State.get(VPMask);
3246     if (isReverse())
3247       Mask = Builder.CreateVectorReverse(Mask, "reverse");
3248   }
3249 
3250   Value *StoredVal = State.get(StoredVPValue);
3251   if (isReverse()) {
3252     // If we store to reverse consecutive memory locations, then we need
3253     // to reverse the order of elements in the stored value.
3254     StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3255     // We don't want to update the value in the map as it might be used in
3256     // another expression. So don't call resetVectorValue(StoredVal).
3257   }
3258   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3259   Instruction *NewSI = nullptr;
3260   if (CreateScatter)
3261     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3262   else if (Mask)
3263     NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3264   else
3265     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3266   applyMetadata(*NewSI);
3267 }
3268 
3269 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3270 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
3271                                VPSlotTracker &SlotTracker) const {
3272   O << Indent << "WIDEN store ";
3273   printOperands(O, SlotTracker);
3274 }
3275 #endif
3276 
3277 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3278   VPValue *StoredValue = getStoredValue();
3279   bool CreateScatter = !isConsecutive();
3280   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3281 
3282   auto &Builder = State.Builder;
3283 
3284   CallInst *NewSI = nullptr;
3285   Value *StoredVal = State.get(StoredValue);
3286   Value *EVL = State.get(getEVL(), VPLane(0));
3287   if (isReverse())
3288     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3289   Value *Mask = nullptr;
3290   if (VPValue *VPMask = getMask()) {
3291     Mask = State.get(VPMask);
3292     if (isReverse())
3293       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3294   } else {
3295     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3296   }
3297   Value *Addr = State.get(getAddr(), !CreateScatter);
3298   if (CreateScatter) {
3299     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3300                                     Intrinsic::vp_scatter,
3301                                     {StoredVal, Addr, Mask, EVL});
3302   } else {
3303     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3304                                     Intrinsic::vp_store,
3305                                     {StoredVal, Addr, Mask, EVL});
3306   }
3307   NewSI->addParamAttr(
3308       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
3309   applyMetadata(*NewSI);
3310 }
3311 
3312 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3313                                                    VPCostContext &Ctx) const {
3314   if (!Consecutive || IsMasked)
3315     return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3316 
3317   // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3318   // here because the EVL recipes using EVL to replace the tail mask. But in the
3319   // legacy model, it will always calculate the cost of mask.
3320   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3321   // don't need to compare to the legacy cost model.
3322   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3323   const Align Alignment = getLoadStoreAlignment(&Ingredient);
3324   unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3325   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3326       Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3327   if (!Reverse)
3328     return Cost;
3329 
3330   return Cost + Ctx.TTI.getShuffleCost(
3331                     TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3332                     cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3333 }
3334 
3335 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3336 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3337                                   VPSlotTracker &SlotTracker) const {
3338   O << Indent << "WIDEN vp.store ";
3339   printOperands(O, SlotTracker);
3340 }
3341 #endif
3342 
3343 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
3344                                      VectorType *DstVTy, const DataLayout &DL) {
3345   // Verify that V is a vector type with same number of elements as DstVTy.
3346   auto VF = DstVTy->getElementCount();
3347   auto *SrcVecTy = cast<VectorType>(V->getType());
3348   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3349   Type *SrcElemTy = SrcVecTy->getElementType();
3350   Type *DstElemTy = DstVTy->getElementType();
3351   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3352          "Vector elements must have same size");
3353 
3354   // Do a direct cast if element types are castable.
3355   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3356     return Builder.CreateBitOrPointerCast(V, DstVTy);
3357   }
3358   // V cannot be directly casted to desired vector type.
3359   // May happen when V is a floating point vector but DstVTy is a vector of
3360   // pointers or vice-versa. Handle this using a two-step bitcast using an
3361   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3362   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3363          "Only one type should be a pointer type");
3364   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3365          "Only one type should be a floating point type");
3366   Type *IntTy =
3367       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3368   auto *VecIntTy = VectorType::get(IntTy, VF);
3369   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3370   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3371 }
3372 
3373 /// Return a vector containing interleaved elements from multiple
3374 /// smaller input vectors.
3375 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
3376                                 const Twine &Name) {
3377   unsigned Factor = Vals.size();
3378   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3379 
3380   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3381 #ifndef NDEBUG
3382   for (Value *Val : Vals)
3383     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3384 #endif
3385 
3386   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3387   // must use intrinsics to interleave.
3388   if (VecTy->isScalableTy()) {
3389     assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3390     VectorType *InterleaveTy =
3391         VectorType::get(VecTy->getElementType(),
3392                         VecTy->getElementCount().multiplyCoefficientBy(Factor));
3393     return Builder.CreateIntrinsic(InterleaveTy,
3394                                    getInterleaveIntrinsicID(Factor), Vals,
3395                                    /*FMFSource=*/nullptr, Name);
3396   }
3397 
3398   // Fixed length. Start by concatenating all vectors into a wide vector.
3399   Value *WideVec = concatenateVectors(Builder, Vals);
3400 
3401   // Interleave the elements into the wide vector.
3402   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3403   return Builder.CreateShuffleVector(
3404       WideVec, createInterleaveMask(NumElts, Factor), Name);
3405 }
3406 
3407 // Try to vectorize the interleave group that \p Instr belongs to.
3408 //
3409 // E.g. Translate following interleaved load group (factor = 3):
3410 //   for (i = 0; i < N; i+=3) {
3411 //     R = Pic[i];             // Member of index 0
3412 //     G = Pic[i+1];           // Member of index 1
3413 //     B = Pic[i+2];           // Member of index 2
3414 //     ... // do something to R, G, B
3415 //   }
3416 // To:
3417 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
3418 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
3419 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
3420 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
3421 //
3422 // Or translate following interleaved store group (factor = 3):
3423 //   for (i = 0; i < N; i+=3) {
3424 //     ... do something to R, G, B
3425 //     Pic[i]   = R;           // Member of index 0
3426 //     Pic[i+1] = G;           // Member of index 1
3427 //     Pic[i+2] = B;           // Member of index 2
3428 //   }
3429 // To:
3430 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3431 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3432 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3433 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
3434 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
3435 void VPInterleaveRecipe::execute(VPTransformState &State) {
3436   assert(!State.Lane && "Interleave group being replicated.");
3437   const InterleaveGroup<Instruction> *Group = IG;
3438   Instruction *Instr = Group->getInsertPos();
3439 
3440   // Prepare for the vector type of the interleaved load/store.
3441   Type *ScalarTy = getLoadStoreType(Instr);
3442   unsigned InterleaveFactor = Group->getFactor();
3443   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3444 
3445   VPValue *BlockInMask = getMask();
3446   VPValue *Addr = getAddr();
3447   Value *ResAddr = State.get(Addr, VPLane(0));
3448   Value *PoisonVec = PoisonValue::get(VecTy);
3449 
3450   auto CreateGroupMask = [&BlockInMask, &State,
3451                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
3452     if (State.VF.isScalable()) {
3453       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3454       assert(InterleaveFactor <= 8 &&
3455              "Unsupported deinterleave factor for scalable vectors");
3456       auto *ResBlockInMask = State.get(BlockInMask);
3457       SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3458       return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3459     }
3460 
3461     if (!BlockInMask)
3462       return MaskForGaps;
3463 
3464     Value *ResBlockInMask = State.get(BlockInMask);
3465     Value *ShuffledMask = State.Builder.CreateShuffleVector(
3466         ResBlockInMask,
3467         createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3468         "interleaved.mask");
3469     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3470                                                    ShuffledMask, MaskForGaps)
3471                        : ShuffledMask;
3472   };
3473 
3474   const DataLayout &DL = Instr->getDataLayout();
3475   // Vectorize the interleaved load group.
3476   if (isa<LoadInst>(Instr)) {
3477     Value *MaskForGaps = nullptr;
3478     if (NeedsMaskForGaps) {
3479       MaskForGaps =
3480           createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3481       assert(MaskForGaps && "Mask for Gaps is required but it is null");
3482     }
3483 
3484     Instruction *NewLoad;
3485     if (BlockInMask || MaskForGaps) {
3486       Value *GroupMask = CreateGroupMask(MaskForGaps);
3487       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3488                                                Group->getAlign(), GroupMask,
3489                                                PoisonVec, "wide.masked.vec");
3490     } else
3491       NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3492                                                 Group->getAlign(), "wide.vec");
3493     Group->addMetadata(NewLoad);
3494 
3495     ArrayRef<VPValue *> VPDefs = definedValues();
3496     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3497     if (VecTy->isScalableTy()) {
3498       // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3499       // so must use intrinsics to deinterleave.
3500       assert(InterleaveFactor <= 8 &&
3501              "Unsupported deinterleave factor for scalable vectors");
3502       Value *Deinterleave = State.Builder.CreateIntrinsic(
3503           getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
3504           NewLoad,
3505           /*FMFSource=*/nullptr, "strided.vec");
3506 
3507       for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3508         Instruction *Member = Group->getMember(I);
3509         Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
3510         if (!Member) {
3511           // This value is not needed as it's not used
3512           cast<Instruction>(StridedVec)->eraseFromParent();
3513           continue;
3514         }
3515         // If this member has different type, cast the result type.
3516         if (Member->getType() != ScalarTy) {
3517           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3518           StridedVec =
3519               createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3520         }
3521 
3522         if (Group->isReverse())
3523           StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3524 
3525         State.set(VPDefs[J], StridedVec);
3526         ++J;
3527       }
3528 
3529       return;
3530     }
3531     assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3532 
3533     // For each member in the group, shuffle out the appropriate data from the
3534     // wide loads.
3535     unsigned J = 0;
3536     for (unsigned I = 0; I < InterleaveFactor; ++I) {
3537       Instruction *Member = Group->getMember(I);
3538 
3539       // Skip the gaps in the group.
3540       if (!Member)
3541         continue;
3542 
3543       auto StrideMask =
3544           createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
3545       Value *StridedVec =
3546           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
3547 
3548       // If this member has different type, cast the result type.
3549       if (Member->getType() != ScalarTy) {
3550         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3551         StridedVec =
3552             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3553       }
3554 
3555       if (Group->isReverse())
3556         StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3557 
3558       State.set(VPDefs[J], StridedVec);
3559       ++J;
3560     }
3561     return;
3562   }
3563 
3564   // The sub vector type for current instruction.
3565   auto *SubVT = VectorType::get(ScalarTy, State.VF);
3566 
3567   // Vectorize the interleaved store group.
3568   Value *MaskForGaps =
3569       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3570   assert((!MaskForGaps || !State.VF.isScalable()) &&
3571          "masking gaps for scalable vectors is not yet supported.");
3572   ArrayRef<VPValue *> StoredValues = getStoredValues();
3573   // Collect the stored vector from each member.
3574   SmallVector<Value *, 4> StoredVecs;
3575   unsigned StoredIdx = 0;
3576   for (unsigned i = 0; i < InterleaveFactor; i++) {
3577     assert((Group->getMember(i) || MaskForGaps) &&
3578            "Fail to get a member from an interleaved store group");
3579     Instruction *Member = Group->getMember(i);
3580 
3581     // Skip the gaps in the group.
3582     if (!Member) {
3583       Value *Undef = PoisonValue::get(SubVT);
3584       StoredVecs.push_back(Undef);
3585       continue;
3586     }
3587 
3588     Value *StoredVec = State.get(StoredValues[StoredIdx]);
3589     ++StoredIdx;
3590 
3591     if (Group->isReverse())
3592       StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3593 
3594     // If this member has different type, cast it to a unified type.
3595 
3596     if (StoredVec->getType() != SubVT)
3597       StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3598 
3599     StoredVecs.push_back(StoredVec);
3600   }
3601 
3602   // Interleave all the smaller vectors into one wider vector.
3603   Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3604   Instruction *NewStoreInstr;
3605   if (BlockInMask || MaskForGaps) {
3606     Value *GroupMask = CreateGroupMask(MaskForGaps);
3607     NewStoreInstr = State.Builder.CreateMaskedStore(
3608         IVec, ResAddr, Group->getAlign(), GroupMask);
3609   } else
3610     NewStoreInstr =
3611         State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3612 
3613   Group->addMetadata(NewStoreInstr);
3614 }
3615 
3616 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3617 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3618                                VPSlotTracker &SlotTracker) const {
3619   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3620   IG->getInsertPos()->printAsOperand(O, false);
3621   O << ", ";
3622   getAddr()->printAsOperand(O, SlotTracker);
3623   VPValue *Mask = getMask();
3624   if (Mask) {
3625     O << ", ";
3626     Mask->printAsOperand(O, SlotTracker);
3627   }
3628 
3629   unsigned OpIdx = 0;
3630   for (unsigned i = 0; i < IG->getFactor(); ++i) {
3631     if (!IG->getMember(i))
3632       continue;
3633     if (getNumStoreOperands() > 0) {
3634       O << "\n" << Indent << "  store ";
3635       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3636       O << " to index " << i;
3637     } else {
3638       O << "\n" << Indent << "  ";
3639       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3640       O << " = load from index " << i;
3641     }
3642     ++OpIdx;
3643   }
3644 }
3645 #endif
3646 
3647 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3648                                                 VPCostContext &Ctx) const {
3649   Instruction *InsertPos = getInsertPos();
3650   // Find the VPValue index of the interleave group. We need to skip gaps.
3651   unsigned InsertPosIdx = 0;
3652   for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3653     if (auto *Member = IG->getMember(Idx)) {
3654       if (Member == InsertPos)
3655         break;
3656       InsertPosIdx++;
3657     }
3658   Type *ValTy = Ctx.Types.inferScalarType(
3659       getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3660                                 : getStoredValues()[InsertPosIdx]);
3661   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3662   unsigned AS = getLoadStoreAddressSpace(InsertPos);
3663 
3664   unsigned InterleaveFactor = IG->getFactor();
3665   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3666 
3667   // Holds the indices of existing members in the interleaved group.
3668   SmallVector<unsigned, 4> Indices;
3669   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3670     if (IG->getMember(IF))
3671       Indices.push_back(IF);
3672 
3673   // Calculate the cost of the whole interleaved group.
3674   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3675       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3676       IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
3677 
3678   if (!IG->isReverse())
3679     return Cost;
3680 
3681   return Cost + IG->getNumMembers() *
3682                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3683                                            VectorTy, VectorTy, {}, Ctx.CostKind,
3684                                            0);
3685 }
3686 
3687 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3688 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3689                                    VPSlotTracker &SlotTracker) const {
3690   O << Indent << "EMIT ";
3691   printAsOperand(O, SlotTracker);
3692   O << " = CANONICAL-INDUCTION ";
3693   printOperands(O, SlotTracker);
3694 }
3695 #endif
3696 
3697 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3698   return IsScalarAfterVectorization &&
3699          (!IsScalable || vputils::onlyFirstLaneUsed(this));
3700 }
3701 
3702 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3703   assert(getInductionDescriptor().getKind() ==
3704              InductionDescriptor::IK_PtrInduction &&
3705          "Not a pointer induction according to InductionDescriptor!");
3706   assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
3707          "Unexpected type.");
3708   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3709          "Recipe should have been replaced");
3710 
3711   unsigned CurrentPart = getUnrollPart(*this);
3712 
3713   // Build a pointer phi
3714   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3715   Type *ScStValueType = ScalarStartValue->getType();
3716 
3717   BasicBlock *VectorPH =
3718       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3719   PHINode *NewPointerPhi = nullptr;
3720   if (CurrentPart == 0) {
3721     IRBuilder<>::InsertPointGuard Guard(State.Builder);
3722     if (State.Builder.GetInsertPoint() !=
3723         State.Builder.GetInsertBlock()->getFirstNonPHIIt())
3724       State.Builder.SetInsertPoint(
3725           State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3726     NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi");
3727     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3728     NewPointerPhi->setDebugLoc(getDebugLoc());
3729   } else {
3730     // The recipe has been unrolled. In that case, fetch the single pointer phi
3731     // shared among all unrolled parts of the recipe.
3732     auto *GEP =
3733         cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3734     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3735   }
3736 
3737   // A pointer induction, performed by using a gep
3738   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3739   Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3740   Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3741   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3742   // Add induction update using an incorrect block temporarily. The phi node
3743   // will be fixed after VPlan execution. Note that at this point the latch
3744   // block cannot be used, as it does not exist yet.
3745   // TODO: Model increment value in VPlan, by turning the recipe into a
3746   // multi-def and a subclass of VPHeaderPHIRecipe.
3747   if (CurrentPart == 0) {
3748     // The recipe represents the first part of the pointer induction. Create the
3749     // GEP to increment the phi across all unrolled parts.
3750     Value *NumUnrolledElems = State.get(getOperand(2), true);
3751 
3752     Value *InductionGEP = GetElementPtrInst::Create(
3753         State.Builder.getInt8Ty(), NewPointerPhi,
3754         State.Builder.CreateMul(
3755             ScalarStepValue,
3756             State.Builder.CreateTrunc(NumUnrolledElems, PhiType)),
3757         "ptr.ind", InductionLoc);
3758 
3759     NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3760   }
3761 
3762   // Create actual address geps that use the pointer phi as base and a
3763   // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3764   Type *VecPhiType = VectorType::get(PhiType, State.VF);
3765   Value *StartOffsetScalar = State.Builder.CreateMul(
3766       RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3767   Value *StartOffset =
3768       State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3769   // Create a vector of consecutive numbers from zero to VF.
3770   StartOffset = State.Builder.CreateAdd(
3771       StartOffset, State.Builder.CreateStepVector(VecPhiType));
3772 
3773   assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3774          "scalar step must be the same across all parts");
3775   Value *GEP = State.Builder.CreateGEP(
3776       State.Builder.getInt8Ty(), NewPointerPhi,
3777       State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3778                                                State.VF, ScalarStepValue)),
3779       "vector.gep");
3780   State.set(this, GEP);
3781 }
3782 
3783 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3784 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3785                                           VPSlotTracker &SlotTracker) const {
3786   assert((getNumOperands() == 3 || getNumOperands() == 5) &&
3787          "unexpected number of operands");
3788   O << Indent << "EMIT ";
3789   printAsOperand(O, SlotTracker);
3790   O << " = WIDEN-POINTER-INDUCTION ";
3791   getStartValue()->printAsOperand(O, SlotTracker);
3792   O << ", ";
3793   getStepValue()->printAsOperand(O, SlotTracker);
3794   O << ", ";
3795   getOperand(2)->printAsOperand(O, SlotTracker);
3796   if (getNumOperands() == 5) {
3797     O << ", ";
3798     getOperand(3)->printAsOperand(O, SlotTracker);
3799     O << ", ";
3800     getOperand(4)->printAsOperand(O, SlotTracker);
3801   }
3802 }
3803 #endif
3804 
3805 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3806   assert(!State.Lane && "cannot be used in per-lane");
3807   const DataLayout &DL = SE.getDataLayout();
3808   SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true);
3809   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3810                                  &*State.Builder.GetInsertPoint());
3811   State.set(this, Res, VPLane(0));
3812 }
3813 
3814 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3815 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3816                                VPSlotTracker &SlotTracker) const {
3817   O << Indent << "EMIT ";
3818   printAsOperand(O, SlotTracker);
3819   O << " = EXPAND SCEV " << *Expr;
3820 }
3821 #endif
3822 
3823 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3824   Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3825   Type *STy = CanonicalIV->getType();
3826   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3827   ElementCount VF = State.VF;
3828   Value *VStart = VF.isScalar()
3829                       ? CanonicalIV
3830                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3831   Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3832   if (VF.isVector()) {
3833     VStep = Builder.CreateVectorSplat(VF, VStep);
3834     VStep =
3835         Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3836   }
3837   Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3838   State.set(this, CanonicalVectorIV);
3839 }
3840 
3841 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3842 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3843                                      VPSlotTracker &SlotTracker) const {
3844   O << Indent << "EMIT ";
3845   printAsOperand(O, SlotTracker);
3846   O << " = WIDEN-CANONICAL-INDUCTION ";
3847   printOperands(O, SlotTracker);
3848 }
3849 #endif
3850 
3851 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3852   auto &Builder = State.Builder;
3853   // Create a vector from the initial value.
3854   auto *VectorInit = getStartValue()->getLiveInIRValue();
3855 
3856   Type *VecTy = State.VF.isScalar()
3857                     ? VectorInit->getType()
3858                     : VectorType::get(VectorInit->getType(), State.VF);
3859 
3860   BasicBlock *VectorPH =
3861       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3862   if (State.VF.isVector()) {
3863     auto *IdxTy = Builder.getInt32Ty();
3864     auto *One = ConstantInt::get(IdxTy, 1);
3865     IRBuilder<>::InsertPointGuard Guard(Builder);
3866     Builder.SetInsertPoint(VectorPH->getTerminator());
3867     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3868     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3869     VectorInit = Builder.CreateInsertElement(
3870         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3871   }
3872 
3873   // Create a phi node for the new recurrence.
3874   PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3875   Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3876   Phi->addIncoming(VectorInit, VectorPH);
3877   State.set(this, Phi);
3878 }
3879 
3880 InstructionCost
3881 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3882                                              VPCostContext &Ctx) const {
3883   if (VF.isScalar())
3884     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3885 
3886   return 0;
3887 }
3888 
3889 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3890 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3891                                             VPSlotTracker &SlotTracker) const {
3892   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3893   printAsOperand(O, SlotTracker);
3894   O << " = phi ";
3895   printOperands(O, SlotTracker);
3896 }
3897 #endif
3898 
3899 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3900   // Reductions do not have to start at zero. They can start with
3901   // any loop invariant values.
3902   VPValue *StartVPV = getStartValue();
3903 
3904   // In order to support recurrences we need to be able to vectorize Phi nodes.
3905   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3906   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3907   // this value when we vectorize all of the instructions that use the PHI.
3908   BasicBlock *VectorPH =
3909       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3910   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3911   Value *StartV = State.get(StartVPV, ScalarPHI);
3912   Type *VecTy = StartV->getType();
3913 
3914   BasicBlock *HeaderBB = State.CFG.PrevBB;
3915   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3916          "recipe must be in the vector loop header");
3917   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3918   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3919   State.set(this, Phi, IsInLoop);
3920 
3921   Phi->addIncoming(StartV, VectorPH);
3922 }
3923 
3924 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3925 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3926                                  VPSlotTracker &SlotTracker) const {
3927   O << Indent << "WIDEN-REDUCTION-PHI ";
3928 
3929   printAsOperand(O, SlotTracker);
3930   O << " = phi ";
3931   printOperands(O, SlotTracker);
3932   if (VFScaleFactor != 1)
3933     O << " (VF scaled by 1/" << VFScaleFactor << ")";
3934 }
3935 #endif
3936 
3937 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3938   Value *Op0 = State.get(getOperand(0));
3939   Type *VecTy = Op0->getType();
3940   Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
3941   // Manually move it with the other PHIs in case PHI recipes above this one
3942   // also inserted non-phi instructions.
3943   // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
3944   // convertToConcreteRecipes.
3945   VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3946   State.set(this, VecPhi);
3947 }
3948 
3949 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3950 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3951                              VPSlotTracker &SlotTracker) const {
3952   O << Indent << "WIDEN-PHI ";
3953 
3954   printAsOperand(O, SlotTracker);
3955   O << " = phi ";
3956   printPhiOperands(O, SlotTracker);
3957 }
3958 #endif
3959 
3960 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3961 // remove VPActiveLaneMaskPHIRecipe.
3962 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3963   BasicBlock *VectorPH =
3964       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3965   Value *StartMask = State.get(getOperand(0));
3966   PHINode *Phi =
3967       State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3968   Phi->addIncoming(StartMask, VectorPH);
3969   State.set(this, Phi);
3970 }
3971 
3972 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3973 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3974                                       VPSlotTracker &SlotTracker) const {
3975   O << Indent << "ACTIVE-LANE-MASK-PHI ";
3976 
3977   printAsOperand(O, SlotTracker);
3978   O << " = phi ";
3979   printOperands(O, SlotTracker);
3980 }
3981 #endif
3982 
3983 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3984 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3985                                   VPSlotTracker &SlotTracker) const {
3986   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3987 
3988   printAsOperand(O, SlotTracker);
3989   O << " = phi ";
3990   printOperands(O, SlotTracker);
3991 }
3992 #endif
3993