xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1 //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the declarations of the Vectorization Plan base classes:
11 /// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
12 ///    VPBlockBase, together implementing a Hierarchical CFG;
13 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained
14 ///    within VPBasicBlocks;
15 /// 3. Pure virtual VPSingleDefRecipe serving as a base class for recipes that
16 ///    also inherit from VPValue.
17 /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
18 ///    instruction;
19 /// 5. The VPlan class holding a candidate for vectorization;
20 /// 6. The VPlanPrinter class providing a way to print a plan in dot format;
21 /// These are documented in docs/VectorizationPlan.rst.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
26 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
27 
28 #include "VPlanAnalysis.h"
29 #include "VPlanValue.h"
30 #include "llvm/ADT/DenseMap.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/SmallBitVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Twine.h"
36 #include "llvm/ADT/ilist.h"
37 #include "llvm/ADT/ilist_node.h"
38 #include "llvm/Analysis/DomTreeUpdater.h"
39 #include "llvm/Analysis/IVDescriptors.h"
40 #include "llvm/Analysis/LoopInfo.h"
41 #include "llvm/Analysis/VectorUtils.h"
42 #include "llvm/IR/DebugLoc.h"
43 #include "llvm/IR/FMF.h"
44 #include "llvm/IR/Operator.h"
45 #include "llvm/Support/InstructionCost.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstddef>
49 #include <string>
50 
51 namespace llvm {
52 
53 class BasicBlock;
54 class DominatorTree;
55 class InnerLoopVectorizer;
56 class IRBuilderBase;
57 class LoopInfo;
58 class raw_ostream;
59 class RecurrenceDescriptor;
60 class SCEV;
61 class Type;
62 class VPBasicBlock;
63 class VPRegionBlock;
64 class VPlan;
65 class VPReplicateRecipe;
66 class VPlanSlp;
67 class Value;
68 class LoopVectorizationCostModel;
69 class LoopVersioning;
70 
71 struct VPCostContext;
72 
73 namespace Intrinsic {
74 typedef unsigned ID;
75 }
76 
77 /// Returns a calculation for the total number of elements for a given \p VF.
78 /// For fixed width vectors this value is a constant, whereas for scalable
79 /// vectors it is an expression determined at runtime.
80 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
81 
82 /// Return a value for Step multiplied by VF.
83 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
84                        int64_t Step);
85 
86 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
87                                 Loop *CurLoop = nullptr);
88 
89 /// A helper function that returns the reciprocal of the block probability of
90 /// predicated blocks. If we return X, we are assuming the predicated block
91 /// will execute once for every X iterations of the loop header.
92 ///
93 /// TODO: We should use actual block probability here, if available. Currently,
94 ///       we always assume predicated blocks have a 50% chance of executing.
95 inline unsigned getReciprocalPredBlockProb() { return 2; }
96 
97 /// A range of powers-of-2 vectorization factors with fixed start and
98 /// adjustable end. The range includes start and excludes end, e.g.,:
99 /// [1, 16) = {1, 2, 4, 8}
100 struct VFRange {
101   // A power of 2.
102   const ElementCount Start;
103 
104   // A power of 2. If End <= Start range is empty.
105   ElementCount End;
106 
107   bool isEmpty() const {
108     return End.getKnownMinValue() <= Start.getKnownMinValue();
109   }
110 
111   VFRange(const ElementCount &Start, const ElementCount &End)
112       : Start(Start), End(End) {
113     assert(Start.isScalable() == End.isScalable() &&
114            "Both Start and End should have the same scalable flag");
115     assert(isPowerOf2_32(Start.getKnownMinValue()) &&
116            "Expected Start to be a power of 2");
117     assert(isPowerOf2_32(End.getKnownMinValue()) &&
118            "Expected End to be a power of 2");
119   }
120 
121   /// Iterator to iterate over vectorization factors in a VFRange.
122   class iterator
123       : public iterator_facade_base<iterator, std::forward_iterator_tag,
124                                     ElementCount> {
125     ElementCount VF;
126 
127   public:
128     iterator(ElementCount VF) : VF(VF) {}
129 
130     bool operator==(const iterator &Other) const { return VF == Other.VF; }
131 
132     ElementCount operator*() const { return VF; }
133 
134     iterator &operator++() {
135       VF *= 2;
136       return *this;
137     }
138   };
139 
140   iterator begin() { return iterator(Start); }
141   iterator end() {
142     assert(isPowerOf2_32(End.getKnownMinValue()));
143     return iterator(End);
144   }
145 };
146 
147 using VPlanPtr = std::unique_ptr<VPlan>;
148 
149 /// In what follows, the term "input IR" refers to code that is fed into the
150 /// vectorizer whereas the term "output IR" refers to code that is generated by
151 /// the vectorizer.
152 
153 /// VPLane provides a way to access lanes in both fixed width and scalable
154 /// vectors, where for the latter the lane index sometimes needs calculating
155 /// as a runtime expression.
156 class VPLane {
157 public:
158   /// Kind describes how to interpret Lane.
159   enum class Kind : uint8_t {
160     /// For First, Lane is the index into the first N elements of a
161     /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
162     First,
163     /// For ScalableLast, Lane is the offset from the start of the last
164     /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
165     /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
166     /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
167     ScalableLast
168   };
169 
170 private:
171   /// in [0..VF)
172   unsigned Lane;
173 
174   /// Indicates how the Lane should be interpreted, as described above.
175   Kind LaneKind;
176 
177 public:
178   VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
179 
180   static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
181 
182   static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
183     assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
184            "trying to extract with invalid offset");
185     unsigned LaneOffset = VF.getKnownMinValue() - Offset;
186     Kind LaneKind;
187     if (VF.isScalable())
188       // In this case 'LaneOffset' refers to the offset from the start of the
189       // last subvector with VF.getKnownMinValue() elements.
190       LaneKind = VPLane::Kind::ScalableLast;
191     else
192       LaneKind = VPLane::Kind::First;
193     return VPLane(LaneOffset, LaneKind);
194   }
195 
196   static VPLane getLastLaneForVF(const ElementCount &VF) {
197     return getLaneFromEnd(VF, 1);
198   }
199 
200   /// Returns a compile-time known value for the lane index and asserts if the
201   /// lane can only be calculated at runtime.
202   unsigned getKnownLane() const {
203     assert(LaneKind == Kind::First);
204     return Lane;
205   }
206 
207   /// Returns an expression describing the lane index that can be used at
208   /// runtime.
209   Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
210 
211   /// Returns the Kind of lane offset.
212   Kind getKind() const { return LaneKind; }
213 
214   /// Returns true if this is the first lane of the whole vector.
215   bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
216 
217   /// Maps the lane to a cache index based on \p VF.
218   unsigned mapToCacheIndex(const ElementCount &VF) const {
219     switch (LaneKind) {
220     case VPLane::Kind::ScalableLast:
221       assert(VF.isScalable() && Lane < VF.getKnownMinValue());
222       return VF.getKnownMinValue() + Lane;
223     default:
224       assert(Lane < VF.getKnownMinValue());
225       return Lane;
226     }
227   }
228 
229   /// Returns the maxmimum number of lanes that we are able to consider
230   /// caching for \p VF.
231   static unsigned getNumCachedLanes(const ElementCount &VF) {
232     return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
233   }
234 };
235 
236 /// VPIteration represents a single point in the iteration space of the output
237 /// (vectorized and/or unrolled) IR loop.
238 struct VPIteration {
239   /// in [0..UF)
240   unsigned Part;
241 
242   VPLane Lane;
243 
244   VPIteration(unsigned Part, unsigned Lane,
245               VPLane::Kind Kind = VPLane::Kind::First)
246       : Part(Part), Lane(Lane, Kind) {}
247 
248   VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
249 
250   bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
251 };
252 
253 /// VPTransformState holds information passed down when "executing" a VPlan,
254 /// needed for generating the output IR.
255 struct VPTransformState {
256   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
257                    DominatorTree *DT, IRBuilderBase &Builder,
258                    InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx);
259 
260   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
261   ElementCount VF;
262   unsigned UF;
263 
264   /// Hold the indices to generate specific scalar instructions. Null indicates
265   /// that all instances are to be generated, using either scalar or vector
266   /// instructions.
267   std::optional<VPIteration> Instance;
268 
269   struct DataState {
270     /// A type for vectorized values in the new loop. Each value from the
271     /// original loop, when vectorized, is represented by UF vector values in
272     /// the new unrolled loop, where UF is the unroll factor.
273     typedef SmallVector<Value *, 2> PerPartValuesTy;
274 
275     DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
276 
277     using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
278     DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
279   } Data;
280 
281   /// Get the generated vector Value for a given VPValue \p Def and a given \p
282   /// Part if \p IsScalar is false, otherwise return the generated scalar
283   /// for \p Part. \See set.
284   Value *get(VPValue *Def, unsigned Part, bool IsScalar = false);
285 
286   /// Get the generated Value for a given VPValue and given Part and Lane.
287   Value *get(VPValue *Def, const VPIteration &Instance);
288 
289   bool hasVectorValue(VPValue *Def, unsigned Part) {
290     auto I = Data.PerPartOutput.find(Def);
291     return I != Data.PerPartOutput.end() && Part < I->second.size() &&
292            I->second[Part];
293   }
294 
295   bool hasScalarValue(VPValue *Def, VPIteration Instance) {
296     auto I = Data.PerPartScalars.find(Def);
297     if (I == Data.PerPartScalars.end())
298       return false;
299     unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
300     return Instance.Part < I->second.size() &&
301            CacheIdx < I->second[Instance.Part].size() &&
302            I->second[Instance.Part][CacheIdx];
303   }
304 
305   /// Set the generated vector Value for a given VPValue and a given Part, if \p
306   /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0).
307   void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) {
308     if (IsScalar) {
309       set(Def, V, VPIteration(Part, 0));
310       return;
311     }
312     assert((VF.isScalar() || V->getType()->isVectorTy()) &&
313            "scalar values must be stored as (Part, 0)");
314     if (!Data.PerPartOutput.count(Def)) {
315       DataState::PerPartValuesTy Entry(UF);
316       Data.PerPartOutput[Def] = Entry;
317     }
318     Data.PerPartOutput[Def][Part] = V;
319   }
320 
321   /// Reset an existing vector value for \p Def and a given \p Part.
322   void reset(VPValue *Def, Value *V, unsigned Part) {
323     auto Iter = Data.PerPartOutput.find(Def);
324     assert(Iter != Data.PerPartOutput.end() &&
325            "need to overwrite existing value");
326     Iter->second[Part] = V;
327   }
328 
329   /// Set the generated scalar \p V for \p Def and the given \p Instance.
330   void set(VPValue *Def, Value *V, const VPIteration &Instance) {
331     auto Iter = Data.PerPartScalars.insert({Def, {}});
332     auto &PerPartVec = Iter.first->second;
333     if (PerPartVec.size() <= Instance.Part)
334       PerPartVec.resize(Instance.Part + 1);
335     auto &Scalars = PerPartVec[Instance.Part];
336     unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
337     if (Scalars.size() <= CacheIdx)
338       Scalars.resize(CacheIdx + 1);
339     assert(!Scalars[CacheIdx] && "should overwrite existing value");
340     Scalars[CacheIdx] = V;
341   }
342 
343   /// Reset an existing scalar value for \p Def and a given \p Instance.
344   void reset(VPValue *Def, Value *V, const VPIteration &Instance) {
345     auto Iter = Data.PerPartScalars.find(Def);
346     assert(Iter != Data.PerPartScalars.end() &&
347            "need to overwrite existing value");
348     assert(Instance.Part < Iter->second.size() &&
349            "need to overwrite existing value");
350     unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
351     assert(CacheIdx < Iter->second[Instance.Part].size() &&
352            "need to overwrite existing value");
353     Iter->second[Instance.Part][CacheIdx] = V;
354   }
355 
356   /// Add additional metadata to \p To that was not present on \p Orig.
357   ///
358   /// Currently this is used to add the noalias annotations based on the
359   /// inserted memchecks.  Use this for instructions that are *cloned* into the
360   /// vector loop.
361   void addNewMetadata(Instruction *To, const Instruction *Orig);
362 
363   /// Add metadata from one instruction to another.
364   ///
365   /// This includes both the original MDs from \p From and additional ones (\see
366   /// addNewMetadata).  Use this for *newly created* instructions in the vector
367   /// loop.
368   void addMetadata(Value *To, Instruction *From);
369 
370   /// Set the debug location in the builder using the debug location \p DL.
371   void setDebugLocFrom(DebugLoc DL);
372 
373   /// Construct the vector value of a scalarized value \p V one lane at a time.
374   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance);
375 
376   /// Hold state information used when constructing the CFG of the output IR,
377   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
378   struct CFGState {
379     /// The previous VPBasicBlock visited. Initially set to null.
380     VPBasicBlock *PrevVPBB = nullptr;
381 
382     /// The previous IR BasicBlock created or used. Initially set to the new
383     /// header BasicBlock.
384     BasicBlock *PrevBB = nullptr;
385 
386     /// The last IR BasicBlock in the output IR. Set to the exit block of the
387     /// vector loop.
388     BasicBlock *ExitBB = nullptr;
389 
390     /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
391     /// of replication, maps the BasicBlock of the last replica created.
392     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
393 
394     /// Updater for the DominatorTree.
395     DomTreeUpdater DTU;
396 
397     CFGState(DominatorTree *DT)
398         : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
399 
400     /// Returns the BasicBlock* mapped to the pre-header of the loop region
401     /// containing \p R.
402     BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
403   } CFG;
404 
405   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
406   LoopInfo *LI;
407 
408   /// Hold a reference to the IRBuilder used to generate output IR code.
409   IRBuilderBase &Builder;
410 
411   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
412   InnerLoopVectorizer *ILV;
413 
414   /// Pointer to the VPlan code is generated for.
415   VPlan *Plan;
416 
417   /// The loop object for the current parent region, or nullptr.
418   Loop *CurrentVectorLoop = nullptr;
419 
420   /// LoopVersioning.  It's only set up (non-null) if memchecks were
421   /// used.
422   ///
423   /// This is currently only used to add no-alias metadata based on the
424   /// memchecks.  The actually versioning is performed manually.
425   LoopVersioning *LVer = nullptr;
426 
427   /// Map SCEVs to their expanded values. Populated when executing
428   /// VPExpandSCEVRecipes.
429   DenseMap<const SCEV *, Value *> ExpandedSCEVs;
430 
431   /// VPlan-based type analysis.
432   VPTypeAnalysis TypeAnalysis;
433 };
434 
435 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
436 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
437 class VPBlockBase {
438   friend class VPBlockUtils;
439 
440   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
441 
442   /// An optional name for the block.
443   std::string Name;
444 
445   /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
446   /// it is a topmost VPBlockBase.
447   VPRegionBlock *Parent = nullptr;
448 
449   /// List of predecessor blocks.
450   SmallVector<VPBlockBase *, 1> Predecessors;
451 
452   /// List of successor blocks.
453   SmallVector<VPBlockBase *, 1> Successors;
454 
455   /// VPlan containing the block. Can only be set on the entry block of the
456   /// plan.
457   VPlan *Plan = nullptr;
458 
459   /// Add \p Successor as the last successor to this block.
460   void appendSuccessor(VPBlockBase *Successor) {
461     assert(Successor && "Cannot add nullptr successor!");
462     Successors.push_back(Successor);
463   }
464 
465   /// Add \p Predecessor as the last predecessor to this block.
466   void appendPredecessor(VPBlockBase *Predecessor) {
467     assert(Predecessor && "Cannot add nullptr predecessor!");
468     Predecessors.push_back(Predecessor);
469   }
470 
471   /// Remove \p Predecessor from the predecessors of this block.
472   void removePredecessor(VPBlockBase *Predecessor) {
473     auto Pos = find(Predecessors, Predecessor);
474     assert(Pos && "Predecessor does not exist");
475     Predecessors.erase(Pos);
476   }
477 
478   /// Remove \p Successor from the successors of this block.
479   void removeSuccessor(VPBlockBase *Successor) {
480     auto Pos = find(Successors, Successor);
481     assert(Pos && "Successor does not exist");
482     Successors.erase(Pos);
483   }
484 
485 protected:
486   VPBlockBase(const unsigned char SC, const std::string &N)
487       : SubclassID(SC), Name(N) {}
488 
489 public:
490   /// An enumeration for keeping track of the concrete subclass of VPBlockBase
491   /// that are actually instantiated. Values of this enumeration are kept in the
492   /// SubclassID field of the VPBlockBase objects. They are used for concrete
493   /// type identification.
494   using VPBlockTy = enum { VPRegionBlockSC, VPBasicBlockSC, VPIRBasicBlockSC };
495 
496   using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
497 
498   virtual ~VPBlockBase() = default;
499 
500   const std::string &getName() const { return Name; }
501 
502   void setName(const Twine &newName) { Name = newName.str(); }
503 
504   /// \return an ID for the concrete type of this object.
505   /// This is used to implement the classof checks. This should not be used
506   /// for any other purpose, as the values may change as LLVM evolves.
507   unsigned getVPBlockID() const { return SubclassID; }
508 
509   VPRegionBlock *getParent() { return Parent; }
510   const VPRegionBlock *getParent() const { return Parent; }
511 
512   /// \return A pointer to the plan containing the current block.
513   VPlan *getPlan();
514   const VPlan *getPlan() const;
515 
516   /// Sets the pointer of the plan containing the block. The block must be the
517   /// entry block into the VPlan.
518   void setPlan(VPlan *ParentPlan);
519 
520   void setParent(VPRegionBlock *P) { Parent = P; }
521 
522   /// \return the VPBasicBlock that is the entry of this VPBlockBase,
523   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
524   /// VPBlockBase is a VPBasicBlock, it is returned.
525   const VPBasicBlock *getEntryBasicBlock() const;
526   VPBasicBlock *getEntryBasicBlock();
527 
528   /// \return the VPBasicBlock that is the exiting this VPBlockBase,
529   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
530   /// VPBlockBase is a VPBasicBlock, it is returned.
531   const VPBasicBlock *getExitingBasicBlock() const;
532   VPBasicBlock *getExitingBasicBlock();
533 
534   const VPBlocksTy &getSuccessors() const { return Successors; }
535   VPBlocksTy &getSuccessors() { return Successors; }
536 
537   iterator_range<VPBlockBase **> successors() { return Successors; }
538 
539   const VPBlocksTy &getPredecessors() const { return Predecessors; }
540   VPBlocksTy &getPredecessors() { return Predecessors; }
541 
542   /// \return the successor of this VPBlockBase if it has a single successor.
543   /// Otherwise return a null pointer.
544   VPBlockBase *getSingleSuccessor() const {
545     return (Successors.size() == 1 ? *Successors.begin() : nullptr);
546   }
547 
548   /// \return the predecessor of this VPBlockBase if it has a single
549   /// predecessor. Otherwise return a null pointer.
550   VPBlockBase *getSinglePredecessor() const {
551     return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
552   }
553 
554   size_t getNumSuccessors() const { return Successors.size(); }
555   size_t getNumPredecessors() const { return Predecessors.size(); }
556 
557   /// An Enclosing Block of a block B is any block containing B, including B
558   /// itself. \return the closest enclosing block starting from "this", which
559   /// has successors. \return the root enclosing block if all enclosing blocks
560   /// have no successors.
561   VPBlockBase *getEnclosingBlockWithSuccessors();
562 
563   /// \return the closest enclosing block starting from "this", which has
564   /// predecessors. \return the root enclosing block if all enclosing blocks
565   /// have no predecessors.
566   VPBlockBase *getEnclosingBlockWithPredecessors();
567 
568   /// \return the successors either attached directly to this VPBlockBase or, if
569   /// this VPBlockBase is the exit block of a VPRegionBlock and has no
570   /// successors of its own, search recursively for the first enclosing
571   /// VPRegionBlock that has successors and return them. If no such
572   /// VPRegionBlock exists, return the (empty) successors of the topmost
573   /// VPBlockBase reached.
574   const VPBlocksTy &getHierarchicalSuccessors() {
575     return getEnclosingBlockWithSuccessors()->getSuccessors();
576   }
577 
578   /// \return the hierarchical successor of this VPBlockBase if it has a single
579   /// hierarchical successor. Otherwise return a null pointer.
580   VPBlockBase *getSingleHierarchicalSuccessor() {
581     return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
582   }
583 
584   /// \return the predecessors either attached directly to this VPBlockBase or,
585   /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
586   /// predecessors of its own, search recursively for the first enclosing
587   /// VPRegionBlock that has predecessors and return them. If no such
588   /// VPRegionBlock exists, return the (empty) predecessors of the topmost
589   /// VPBlockBase reached.
590   const VPBlocksTy &getHierarchicalPredecessors() {
591     return getEnclosingBlockWithPredecessors()->getPredecessors();
592   }
593 
594   /// \return the hierarchical predecessor of this VPBlockBase if it has a
595   /// single hierarchical predecessor. Otherwise return a null pointer.
596   VPBlockBase *getSingleHierarchicalPredecessor() {
597     return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
598   }
599 
600   /// Set a given VPBlockBase \p Successor as the single successor of this
601   /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
602   /// This VPBlockBase must have no successors.
603   void setOneSuccessor(VPBlockBase *Successor) {
604     assert(Successors.empty() && "Setting one successor when others exist.");
605     assert(Successor->getParent() == getParent() &&
606            "connected blocks must have the same parent");
607     appendSuccessor(Successor);
608   }
609 
610   /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
611   /// successors of this VPBlockBase. This VPBlockBase is not added as
612   /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
613   /// successors.
614   void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
615     assert(Successors.empty() && "Setting two successors when others exist.");
616     appendSuccessor(IfTrue);
617     appendSuccessor(IfFalse);
618   }
619 
620   /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
621   /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
622   /// as successor of any VPBasicBlock in \p NewPreds.
623   void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
624     assert(Predecessors.empty() && "Block predecessors already set.");
625     for (auto *Pred : NewPreds)
626       appendPredecessor(Pred);
627   }
628 
629   /// Set each VPBasicBlock in \p NewSuccss as successor of this VPBlockBase.
630   /// This VPBlockBase must have no successors. This VPBlockBase is not added
631   /// as predecessor of any VPBasicBlock in \p NewSuccs.
632   void setSuccessors(ArrayRef<VPBlockBase *> NewSuccs) {
633     assert(Successors.empty() && "Block successors already set.");
634     for (auto *Succ : NewSuccs)
635       appendSuccessor(Succ);
636   }
637 
638   /// Remove all the predecessor of this block.
639   void clearPredecessors() { Predecessors.clear(); }
640 
641   /// Remove all the successors of this block.
642   void clearSuccessors() { Successors.clear(); }
643 
644   /// The method which generates the output IR that correspond to this
645   /// VPBlockBase, thereby "executing" the VPlan.
646   virtual void execute(VPTransformState *State) = 0;
647 
648   /// Return the cost of the block.
649   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
650 
651   /// Delete all blocks reachable from a given VPBlockBase, inclusive.
652   static void deleteCFG(VPBlockBase *Entry);
653 
654   /// Return true if it is legal to hoist instructions into this block.
655   bool isLegalToHoistInto() {
656     // There are currently no constraints that prevent an instruction to be
657     // hoisted into a VPBlockBase.
658     return true;
659   }
660 
661   /// Replace all operands of VPUsers in the block with \p NewValue and also
662   /// replaces all uses of VPValues defined in the block with NewValue.
663   virtual void dropAllReferences(VPValue *NewValue) = 0;
664 
665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
666   void printAsOperand(raw_ostream &OS, bool PrintType) const {
667     OS << getName();
668   }
669 
670   /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines
671   /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using
672   /// consequtive numbers.
673   ///
674   /// Note that the numbering is applied to the whole VPlan, so printing
675   /// individual blocks is consistent with the whole VPlan printing.
676   virtual void print(raw_ostream &O, const Twine &Indent,
677                      VPSlotTracker &SlotTracker) const = 0;
678 
679   /// Print plain-text dump of this VPlan to \p O.
680   void print(raw_ostream &O) const {
681     VPSlotTracker SlotTracker(getPlan());
682     print(O, "", SlotTracker);
683   }
684 
685   /// Print the successors of this block to \p O, prefixing all lines with \p
686   /// Indent.
687   void printSuccessors(raw_ostream &O, const Twine &Indent) const;
688 
689   /// Dump this VPBlockBase to dbgs().
690   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
691 #endif
692 
693   /// Clone the current block and it's recipes without updating the operands of
694   /// the cloned recipes, including all blocks in the single-entry single-exit
695   /// region for VPRegionBlocks.
696   virtual VPBlockBase *clone() = 0;
697 };
698 
699 /// A value that is used outside the VPlan. The operand of the user needs to be
700 /// added to the associated phi node. The incoming block from VPlan is
701 /// determined by where the VPValue is defined: if it is defined by a recipe
702 /// outside a region, its parent block is used, otherwise the middle block is
703 /// used.
704 class VPLiveOut : public VPUser {
705   PHINode *Phi;
706 
707 public:
708   VPLiveOut(PHINode *Phi, VPValue *Op)
709       : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
710 
711   static inline bool classof(const VPUser *U) {
712     return U->getVPUserID() == VPUser::VPUserID::LiveOut;
713   }
714 
715   /// Fix the wrapped phi node. This means adding an incoming value to exit
716   /// block phi's from the vector loop via middle block (values from scalar loop
717   /// already reach these phi's), and updating the value to scalar header phi's
718   /// from the scalar preheader.
719   void fixPhi(VPlan &Plan, VPTransformState &State);
720 
721   /// Returns true if the VPLiveOut uses scalars of operand \p Op.
722   bool usesScalars(const VPValue *Op) const override {
723     assert(is_contained(operands(), Op) &&
724            "Op must be an operand of the recipe");
725     return true;
726   }
727 
728   PHINode *getPhi() const { return Phi; }
729 
730 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
731   /// Print the VPLiveOut to \p O.
732   void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
733 #endif
734 };
735 
736 /// Struct to hold various analysis needed for cost computations.
737 struct VPCostContext {
738   const TargetTransformInfo &TTI;
739   VPTypeAnalysis Types;
740   LLVMContext &LLVMCtx;
741   LoopVectorizationCostModel &CM;
742   SmallPtrSet<Instruction *, 8> SkipCostComputation;
743 
744   VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy,
745                 LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM)
746       : TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {}
747 
748   /// Return the cost for \p UI with \p VF using the legacy cost model as
749   /// fallback until computing the cost of all recipes migrates to VPlan.
750   InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
751 
752   /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
753   /// has already been pre-computed.
754   bool skipCostComputation(Instruction *UI, bool IsVector) const;
755 };
756 
757 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
758 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
759 /// and is responsible for deleting its defined values. Single-value
760 /// recipes must inherit from VPSingleDef instead of inheriting from both
761 /// VPRecipeBase and VPValue separately.
762 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
763                      public VPDef,
764                      public VPUser {
765   friend VPBasicBlock;
766   friend class VPBlockUtils;
767 
768   /// Each VPRecipe belongs to a single VPBasicBlock.
769   VPBasicBlock *Parent = nullptr;
770 
771   /// The debug location for the recipe.
772   DebugLoc DL;
773 
774 public:
775   VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
776                DebugLoc DL = {})
777       : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
778 
779   template <typename IterT>
780   VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands,
781                DebugLoc DL = {})
782       : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
783   virtual ~VPRecipeBase() = default;
784 
785   /// Clone the current recipe.
786   virtual VPRecipeBase *clone() = 0;
787 
788   /// \return the VPBasicBlock which this VPRecipe belongs to.
789   VPBasicBlock *getParent() { return Parent; }
790   const VPBasicBlock *getParent() const { return Parent; }
791 
792   /// The method which generates the output IR instructions that correspond to
793   /// this VPRecipe, thereby "executing" the VPlan.
794   virtual void execute(VPTransformState &State) = 0;
795 
796   /// Return the cost of this recipe, taking into account if the cost
797   /// computation should be skipped and the ForceTargetInstructionCost flag.
798   /// Also takes care of printing the cost for debugging.
799   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
800 
801   /// Insert an unlinked recipe into a basic block immediately before
802   /// the specified recipe.
803   void insertBefore(VPRecipeBase *InsertPos);
804   /// Insert an unlinked recipe into \p BB immediately before the insertion
805   /// point \p IP;
806   void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
807 
808   /// Insert an unlinked Recipe into a basic block immediately after
809   /// the specified Recipe.
810   void insertAfter(VPRecipeBase *InsertPos);
811 
812   /// Unlink this recipe from its current VPBasicBlock and insert it into
813   /// the VPBasicBlock that MovePos lives in, right after MovePos.
814   void moveAfter(VPRecipeBase *MovePos);
815 
816   /// Unlink this recipe and insert into BB before I.
817   ///
818   /// \pre I is a valid iterator into BB.
819   void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
820 
821   /// This method unlinks 'this' from the containing basic block, but does not
822   /// delete it.
823   void removeFromParent();
824 
825   /// This method unlinks 'this' from the containing basic block and deletes it.
826   ///
827   /// \returns an iterator pointing to the element after the erased one
828   iplist<VPRecipeBase>::iterator eraseFromParent();
829 
830   /// Method to support type inquiry through isa, cast, and dyn_cast.
831   static inline bool classof(const VPDef *D) {
832     // All VPDefs are also VPRecipeBases.
833     return true;
834   }
835 
836   static inline bool classof(const VPUser *U) {
837     return U->getVPUserID() == VPUser::VPUserID::Recipe;
838   }
839 
840   /// Returns true if the recipe may have side-effects.
841   bool mayHaveSideEffects() const;
842 
843   /// Returns true for PHI-like recipes.
844   bool isPhi() const {
845     return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
846   }
847 
848   /// Returns true if the recipe may read from memory.
849   bool mayReadFromMemory() const;
850 
851   /// Returns true if the recipe may write to memory.
852   bool mayWriteToMemory() const;
853 
854   /// Returns true if the recipe may read from or write to memory.
855   bool mayReadOrWriteMemory() const {
856     return mayReadFromMemory() || mayWriteToMemory();
857   }
858 
859   /// Returns the debug location of the recipe.
860   DebugLoc getDebugLoc() const { return DL; }
861 
862 protected:
863   /// Compute the cost of this recipe using the legacy cost model and the
864   /// underlying instructions.
865   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const;
866 };
867 
868 // Helper macro to define common classof implementations for recipes.
869 #define VP_CLASSOF_IMPL(VPDefID)                                               \
870   static inline bool classof(const VPDef *D) {                                 \
871     return D->getVPDefID() == VPDefID;                                         \
872   }                                                                            \
873   static inline bool classof(const VPValue *V) {                               \
874     auto *R = V->getDefiningRecipe();                                          \
875     return R && R->getVPDefID() == VPDefID;                                    \
876   }                                                                            \
877   static inline bool classof(const VPUser *U) {                                \
878     auto *R = dyn_cast<VPRecipeBase>(U);                                       \
879     return R && R->getVPDefID() == VPDefID;                                    \
880   }                                                                            \
881   static inline bool classof(const VPRecipeBase *R) {                          \
882     return R->getVPDefID() == VPDefID;                                         \
883   }                                                                            \
884   static inline bool classof(const VPSingleDefRecipe *R) {                     \
885     return R->getVPDefID() == VPDefID;                                         \
886   }
887 
888 /// VPSingleDef is a base class for recipes for modeling a sequence of one or
889 /// more output IR that define a single result VPValue.
890 /// Note that VPRecipeBase must be inherited from before VPValue.
891 class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
892 public:
893   template <typename IterT>
894   VPSingleDefRecipe(const unsigned char SC, IterT Operands, DebugLoc DL = {})
895       : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
896 
897   VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
898                     DebugLoc DL = {})
899       : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
900 
901   template <typename IterT>
902   VPSingleDefRecipe(const unsigned char SC, IterT Operands, Value *UV,
903                     DebugLoc DL = {})
904       : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
905 
906   static inline bool classof(const VPRecipeBase *R) {
907     switch (R->getVPDefID()) {
908     case VPRecipeBase::VPDerivedIVSC:
909     case VPRecipeBase::VPEVLBasedIVPHISC:
910     case VPRecipeBase::VPExpandSCEVSC:
911     case VPRecipeBase::VPInstructionSC:
912     case VPRecipeBase::VPReductionEVLSC:
913     case VPRecipeBase::VPReductionSC:
914     case VPRecipeBase::VPReplicateSC:
915     case VPRecipeBase::VPScalarIVStepsSC:
916     case VPRecipeBase::VPVectorPointerSC:
917     case VPRecipeBase::VPWidenCallSC:
918     case VPRecipeBase::VPWidenCanonicalIVSC:
919     case VPRecipeBase::VPWidenCastSC:
920     case VPRecipeBase::VPWidenGEPSC:
921     case VPRecipeBase::VPWidenSC:
922     case VPRecipeBase::VPWidenSelectSC:
923     case VPRecipeBase::VPBlendSC:
924     case VPRecipeBase::VPPredInstPHISC:
925     case VPRecipeBase::VPCanonicalIVPHISC:
926     case VPRecipeBase::VPActiveLaneMaskPHISC:
927     case VPRecipeBase::VPFirstOrderRecurrencePHISC:
928     case VPRecipeBase::VPWidenPHISC:
929     case VPRecipeBase::VPWidenIntOrFpInductionSC:
930     case VPRecipeBase::VPWidenPointerInductionSC:
931     case VPRecipeBase::VPReductionPHISC:
932     case VPRecipeBase::VPScalarCastSC:
933       return true;
934     case VPRecipeBase::VPInterleaveSC:
935     case VPRecipeBase::VPBranchOnMaskSC:
936     case VPRecipeBase::VPWidenLoadEVLSC:
937     case VPRecipeBase::VPWidenLoadSC:
938     case VPRecipeBase::VPWidenStoreEVLSC:
939     case VPRecipeBase::VPWidenStoreSC:
940       // TODO: Widened stores don't define a value, but widened loads do. Split
941       // the recipes to be able to make widened loads VPSingleDefRecipes.
942       return false;
943     }
944     llvm_unreachable("Unhandled VPDefID");
945   }
946 
947   static inline bool classof(const VPUser *U) {
948     auto *R = dyn_cast<VPRecipeBase>(U);
949     return R && classof(R);
950   }
951 
952   virtual VPSingleDefRecipe *clone() override = 0;
953 
954   /// Returns the underlying instruction.
955   Instruction *getUnderlyingInstr() {
956     return cast<Instruction>(getUnderlyingValue());
957   }
958   const Instruction *getUnderlyingInstr() const {
959     return cast<Instruction>(getUnderlyingValue());
960   }
961 };
962 
963 /// Class to record LLVM IR flag for a recipe along with it.
964 class VPRecipeWithIRFlags : public VPSingleDefRecipe {
965   enum class OperationType : unsigned char {
966     Cmp,
967     OverflowingBinOp,
968     DisjointOp,
969     PossiblyExactOp,
970     GEPOp,
971     FPMathOp,
972     NonNegOp,
973     Other
974   };
975 
976 public:
977   struct WrapFlagsTy {
978     char HasNUW : 1;
979     char HasNSW : 1;
980 
981     WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
982   };
983 
984   struct DisjointFlagsTy {
985     char IsDisjoint : 1;
986     DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
987   };
988 
989 protected:
990   struct GEPFlagsTy {
991     char IsInBounds : 1;
992     GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
993   };
994 
995 private:
996   struct ExactFlagsTy {
997     char IsExact : 1;
998   };
999   struct NonNegFlagsTy {
1000     char NonNeg : 1;
1001   };
1002   struct FastMathFlagsTy {
1003     char AllowReassoc : 1;
1004     char NoNaNs : 1;
1005     char NoInfs : 1;
1006     char NoSignedZeros : 1;
1007     char AllowReciprocal : 1;
1008     char AllowContract : 1;
1009     char ApproxFunc : 1;
1010 
1011     FastMathFlagsTy(const FastMathFlags &FMF);
1012   };
1013 
1014   OperationType OpType;
1015 
1016   union {
1017     CmpInst::Predicate CmpPredicate;
1018     WrapFlagsTy WrapFlags;
1019     DisjointFlagsTy DisjointFlags;
1020     ExactFlagsTy ExactFlags;
1021     GEPFlagsTy GEPFlags;
1022     NonNegFlagsTy NonNegFlags;
1023     FastMathFlagsTy FMFs;
1024     unsigned AllFlags;
1025   };
1026 
1027 protected:
1028   void transferFlags(VPRecipeWithIRFlags &Other) {
1029     OpType = Other.OpType;
1030     AllFlags = Other.AllFlags;
1031   }
1032 
1033 public:
1034   template <typename IterT>
1035   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
1036       : VPSingleDefRecipe(SC, Operands, DL) {
1037     OpType = OperationType::Other;
1038     AllFlags = 0;
1039   }
1040 
1041   template <typename IterT>
1042   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)
1043       : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) {
1044     if (auto *Op = dyn_cast<CmpInst>(&I)) {
1045       OpType = OperationType::Cmp;
1046       CmpPredicate = Op->getPredicate();
1047     } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) {
1048       OpType = OperationType::DisjointOp;
1049       DisjointFlags.IsDisjoint = Op->isDisjoint();
1050     } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
1051       OpType = OperationType::OverflowingBinOp;
1052       WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};
1053     } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {
1054       OpType = OperationType::PossiblyExactOp;
1055       ExactFlags.IsExact = Op->isExact();
1056     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
1057       OpType = OperationType::GEPOp;
1058       GEPFlags.IsInBounds = GEP->isInBounds();
1059     } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {
1060       OpType = OperationType::NonNegOp;
1061       NonNegFlags.NonNeg = PNNI->hasNonNeg();
1062     } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {
1063       OpType = OperationType::FPMathOp;
1064       FMFs = Op->getFastMathFlags();
1065     } else {
1066       OpType = OperationType::Other;
1067       AllFlags = 0;
1068     }
1069   }
1070 
1071   template <typename IterT>
1072   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1073                       CmpInst::Predicate Pred, DebugLoc DL = {})
1074       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp),
1075         CmpPredicate(Pred) {}
1076 
1077   template <typename IterT>
1078   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1079                       WrapFlagsTy WrapFlags, DebugLoc DL = {})
1080       : VPSingleDefRecipe(SC, Operands, DL),
1081         OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {}
1082 
1083   template <typename IterT>
1084   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1085                       FastMathFlags FMFs, DebugLoc DL = {})
1086       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp),
1087         FMFs(FMFs) {}
1088 
1089   template <typename IterT>
1090   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1091                       DisjointFlagsTy DisjointFlags, DebugLoc DL = {})
1092       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
1093         DisjointFlags(DisjointFlags) {}
1094 
1095 protected:
1096   template <typename IterT>
1097   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1098                       GEPFlagsTy GEPFlags, DebugLoc DL = {})
1099       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
1100         GEPFlags(GEPFlags) {}
1101 
1102 public:
1103   static inline bool classof(const VPRecipeBase *R) {
1104     return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
1105            R->getVPDefID() == VPRecipeBase::VPWidenSC ||
1106            R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
1107            R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1108            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
1109            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
1110   }
1111 
1112   static inline bool classof(const VPUser *U) {
1113     auto *R = dyn_cast<VPRecipeBase>(U);
1114     return R && classof(R);
1115   }
1116 
1117   /// Drop all poison-generating flags.
1118   void dropPoisonGeneratingFlags() {
1119     // NOTE: This needs to be kept in-sync with
1120     // Instruction::dropPoisonGeneratingFlags.
1121     switch (OpType) {
1122     case OperationType::OverflowingBinOp:
1123       WrapFlags.HasNUW = false;
1124       WrapFlags.HasNSW = false;
1125       break;
1126     case OperationType::DisjointOp:
1127       DisjointFlags.IsDisjoint = false;
1128       break;
1129     case OperationType::PossiblyExactOp:
1130       ExactFlags.IsExact = false;
1131       break;
1132     case OperationType::GEPOp:
1133       GEPFlags.IsInBounds = false;
1134       break;
1135     case OperationType::FPMathOp:
1136       FMFs.NoNaNs = false;
1137       FMFs.NoInfs = false;
1138       break;
1139     case OperationType::NonNegOp:
1140       NonNegFlags.NonNeg = false;
1141       break;
1142     case OperationType::Cmp:
1143     case OperationType::Other:
1144       break;
1145     }
1146   }
1147 
1148   /// Set the IR flags for \p I.
1149   void setFlags(Instruction *I) const {
1150     switch (OpType) {
1151     case OperationType::OverflowingBinOp:
1152       I->setHasNoUnsignedWrap(WrapFlags.HasNUW);
1153       I->setHasNoSignedWrap(WrapFlags.HasNSW);
1154       break;
1155     case OperationType::DisjointOp:
1156       cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint);
1157       break;
1158     case OperationType::PossiblyExactOp:
1159       I->setIsExact(ExactFlags.IsExact);
1160       break;
1161     case OperationType::GEPOp:
1162       // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan.
1163       cast<GetElementPtrInst>(I)->setNoWrapFlags(
1164           GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds()
1165                               : GEPNoWrapFlags::none());
1166       break;
1167     case OperationType::FPMathOp:
1168       I->setHasAllowReassoc(FMFs.AllowReassoc);
1169       I->setHasNoNaNs(FMFs.NoNaNs);
1170       I->setHasNoInfs(FMFs.NoInfs);
1171       I->setHasNoSignedZeros(FMFs.NoSignedZeros);
1172       I->setHasAllowReciprocal(FMFs.AllowReciprocal);
1173       I->setHasAllowContract(FMFs.AllowContract);
1174       I->setHasApproxFunc(FMFs.ApproxFunc);
1175       break;
1176     case OperationType::NonNegOp:
1177       I->setNonNeg(NonNegFlags.NonNeg);
1178       break;
1179     case OperationType::Cmp:
1180     case OperationType::Other:
1181       break;
1182     }
1183   }
1184 
1185   CmpInst::Predicate getPredicate() const {
1186     assert(OpType == OperationType::Cmp &&
1187            "recipe doesn't have a compare predicate");
1188     return CmpPredicate;
1189   }
1190 
1191   bool isInBounds() const {
1192     assert(OpType == OperationType::GEPOp &&
1193            "recipe doesn't have inbounds flag");
1194     return GEPFlags.IsInBounds;
1195   }
1196 
1197   /// Returns true if the recipe has fast-math flags.
1198   bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
1199 
1200   FastMathFlags getFastMathFlags() const;
1201 
1202   bool hasNoUnsignedWrap() const {
1203     assert(OpType == OperationType::OverflowingBinOp &&
1204            "recipe doesn't have a NUW flag");
1205     return WrapFlags.HasNUW;
1206   }
1207 
1208   bool hasNoSignedWrap() const {
1209     assert(OpType == OperationType::OverflowingBinOp &&
1210            "recipe doesn't have a NSW flag");
1211     return WrapFlags.HasNSW;
1212   }
1213 
1214   bool isDisjoint() const {
1215     assert(OpType == OperationType::DisjointOp &&
1216            "recipe cannot have a disjoing flag");
1217     return DisjointFlags.IsDisjoint;
1218   }
1219 
1220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1221   void printFlags(raw_ostream &O) const;
1222 #endif
1223 };
1224 
1225 /// This is a concrete Recipe that models a single VPlan-level instruction.
1226 /// While as any Recipe it may generate a sequence of IR instructions when
1227 /// executed, these instructions would always form a single-def expression as
1228 /// the VPInstruction is also a single def-use vertex.
1229 class VPInstruction : public VPRecipeWithIRFlags {
1230   friend class VPlanSlp;
1231 
1232 public:
1233   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
1234   enum {
1235     FirstOrderRecurrenceSplice =
1236         Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
1237                                       // values of a first-order recurrence.
1238     Not,
1239     SLPLoad,
1240     SLPStore,
1241     ActiveLaneMask,
1242     ExplicitVectorLength,
1243     /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
1244     /// The first operand is the incoming value from the predecessor in VPlan,
1245     /// the second operand is the incoming value for all other predecessors
1246     /// (which are currently not modeled in VPlan).
1247     ResumePhi,
1248     CalculateTripCountMinusVF,
1249     // Increment the canonical IV separately for each unrolled part.
1250     CanonicalIVIncrementForPart,
1251     BranchOnCount,
1252     BranchOnCond,
1253     ComputeReductionResult,
1254     // Takes the VPValue to extract from as first operand and the lane or part
1255     // to extract as second operand, counting from the end starting with 1 for
1256     // last. The second operand must be a positive constant and <= VF when
1257     // extracting from a vector or <= UF when extracting from an unrolled
1258     // scalar.
1259     ExtractFromEnd,
1260     LogicalAnd, // Non-poison propagating logical And.
1261     // Add an offset in bytes (second operand) to a base pointer (first
1262     // operand). Only generates scalar values (either for the first lane only or
1263     // for all lanes, depending on its uses).
1264     PtrAdd,
1265   };
1266 
1267 private:
1268   typedef unsigned char OpcodeTy;
1269   OpcodeTy Opcode;
1270 
1271   /// An optional name that can be used for the generated IR instruction.
1272   const std::string Name;
1273 
1274   /// Returns true if this VPInstruction generates scalar values for all lanes.
1275   /// Most VPInstructions generate a single value per part, either vector or
1276   /// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
1277   /// values per all lanes, stemming from an original ingredient. This method
1278   /// identifies the (rare) cases of VPInstructions that do so as well, w/o an
1279   /// underlying ingredient.
1280   bool doesGeneratePerAllLanes() const;
1281 
1282   /// Returns true if we can generate a scalar for the first lane only if
1283   /// needed.
1284   bool canGenerateScalarForFirstLane() const;
1285 
1286   /// Utility methods serving execute(): generates a single instance of the
1287   /// modeled instruction for a given part. \returns the generated value for \p
1288   /// Part. In some cases an existing value is returned rather than a generated
1289   /// one.
1290   Value *generatePerPart(VPTransformState &State, unsigned Part);
1291 
1292   /// Utility methods serving execute(): generates a scalar single instance of
1293   /// the modeled instruction for a given lane. \returns the scalar generated
1294   /// value for lane \p Lane.
1295   Value *generatePerLane(VPTransformState &State, const VPIteration &Lane);
1296 
1297 #if !defined(NDEBUG)
1298   /// Return true if the VPInstruction is a floating point math operation, i.e.
1299   /// has fast-math flags.
1300   bool isFPMathOp() const;
1301 #endif
1302 
1303 public:
1304   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
1305                 const Twine &Name = "")
1306       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
1307         Opcode(Opcode), Name(Name.str()) {}
1308 
1309   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1310                 DebugLoc DL = {}, const Twine &Name = "")
1311       : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
1312 
1313   VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A,
1314                 VPValue *B, DebugLoc DL = {}, const Twine &Name = "");
1315 
1316   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1317                 WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")
1318       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),
1319         Opcode(Opcode), Name(Name.str()) {}
1320 
1321   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1322                 DisjointFlagsTy DisjointFlag, DebugLoc DL = {},
1323                 const Twine &Name = "")
1324       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL),
1325         Opcode(Opcode), Name(Name.str()) {
1326     assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
1327   }
1328 
1329   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1330                 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
1331 
1332   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
1333 
1334   VPInstruction *clone() override {
1335     SmallVector<VPValue *, 2> Operands(operands());
1336     auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name);
1337     New->transferFlags(*this);
1338     return New;
1339   }
1340 
1341   unsigned getOpcode() const { return Opcode; }
1342 
1343   /// Generate the instruction.
1344   /// TODO: We currently execute only per-part unless a specific instance is
1345   /// provided.
1346   void execute(VPTransformState &State) override;
1347 
1348 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1349   /// Print the VPInstruction to \p O.
1350   void print(raw_ostream &O, const Twine &Indent,
1351              VPSlotTracker &SlotTracker) const override;
1352 
1353   /// Print the VPInstruction to dbgs() (for debugging).
1354   LLVM_DUMP_METHOD void dump() const;
1355 #endif
1356 
1357   /// Return true if this instruction may modify memory.
1358   bool mayWriteToMemory() const {
1359     // TODO: we can use attributes of the called function to rule out memory
1360     //       modifications.
1361     return Opcode == Instruction::Store || Opcode == Instruction::Call ||
1362            Opcode == Instruction::Invoke || Opcode == SLPStore;
1363   }
1364 
1365   bool hasResult() const {
1366     // CallInst may or may not have a result, depending on the called function.
1367     // Conservatively return calls have results for now.
1368     switch (getOpcode()) {
1369     case Instruction::Ret:
1370     case Instruction::Br:
1371     case Instruction::Store:
1372     case Instruction::Switch:
1373     case Instruction::IndirectBr:
1374     case Instruction::Resume:
1375     case Instruction::CatchRet:
1376     case Instruction::Unreachable:
1377     case Instruction::Fence:
1378     case Instruction::AtomicRMW:
1379     case VPInstruction::BranchOnCond:
1380     case VPInstruction::BranchOnCount:
1381       return false;
1382     default:
1383       return true;
1384     }
1385   }
1386 
1387   /// Returns true if the recipe only uses the first lane of operand \p Op.
1388   bool onlyFirstLaneUsed(const VPValue *Op) const override;
1389 
1390   /// Returns true if the recipe only uses the first part of operand \p Op.
1391   bool onlyFirstPartUsed(const VPValue *Op) const override;
1392 
1393   /// Returns true if this VPInstruction produces a scalar value from a vector,
1394   /// e.g. by performing a reduction or extracting a lane.
1395   bool isVectorToScalar() const;
1396 
1397   /// Returns true if this VPInstruction's operands are single scalars and the
1398   /// result is also a single scalar.
1399   bool isSingleScalar() const;
1400 };
1401 
1402 /// VPWidenRecipe is a recipe for producing a widened instruction using the
1403 /// opcode and operands of the recipe. This recipe covers most of the
1404 /// traditional vectorization cases where each recipe transforms into a
1405 /// vectorized version of itself.
1406 class VPWidenRecipe : public VPRecipeWithIRFlags {
1407   unsigned Opcode;
1408 
1409 public:
1410   template <typename IterT>
1411   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
1412       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
1413         Opcode(I.getOpcode()) {}
1414 
1415   ~VPWidenRecipe() override = default;
1416 
1417   VPWidenRecipe *clone() override {
1418     auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands());
1419     R->transferFlags(*this);
1420     return R;
1421   }
1422 
1423   VP_CLASSOF_IMPL(VPDef::VPWidenSC)
1424 
1425   /// Produce a widened instruction using the opcode and operands of the recipe,
1426   /// processing State.VF elements.
1427   void execute(VPTransformState &State) override;
1428 
1429   unsigned getOpcode() const { return Opcode; }
1430 
1431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1432   /// Print the recipe.
1433   void print(raw_ostream &O, const Twine &Indent,
1434              VPSlotTracker &SlotTracker) const override;
1435 #endif
1436 };
1437 
1438 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
1439 class VPWidenCastRecipe : public VPRecipeWithIRFlags {
1440   /// Cast instruction opcode.
1441   Instruction::CastOps Opcode;
1442 
1443   /// Result type for the cast.
1444   Type *ResultTy;
1445 
1446 public:
1447   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1448                     CastInst &UI)
1449       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
1450         ResultTy(ResultTy) {
1451     assert(UI.getOpcode() == Opcode &&
1452            "opcode of underlying cast doesn't match");
1453   }
1454 
1455   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1456       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
1457         ResultTy(ResultTy) {}
1458 
1459   ~VPWidenCastRecipe() override = default;
1460 
1461   VPWidenCastRecipe *clone() override {
1462     if (auto *UV = getUnderlyingValue())
1463       return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
1464                                    *cast<CastInst>(UV));
1465 
1466     return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy);
1467   }
1468 
1469   VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
1470 
1471   /// Produce widened copies of the cast.
1472   void execute(VPTransformState &State) override;
1473 
1474 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1475   /// Print the recipe.
1476   void print(raw_ostream &O, const Twine &Indent,
1477              VPSlotTracker &SlotTracker) const override;
1478 #endif
1479 
1480   Instruction::CastOps getOpcode() const { return Opcode; }
1481 
1482   /// Returns the result type of the cast.
1483   Type *getResultType() const { return ResultTy; }
1484 };
1485 
1486 /// VPScalarCastRecipe is a recipe to create scalar cast instructions.
1487 class VPScalarCastRecipe : public VPSingleDefRecipe {
1488   Instruction::CastOps Opcode;
1489 
1490   Type *ResultTy;
1491 
1492   Value *generate(VPTransformState &State, unsigned Part);
1493 
1494 public:
1495   VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1496       : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode),
1497         ResultTy(ResultTy) {}
1498 
1499   ~VPScalarCastRecipe() override = default;
1500 
1501   VPScalarCastRecipe *clone() override {
1502     return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy);
1503   }
1504 
1505   VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
1506 
1507   void execute(VPTransformState &State) override;
1508 
1509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1510   void print(raw_ostream &O, const Twine &Indent,
1511              VPSlotTracker &SlotTracker) const override;
1512 #endif
1513 
1514   /// Returns the result type of the cast.
1515   Type *getResultType() const { return ResultTy; }
1516 
1517   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1518     // At the moment, only uniform codegen is implemented.
1519     assert(is_contained(operands(), Op) &&
1520            "Op must be an operand of the recipe");
1521     return true;
1522   }
1523 };
1524 
1525 /// A recipe for widening Call instructions.
1526 class VPWidenCallRecipe : public VPSingleDefRecipe {
1527   /// ID of the vector intrinsic to call when widening the call. If set the
1528   /// Intrinsic::not_intrinsic, a library call will be used instead.
1529   Intrinsic::ID VectorIntrinsicID;
1530   /// If this recipe represents a library call, Variant stores a pointer to
1531   /// the chosen function. There is a 1:1 mapping between a given VF and the
1532   /// chosen vectorized variant, so there will be a different vplan for each
1533   /// VF with a valid variant.
1534   Function *Variant;
1535 
1536 public:
1537   template <typename IterT>
1538   VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
1539                     Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
1540                     Function *Variant = nullptr)
1541       : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
1542         VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
1543     assert(
1544         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
1545         "last operand must be the called function");
1546   }
1547 
1548   ~VPWidenCallRecipe() override = default;
1549 
1550   VPWidenCallRecipe *clone() override {
1551     return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
1552                                  VectorIntrinsicID, getDebugLoc(), Variant);
1553   }
1554 
1555   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
1556 
1557   /// Produce a widened version of the call instruction.
1558   void execute(VPTransformState &State) override;
1559 
1560   Function *getCalledScalarFunction() const {
1561     return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
1562   }
1563 
1564   operand_range arg_operands() {
1565     return make_range(op_begin(), op_begin() + getNumOperands() - 1);
1566   }
1567   const_operand_range arg_operands() const {
1568     return make_range(op_begin(), op_begin() + getNumOperands() - 1);
1569   }
1570 
1571 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1572   /// Print the recipe.
1573   void print(raw_ostream &O, const Twine &Indent,
1574              VPSlotTracker &SlotTracker) const override;
1575 #endif
1576 };
1577 
1578 /// A recipe for widening select instructions.
1579 struct VPWidenSelectRecipe : public VPSingleDefRecipe {
1580   template <typename IterT>
1581   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
1582       : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I,
1583                           I.getDebugLoc()) {}
1584 
1585   ~VPWidenSelectRecipe() override = default;
1586 
1587   VPWidenSelectRecipe *clone() override {
1588     return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
1589                                    operands());
1590   }
1591 
1592   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
1593 
1594   /// Produce a widened version of the select instruction.
1595   void execute(VPTransformState &State) override;
1596 
1597 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1598   /// Print the recipe.
1599   void print(raw_ostream &O, const Twine &Indent,
1600              VPSlotTracker &SlotTracker) const override;
1601 #endif
1602 
1603   VPValue *getCond() const {
1604     return getOperand(0);
1605   }
1606 
1607   bool isInvariantCond() const {
1608     return getCond()->isDefinedOutsideVectorRegions();
1609   }
1610 };
1611 
1612 /// A recipe for handling GEP instructions.
1613 class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
1614   bool isPointerLoopInvariant() const {
1615     return getOperand(0)->isDefinedOutsideVectorRegions();
1616   }
1617 
1618   bool isIndexLoopInvariant(unsigned I) const {
1619     return getOperand(I + 1)->isDefinedOutsideVectorRegions();
1620   }
1621 
1622   bool areAllOperandsInvariant() const {
1623     return all_of(operands(), [](VPValue *Op) {
1624       return Op->isDefinedOutsideVectorRegions();
1625     });
1626   }
1627 
1628 public:
1629   template <typename IterT>
1630   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
1631       : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {}
1632 
1633   ~VPWidenGEPRecipe() override = default;
1634 
1635   VPWidenGEPRecipe *clone() override {
1636     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()),
1637                                 operands());
1638   }
1639 
1640   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
1641 
1642   /// Generate the gep nodes.
1643   void execute(VPTransformState &State) override;
1644 
1645 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1646   /// Print the recipe.
1647   void print(raw_ostream &O, const Twine &Indent,
1648              VPSlotTracker &SlotTracker) const override;
1649 #endif
1650 };
1651 
1652 /// A recipe to compute the pointers for widened memory accesses of IndexTy for
1653 /// all parts. If IsReverse is true, compute pointers for accessing the input in
1654 /// reverse order per part.
1655 class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
1656   Type *IndexedTy;
1657   bool IsReverse;
1658 
1659 public:
1660   VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
1661                         bool IsInBounds, DebugLoc DL)
1662       : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
1663                             GEPFlagsTy(IsInBounds), DL),
1664         IndexedTy(IndexedTy), IsReverse(IsReverse) {}
1665 
1666   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
1667 
1668   void execute(VPTransformState &State) override;
1669 
1670   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1671     assert(is_contained(operands(), Op) &&
1672            "Op must be an operand of the recipe");
1673     return true;
1674   }
1675 
1676   VPVectorPointerRecipe *clone() override {
1677     return new VPVectorPointerRecipe(getOperand(0), IndexedTy, IsReverse,
1678                                      isInBounds(), getDebugLoc());
1679   }
1680 
1681 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1682   /// Print the recipe.
1683   void print(raw_ostream &O, const Twine &Indent,
1684              VPSlotTracker &SlotTracker) const override;
1685 #endif
1686 };
1687 
1688 /// A pure virtual base class for all recipes modeling header phis, including
1689 /// phis for first order recurrences, pointer inductions and reductions. The
1690 /// start value is the first operand of the recipe and the incoming value from
1691 /// the backedge is the second operand.
1692 ///
1693 /// Inductions are modeled using the following sub-classes:
1694 ///  * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop,
1695 ///    starting at a specified value (zero for the main vector loop, the resume
1696 ///    value for the epilogue vector loop) and stepping by 1. The induction
1697 ///    controls exiting of the vector loop by comparing against the vector trip
1698 ///    count. Produces a single scalar PHI for the induction value per
1699 ///    iteration.
1700 ///  * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and
1701 ///    floating point inductions with arbitrary start and step values. Produces
1702 ///    a vector PHI per-part.
1703 ///  * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding
1704 ///    value of an IV with different start and step values. Produces a single
1705 ///    scalar value per iteration
1706 ///  * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a
1707 ///    canonical or derived induction.
1708 ///  * VPWidenPointerInductionRecipe: Generate vector and scalar values for a
1709 ///    pointer induction. Produces either a vector PHI per-part or scalar values
1710 ///    per-lane based on the canonical induction.
1711 class VPHeaderPHIRecipe : public VPSingleDefRecipe {
1712 protected:
1713   VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
1714                     VPValue *Start = nullptr, DebugLoc DL = {})
1715       : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>(), UnderlyingInstr, DL) {
1716     if (Start)
1717       addOperand(Start);
1718   }
1719 
1720 public:
1721   ~VPHeaderPHIRecipe() override = default;
1722 
1723   /// Method to support type inquiry through isa, cast, and dyn_cast.
1724   static inline bool classof(const VPRecipeBase *B) {
1725     return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC &&
1726            B->getVPDefID() <= VPDef::VPLastHeaderPHISC;
1727   }
1728   static inline bool classof(const VPValue *V) {
1729     auto *B = V->getDefiningRecipe();
1730     return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC &&
1731            B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC;
1732   }
1733 
1734   /// Generate the phi nodes.
1735   void execute(VPTransformState &State) override = 0;
1736 
1737 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1738   /// Print the recipe.
1739   void print(raw_ostream &O, const Twine &Indent,
1740              VPSlotTracker &SlotTracker) const override = 0;
1741 #endif
1742 
1743   /// Returns the start value of the phi, if one is set.
1744   VPValue *getStartValue() {
1745     return getNumOperands() == 0 ? nullptr : getOperand(0);
1746   }
1747   VPValue *getStartValue() const {
1748     return getNumOperands() == 0 ? nullptr : getOperand(0);
1749   }
1750 
1751   /// Update the start value of the recipe.
1752   void setStartValue(VPValue *V) { setOperand(0, V); }
1753 
1754   /// Returns the incoming value from the loop backedge.
1755   virtual VPValue *getBackedgeValue() {
1756     return getOperand(1);
1757   }
1758 
1759   /// Returns the backedge value as a recipe. The backedge value is guaranteed
1760   /// to be a recipe.
1761   virtual VPRecipeBase &getBackedgeRecipe() {
1762     return *getBackedgeValue()->getDefiningRecipe();
1763   }
1764 };
1765 
1766 /// A recipe for handling phi nodes of integer and floating-point inductions,
1767 /// producing their vector values.
1768 class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
1769   PHINode *IV;
1770   TruncInst *Trunc;
1771   const InductionDescriptor &IndDesc;
1772 
1773 public:
1774   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1775                                 const InductionDescriptor &IndDesc)
1776       : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
1777         Trunc(nullptr), IndDesc(IndDesc) {
1778     addOperand(Step);
1779   }
1780 
1781   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1782                                 const InductionDescriptor &IndDesc,
1783                                 TruncInst *Trunc)
1784       : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
1785         IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
1786     addOperand(Step);
1787   }
1788 
1789   ~VPWidenIntOrFpInductionRecipe() override = default;
1790 
1791   VPWidenIntOrFpInductionRecipe *clone() override {
1792     return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(),
1793                                              getStepValue(), IndDesc, Trunc);
1794   }
1795 
1796   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
1797 
1798   /// Generate the vectorized and scalarized versions of the phi node as
1799   /// needed by their users.
1800   void execute(VPTransformState &State) override;
1801 
1802 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1803   /// Print the recipe.
1804   void print(raw_ostream &O, const Twine &Indent,
1805              VPSlotTracker &SlotTracker) const override;
1806 #endif
1807 
1808   VPValue *getBackedgeValue() override {
1809     // TODO: All operands of base recipe must exist and be at same index in
1810     // derived recipe.
1811     llvm_unreachable(
1812         "VPWidenIntOrFpInductionRecipe generates its own backedge value");
1813   }
1814 
1815   VPRecipeBase &getBackedgeRecipe() override {
1816     // TODO: All operands of base recipe must exist and be at same index in
1817     // derived recipe.
1818     llvm_unreachable(
1819         "VPWidenIntOrFpInductionRecipe generates its own backedge value");
1820   }
1821 
1822   /// Returns the step value of the induction.
1823   VPValue *getStepValue() { return getOperand(1); }
1824   const VPValue *getStepValue() const { return getOperand(1); }
1825 
1826   /// Returns the first defined value as TruncInst, if it is one or nullptr
1827   /// otherwise.
1828   TruncInst *getTruncInst() { return Trunc; }
1829   const TruncInst *getTruncInst() const { return Trunc; }
1830 
1831   PHINode *getPHINode() { return IV; }
1832 
1833   /// Returns the induction descriptor for the recipe.
1834   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1835 
1836   /// Returns true if the induction is canonical, i.e. starting at 0 and
1837   /// incremented by UF * VF (= the original IV is incremented by 1) and has the
1838   /// same type as the canonical induction.
1839   bool isCanonical() const;
1840 
1841   /// Returns the scalar type of the induction.
1842   Type *getScalarType() const {
1843     return Trunc ? Trunc->getType() : IV->getType();
1844   }
1845 };
1846 
1847 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
1848   const InductionDescriptor &IndDesc;
1849 
1850   bool IsScalarAfterVectorization;
1851 
1852 public:
1853   /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
1854   /// Start.
1855   VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
1856                                 const InductionDescriptor &IndDesc,
1857                                 bool IsScalarAfterVectorization)
1858       : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi),
1859         IndDesc(IndDesc),
1860         IsScalarAfterVectorization(IsScalarAfterVectorization) {
1861     addOperand(Start);
1862     addOperand(Step);
1863   }
1864 
1865   ~VPWidenPointerInductionRecipe() override = default;
1866 
1867   VPWidenPointerInductionRecipe *clone() override {
1868     return new VPWidenPointerInductionRecipe(
1869         cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
1870         IndDesc, IsScalarAfterVectorization);
1871   }
1872 
1873   VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
1874 
1875   /// Generate vector values for the pointer induction.
1876   void execute(VPTransformState &State) override;
1877 
1878   /// Returns true if only scalar values will be generated.
1879   bool onlyScalarsGenerated(bool IsScalable);
1880 
1881   /// Returns the induction descriptor for the recipe.
1882   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1883 
1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1885   /// Print the recipe.
1886   void print(raw_ostream &O, const Twine &Indent,
1887              VPSlotTracker &SlotTracker) const override;
1888 #endif
1889 };
1890 
1891 /// A recipe for handling phis that are widened in the vector loop.
1892 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
1893 /// managed in the recipe directly.
1894 class VPWidenPHIRecipe : public VPSingleDefRecipe {
1895   /// List of incoming blocks. Only used in the VPlan native path.
1896   SmallVector<VPBasicBlock *, 2> IncomingBlocks;
1897 
1898 public:
1899   /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
1900   VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr)
1901       : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi) {
1902     if (Start)
1903       addOperand(Start);
1904   }
1905 
1906   VPWidenPHIRecipe *clone() override {
1907     llvm_unreachable("cloning not implemented yet");
1908   }
1909 
1910   ~VPWidenPHIRecipe() override = default;
1911 
1912   VP_CLASSOF_IMPL(VPDef::VPWidenPHISC)
1913 
1914   /// Generate the phi/select nodes.
1915   void execute(VPTransformState &State) override;
1916 
1917 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1918   /// Print the recipe.
1919   void print(raw_ostream &O, const Twine &Indent,
1920              VPSlotTracker &SlotTracker) const override;
1921 #endif
1922 
1923   /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
1924   void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
1925     addOperand(IncomingV);
1926     IncomingBlocks.push_back(IncomingBlock);
1927   }
1928 
1929   /// Returns the \p I th incoming VPBasicBlock.
1930   VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
1931 
1932   /// Returns the \p I th incoming VPValue.
1933   VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
1934 };
1935 
1936 /// A recipe for handling first-order recurrence phis. The start value is the
1937 /// first operand of the recipe and the incoming value from the backedge is the
1938 /// second operand.
1939 struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
1940   VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
1941       : VPHeaderPHIRecipe(VPDef::VPFirstOrderRecurrencePHISC, Phi, &Start) {}
1942 
1943   VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC)
1944 
1945   static inline bool classof(const VPHeaderPHIRecipe *R) {
1946     return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC;
1947   }
1948 
1949   VPFirstOrderRecurrencePHIRecipe *clone() override {
1950     return new VPFirstOrderRecurrencePHIRecipe(
1951         cast<PHINode>(getUnderlyingInstr()), *getOperand(0));
1952   }
1953 
1954   void execute(VPTransformState &State) override;
1955 
1956 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1957   /// Print the recipe.
1958   void print(raw_ostream &O, const Twine &Indent,
1959              VPSlotTracker &SlotTracker) const override;
1960 #endif
1961 };
1962 
1963 /// A recipe for handling reduction phis. The start value is the first operand
1964 /// of the recipe and the incoming value from the backedge is the second
1965 /// operand.
1966 class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
1967   /// Descriptor for the reduction.
1968   const RecurrenceDescriptor &RdxDesc;
1969 
1970   /// The phi is part of an in-loop reduction.
1971   bool IsInLoop;
1972 
1973   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
1974   bool IsOrdered;
1975 
1976 public:
1977   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
1978   /// RdxDesc.
1979   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
1980                        VPValue &Start, bool IsInLoop = false,
1981                        bool IsOrdered = false)
1982       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
1983         RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
1984     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
1985   }
1986 
1987   ~VPReductionPHIRecipe() override = default;
1988 
1989   VPReductionPHIRecipe *clone() override {
1990     auto *R =
1991         new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
1992                                  *getOperand(0), IsInLoop, IsOrdered);
1993     R->addOperand(getBackedgeValue());
1994     return R;
1995   }
1996 
1997   VP_CLASSOF_IMPL(VPDef::VPReductionPHISC)
1998 
1999   static inline bool classof(const VPHeaderPHIRecipe *R) {
2000     return R->getVPDefID() == VPDef::VPReductionPHISC;
2001   }
2002 
2003   /// Generate the phi/select nodes.
2004   void execute(VPTransformState &State) override;
2005 
2006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2007   /// Print the recipe.
2008   void print(raw_ostream &O, const Twine &Indent,
2009              VPSlotTracker &SlotTracker) const override;
2010 #endif
2011 
2012   const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2013     return RdxDesc;
2014   }
2015 
2016   /// Returns true, if the phi is part of an ordered reduction.
2017   bool isOrdered() const { return IsOrdered; }
2018 
2019   /// Returns true, if the phi is part of an in-loop reduction.
2020   bool isInLoop() const { return IsInLoop; }
2021 };
2022 
2023 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
2024 /// instructions.
2025 class VPBlendRecipe : public VPSingleDefRecipe {
2026 public:
2027   /// The blend operation is a User of the incoming values and of their
2028   /// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first
2029   /// incoming value does not have a mask associated.
2030   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
2031       : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
2032     assert((Operands.size() + 1) % 2 == 0 &&
2033            "Expected an odd number of operands");
2034   }
2035 
2036   VPBlendRecipe *clone() override {
2037     SmallVector<VPValue *> Ops(operands());
2038     return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops);
2039   }
2040 
2041   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
2042 
2043   /// Return the number of incoming values, taking into account that the first
2044   /// incoming value has no mask.
2045   unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
2046 
2047   /// Return incoming value number \p Idx.
2048   VPValue *getIncomingValue(unsigned Idx) const {
2049     return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1);
2050   }
2051 
2052   /// Return mask number \p Idx.
2053   VPValue *getMask(unsigned Idx) const {
2054     assert(Idx > 0 && "First index has no mask associated.");
2055     return getOperand(Idx * 2);
2056   }
2057 
2058   /// Generate the phi/select nodes.
2059   void execute(VPTransformState &State) override;
2060 
2061 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2062   /// Print the recipe.
2063   void print(raw_ostream &O, const Twine &Indent,
2064              VPSlotTracker &SlotTracker) const override;
2065 #endif
2066 
2067   /// Returns true if the recipe only uses the first lane of operand \p Op.
2068   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2069     assert(is_contained(operands(), Op) &&
2070            "Op must be an operand of the recipe");
2071     // Recursing through Blend recipes only, must terminate at header phi's the
2072     // latest.
2073     return all_of(users(),
2074                   [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
2075   }
2076 };
2077 
2078 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
2079 /// or stores into one wide load/store and shuffles. The first operand of a
2080 /// VPInterleave recipe is the address, followed by the stored values, followed
2081 /// by an optional mask.
2082 class VPInterleaveRecipe : public VPRecipeBase {
2083   const InterleaveGroup<Instruction> *IG;
2084 
2085   /// Indicates if the interleave group is in a conditional block and requires a
2086   /// mask.
2087   bool HasMask = false;
2088 
2089   /// Indicates if gaps between members of the group need to be masked out or if
2090   /// unusued gaps can be loaded speculatively.
2091   bool NeedsMaskForGaps = false;
2092 
2093 public:
2094   VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
2095                      ArrayRef<VPValue *> StoredValues, VPValue *Mask,
2096                      bool NeedsMaskForGaps)
2097       : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG),
2098         NeedsMaskForGaps(NeedsMaskForGaps) {
2099     for (unsigned i = 0; i < IG->getFactor(); ++i)
2100       if (Instruction *I = IG->getMember(i)) {
2101         if (I->getType()->isVoidTy())
2102           continue;
2103         new VPValue(I, this);
2104       }
2105 
2106     for (auto *SV : StoredValues)
2107       addOperand(SV);
2108     if (Mask) {
2109       HasMask = true;
2110       addOperand(Mask);
2111     }
2112   }
2113   ~VPInterleaveRecipe() override = default;
2114 
2115   VPInterleaveRecipe *clone() override {
2116     return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
2117                                   NeedsMaskForGaps);
2118   }
2119 
2120   VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
2121 
2122   /// Return the address accessed by this recipe.
2123   VPValue *getAddr() const {
2124     return getOperand(0); // Address is the 1st, mandatory operand.
2125   }
2126 
2127   /// Return the mask used by this recipe. Note that a full mask is represented
2128   /// by a nullptr.
2129   VPValue *getMask() const {
2130     // Mask is optional and therefore the last, currently 2nd operand.
2131     return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
2132   }
2133 
2134   /// Return the VPValues stored by this interleave group. If it is a load
2135   /// interleave group, return an empty ArrayRef.
2136   ArrayRef<VPValue *> getStoredValues() const {
2137     // The first operand is the address, followed by the stored values, followed
2138     // by an optional mask.
2139     return ArrayRef<VPValue *>(op_begin(), getNumOperands())
2140         .slice(1, getNumStoreOperands());
2141   }
2142 
2143   /// Generate the wide load or store, and shuffles.
2144   void execute(VPTransformState &State) override;
2145 
2146 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2147   /// Print the recipe.
2148   void print(raw_ostream &O, const Twine &Indent,
2149              VPSlotTracker &SlotTracker) const override;
2150 #endif
2151 
2152   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
2153 
2154   /// Returns the number of stored operands of this interleave group. Returns 0
2155   /// for load interleave groups.
2156   unsigned getNumStoreOperands() const {
2157     return getNumOperands() - (HasMask ? 2 : 1);
2158   }
2159 
2160   /// The recipe only uses the first lane of the address.
2161   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2162     assert(is_contained(operands(), Op) &&
2163            "Op must be an operand of the recipe");
2164     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
2165   }
2166 
2167   Instruction *getInsertPos() const { return IG->getInsertPos(); }
2168 };
2169 
2170 /// A recipe to represent inloop reduction operations, performing a reduction on
2171 /// a vector operand into a scalar value, and adding the result to a chain.
2172 /// The Operands are {ChainOp, VecOp, [Condition]}.
2173 class VPReductionRecipe : public VPSingleDefRecipe {
2174   /// The recurrence decriptor for the reduction in question.
2175   const RecurrenceDescriptor &RdxDesc;
2176   bool IsOrdered;
2177   /// Whether the reduction is conditional.
2178   bool IsConditional = false;
2179 
2180 protected:
2181   VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
2182                     Instruction *I, ArrayRef<VPValue *> Operands,
2183                     VPValue *CondOp, bool IsOrdered)
2184       : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
2185     if (CondOp) {
2186       IsConditional = true;
2187       addOperand(CondOp);
2188     }
2189   }
2190 
2191 public:
2192   VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
2193                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2194                     bool IsOrdered)
2195       : VPReductionRecipe(VPDef::VPReductionSC, R, I,
2196                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2197                           IsOrdered) {}
2198 
2199   ~VPReductionRecipe() override = default;
2200 
2201   VPReductionRecipe *clone() override {
2202     return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
2203                                  getVecOp(), getCondOp(), IsOrdered);
2204   }
2205 
2206   static inline bool classof(const VPRecipeBase *R) {
2207     return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
2208            R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
2209   }
2210 
2211   static inline bool classof(const VPUser *U) {
2212     auto *R = dyn_cast<VPRecipeBase>(U);
2213     return R && classof(R);
2214   }
2215 
2216   /// Generate the reduction in the loop
2217   void execute(VPTransformState &State) override;
2218 
2219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2220   /// Print the recipe.
2221   void print(raw_ostream &O, const Twine &Indent,
2222              VPSlotTracker &SlotTracker) const override;
2223 #endif
2224 
2225   /// Return the recurrence decriptor for the in-loop reduction.
2226   const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2227     return RdxDesc;
2228   }
2229   /// Return true if the in-loop reduction is ordered.
2230   bool isOrdered() const { return IsOrdered; };
2231   /// Return true if the in-loop reduction is conditional.
2232   bool isConditional() const { return IsConditional; };
2233   /// The VPValue of the scalar Chain being accumulated.
2234   VPValue *getChainOp() const { return getOperand(0); }
2235   /// The VPValue of the vector value to be reduced.
2236   VPValue *getVecOp() const { return getOperand(1); }
2237   /// The VPValue of the condition for the block.
2238   VPValue *getCondOp() const {
2239     return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
2240   }
2241 };
2242 
2243 /// A recipe to represent inloop reduction operations with vector-predication
2244 /// intrinsics, performing a reduction on a vector operand with the explicit
2245 /// vector length (EVL) into a scalar value, and adding the result to a chain.
2246 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
2247 class VPReductionEVLRecipe : public VPReductionRecipe {
2248 public:
2249   VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
2250       : VPReductionRecipe(
2251             VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
2252             cast_or_null<Instruction>(R->getUnderlyingValue()),
2253             ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
2254             R->isOrdered()) {}
2255 
2256   ~VPReductionEVLRecipe() override = default;
2257 
2258   VPReductionEVLRecipe *clone() override {
2259     llvm_unreachable("cloning not implemented yet");
2260   }
2261 
2262   VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
2263 
2264   /// Generate the reduction in the loop
2265   void execute(VPTransformState &State) override;
2266 
2267 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2268   /// Print the recipe.
2269   void print(raw_ostream &O, const Twine &Indent,
2270              VPSlotTracker &SlotTracker) const override;
2271 #endif
2272 
2273   /// The VPValue of the explicit vector length.
2274   VPValue *getEVL() const { return getOperand(2); }
2275 
2276   /// Returns true if the recipe only uses the first lane of operand \p Op.
2277   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2278     assert(is_contained(operands(), Op) &&
2279            "Op must be an operand of the recipe");
2280     return Op == getEVL();
2281   }
2282 };
2283 
2284 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
2285 /// copies of the original scalar type, one per lane, instead of producing a
2286 /// single copy of widened type for all lanes. If the instruction is known to be
2287 /// uniform only one copy, per lane zero, will be generated.
2288 class VPReplicateRecipe : public VPRecipeWithIRFlags {
2289   /// Indicator if only a single replica per lane is needed.
2290   bool IsUniform;
2291 
2292   /// Indicator if the replicas are also predicated.
2293   bool IsPredicated;
2294 
2295 public:
2296   template <typename IterT>
2297   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
2298                     bool IsUniform, VPValue *Mask = nullptr)
2299       : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
2300         IsUniform(IsUniform), IsPredicated(Mask) {
2301     if (Mask)
2302       addOperand(Mask);
2303   }
2304 
2305   ~VPReplicateRecipe() override = default;
2306 
2307   VPReplicateRecipe *clone() override {
2308     auto *Copy =
2309         new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform,
2310                               isPredicated() ? getMask() : nullptr);
2311     Copy->transferFlags(*this);
2312     return Copy;
2313   }
2314 
2315   VP_CLASSOF_IMPL(VPDef::VPReplicateSC)
2316 
2317   /// Generate replicas of the desired Ingredient. Replicas will be generated
2318   /// for all parts and lanes unless a specific part and lane are specified in
2319   /// the \p State.
2320   void execute(VPTransformState &State) override;
2321 
2322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2323   /// Print the recipe.
2324   void print(raw_ostream &O, const Twine &Indent,
2325              VPSlotTracker &SlotTracker) const override;
2326 #endif
2327 
2328   bool isUniform() const { return IsUniform; }
2329 
2330   bool isPredicated() const { return IsPredicated; }
2331 
2332   /// Returns true if the recipe only uses the first lane of operand \p Op.
2333   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2334     assert(is_contained(operands(), Op) &&
2335            "Op must be an operand of the recipe");
2336     return isUniform();
2337   }
2338 
2339   /// Returns true if the recipe uses scalars of operand \p Op.
2340   bool usesScalars(const VPValue *Op) const override {
2341     assert(is_contained(operands(), Op) &&
2342            "Op must be an operand of the recipe");
2343     return true;
2344   }
2345 
2346   /// Returns true if the recipe is used by a widened recipe via an intervening
2347   /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed
2348   /// in a vector.
2349   bool shouldPack() const;
2350 
2351   /// Return the mask of a predicated VPReplicateRecipe.
2352   VPValue *getMask() {
2353     assert(isPredicated() && "Trying to get the mask of a unpredicated recipe");
2354     return getOperand(getNumOperands() - 1);
2355   }
2356 
2357   unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); }
2358 };
2359 
2360 /// A recipe for generating conditional branches on the bits of a mask.
2361 class VPBranchOnMaskRecipe : public VPRecipeBase {
2362 public:
2363   VPBranchOnMaskRecipe(VPValue *BlockInMask)
2364       : VPRecipeBase(VPDef::VPBranchOnMaskSC, {}) {
2365     if (BlockInMask) // nullptr means all-one mask.
2366       addOperand(BlockInMask);
2367   }
2368 
2369   VPBranchOnMaskRecipe *clone() override {
2370     return new VPBranchOnMaskRecipe(getOperand(0));
2371   }
2372 
2373   VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC)
2374 
2375   /// Generate the extraction of the appropriate bit from the block mask and the
2376   /// conditional branch.
2377   void execute(VPTransformState &State) override;
2378 
2379 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2380   /// Print the recipe.
2381   void print(raw_ostream &O, const Twine &Indent,
2382              VPSlotTracker &SlotTracker) const override {
2383     O << Indent << "BRANCH-ON-MASK ";
2384     if (VPValue *Mask = getMask())
2385       Mask->printAsOperand(O, SlotTracker);
2386     else
2387       O << " All-One";
2388   }
2389 #endif
2390 
2391   /// Return the mask used by this recipe. Note that a full mask is represented
2392   /// by a nullptr.
2393   VPValue *getMask() const {
2394     assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
2395     // Mask is optional.
2396     return getNumOperands() == 1 ? getOperand(0) : nullptr;
2397   }
2398 
2399   /// Returns true if the recipe uses scalars of operand \p Op.
2400   bool usesScalars(const VPValue *Op) const override {
2401     assert(is_contained(operands(), Op) &&
2402            "Op must be an operand of the recipe");
2403     return true;
2404   }
2405 };
2406 
2407 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
2408 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
2409 /// order to merge values that are set under such a branch and feed their uses.
2410 /// The phi nodes can be scalar or vector depending on the users of the value.
2411 /// This recipe works in concert with VPBranchOnMaskRecipe.
2412 class VPPredInstPHIRecipe : public VPSingleDefRecipe {
2413 public:
2414   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
2415   /// nodes after merging back from a Branch-on-Mask.
2416   VPPredInstPHIRecipe(VPValue *PredV)
2417       : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {}
2418   ~VPPredInstPHIRecipe() override = default;
2419 
2420   VPPredInstPHIRecipe *clone() override {
2421     return new VPPredInstPHIRecipe(getOperand(0));
2422   }
2423 
2424   VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
2425 
2426   /// Generates phi nodes for live-outs as needed to retain SSA form.
2427   void execute(VPTransformState &State) override;
2428 
2429 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2430   /// Print the recipe.
2431   void print(raw_ostream &O, const Twine &Indent,
2432              VPSlotTracker &SlotTracker) const override;
2433 #endif
2434 
2435   /// Returns true if the recipe uses scalars of operand \p Op.
2436   bool usesScalars(const VPValue *Op) const override {
2437     assert(is_contained(operands(), Op) &&
2438            "Op must be an operand of the recipe");
2439     return true;
2440   }
2441 };
2442 
2443 /// A common base class for widening memory operations. An optional mask can be
2444 /// provided as the last operand.
2445 class VPWidenMemoryRecipe : public VPRecipeBase {
2446 protected:
2447   Instruction &Ingredient;
2448 
2449   /// Whether the accessed addresses are consecutive.
2450   bool Consecutive;
2451 
2452   /// Whether the consecutive accessed addresses are in reverse order.
2453   bool Reverse;
2454 
2455   /// Whether the memory access is masked.
2456   bool IsMasked = false;
2457 
2458   void setMask(VPValue *Mask) {
2459     assert(!IsMasked && "cannot re-set mask");
2460     if (!Mask)
2461       return;
2462     addOperand(Mask);
2463     IsMasked = true;
2464   }
2465 
2466   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
2467                       std::initializer_list<VPValue *> Operands,
2468                       bool Consecutive, bool Reverse, DebugLoc DL)
2469       : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
2470         Reverse(Reverse) {
2471     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
2472   }
2473 
2474 public:
2475   VPWidenMemoryRecipe *clone() override {
2476     llvm_unreachable("cloning not supported");
2477   }
2478 
2479   static inline bool classof(const VPRecipeBase *R) {
2480     return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
2481            R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
2482            R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
2483            R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
2484   }
2485 
2486   static inline bool classof(const VPUser *U) {
2487     auto *R = dyn_cast<VPRecipeBase>(U);
2488     return R && classof(R);
2489   }
2490 
2491   /// Return whether the loaded-from / stored-to addresses are consecutive.
2492   bool isConsecutive() const { return Consecutive; }
2493 
2494   /// Return whether the consecutive loaded/stored addresses are in reverse
2495   /// order.
2496   bool isReverse() const { return Reverse; }
2497 
2498   /// Return the address accessed by this recipe.
2499   VPValue *getAddr() const { return getOperand(0); }
2500 
2501   /// Returns true if the recipe is masked.
2502   bool isMasked() const { return IsMasked; }
2503 
2504   /// Return the mask used by this recipe. Note that a full mask is represented
2505   /// by a nullptr.
2506   VPValue *getMask() const {
2507     // Mask is optional and therefore the last operand.
2508     return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
2509   }
2510 
2511   /// Generate the wide load/store.
2512   void execute(VPTransformState &State) override {
2513     llvm_unreachable("VPWidenMemoryRecipe should not be instantiated.");
2514   }
2515 
2516   Instruction &getIngredient() const { return Ingredient; }
2517 };
2518 
2519 /// A recipe for widening load operations, using the address to load from and an
2520 /// optional mask.
2521 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
2522   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
2523                     bool Consecutive, bool Reverse, DebugLoc DL)
2524       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
2525                             Reverse, DL),
2526         VPValue(this, &Load) {
2527     setMask(Mask);
2528   }
2529 
2530   VPWidenLoadRecipe *clone() override {
2531     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
2532                                  getMask(), Consecutive, Reverse,
2533                                  getDebugLoc());
2534   }
2535 
2536   VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
2537 
2538   /// Generate a wide load or gather.
2539   void execute(VPTransformState &State) override;
2540 
2541 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2542   /// Print the recipe.
2543   void print(raw_ostream &O, const Twine &Indent,
2544              VPSlotTracker &SlotTracker) const override;
2545 #endif
2546 
2547   /// Returns true if the recipe only uses the first lane of operand \p Op.
2548   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2549     assert(is_contained(operands(), Op) &&
2550            "Op must be an operand of the recipe");
2551     // Widened, consecutive loads operations only demand the first lane of
2552     // their address.
2553     return Op == getAddr() && isConsecutive();
2554   }
2555 };
2556 
2557 /// A recipe for widening load operations with vector-predication intrinsics,
2558 /// using the address to load from, the explicit vector length and an optional
2559 /// mask.
2560 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
2561   VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
2562       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
2563                             {L->getAddr(), EVL}, L->isConsecutive(),
2564                             L->isReverse(), L->getDebugLoc()),
2565         VPValue(this, &getIngredient()) {
2566     setMask(Mask);
2567   }
2568 
2569   VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC)
2570 
2571   /// Return the EVL operand.
2572   VPValue *getEVL() const { return getOperand(1); }
2573 
2574   /// Generate the wide load or gather.
2575   void execute(VPTransformState &State) override;
2576 
2577 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2578   /// Print the recipe.
2579   void print(raw_ostream &O, const Twine &Indent,
2580              VPSlotTracker &SlotTracker) const override;
2581 #endif
2582 
2583   /// Returns true if the recipe only uses the first lane of operand \p Op.
2584   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2585     assert(is_contained(operands(), Op) &&
2586            "Op must be an operand of the recipe");
2587     // Widened loads only demand the first lane of EVL and consecutive loads
2588     // only demand the first lane of their address.
2589     return Op == getEVL() || (Op == getAddr() && isConsecutive());
2590   }
2591 };
2592 
2593 /// A recipe for widening store operations, using the stored value, the address
2594 /// to store to and an optional mask.
2595 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
2596   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
2597                      VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
2598       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
2599                             Consecutive, Reverse, DL) {
2600     setMask(Mask);
2601   }
2602 
2603   VPWidenStoreRecipe *clone() override {
2604     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
2605                                   getStoredValue(), getMask(), Consecutive,
2606                                   Reverse, getDebugLoc());
2607   }
2608 
2609   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
2610 
2611   /// Return the value stored by this recipe.
2612   VPValue *getStoredValue() const { return getOperand(1); }
2613 
2614   /// Generate a wide store or scatter.
2615   void execute(VPTransformState &State) override;
2616 
2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2618   /// Print the recipe.
2619   void print(raw_ostream &O, const Twine &Indent,
2620              VPSlotTracker &SlotTracker) const override;
2621 #endif
2622 
2623   /// Returns true if the recipe only uses the first lane of operand \p Op.
2624   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2625     assert(is_contained(operands(), Op) &&
2626            "Op must be an operand of the recipe");
2627     // Widened, consecutive stores only demand the first lane of their address,
2628     // unless the same operand is also stored.
2629     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
2630   }
2631 };
2632 
2633 /// A recipe for widening store operations with vector-predication intrinsics,
2634 /// using the value to store, the address to store to, the explicit vector
2635 /// length and an optional mask.
2636 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
2637   VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
2638       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
2639                             {S->getAddr(), S->getStoredValue(), EVL},
2640                             S->isConsecutive(), S->isReverse(),
2641                             S->getDebugLoc()) {
2642     setMask(Mask);
2643   }
2644 
2645   VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
2646 
2647   /// Return the address accessed by this recipe.
2648   VPValue *getStoredValue() const { return getOperand(1); }
2649 
2650   /// Return the EVL operand.
2651   VPValue *getEVL() const { return getOperand(2); }
2652 
2653   /// Generate the wide store or scatter.
2654   void execute(VPTransformState &State) override;
2655 
2656 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2657   /// Print the recipe.
2658   void print(raw_ostream &O, const Twine &Indent,
2659              VPSlotTracker &SlotTracker) const override;
2660 #endif
2661 
2662   /// Returns true if the recipe only uses the first lane of operand \p Op.
2663   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2664     assert(is_contained(operands(), Op) &&
2665            "Op must be an operand of the recipe");
2666     if (Op == getEVL()) {
2667       assert(getStoredValue() != Op && "unexpected store of EVL");
2668       return true;
2669     }
2670     // Widened, consecutive memory operations only demand the first lane of
2671     // their address, unless the same operand is also stored. That latter can
2672     // happen with opaque pointers.
2673     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
2674   }
2675 };
2676 
2677 /// Recipe to expand a SCEV expression.
2678 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
2679   const SCEV *Expr;
2680   ScalarEvolution &SE;
2681 
2682 public:
2683   VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
2684       : VPSingleDefRecipe(VPDef::VPExpandSCEVSC, {}), Expr(Expr), SE(SE) {}
2685 
2686   ~VPExpandSCEVRecipe() override = default;
2687 
2688   VPExpandSCEVRecipe *clone() override {
2689     return new VPExpandSCEVRecipe(Expr, SE);
2690   }
2691 
2692   VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC)
2693 
2694   /// Generate a canonical vector induction variable of the vector loop, with
2695   void execute(VPTransformState &State) override;
2696 
2697 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2698   /// Print the recipe.
2699   void print(raw_ostream &O, const Twine &Indent,
2700              VPSlotTracker &SlotTracker) const override;
2701 #endif
2702 
2703   const SCEV *getSCEV() const { return Expr; }
2704 };
2705 
2706 /// Canonical scalar induction phi of the vector loop. Starting at the specified
2707 /// start value (either 0 or the resume value when vectorizing the epilogue
2708 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the
2709 /// canonical induction variable.
2710 class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
2711 public:
2712   VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
2713       : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {}
2714 
2715   ~VPCanonicalIVPHIRecipe() override = default;
2716 
2717   VPCanonicalIVPHIRecipe *clone() override {
2718     auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
2719     R->addOperand(getBackedgeValue());
2720     return R;
2721   }
2722 
2723   VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)
2724 
2725   static inline bool classof(const VPHeaderPHIRecipe *D) {
2726     return D->getVPDefID() == VPDef::VPCanonicalIVPHISC;
2727   }
2728 
2729   /// Generate the canonical scalar induction phi of the vector loop.
2730   void execute(VPTransformState &State) override;
2731 
2732 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2733   /// Print the recipe.
2734   void print(raw_ostream &O, const Twine &Indent,
2735              VPSlotTracker &SlotTracker) const override;
2736 #endif
2737 
2738   /// Returns the scalar type of the induction.
2739   Type *getScalarType() const {
2740     return getStartValue()->getLiveInIRValue()->getType();
2741   }
2742 
2743   /// Returns true if the recipe only uses the first lane of operand \p Op.
2744   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2745     assert(is_contained(operands(), Op) &&
2746            "Op must be an operand of the recipe");
2747     return true;
2748   }
2749 
2750   /// Returns true if the recipe only uses the first part of operand \p Op.
2751   bool onlyFirstPartUsed(const VPValue *Op) const override {
2752     assert(is_contained(operands(), Op) &&
2753            "Op must be an operand of the recipe");
2754     return true;
2755   }
2756 
2757   /// Check if the induction described by \p Kind, /p Start and \p Step is
2758   /// canonical, i.e.  has the same start and step (of 1) as the canonical IV.
2759   bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start,
2760                    VPValue *Step) const;
2761 };
2762 
2763 /// A recipe for generating the active lane mask for the vector loop that is
2764 /// used to predicate the vector operations.
2765 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2766 /// remove VPActiveLaneMaskPHIRecipe.
2767 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
2768 public:
2769   VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
2770       : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
2771                           DL) {}
2772 
2773   ~VPActiveLaneMaskPHIRecipe() override = default;
2774 
2775   VPActiveLaneMaskPHIRecipe *clone() override {
2776     return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2777   }
2778 
2779   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
2780 
2781   static inline bool classof(const VPHeaderPHIRecipe *D) {
2782     return D->getVPDefID() == VPDef::VPActiveLaneMaskPHISC;
2783   }
2784 
2785   /// Generate the active lane mask phi of the vector loop.
2786   void execute(VPTransformState &State) override;
2787 
2788 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2789   /// Print the recipe.
2790   void print(raw_ostream &O, const Twine &Indent,
2791              VPSlotTracker &SlotTracker) const override;
2792 #endif
2793 };
2794 
2795 /// A recipe for generating the phi node for the current index of elements,
2796 /// adjusted in accordance with EVL value. It starts at the start value of the
2797 /// canonical induction and gets incremented by EVL in each iteration of the
2798 /// vector loop.
2799 class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
2800 public:
2801   VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
2802       : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}
2803 
2804   ~VPEVLBasedIVPHIRecipe() override = default;
2805 
2806   VPEVLBasedIVPHIRecipe *clone() override {
2807     llvm_unreachable("cloning not implemented yet");
2808   }
2809 
2810   VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)
2811 
2812   static inline bool classof(const VPHeaderPHIRecipe *D) {
2813     return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
2814   }
2815 
2816   /// Generate phi for handling IV based on EVL over iterations correctly.
2817   /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe.
2818   void execute(VPTransformState &State) override;
2819 
2820   /// Returns true if the recipe only uses the first lane of operand \p Op.
2821   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2822     assert(is_contained(operands(), Op) &&
2823            "Op must be an operand of the recipe");
2824     return true;
2825   }
2826 
2827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2828   /// Print the recipe.
2829   void print(raw_ostream &O, const Twine &Indent,
2830              VPSlotTracker &SlotTracker) const override;
2831 #endif
2832 };
2833 
2834 /// A Recipe for widening the canonical induction variable of the vector loop.
2835 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
2836 public:
2837   VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
2838       : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
2839 
2840   ~VPWidenCanonicalIVRecipe() override = default;
2841 
2842   VPWidenCanonicalIVRecipe *clone() override {
2843     return new VPWidenCanonicalIVRecipe(
2844         cast<VPCanonicalIVPHIRecipe>(getOperand(0)));
2845   }
2846 
2847   VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC)
2848 
2849   /// Generate a canonical vector induction variable of the vector loop, with
2850   /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
2851   /// step = <VF*UF, VF*UF, ..., VF*UF>.
2852   void execute(VPTransformState &State) override;
2853 
2854 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2855   /// Print the recipe.
2856   void print(raw_ostream &O, const Twine &Indent,
2857              VPSlotTracker &SlotTracker) const override;
2858 #endif
2859 };
2860 
2861 /// A recipe for converting the input value \p IV value to the corresponding
2862 /// value of an IV with different start and step values, using Start + IV *
2863 /// Step.
2864 class VPDerivedIVRecipe : public VPSingleDefRecipe {
2865   /// Kind of the induction.
2866   const InductionDescriptor::InductionKind Kind;
2867   /// If not nullptr, the floating point induction binary operator. Must be set
2868   /// for floating point inductions.
2869   const FPMathOperator *FPBinOp;
2870 
2871 public:
2872   VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
2873                     VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
2874       : VPDerivedIVRecipe(
2875             IndDesc.getKind(),
2876             dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
2877             Start, CanonicalIV, Step) {}
2878 
2879   VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
2880                     const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV,
2881                     VPValue *Step)
2882       : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind),
2883         FPBinOp(FPBinOp) {}
2884 
2885   ~VPDerivedIVRecipe() override = default;
2886 
2887   VPDerivedIVRecipe *clone() override {
2888     return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1),
2889                                  getStepValue());
2890   }
2891 
2892   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
2893 
2894   /// Generate the transformed value of the induction at offset StartValue (1.
2895   /// operand) + IV (2. operand) * StepValue (3, operand).
2896   void execute(VPTransformState &State) override;
2897 
2898 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2899   /// Print the recipe.
2900   void print(raw_ostream &O, const Twine &Indent,
2901              VPSlotTracker &SlotTracker) const override;
2902 #endif
2903 
2904   Type *getScalarType() const {
2905     return getStartValue()->getLiveInIRValue()->getType();
2906   }
2907 
2908   VPValue *getStartValue() const { return getOperand(0); }
2909   VPValue *getStepValue() const { return getOperand(2); }
2910 
2911   /// Returns true if the recipe only uses the first lane of operand \p Op.
2912   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2913     assert(is_contained(operands(), Op) &&
2914            "Op must be an operand of the recipe");
2915     return true;
2916   }
2917 };
2918 
2919 /// A recipe for handling phi nodes of integer and floating-point inductions,
2920 /// producing their scalar values.
2921 class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
2922   Instruction::BinaryOps InductionOpcode;
2923 
2924 public:
2925   VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step,
2926                         Instruction::BinaryOps Opcode, FastMathFlags FMFs)
2927       : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
2928                             ArrayRef<VPValue *>({IV, Step}), FMFs),
2929         InductionOpcode(Opcode) {}
2930 
2931   VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
2932                         VPValue *Step)
2933       : VPScalarIVStepsRecipe(
2934             IV, Step, IndDesc.getInductionOpcode(),
2935             dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
2936                 ? IndDesc.getInductionBinOp()->getFastMathFlags()
2937                 : FastMathFlags()) {}
2938 
2939   ~VPScalarIVStepsRecipe() override = default;
2940 
2941   VPScalarIVStepsRecipe *clone() override {
2942     return new VPScalarIVStepsRecipe(
2943         getOperand(0), getOperand(1), InductionOpcode,
2944         hasFastMathFlags() ? getFastMathFlags() : FastMathFlags());
2945   }
2946 
2947   VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
2948 
2949   /// Generate the scalarized versions of the phi node as needed by their users.
2950   void execute(VPTransformState &State) override;
2951 
2952 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2953   /// Print the recipe.
2954   void print(raw_ostream &O, const Twine &Indent,
2955              VPSlotTracker &SlotTracker) const override;
2956 #endif
2957 
2958   VPValue *getStepValue() const { return getOperand(1); }
2959 
2960   /// Returns true if the recipe only uses the first lane of operand \p Op.
2961   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2962     assert(is_contained(operands(), Op) &&
2963            "Op must be an operand of the recipe");
2964     return true;
2965   }
2966 };
2967 
2968 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
2969 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
2970 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
2971 class VPBasicBlock : public VPBlockBase {
2972 public:
2973   using RecipeListTy = iplist<VPRecipeBase>;
2974 
2975 protected:
2976   /// The VPRecipes held in the order of output instructions to generate.
2977   RecipeListTy Recipes;
2978 
2979   VPBasicBlock(const unsigned char BlockSC, const Twine &Name = "")
2980       : VPBlockBase(BlockSC, Name.str()) {}
2981 
2982 public:
2983   VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
2984       : VPBlockBase(VPBasicBlockSC, Name.str()) {
2985     if (Recipe)
2986       appendRecipe(Recipe);
2987   }
2988 
2989   ~VPBasicBlock() override {
2990     while (!Recipes.empty())
2991       Recipes.pop_back();
2992   }
2993 
2994   /// Instruction iterators...
2995   using iterator = RecipeListTy::iterator;
2996   using const_iterator = RecipeListTy::const_iterator;
2997   using reverse_iterator = RecipeListTy::reverse_iterator;
2998   using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
2999 
3000   //===--------------------------------------------------------------------===//
3001   /// Recipe iterator methods
3002   ///
3003   inline iterator begin() { return Recipes.begin(); }
3004   inline const_iterator begin() const { return Recipes.begin(); }
3005   inline iterator end() { return Recipes.end(); }
3006   inline const_iterator end() const { return Recipes.end(); }
3007 
3008   inline reverse_iterator rbegin() { return Recipes.rbegin(); }
3009   inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
3010   inline reverse_iterator rend() { return Recipes.rend(); }
3011   inline const_reverse_iterator rend() const { return Recipes.rend(); }
3012 
3013   inline size_t size() const { return Recipes.size(); }
3014   inline bool empty() const { return Recipes.empty(); }
3015   inline const VPRecipeBase &front() const { return Recipes.front(); }
3016   inline VPRecipeBase &front() { return Recipes.front(); }
3017   inline const VPRecipeBase &back() const { return Recipes.back(); }
3018   inline VPRecipeBase &back() { return Recipes.back(); }
3019 
3020   /// Returns a reference to the list of recipes.
3021   RecipeListTy &getRecipeList() { return Recipes; }
3022 
3023   /// Returns a pointer to a member of the recipe list.
3024   static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
3025     return &VPBasicBlock::Recipes;
3026   }
3027 
3028   /// Method to support type inquiry through isa, cast, and dyn_cast.
3029   static inline bool classof(const VPBlockBase *V) {
3030     return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC ||
3031            V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
3032   }
3033 
3034   void insert(VPRecipeBase *Recipe, iterator InsertPt) {
3035     assert(Recipe && "No recipe to append.");
3036     assert(!Recipe->Parent && "Recipe already in VPlan");
3037     Recipe->Parent = this;
3038     Recipes.insert(InsertPt, Recipe);
3039   }
3040 
3041   /// Augment the existing recipes of a VPBasicBlock with an additional
3042   /// \p Recipe as the last recipe.
3043   void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
3044 
3045   /// The method which generates the output IR instructions that correspond to
3046   /// this VPBasicBlock, thereby "executing" the VPlan.
3047   void execute(VPTransformState *State) override;
3048 
3049   /// Return the cost of this VPBasicBlock.
3050   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
3051 
3052   /// Return the position of the first non-phi node recipe in the block.
3053   iterator getFirstNonPhi();
3054 
3055   /// Returns an iterator range over the PHI-like recipes in the block.
3056   iterator_range<iterator> phis() {
3057     return make_range(begin(), getFirstNonPhi());
3058   }
3059 
3060   void dropAllReferences(VPValue *NewValue) override;
3061 
3062   /// Split current block at \p SplitAt by inserting a new block between the
3063   /// current block and its successors and moving all recipes starting at
3064   /// SplitAt to the new block. Returns the new block.
3065   VPBasicBlock *splitAt(iterator SplitAt);
3066 
3067   VPRegionBlock *getEnclosingLoopRegion();
3068 
3069 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3070   /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
3071   /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
3072   ///
3073   /// Note that the numbering is applied to the whole VPlan, so printing
3074   /// individual blocks is consistent with the whole VPlan printing.
3075   void print(raw_ostream &O, const Twine &Indent,
3076              VPSlotTracker &SlotTracker) const override;
3077   using VPBlockBase::print; // Get the print(raw_stream &O) version.
3078 #endif
3079 
3080   /// If the block has multiple successors, return the branch recipe terminating
3081   /// the block. If there are no or only a single successor, return nullptr;
3082   VPRecipeBase *getTerminator();
3083   const VPRecipeBase *getTerminator() const;
3084 
3085   /// Returns true if the block is exiting it's parent region.
3086   bool isExiting() const;
3087 
3088   /// Clone the current block and it's recipes, without updating the operands of
3089   /// the cloned recipes.
3090   VPBasicBlock *clone() override {
3091     auto *NewBlock = new VPBasicBlock(getName());
3092     for (VPRecipeBase &R : *this)
3093       NewBlock->appendRecipe(R.clone());
3094     return NewBlock;
3095   }
3096 
3097 protected:
3098   /// Execute the recipes in the IR basic block \p BB.
3099   void executeRecipes(VPTransformState *State, BasicBlock *BB);
3100 
3101 private:
3102   /// Create an IR BasicBlock to hold the output instructions generated by this
3103   /// VPBasicBlock, and return it. Update the CFGState accordingly.
3104   BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
3105 };
3106 
3107 /// A special type of VPBasicBlock that wraps an existing IR basic block.
3108 /// Recipes of the block get added before the first non-phi instruction in the
3109 /// wrapped block.
3110 /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's
3111 /// preheader block.
3112 class VPIRBasicBlock : public VPBasicBlock {
3113   BasicBlock *IRBB;
3114 
3115 public:
3116   VPIRBasicBlock(BasicBlock *IRBB)
3117       : VPBasicBlock(VPIRBasicBlockSC,
3118                      (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()),
3119         IRBB(IRBB) {}
3120 
3121   ~VPIRBasicBlock() override {}
3122 
3123   static inline bool classof(const VPBlockBase *V) {
3124     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
3125   }
3126 
3127   /// The method which generates the output IR instructions that correspond to
3128   /// this VPBasicBlock, thereby "executing" the VPlan.
3129   void execute(VPTransformState *State) override;
3130 
3131   VPIRBasicBlock *clone() override {
3132     auto *NewBlock = new VPIRBasicBlock(IRBB);
3133     for (VPRecipeBase &R : Recipes)
3134       NewBlock->appendRecipe(R.clone());
3135     return NewBlock;
3136   }
3137 
3138   BasicBlock *getIRBasicBlock() const { return IRBB; }
3139 };
3140 
3141 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
3142 /// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
3143 /// A VPRegionBlock may indicate that its contents are to be replicated several
3144 /// times. This is designed to support predicated scalarization, in which a
3145 /// scalar if-then code structure needs to be generated VF * UF times. Having
3146 /// this replication indicator helps to keep a single model for multiple
3147 /// candidate VF's. The actual replication takes place only once the desired VF
3148 /// and UF have been determined.
3149 class VPRegionBlock : public VPBlockBase {
3150   /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
3151   VPBlockBase *Entry;
3152 
3153   /// Hold the Single Exiting block of the SESE region modelled by the
3154   /// VPRegionBlock.
3155   VPBlockBase *Exiting;
3156 
3157   /// An indicator whether this region is to generate multiple replicated
3158   /// instances of output IR corresponding to its VPBlockBases.
3159   bool IsReplicator;
3160 
3161 public:
3162   VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
3163                 const std::string &Name = "", bool IsReplicator = false)
3164       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
3165         IsReplicator(IsReplicator) {
3166     assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
3167     assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
3168     Entry->setParent(this);
3169     Exiting->setParent(this);
3170   }
3171   VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
3172       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
3173         IsReplicator(IsReplicator) {}
3174 
3175   ~VPRegionBlock() override {
3176     if (Entry) {
3177       VPValue DummyValue;
3178       Entry->dropAllReferences(&DummyValue);
3179       deleteCFG(Entry);
3180     }
3181   }
3182 
3183   /// Method to support type inquiry through isa, cast, and dyn_cast.
3184   static inline bool classof(const VPBlockBase *V) {
3185     return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
3186   }
3187 
3188   const VPBlockBase *getEntry() const { return Entry; }
3189   VPBlockBase *getEntry() { return Entry; }
3190 
3191   /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
3192   /// EntryBlock must have no predecessors.
3193   void setEntry(VPBlockBase *EntryBlock) {
3194     assert(EntryBlock->getPredecessors().empty() &&
3195            "Entry block cannot have predecessors.");
3196     Entry = EntryBlock;
3197     EntryBlock->setParent(this);
3198   }
3199 
3200   const VPBlockBase *getExiting() const { return Exiting; }
3201   VPBlockBase *getExiting() { return Exiting; }
3202 
3203   /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
3204   /// ExitingBlock must have no successors.
3205   void setExiting(VPBlockBase *ExitingBlock) {
3206     assert(ExitingBlock->getSuccessors().empty() &&
3207            "Exit block cannot have successors.");
3208     Exiting = ExitingBlock;
3209     ExitingBlock->setParent(this);
3210   }
3211 
3212   /// Returns the pre-header VPBasicBlock of the loop region.
3213   VPBasicBlock *getPreheaderVPBB() {
3214     assert(!isReplicator() && "should only get pre-header of loop regions");
3215     return getSinglePredecessor()->getExitingBasicBlock();
3216   }
3217 
3218   /// An indicator whether this region is to generate multiple replicated
3219   /// instances of output IR corresponding to its VPBlockBases.
3220   bool isReplicator() const { return IsReplicator; }
3221 
3222   /// The method which generates the output IR instructions that correspond to
3223   /// this VPRegionBlock, thereby "executing" the VPlan.
3224   void execute(VPTransformState *State) override;
3225 
3226   // Return the cost of this region.
3227   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
3228 
3229   void dropAllReferences(VPValue *NewValue) override;
3230 
3231 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3232   /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
3233   /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
3234   /// consequtive numbers.
3235   ///
3236   /// Note that the numbering is applied to the whole VPlan, so printing
3237   /// individual regions is consistent with the whole VPlan printing.
3238   void print(raw_ostream &O, const Twine &Indent,
3239              VPSlotTracker &SlotTracker) const override;
3240   using VPBlockBase::print; // Get the print(raw_stream &O) version.
3241 #endif
3242 
3243   /// Clone all blocks in the single-entry single-exit region of the block and
3244   /// their recipes without updating the operands of the cloned recipes.
3245   VPRegionBlock *clone() override;
3246 };
3247 
3248 /// VPlan models a candidate for vectorization, encoding various decisions take
3249 /// to produce efficient output IR, including which branches, basic-blocks and
3250 /// output IR instructions to generate, and their cost. VPlan holds a
3251 /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
3252 /// VPBasicBlock.
3253 class VPlan {
3254   friend class VPlanPrinter;
3255   friend class VPSlotTracker;
3256 
3257   /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the
3258   /// preheader of the vector loop.
3259   VPBasicBlock *Entry;
3260 
3261   /// VPBasicBlock corresponding to the original preheader. Used to place
3262   /// VPExpandSCEV recipes for expressions used during skeleton creation and the
3263   /// rest of VPlan execution.
3264   VPBasicBlock *Preheader;
3265 
3266   /// Holds the VFs applicable to this VPlan.
3267   SmallSetVector<ElementCount, 2> VFs;
3268 
3269   /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for
3270   /// any UF.
3271   SmallSetVector<unsigned, 2> UFs;
3272 
3273   /// Holds the name of the VPlan, for printing.
3274   std::string Name;
3275 
3276   /// Represents the trip count of the original loop, for folding
3277   /// the tail.
3278   VPValue *TripCount = nullptr;
3279 
3280   /// Represents the backedge taken count of the original loop, for folding
3281   /// the tail. It equals TripCount - 1.
3282   VPValue *BackedgeTakenCount = nullptr;
3283 
3284   /// Represents the vector trip count.
3285   VPValue VectorTripCount;
3286 
3287   /// Represents the loop-invariant VF * UF of the vector loop region.
3288   VPValue VFxUF;
3289 
3290   /// Holds a mapping between Values and their corresponding VPValue inside
3291   /// VPlan.
3292   Value2VPValueTy Value2VPValue;
3293 
3294   /// Contains all the external definitions created for this VPlan. External
3295   /// definitions are VPValues that hold a pointer to their underlying IR.
3296   SmallVector<VPValue *, 16> VPLiveInsToFree;
3297 
3298   /// Values used outside the plan. It contains live-outs that need fixing. Any
3299   /// live-out that is fixed outside VPlan needs to be removed. The remaining
3300   /// live-outs are fixed via VPLiveOut::fixPhi.
3301   MapVector<PHINode *, VPLiveOut *> LiveOuts;
3302 
3303   /// Mapping from SCEVs to the VPValues representing their expansions.
3304   /// NOTE: This mapping is temporary and will be removed once all users have
3305   /// been modeled in VPlan directly.
3306   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
3307 
3308 public:
3309   /// Construct a VPlan with original preheader \p Preheader, trip count \p TC
3310   /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to
3311   /// be disconnected, as the bypass blocks between them are not yet modeled in
3312   /// VPlan.
3313   VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry)
3314       : VPlan(Preheader, Entry) {
3315     TripCount = TC;
3316   }
3317 
3318   /// Construct a VPlan with original preheader \p Preheader and \p Entry to
3319   /// the plan. At the moment, \p Preheader and \p Entry need to be
3320   /// disconnected, as the bypass blocks between them are not yet modeled in
3321   /// VPlan.
3322   VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry)
3323       : Entry(Entry), Preheader(Preheader) {
3324     Entry->setPlan(this);
3325     Preheader->setPlan(this);
3326     assert(Preheader->getNumSuccessors() == 0 &&
3327            Preheader->getNumPredecessors() == 0 &&
3328            "preheader must be disconnected");
3329   }
3330 
3331   ~VPlan();
3332 
3333   /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
3334   /// original scalar pre-header ) which contains SCEV expansions that need
3335   /// to happen before the CFG is modified; a VPBasicBlock for the vector
3336   /// pre-header, followed by a region for the vector loop, followed by the
3337   /// middle VPBasicBlock. If a check is needed to guard executing the scalar
3338   /// epilogue loop, it will be added to the middle block, together with
3339   /// VPBasicBlocks for the scalar preheader and exit blocks.
3340   static VPlanPtr createInitialVPlan(const SCEV *TripCount,
3341                                      ScalarEvolution &PSE,
3342                                      bool RequiresScalarEpilogueCheck,
3343                                      bool TailFolded, Loop *TheLoop);
3344 
3345   /// Prepare the plan for execution, setting up the required live-in values.
3346   void prepareToExecute(Value *TripCount, Value *VectorTripCount,
3347                         Value *CanonicalIVStartValue, VPTransformState &State);
3348 
3349   /// Generate the IR code for this VPlan.
3350   void execute(VPTransformState *State);
3351 
3352   /// Return the cost of this plan.
3353   InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
3354 
3355   VPBasicBlock *getEntry() { return Entry; }
3356   const VPBasicBlock *getEntry() const { return Entry; }
3357 
3358   /// The trip count of the original loop.
3359   VPValue *getTripCount() const {
3360     assert(TripCount && "trip count needs to be set before accessing it");
3361     return TripCount;
3362   }
3363 
3364   /// Resets the trip count for the VPlan. The caller must make sure all uses of
3365   /// the original trip count have been replaced.
3366   void resetTripCount(VPValue *NewTripCount) {
3367     assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
3368            "TripCount always must be set");
3369     TripCount = NewTripCount;
3370   }
3371 
3372   /// The backedge taken count of the original loop.
3373   VPValue *getOrCreateBackedgeTakenCount() {
3374     if (!BackedgeTakenCount)
3375       BackedgeTakenCount = new VPValue();
3376     return BackedgeTakenCount;
3377   }
3378 
3379   /// The vector trip count.
3380   VPValue &getVectorTripCount() { return VectorTripCount; }
3381 
3382   /// Returns VF * UF of the vector loop region.
3383   VPValue &getVFxUF() { return VFxUF; }
3384 
3385   void addVF(ElementCount VF) { VFs.insert(VF); }
3386 
3387   void setVF(ElementCount VF) {
3388     assert(hasVF(VF) && "Cannot set VF not already in plan");
3389     VFs.clear();
3390     VFs.insert(VF);
3391   }
3392 
3393   bool hasVF(ElementCount VF) { return VFs.count(VF); }
3394   bool hasScalableVF() {
3395     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
3396   }
3397 
3398   /// Returns an iterator range over all VFs of the plan.
3399   iterator_range<SmallSetVector<ElementCount, 2>::iterator>
3400   vectorFactors() const {
3401     return {VFs.begin(), VFs.end()};
3402   }
3403 
3404   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
3405 
3406   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
3407 
3408   void setUF(unsigned UF) {
3409     assert(hasUF(UF) && "Cannot set the UF not already in plan");
3410     UFs.clear();
3411     UFs.insert(UF);
3412   }
3413 
3414   /// Return a string with the name of the plan and the applicable VFs and UFs.
3415   std::string getName() const;
3416 
3417   void setName(const Twine &newName) { Name = newName.str(); }
3418 
3419   /// Gets the live-in VPValue for \p V or adds a new live-in (if none exists
3420   ///  yet) for \p V.
3421   VPValue *getOrAddLiveIn(Value *V) {
3422     assert(V && "Trying to get or add the VPValue of a null Value");
3423     if (!Value2VPValue.count(V)) {
3424       VPValue *VPV = new VPValue(V);
3425       VPLiveInsToFree.push_back(VPV);
3426       assert(VPV->isLiveIn() && "VPV must be a live-in.");
3427       assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
3428       Value2VPValue[V] = VPV;
3429     }
3430 
3431     assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
3432     assert(Value2VPValue[V]->isLiveIn() &&
3433            "Only live-ins should be in mapping");
3434     return Value2VPValue[V];
3435   }
3436 
3437   /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
3438   VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); }
3439 
3440 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3441   /// Print the live-ins of this VPlan to \p O.
3442   void printLiveIns(raw_ostream &O) const;
3443 
3444   /// Print this VPlan to \p O.
3445   void print(raw_ostream &O) const;
3446 
3447   /// Print this VPlan in DOT format to \p O.
3448   void printDOT(raw_ostream &O) const;
3449 
3450   /// Dump the plan to stderr (for debugging).
3451   LLVM_DUMP_METHOD void dump() const;
3452 #endif
3453 
3454   /// Returns the VPRegionBlock of the vector loop.
3455   VPRegionBlock *getVectorLoopRegion() {
3456     return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
3457   }
3458   const VPRegionBlock *getVectorLoopRegion() const {
3459     return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
3460   }
3461 
3462   /// Returns the canonical induction recipe of the vector loop.
3463   VPCanonicalIVPHIRecipe *getCanonicalIV() {
3464     VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
3465     if (EntryVPBB->empty()) {
3466       // VPlan native path.
3467       EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
3468     }
3469     return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
3470   }
3471 
3472   void addLiveOut(PHINode *PN, VPValue *V);
3473 
3474   void removeLiveOut(PHINode *PN) {
3475     delete LiveOuts[PN];
3476     LiveOuts.erase(PN);
3477   }
3478 
3479   const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
3480     return LiveOuts;
3481   }
3482 
3483   VPValue *getSCEVExpansion(const SCEV *S) const {
3484     return SCEVToExpansion.lookup(S);
3485   }
3486 
3487   void addSCEVExpansion(const SCEV *S, VPValue *V) {
3488     assert(!SCEVToExpansion.contains(S) && "SCEV already expanded");
3489     SCEVToExpansion[S] = V;
3490   }
3491 
3492   /// \return The block corresponding to the original preheader.
3493   VPBasicBlock *getPreheader() { return Preheader; }
3494   const VPBasicBlock *getPreheader() const { return Preheader; }
3495 
3496   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
3497   /// recipes to refer to the clones, and return it.
3498   VPlan *duplicate();
3499 };
3500 
3501 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3502 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
3503 /// indented and follows the dot format.
3504 class VPlanPrinter {
3505   raw_ostream &OS;
3506   const VPlan &Plan;
3507   unsigned Depth = 0;
3508   unsigned TabWidth = 2;
3509   std::string Indent;
3510   unsigned BID = 0;
3511   SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
3512 
3513   VPSlotTracker SlotTracker;
3514 
3515   /// Handle indentation.
3516   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
3517 
3518   /// Print a given \p Block of the Plan.
3519   void dumpBlock(const VPBlockBase *Block);
3520 
3521   /// Print the information related to the CFG edges going out of a given
3522   /// \p Block, followed by printing the successor blocks themselves.
3523   void dumpEdges(const VPBlockBase *Block);
3524 
3525   /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
3526   /// its successor blocks.
3527   void dumpBasicBlock(const VPBasicBlock *BasicBlock);
3528 
3529   /// Print a given \p Region of the Plan.
3530   void dumpRegion(const VPRegionBlock *Region);
3531 
3532   unsigned getOrCreateBID(const VPBlockBase *Block) {
3533     return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
3534   }
3535 
3536   Twine getOrCreateName(const VPBlockBase *Block);
3537 
3538   Twine getUID(const VPBlockBase *Block);
3539 
3540   /// Print the information related to a CFG edge between two VPBlockBases.
3541   void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
3542                 const Twine &Label);
3543 
3544 public:
3545   VPlanPrinter(raw_ostream &O, const VPlan &P)
3546       : OS(O), Plan(P), SlotTracker(&P) {}
3547 
3548   LLVM_DUMP_METHOD void dump();
3549 };
3550 
3551 struct VPlanIngredient {
3552   const Value *V;
3553 
3554   VPlanIngredient(const Value *V) : V(V) {}
3555 
3556   void print(raw_ostream &O) const;
3557 };
3558 
3559 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
3560   I.print(OS);
3561   return OS;
3562 }
3563 
3564 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
3565   Plan.print(OS);
3566   return OS;
3567 }
3568 #endif
3569 
3570 //===----------------------------------------------------------------------===//
3571 // VPlan Utilities
3572 //===----------------------------------------------------------------------===//
3573 
3574 /// Class that provides utilities for VPBlockBases in VPlan.
3575 class VPBlockUtils {
3576 public:
3577   VPBlockUtils() = delete;
3578 
3579   /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
3580   /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
3581   /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
3582   /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
3583   /// have neither successors nor predecessors.
3584   static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
3585     assert(NewBlock->getSuccessors().empty() &&
3586            NewBlock->getPredecessors().empty() &&
3587            "Can't insert new block with predecessors or successors.");
3588     NewBlock->setParent(BlockPtr->getParent());
3589     SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
3590     for (VPBlockBase *Succ : Succs) {
3591       disconnectBlocks(BlockPtr, Succ);
3592       connectBlocks(NewBlock, Succ);
3593     }
3594     connectBlocks(BlockPtr, NewBlock);
3595   }
3596 
3597   /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
3598   /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
3599   /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
3600   /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors
3601   /// and \p IfTrue and \p IfFalse must have neither successors nor
3602   /// predecessors.
3603   static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
3604                                    VPBlockBase *BlockPtr) {
3605     assert(IfTrue->getSuccessors().empty() &&
3606            "Can't insert IfTrue with successors.");
3607     assert(IfFalse->getSuccessors().empty() &&
3608            "Can't insert IfFalse with successors.");
3609     BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
3610     IfTrue->setPredecessors({BlockPtr});
3611     IfFalse->setPredecessors({BlockPtr});
3612     IfTrue->setParent(BlockPtr->getParent());
3613     IfFalse->setParent(BlockPtr->getParent());
3614   }
3615 
3616   /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
3617   /// the successors of \p From and \p From to the predecessors of \p To. Both
3618   /// VPBlockBases must have the same parent, which can be null. Both
3619   /// VPBlockBases can be already connected to other VPBlockBases.
3620   static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
3621     assert((From->getParent() == To->getParent()) &&
3622            "Can't connect two block with different parents");
3623     assert(From->getNumSuccessors() < 2 &&
3624            "Blocks can't have more than two successors.");
3625     From->appendSuccessor(To);
3626     To->appendPredecessor(From);
3627   }
3628 
3629   /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
3630   /// from the successors of \p From and \p From from the predecessors of \p To.
3631   static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
3632     assert(To && "Successor to disconnect is null.");
3633     From->removeSuccessor(To);
3634     To->removePredecessor(From);
3635   }
3636 
3637   /// Return an iterator range over \p Range which only includes \p BlockTy
3638   /// blocks. The accesses are casted to \p BlockTy.
3639   template <typename BlockTy, typename T>
3640   static auto blocksOnly(const T &Range) {
3641     // Create BaseTy with correct const-ness based on BlockTy.
3642     using BaseTy = std::conditional_t<std::is_const<BlockTy>::value,
3643                                       const VPBlockBase, VPBlockBase>;
3644 
3645     // We need to first create an iterator range over (const) BlocktTy & instead
3646     // of (const) BlockTy * for filter_range to work properly.
3647     auto Mapped =
3648         map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; });
3649     auto Filter = make_filter_range(
3650         Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); });
3651     return map_range(Filter, [](BaseTy &Block) -> BlockTy * {
3652       return cast<BlockTy>(&Block);
3653     });
3654   }
3655 };
3656 
3657 class VPInterleavedAccessInfo {
3658   DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
3659       InterleaveGroupMap;
3660 
3661   /// Type for mapping of instruction based interleave groups to VPInstruction
3662   /// interleave groups
3663   using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
3664                              InterleaveGroup<VPInstruction> *>;
3665 
3666   /// Recursively \p Region and populate VPlan based interleave groups based on
3667   /// \p IAI.
3668   void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
3669                    InterleavedAccessInfo &IAI);
3670   /// Recursively traverse \p Block and populate VPlan based interleave groups
3671   /// based on \p IAI.
3672   void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
3673                   InterleavedAccessInfo &IAI);
3674 
3675 public:
3676   VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
3677 
3678   ~VPInterleavedAccessInfo() {
3679     SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
3680     // Avoid releasing a pointer twice.
3681     for (auto &I : InterleaveGroupMap)
3682       DelSet.insert(I.second);
3683     for (auto *Ptr : DelSet)
3684       delete Ptr;
3685   }
3686 
3687   /// Get the interleave group that \p Instr belongs to.
3688   ///
3689   /// \returns nullptr if doesn't have such group.
3690   InterleaveGroup<VPInstruction> *
3691   getInterleaveGroup(VPInstruction *Instr) const {
3692     return InterleaveGroupMap.lookup(Instr);
3693   }
3694 };
3695 
3696 /// Class that maps (parts of) an existing VPlan to trees of combined
3697 /// VPInstructions.
3698 class VPlanSlp {
3699   enum class OpMode { Failed, Load, Opcode };
3700 
3701   /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
3702   /// DenseMap keys.
3703   struct BundleDenseMapInfo {
3704     static SmallVector<VPValue *, 4> getEmptyKey() {
3705       return {reinterpret_cast<VPValue *>(-1)};
3706     }
3707 
3708     static SmallVector<VPValue *, 4> getTombstoneKey() {
3709       return {reinterpret_cast<VPValue *>(-2)};
3710     }
3711 
3712     static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
3713       return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3714     }
3715 
3716     static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
3717                         const SmallVector<VPValue *, 4> &RHS) {
3718       return LHS == RHS;
3719     }
3720   };
3721 
3722   /// Mapping of values in the original VPlan to a combined VPInstruction.
3723   DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
3724       BundleToCombined;
3725 
3726   VPInterleavedAccessInfo &IAI;
3727 
3728   /// Basic block to operate on. For now, only instructions in a single BB are
3729   /// considered.
3730   const VPBasicBlock &BB;
3731 
3732   /// Indicates whether we managed to combine all visited instructions or not.
3733   bool CompletelySLP = true;
3734 
3735   /// Width of the widest combined bundle in bits.
3736   unsigned WidestBundleBits = 0;
3737 
3738   using MultiNodeOpTy =
3739       typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
3740 
3741   // Input operand bundles for the current multi node. Each multi node operand
3742   // bundle contains values not matching the multi node's opcode. They will
3743   // be reordered in reorderMultiNodeOps, once we completed building a
3744   // multi node.
3745   SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
3746 
3747   /// Indicates whether we are building a multi node currently.
3748   bool MultiNodeActive = false;
3749 
3750   /// Check if we can vectorize Operands together.
3751   bool areVectorizable(ArrayRef<VPValue *> Operands) const;
3752 
3753   /// Add combined instruction \p New for the bundle \p Operands.
3754   void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
3755 
3756   /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
3757   VPInstruction *markFailed();
3758 
3759   /// Reorder operands in the multi node to maximize sequential memory access
3760   /// and commutative operations.
3761   SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
3762 
3763   /// Choose the best candidate to use for the lane after \p Last. The set of
3764   /// candidates to choose from are values with an opcode matching \p Last's
3765   /// or loads consecutive to \p Last.
3766   std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
3767                                        SmallPtrSetImpl<VPValue *> &Candidates,
3768                                        VPInterleavedAccessInfo &IAI);
3769 
3770 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3771   /// Print bundle \p Values to dbgs().
3772   void dumpBundle(ArrayRef<VPValue *> Values);
3773 #endif
3774 
3775 public:
3776   VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
3777 
3778   ~VPlanSlp() = default;
3779 
3780   /// Tries to build an SLP tree rooted at \p Operands and returns a
3781   /// VPInstruction combining \p Operands, if they can be combined.
3782   VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
3783 
3784   /// Return the width of the widest combined bundle in bits.
3785   unsigned getWidestBundleBits() const { return WidestBundleBits; }
3786 
3787   /// Return true if all visited instruction can be combined.
3788   bool isCompletelySLP() const { return CompletelySLP; }
3789 };
3790 
3791 namespace vputils {
3792 
3793 /// Returns true if only the first lane of \p Def is used.
3794 bool onlyFirstLaneUsed(const VPValue *Def);
3795 
3796 /// Returns true if only the first part of \p Def is used.
3797 bool onlyFirstPartUsed(const VPValue *Def);
3798 
3799 /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
3800 /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
3801 /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
3802 /// pre-header already contains a recipe expanding \p Expr, return it. If not,
3803 /// create a new one.
3804 VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
3805                                        ScalarEvolution &SE);
3806 
3807 /// Returns true if \p VPV is uniform after vectorization.
3808 inline bool isUniformAfterVectorization(VPValue *VPV) {
3809   // A value defined outside the vector region must be uniform after
3810   // vectorization inside a vector region.
3811   if (VPV->isDefinedOutsideVectorRegions())
3812     return true;
3813   VPRecipeBase *Def = VPV->getDefiningRecipe();
3814   assert(Def && "Must have definition for value defined inside vector region");
3815   if (auto Rep = dyn_cast<VPReplicateRecipe>(Def))
3816     return Rep->isUniform();
3817   if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
3818     return all_of(GEP->operands(), isUniformAfterVectorization);
3819   if (auto *VPI = dyn_cast<VPInstruction>(Def))
3820     return VPI->isSingleScalar() || VPI->isVectorToScalar();
3821   return false;
3822 }
3823 
3824 /// Return true if \p V is a header mask in \p Plan.
3825 bool isHeaderMask(VPValue *V, VPlan &Plan);
3826 } // end namespace vputils
3827 
3828 } // end namespace llvm
3829 
3830 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
3831