1 //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the declarations of the Vectorization Plan base classes:
11 /// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
12 /// VPBlockBase, together implementing a Hierarchical CFG;
13 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained
14 /// within VPBasicBlocks;
15 /// 3. Pure virtual VPSingleDefRecipe serving as a base class for recipes that
16 /// also inherit from VPValue.
17 /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
18 /// instruction;
19 /// 5. The VPlan class holding a candidate for vectorization;
20 /// 6. The VPlanPrinter class providing a way to print a plan in dot format;
21 /// These are documented in docs/VectorizationPlan.rst.
22 //
23 //===----------------------------------------------------------------------===//
24
25 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
26 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
27
28 #include "VPlanAnalysis.h"
29 #include "VPlanValue.h"
30 #include "llvm/ADT/DenseMap.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/SmallBitVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Twine.h"
36 #include "llvm/ADT/ilist.h"
37 #include "llvm/ADT/ilist_node.h"
38 #include "llvm/Analysis/DomTreeUpdater.h"
39 #include "llvm/Analysis/IVDescriptors.h"
40 #include "llvm/Analysis/LoopInfo.h"
41 #include "llvm/Analysis/VectorUtils.h"
42 #include "llvm/IR/DebugLoc.h"
43 #include "llvm/IR/FMF.h"
44 #include "llvm/IR/Operator.h"
45 #include "llvm/Support/InstructionCost.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstddef>
49 #include <string>
50
51 namespace llvm {
52
53 class BasicBlock;
54 class DominatorTree;
55 class InnerLoopVectorizer;
56 class IRBuilderBase;
57 class LoopInfo;
58 class raw_ostream;
59 class RecurrenceDescriptor;
60 class SCEV;
61 class Type;
62 class VPBasicBlock;
63 class VPRegionBlock;
64 class VPlan;
65 class VPReplicateRecipe;
66 class VPlanSlp;
67 class Value;
68 class LoopVectorizationCostModel;
69 class LoopVersioning;
70
71 struct VPCostContext;
72
73 namespace Intrinsic {
74 typedef unsigned ID;
75 }
76
77 /// Returns a calculation for the total number of elements for a given \p VF.
78 /// For fixed width vectors this value is a constant, whereas for scalable
79 /// vectors it is an expression determined at runtime.
80 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
81
82 /// Return a value for Step multiplied by VF.
83 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
84 int64_t Step);
85
86 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
87 Loop *CurLoop = nullptr);
88
89 /// A helper function that returns the reciprocal of the block probability of
90 /// predicated blocks. If we return X, we are assuming the predicated block
91 /// will execute once for every X iterations of the loop header.
92 ///
93 /// TODO: We should use actual block probability here, if available. Currently,
94 /// we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()95 inline unsigned getReciprocalPredBlockProb() { return 2; }
96
97 /// A range of powers-of-2 vectorization factors with fixed start and
98 /// adjustable end. The range includes start and excludes end, e.g.,:
99 /// [1, 16) = {1, 2, 4, 8}
100 struct VFRange {
101 // A power of 2.
102 const ElementCount Start;
103
104 // A power of 2. If End <= Start range is empty.
105 ElementCount End;
106
isEmptyVFRange107 bool isEmpty() const {
108 return End.getKnownMinValue() <= Start.getKnownMinValue();
109 }
110
VFRangeVFRange111 VFRange(const ElementCount &Start, const ElementCount &End)
112 : Start(Start), End(End) {
113 assert(Start.isScalable() == End.isScalable() &&
114 "Both Start and End should have the same scalable flag");
115 assert(isPowerOf2_32(Start.getKnownMinValue()) &&
116 "Expected Start to be a power of 2");
117 assert(isPowerOf2_32(End.getKnownMinValue()) &&
118 "Expected End to be a power of 2");
119 }
120
121 /// Iterator to iterate over vectorization factors in a VFRange.
122 class iterator
123 : public iterator_facade_base<iterator, std::forward_iterator_tag,
124 ElementCount> {
125 ElementCount VF;
126
127 public:
iteratorVFRange128 iterator(ElementCount VF) : VF(VF) {}
129
130 bool operator==(const iterator &Other) const { return VF == Other.VF; }
131
132 ElementCount operator*() const { return VF; }
133
134 iterator &operator++() {
135 VF *= 2;
136 return *this;
137 }
138 };
139
beginVFRange140 iterator begin() { return iterator(Start); }
endVFRange141 iterator end() {
142 assert(isPowerOf2_32(End.getKnownMinValue()));
143 return iterator(End);
144 }
145 };
146
147 using VPlanPtr = std::unique_ptr<VPlan>;
148
149 /// In what follows, the term "input IR" refers to code that is fed into the
150 /// vectorizer whereas the term "output IR" refers to code that is generated by
151 /// the vectorizer.
152
153 /// VPLane provides a way to access lanes in both fixed width and scalable
154 /// vectors, where for the latter the lane index sometimes needs calculating
155 /// as a runtime expression.
156 class VPLane {
157 public:
158 /// Kind describes how to interpret Lane.
159 enum class Kind : uint8_t {
160 /// For First, Lane is the index into the first N elements of a
161 /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
162 First,
163 /// For ScalableLast, Lane is the offset from the start of the last
164 /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
165 /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
166 /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
167 ScalableLast
168 };
169
170 private:
171 /// in [0..VF)
172 unsigned Lane;
173
174 /// Indicates how the Lane should be interpreted, as described above.
175 Kind LaneKind;
176
177 public:
VPLane(unsigned Lane,Kind LaneKind)178 VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
179
getFirstLane()180 static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
181
getLaneFromEnd(const ElementCount & VF,unsigned Offset)182 static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
183 assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
184 "trying to extract with invalid offset");
185 unsigned LaneOffset = VF.getKnownMinValue() - Offset;
186 Kind LaneKind;
187 if (VF.isScalable())
188 // In this case 'LaneOffset' refers to the offset from the start of the
189 // last subvector with VF.getKnownMinValue() elements.
190 LaneKind = VPLane::Kind::ScalableLast;
191 else
192 LaneKind = VPLane::Kind::First;
193 return VPLane(LaneOffset, LaneKind);
194 }
195
getLastLaneForVF(const ElementCount & VF)196 static VPLane getLastLaneForVF(const ElementCount &VF) {
197 return getLaneFromEnd(VF, 1);
198 }
199
200 /// Returns a compile-time known value for the lane index and asserts if the
201 /// lane can only be calculated at runtime.
getKnownLane()202 unsigned getKnownLane() const {
203 assert(LaneKind == Kind::First);
204 return Lane;
205 }
206
207 /// Returns an expression describing the lane index that can be used at
208 /// runtime.
209 Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
210
211 /// Returns the Kind of lane offset.
getKind()212 Kind getKind() const { return LaneKind; }
213
214 /// Returns true if this is the first lane of the whole vector.
isFirstLane()215 bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
216
217 /// Maps the lane to a cache index based on \p VF.
mapToCacheIndex(const ElementCount & VF)218 unsigned mapToCacheIndex(const ElementCount &VF) const {
219 switch (LaneKind) {
220 case VPLane::Kind::ScalableLast:
221 assert(VF.isScalable() && Lane < VF.getKnownMinValue());
222 return VF.getKnownMinValue() + Lane;
223 default:
224 assert(Lane < VF.getKnownMinValue());
225 return Lane;
226 }
227 }
228
229 /// Returns the maxmimum number of lanes that we are able to consider
230 /// caching for \p VF.
getNumCachedLanes(const ElementCount & VF)231 static unsigned getNumCachedLanes(const ElementCount &VF) {
232 return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
233 }
234 };
235
236 /// VPIteration represents a single point in the iteration space of the output
237 /// (vectorized and/or unrolled) IR loop.
238 struct VPIteration {
239 /// in [0..UF)
240 unsigned Part;
241
242 VPLane Lane;
243
244 VPIteration(unsigned Part, unsigned Lane,
245 VPLane::Kind Kind = VPLane::Kind::First)
PartVPIteration246 : Part(Part), Lane(Lane, Kind) {}
247
VPIterationVPIteration248 VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
249
isFirstIterationVPIteration250 bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
251 };
252
253 /// VPTransformState holds information passed down when "executing" a VPlan,
254 /// needed for generating the output IR.
255 struct VPTransformState {
256 VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
257 DominatorTree *DT, IRBuilderBase &Builder,
258 InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx);
259
260 /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
261 ElementCount VF;
262 unsigned UF;
263
264 /// Hold the indices to generate specific scalar instructions. Null indicates
265 /// that all instances are to be generated, using either scalar or vector
266 /// instructions.
267 std::optional<VPIteration> Instance;
268
269 struct DataState {
270 /// A type for vectorized values in the new loop. Each value from the
271 /// original loop, when vectorized, is represented by UF vector values in
272 /// the new unrolled loop, where UF is the unroll factor.
273 typedef SmallVector<Value *, 2> PerPartValuesTy;
274
275 DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
276
277 using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
278 DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
279 } Data;
280
281 /// Get the generated vector Value for a given VPValue \p Def and a given \p
282 /// Part if \p IsScalar is false, otherwise return the generated scalar
283 /// for \p Part. \See set.
284 Value *get(VPValue *Def, unsigned Part, bool IsScalar = false);
285
286 /// Get the generated Value for a given VPValue and given Part and Lane.
287 Value *get(VPValue *Def, const VPIteration &Instance);
288
hasVectorValueVPTransformState289 bool hasVectorValue(VPValue *Def, unsigned Part) {
290 auto I = Data.PerPartOutput.find(Def);
291 return I != Data.PerPartOutput.end() && Part < I->second.size() &&
292 I->second[Part];
293 }
294
hasScalarValueVPTransformState295 bool hasScalarValue(VPValue *Def, VPIteration Instance) {
296 auto I = Data.PerPartScalars.find(Def);
297 if (I == Data.PerPartScalars.end())
298 return false;
299 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
300 return Instance.Part < I->second.size() &&
301 CacheIdx < I->second[Instance.Part].size() &&
302 I->second[Instance.Part][CacheIdx];
303 }
304
305 /// Set the generated vector Value for a given VPValue and a given Part, if \p
306 /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0).
307 void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) {
308 if (IsScalar) {
309 set(Def, V, VPIteration(Part, 0));
310 return;
311 }
312 assert((VF.isScalar() || V->getType()->isVectorTy()) &&
313 "scalar values must be stored as (Part, 0)");
314 if (!Data.PerPartOutput.count(Def)) {
315 DataState::PerPartValuesTy Entry(UF);
316 Data.PerPartOutput[Def] = Entry;
317 }
318 Data.PerPartOutput[Def][Part] = V;
319 }
320
321 /// Reset an existing vector value for \p Def and a given \p Part.
resetVPTransformState322 void reset(VPValue *Def, Value *V, unsigned Part) {
323 auto Iter = Data.PerPartOutput.find(Def);
324 assert(Iter != Data.PerPartOutput.end() &&
325 "need to overwrite existing value");
326 Iter->second[Part] = V;
327 }
328
329 /// Set the generated scalar \p V for \p Def and the given \p Instance.
setVPTransformState330 void set(VPValue *Def, Value *V, const VPIteration &Instance) {
331 auto Iter = Data.PerPartScalars.insert({Def, {}});
332 auto &PerPartVec = Iter.first->second;
333 if (PerPartVec.size() <= Instance.Part)
334 PerPartVec.resize(Instance.Part + 1);
335 auto &Scalars = PerPartVec[Instance.Part];
336 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
337 if (Scalars.size() <= CacheIdx)
338 Scalars.resize(CacheIdx + 1);
339 assert(!Scalars[CacheIdx] && "should overwrite existing value");
340 Scalars[CacheIdx] = V;
341 }
342
343 /// Reset an existing scalar value for \p Def and a given \p Instance.
resetVPTransformState344 void reset(VPValue *Def, Value *V, const VPIteration &Instance) {
345 auto Iter = Data.PerPartScalars.find(Def);
346 assert(Iter != Data.PerPartScalars.end() &&
347 "need to overwrite existing value");
348 assert(Instance.Part < Iter->second.size() &&
349 "need to overwrite existing value");
350 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
351 assert(CacheIdx < Iter->second[Instance.Part].size() &&
352 "need to overwrite existing value");
353 Iter->second[Instance.Part][CacheIdx] = V;
354 }
355
356 /// Add additional metadata to \p To that was not present on \p Orig.
357 ///
358 /// Currently this is used to add the noalias annotations based on the
359 /// inserted memchecks. Use this for instructions that are *cloned* into the
360 /// vector loop.
361 void addNewMetadata(Instruction *To, const Instruction *Orig);
362
363 /// Add metadata from one instruction to another.
364 ///
365 /// This includes both the original MDs from \p From and additional ones (\see
366 /// addNewMetadata). Use this for *newly created* instructions in the vector
367 /// loop.
368 void addMetadata(Value *To, Instruction *From);
369
370 /// Set the debug location in the builder using the debug location \p DL.
371 void setDebugLocFrom(DebugLoc DL);
372
373 /// Construct the vector value of a scalarized value \p V one lane at a time.
374 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance);
375
376 /// Hold state information used when constructing the CFG of the output IR,
377 /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
378 struct CFGState {
379 /// The previous VPBasicBlock visited. Initially set to null.
380 VPBasicBlock *PrevVPBB = nullptr;
381
382 /// The previous IR BasicBlock created or used. Initially set to the new
383 /// header BasicBlock.
384 BasicBlock *PrevBB = nullptr;
385
386 /// The last IR BasicBlock in the output IR. Set to the exit block of the
387 /// vector loop.
388 BasicBlock *ExitBB = nullptr;
389
390 /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
391 /// of replication, maps the BasicBlock of the last replica created.
392 SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
393
394 /// Updater for the DominatorTree.
395 DomTreeUpdater DTU;
396
CFGStateVPTransformState::CFGState397 CFGState(DominatorTree *DT)
398 : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
399
400 /// Returns the BasicBlock* mapped to the pre-header of the loop region
401 /// containing \p R.
402 BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
403 } CFG;
404
405 /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
406 LoopInfo *LI;
407
408 /// Hold a reference to the IRBuilder used to generate output IR code.
409 IRBuilderBase &Builder;
410
411 /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
412 InnerLoopVectorizer *ILV;
413
414 /// Pointer to the VPlan code is generated for.
415 VPlan *Plan;
416
417 /// The loop object for the current parent region, or nullptr.
418 Loop *CurrentVectorLoop = nullptr;
419
420 /// LoopVersioning. It's only set up (non-null) if memchecks were
421 /// used.
422 ///
423 /// This is currently only used to add no-alias metadata based on the
424 /// memchecks. The actually versioning is performed manually.
425 LoopVersioning *LVer = nullptr;
426
427 /// Map SCEVs to their expanded values. Populated when executing
428 /// VPExpandSCEVRecipes.
429 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
430
431 /// VPlan-based type analysis.
432 VPTypeAnalysis TypeAnalysis;
433 };
434
435 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
436 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
437 class VPBlockBase {
438 friend class VPBlockUtils;
439
440 const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
441
442 /// An optional name for the block.
443 std::string Name;
444
445 /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
446 /// it is a topmost VPBlockBase.
447 VPRegionBlock *Parent = nullptr;
448
449 /// List of predecessor blocks.
450 SmallVector<VPBlockBase *, 1> Predecessors;
451
452 /// List of successor blocks.
453 SmallVector<VPBlockBase *, 1> Successors;
454
455 /// VPlan containing the block. Can only be set on the entry block of the
456 /// plan.
457 VPlan *Plan = nullptr;
458
459 /// Add \p Successor as the last successor to this block.
appendSuccessor(VPBlockBase * Successor)460 void appendSuccessor(VPBlockBase *Successor) {
461 assert(Successor && "Cannot add nullptr successor!");
462 Successors.push_back(Successor);
463 }
464
465 /// Add \p Predecessor as the last predecessor to this block.
appendPredecessor(VPBlockBase * Predecessor)466 void appendPredecessor(VPBlockBase *Predecessor) {
467 assert(Predecessor && "Cannot add nullptr predecessor!");
468 Predecessors.push_back(Predecessor);
469 }
470
471 /// Remove \p Predecessor from the predecessors of this block.
removePredecessor(VPBlockBase * Predecessor)472 void removePredecessor(VPBlockBase *Predecessor) {
473 auto Pos = find(Predecessors, Predecessor);
474 assert(Pos && "Predecessor does not exist");
475 Predecessors.erase(Pos);
476 }
477
478 /// Remove \p Successor from the successors of this block.
removeSuccessor(VPBlockBase * Successor)479 void removeSuccessor(VPBlockBase *Successor) {
480 auto Pos = find(Successors, Successor);
481 assert(Pos && "Successor does not exist");
482 Successors.erase(Pos);
483 }
484
485 protected:
VPBlockBase(const unsigned char SC,const std::string & N)486 VPBlockBase(const unsigned char SC, const std::string &N)
487 : SubclassID(SC), Name(N) {}
488
489 public:
490 /// An enumeration for keeping track of the concrete subclass of VPBlockBase
491 /// that are actually instantiated. Values of this enumeration are kept in the
492 /// SubclassID field of the VPBlockBase objects. They are used for concrete
493 /// type identification.
494 using VPBlockTy = enum { VPRegionBlockSC, VPBasicBlockSC, VPIRBasicBlockSC };
495
496 using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
497
498 virtual ~VPBlockBase() = default;
499
getName()500 const std::string &getName() const { return Name; }
501
setName(const Twine & newName)502 void setName(const Twine &newName) { Name = newName.str(); }
503
504 /// \return an ID for the concrete type of this object.
505 /// This is used to implement the classof checks. This should not be used
506 /// for any other purpose, as the values may change as LLVM evolves.
getVPBlockID()507 unsigned getVPBlockID() const { return SubclassID; }
508
getParent()509 VPRegionBlock *getParent() { return Parent; }
getParent()510 const VPRegionBlock *getParent() const { return Parent; }
511
512 /// \return A pointer to the plan containing the current block.
513 VPlan *getPlan();
514 const VPlan *getPlan() const;
515
516 /// Sets the pointer of the plan containing the block. The block must be the
517 /// entry block into the VPlan.
518 void setPlan(VPlan *ParentPlan);
519
setParent(VPRegionBlock * P)520 void setParent(VPRegionBlock *P) { Parent = P; }
521
522 /// \return the VPBasicBlock that is the entry of this VPBlockBase,
523 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
524 /// VPBlockBase is a VPBasicBlock, it is returned.
525 const VPBasicBlock *getEntryBasicBlock() const;
526 VPBasicBlock *getEntryBasicBlock();
527
528 /// \return the VPBasicBlock that is the exiting this VPBlockBase,
529 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
530 /// VPBlockBase is a VPBasicBlock, it is returned.
531 const VPBasicBlock *getExitingBasicBlock() const;
532 VPBasicBlock *getExitingBasicBlock();
533
getSuccessors()534 const VPBlocksTy &getSuccessors() const { return Successors; }
getSuccessors()535 VPBlocksTy &getSuccessors() { return Successors; }
536
successors()537 iterator_range<VPBlockBase **> successors() { return Successors; }
538
getPredecessors()539 const VPBlocksTy &getPredecessors() const { return Predecessors; }
getPredecessors()540 VPBlocksTy &getPredecessors() { return Predecessors; }
541
542 /// \return the successor of this VPBlockBase if it has a single successor.
543 /// Otherwise return a null pointer.
getSingleSuccessor()544 VPBlockBase *getSingleSuccessor() const {
545 return (Successors.size() == 1 ? *Successors.begin() : nullptr);
546 }
547
548 /// \return the predecessor of this VPBlockBase if it has a single
549 /// predecessor. Otherwise return a null pointer.
getSinglePredecessor()550 VPBlockBase *getSinglePredecessor() const {
551 return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
552 }
553
getNumSuccessors()554 size_t getNumSuccessors() const { return Successors.size(); }
getNumPredecessors()555 size_t getNumPredecessors() const { return Predecessors.size(); }
556
557 /// An Enclosing Block of a block B is any block containing B, including B
558 /// itself. \return the closest enclosing block starting from "this", which
559 /// has successors. \return the root enclosing block if all enclosing blocks
560 /// have no successors.
561 VPBlockBase *getEnclosingBlockWithSuccessors();
562
563 /// \return the closest enclosing block starting from "this", which has
564 /// predecessors. \return the root enclosing block if all enclosing blocks
565 /// have no predecessors.
566 VPBlockBase *getEnclosingBlockWithPredecessors();
567
568 /// \return the successors either attached directly to this VPBlockBase or, if
569 /// this VPBlockBase is the exit block of a VPRegionBlock and has no
570 /// successors of its own, search recursively for the first enclosing
571 /// VPRegionBlock that has successors and return them. If no such
572 /// VPRegionBlock exists, return the (empty) successors of the topmost
573 /// VPBlockBase reached.
getHierarchicalSuccessors()574 const VPBlocksTy &getHierarchicalSuccessors() {
575 return getEnclosingBlockWithSuccessors()->getSuccessors();
576 }
577
578 /// \return the hierarchical successor of this VPBlockBase if it has a single
579 /// hierarchical successor. Otherwise return a null pointer.
getSingleHierarchicalSuccessor()580 VPBlockBase *getSingleHierarchicalSuccessor() {
581 return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
582 }
583
584 /// \return the predecessors either attached directly to this VPBlockBase or,
585 /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
586 /// predecessors of its own, search recursively for the first enclosing
587 /// VPRegionBlock that has predecessors and return them. If no such
588 /// VPRegionBlock exists, return the (empty) predecessors of the topmost
589 /// VPBlockBase reached.
getHierarchicalPredecessors()590 const VPBlocksTy &getHierarchicalPredecessors() {
591 return getEnclosingBlockWithPredecessors()->getPredecessors();
592 }
593
594 /// \return the hierarchical predecessor of this VPBlockBase if it has a
595 /// single hierarchical predecessor. Otherwise return a null pointer.
getSingleHierarchicalPredecessor()596 VPBlockBase *getSingleHierarchicalPredecessor() {
597 return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
598 }
599
600 /// Set a given VPBlockBase \p Successor as the single successor of this
601 /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
602 /// This VPBlockBase must have no successors.
setOneSuccessor(VPBlockBase * Successor)603 void setOneSuccessor(VPBlockBase *Successor) {
604 assert(Successors.empty() && "Setting one successor when others exist.");
605 assert(Successor->getParent() == getParent() &&
606 "connected blocks must have the same parent");
607 appendSuccessor(Successor);
608 }
609
610 /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
611 /// successors of this VPBlockBase. This VPBlockBase is not added as
612 /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
613 /// successors.
setTwoSuccessors(VPBlockBase * IfTrue,VPBlockBase * IfFalse)614 void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
615 assert(Successors.empty() && "Setting two successors when others exist.");
616 appendSuccessor(IfTrue);
617 appendSuccessor(IfFalse);
618 }
619
620 /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
621 /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
622 /// as successor of any VPBasicBlock in \p NewPreds.
setPredecessors(ArrayRef<VPBlockBase * > NewPreds)623 void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
624 assert(Predecessors.empty() && "Block predecessors already set.");
625 for (auto *Pred : NewPreds)
626 appendPredecessor(Pred);
627 }
628
629 /// Set each VPBasicBlock in \p NewSuccss as successor of this VPBlockBase.
630 /// This VPBlockBase must have no successors. This VPBlockBase is not added
631 /// as predecessor of any VPBasicBlock in \p NewSuccs.
setSuccessors(ArrayRef<VPBlockBase * > NewSuccs)632 void setSuccessors(ArrayRef<VPBlockBase *> NewSuccs) {
633 assert(Successors.empty() && "Block successors already set.");
634 for (auto *Succ : NewSuccs)
635 appendSuccessor(Succ);
636 }
637
638 /// Remove all the predecessor of this block.
clearPredecessors()639 void clearPredecessors() { Predecessors.clear(); }
640
641 /// Remove all the successors of this block.
clearSuccessors()642 void clearSuccessors() { Successors.clear(); }
643
644 /// The method which generates the output IR that correspond to this
645 /// VPBlockBase, thereby "executing" the VPlan.
646 virtual void execute(VPTransformState *State) = 0;
647
648 /// Return the cost of the block.
649 virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
650
651 /// Delete all blocks reachable from a given VPBlockBase, inclusive.
652 static void deleteCFG(VPBlockBase *Entry);
653
654 /// Return true if it is legal to hoist instructions into this block.
isLegalToHoistInto()655 bool isLegalToHoistInto() {
656 // There are currently no constraints that prevent an instruction to be
657 // hoisted into a VPBlockBase.
658 return true;
659 }
660
661 /// Replace all operands of VPUsers in the block with \p NewValue and also
662 /// replaces all uses of VPValues defined in the block with NewValue.
663 virtual void dropAllReferences(VPValue *NewValue) = 0;
664
665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printAsOperand(raw_ostream & OS,bool PrintType)666 void printAsOperand(raw_ostream &OS, bool PrintType) const {
667 OS << getName();
668 }
669
670 /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines
671 /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using
672 /// consequtive numbers.
673 ///
674 /// Note that the numbering is applied to the whole VPlan, so printing
675 /// individual blocks is consistent with the whole VPlan printing.
676 virtual void print(raw_ostream &O, const Twine &Indent,
677 VPSlotTracker &SlotTracker) const = 0;
678
679 /// Print plain-text dump of this VPlan to \p O.
print(raw_ostream & O)680 void print(raw_ostream &O) const {
681 VPSlotTracker SlotTracker(getPlan());
682 print(O, "", SlotTracker);
683 }
684
685 /// Print the successors of this block to \p O, prefixing all lines with \p
686 /// Indent.
687 void printSuccessors(raw_ostream &O, const Twine &Indent) const;
688
689 /// Dump this VPBlockBase to dbgs().
dump()690 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
691 #endif
692
693 /// Clone the current block and it's recipes without updating the operands of
694 /// the cloned recipes, including all blocks in the single-entry single-exit
695 /// region for VPRegionBlocks.
696 virtual VPBlockBase *clone() = 0;
697 };
698
699 /// A value that is used outside the VPlan. The operand of the user needs to be
700 /// added to the associated phi node. The incoming block from VPlan is
701 /// determined by where the VPValue is defined: if it is defined by a recipe
702 /// outside a region, its parent block is used, otherwise the middle block is
703 /// used.
704 class VPLiveOut : public VPUser {
705 PHINode *Phi;
706
707 public:
VPLiveOut(PHINode * Phi,VPValue * Op)708 VPLiveOut(PHINode *Phi, VPValue *Op)
709 : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
710
classof(const VPUser * U)711 static inline bool classof(const VPUser *U) {
712 return U->getVPUserID() == VPUser::VPUserID::LiveOut;
713 }
714
715 /// Fix the wrapped phi node. This means adding an incoming value to exit
716 /// block phi's from the vector loop via middle block (values from scalar loop
717 /// already reach these phi's), and updating the value to scalar header phi's
718 /// from the scalar preheader.
719 void fixPhi(VPlan &Plan, VPTransformState &State);
720
721 /// Returns true if the VPLiveOut uses scalars of operand \p Op.
usesScalars(const VPValue * Op)722 bool usesScalars(const VPValue *Op) const override {
723 assert(is_contained(operands(), Op) &&
724 "Op must be an operand of the recipe");
725 return true;
726 }
727
getPhi()728 PHINode *getPhi() const { return Phi; }
729
730 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
731 /// Print the VPLiveOut to \p O.
732 void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
733 #endif
734 };
735
736 /// Struct to hold various analysis needed for cost computations.
737 struct VPCostContext {
738 const TargetTransformInfo &TTI;
739 VPTypeAnalysis Types;
740 LLVMContext &LLVMCtx;
741 LoopVectorizationCostModel &CM;
742 SmallPtrSet<Instruction *, 8> SkipCostComputation;
743
VPCostContextVPCostContext744 VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy,
745 LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM)
746 : TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {}
747
748 /// Return the cost for \p UI with \p VF using the legacy cost model as
749 /// fallback until computing the cost of all recipes migrates to VPlan.
750 InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
751
752 /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
753 /// has already been pre-computed.
754 bool skipCostComputation(Instruction *UI, bool IsVector) const;
755 };
756
757 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
758 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
759 /// and is responsible for deleting its defined values. Single-value
760 /// recipes must inherit from VPSingleDef instead of inheriting from both
761 /// VPRecipeBase and VPValue separately.
762 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
763 public VPDef,
764 public VPUser {
765 friend VPBasicBlock;
766 friend class VPBlockUtils;
767
768 /// Each VPRecipe belongs to a single VPBasicBlock.
769 VPBasicBlock *Parent = nullptr;
770
771 /// The debug location for the recipe.
772 DebugLoc DL;
773
774 public:
775 VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
776 DebugLoc DL = {})
VPDef(SC)777 : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
778
779 template <typename IterT>
780 VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands,
781 DebugLoc DL = {})
VPDef(SC)782 : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
783 virtual ~VPRecipeBase() = default;
784
785 /// Clone the current recipe.
786 virtual VPRecipeBase *clone() = 0;
787
788 /// \return the VPBasicBlock which this VPRecipe belongs to.
getParent()789 VPBasicBlock *getParent() { return Parent; }
getParent()790 const VPBasicBlock *getParent() const { return Parent; }
791
792 /// The method which generates the output IR instructions that correspond to
793 /// this VPRecipe, thereby "executing" the VPlan.
794 virtual void execute(VPTransformState &State) = 0;
795
796 /// Return the cost of this recipe, taking into account if the cost
797 /// computation should be skipped and the ForceTargetInstructionCost flag.
798 /// Also takes care of printing the cost for debugging.
799 virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
800
801 /// Insert an unlinked recipe into a basic block immediately before
802 /// the specified recipe.
803 void insertBefore(VPRecipeBase *InsertPos);
804 /// Insert an unlinked recipe into \p BB immediately before the insertion
805 /// point \p IP;
806 void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
807
808 /// Insert an unlinked Recipe into a basic block immediately after
809 /// the specified Recipe.
810 void insertAfter(VPRecipeBase *InsertPos);
811
812 /// Unlink this recipe from its current VPBasicBlock and insert it into
813 /// the VPBasicBlock that MovePos lives in, right after MovePos.
814 void moveAfter(VPRecipeBase *MovePos);
815
816 /// Unlink this recipe and insert into BB before I.
817 ///
818 /// \pre I is a valid iterator into BB.
819 void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
820
821 /// This method unlinks 'this' from the containing basic block, but does not
822 /// delete it.
823 void removeFromParent();
824
825 /// This method unlinks 'this' from the containing basic block and deletes it.
826 ///
827 /// \returns an iterator pointing to the element after the erased one
828 iplist<VPRecipeBase>::iterator eraseFromParent();
829
830 /// Method to support type inquiry through isa, cast, and dyn_cast.
classof(const VPDef * D)831 static inline bool classof(const VPDef *D) {
832 // All VPDefs are also VPRecipeBases.
833 return true;
834 }
835
classof(const VPUser * U)836 static inline bool classof(const VPUser *U) {
837 return U->getVPUserID() == VPUser::VPUserID::Recipe;
838 }
839
840 /// Returns true if the recipe may have side-effects.
841 bool mayHaveSideEffects() const;
842
843 /// Returns true for PHI-like recipes.
isPhi()844 bool isPhi() const {
845 return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
846 }
847
848 /// Returns true if the recipe may read from memory.
849 bool mayReadFromMemory() const;
850
851 /// Returns true if the recipe may write to memory.
852 bool mayWriteToMemory() const;
853
854 /// Returns true if the recipe may read from or write to memory.
mayReadOrWriteMemory()855 bool mayReadOrWriteMemory() const {
856 return mayReadFromMemory() || mayWriteToMemory();
857 }
858
859 /// Returns the debug location of the recipe.
getDebugLoc()860 DebugLoc getDebugLoc() const { return DL; }
861
862 protected:
863 /// Compute the cost of this recipe using the legacy cost model and the
864 /// underlying instructions.
865 InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const;
866 };
867
868 // Helper macro to define common classof implementations for recipes.
869 #define VP_CLASSOF_IMPL(VPDefID) \
870 static inline bool classof(const VPDef *D) { \
871 return D->getVPDefID() == VPDefID; \
872 } \
873 static inline bool classof(const VPValue *V) { \
874 auto *R = V->getDefiningRecipe(); \
875 return R && R->getVPDefID() == VPDefID; \
876 } \
877 static inline bool classof(const VPUser *U) { \
878 auto *R = dyn_cast<VPRecipeBase>(U); \
879 return R && R->getVPDefID() == VPDefID; \
880 } \
881 static inline bool classof(const VPRecipeBase *R) { \
882 return R->getVPDefID() == VPDefID; \
883 } \
884 static inline bool classof(const VPSingleDefRecipe *R) { \
885 return R->getVPDefID() == VPDefID; \
886 }
887
888 /// VPSingleDef is a base class for recipes for modeling a sequence of one or
889 /// more output IR that define a single result VPValue.
890 /// Note that VPRecipeBase must be inherited from before VPValue.
891 class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
892 public:
893 template <typename IterT>
894 VPSingleDefRecipe(const unsigned char SC, IterT Operands, DebugLoc DL = {})
VPRecipeBase(SC,Operands,DL)895 : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
896
897 VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
898 DebugLoc DL = {})
VPRecipeBase(SC,Operands,DL)899 : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
900
901 template <typename IterT>
902 VPSingleDefRecipe(const unsigned char SC, IterT Operands, Value *UV,
903 DebugLoc DL = {})
VPRecipeBase(SC,Operands,DL)904 : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
905
classof(const VPRecipeBase * R)906 static inline bool classof(const VPRecipeBase *R) {
907 switch (R->getVPDefID()) {
908 case VPRecipeBase::VPDerivedIVSC:
909 case VPRecipeBase::VPEVLBasedIVPHISC:
910 case VPRecipeBase::VPExpandSCEVSC:
911 case VPRecipeBase::VPInstructionSC:
912 case VPRecipeBase::VPReductionEVLSC:
913 case VPRecipeBase::VPReductionSC:
914 case VPRecipeBase::VPReplicateSC:
915 case VPRecipeBase::VPScalarIVStepsSC:
916 case VPRecipeBase::VPVectorPointerSC:
917 case VPRecipeBase::VPWidenCallSC:
918 case VPRecipeBase::VPWidenCanonicalIVSC:
919 case VPRecipeBase::VPWidenCastSC:
920 case VPRecipeBase::VPWidenGEPSC:
921 case VPRecipeBase::VPWidenSC:
922 case VPRecipeBase::VPWidenSelectSC:
923 case VPRecipeBase::VPBlendSC:
924 case VPRecipeBase::VPPredInstPHISC:
925 case VPRecipeBase::VPCanonicalIVPHISC:
926 case VPRecipeBase::VPActiveLaneMaskPHISC:
927 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
928 case VPRecipeBase::VPWidenPHISC:
929 case VPRecipeBase::VPWidenIntOrFpInductionSC:
930 case VPRecipeBase::VPWidenPointerInductionSC:
931 case VPRecipeBase::VPReductionPHISC:
932 case VPRecipeBase::VPScalarCastSC:
933 return true;
934 case VPRecipeBase::VPInterleaveSC:
935 case VPRecipeBase::VPBranchOnMaskSC:
936 case VPRecipeBase::VPWidenLoadEVLSC:
937 case VPRecipeBase::VPWidenLoadSC:
938 case VPRecipeBase::VPWidenStoreEVLSC:
939 case VPRecipeBase::VPWidenStoreSC:
940 // TODO: Widened stores don't define a value, but widened loads do. Split
941 // the recipes to be able to make widened loads VPSingleDefRecipes.
942 return false;
943 }
944 llvm_unreachable("Unhandled VPDefID");
945 }
946
classof(const VPUser * U)947 static inline bool classof(const VPUser *U) {
948 auto *R = dyn_cast<VPRecipeBase>(U);
949 return R && classof(R);
950 }
951
952 virtual VPSingleDefRecipe *clone() override = 0;
953
954 /// Returns the underlying instruction.
getUnderlyingInstr()955 Instruction *getUnderlyingInstr() {
956 return cast<Instruction>(getUnderlyingValue());
957 }
getUnderlyingInstr()958 const Instruction *getUnderlyingInstr() const {
959 return cast<Instruction>(getUnderlyingValue());
960 }
961 };
962
963 /// Class to record LLVM IR flag for a recipe along with it.
964 class VPRecipeWithIRFlags : public VPSingleDefRecipe {
965 enum class OperationType : unsigned char {
966 Cmp,
967 OverflowingBinOp,
968 DisjointOp,
969 PossiblyExactOp,
970 GEPOp,
971 FPMathOp,
972 NonNegOp,
973 Other
974 };
975
976 public:
977 struct WrapFlagsTy {
978 char HasNUW : 1;
979 char HasNSW : 1;
980
WrapFlagsTyWrapFlagsTy981 WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
982 };
983
984 struct DisjointFlagsTy {
985 char IsDisjoint : 1;
DisjointFlagsTyDisjointFlagsTy986 DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
987 };
988
989 protected:
990 struct GEPFlagsTy {
991 char IsInBounds : 1;
GEPFlagsTyGEPFlagsTy992 GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
993 };
994
995 private:
996 struct ExactFlagsTy {
997 char IsExact : 1;
998 };
999 struct NonNegFlagsTy {
1000 char NonNeg : 1;
1001 };
1002 struct FastMathFlagsTy {
1003 char AllowReassoc : 1;
1004 char NoNaNs : 1;
1005 char NoInfs : 1;
1006 char NoSignedZeros : 1;
1007 char AllowReciprocal : 1;
1008 char AllowContract : 1;
1009 char ApproxFunc : 1;
1010
1011 FastMathFlagsTy(const FastMathFlags &FMF);
1012 };
1013
1014 OperationType OpType;
1015
1016 union {
1017 CmpInst::Predicate CmpPredicate;
1018 WrapFlagsTy WrapFlags;
1019 DisjointFlagsTy DisjointFlags;
1020 ExactFlagsTy ExactFlags;
1021 GEPFlagsTy GEPFlags;
1022 NonNegFlagsTy NonNegFlags;
1023 FastMathFlagsTy FMFs;
1024 unsigned AllFlags;
1025 };
1026
1027 protected:
transferFlags(VPRecipeWithIRFlags & Other)1028 void transferFlags(VPRecipeWithIRFlags &Other) {
1029 OpType = Other.OpType;
1030 AllFlags = Other.AllFlags;
1031 }
1032
1033 public:
1034 template <typename IterT>
1035 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
VPSingleDefRecipe(SC,Operands,DL)1036 : VPSingleDefRecipe(SC, Operands, DL) {
1037 OpType = OperationType::Other;
1038 AllFlags = 0;
1039 }
1040
1041 template <typename IterT>
VPRecipeWithIRFlags(const unsigned char SC,IterT Operands,Instruction & I)1042 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)
1043 : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) {
1044 if (auto *Op = dyn_cast<CmpInst>(&I)) {
1045 OpType = OperationType::Cmp;
1046 CmpPredicate = Op->getPredicate();
1047 } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) {
1048 OpType = OperationType::DisjointOp;
1049 DisjointFlags.IsDisjoint = Op->isDisjoint();
1050 } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
1051 OpType = OperationType::OverflowingBinOp;
1052 WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};
1053 } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {
1054 OpType = OperationType::PossiblyExactOp;
1055 ExactFlags.IsExact = Op->isExact();
1056 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
1057 OpType = OperationType::GEPOp;
1058 GEPFlags.IsInBounds = GEP->isInBounds();
1059 } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {
1060 OpType = OperationType::NonNegOp;
1061 NonNegFlags.NonNeg = PNNI->hasNonNeg();
1062 } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {
1063 OpType = OperationType::FPMathOp;
1064 FMFs = Op->getFastMathFlags();
1065 } else {
1066 OpType = OperationType::Other;
1067 AllFlags = 0;
1068 }
1069 }
1070
1071 template <typename IterT>
1072 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1073 CmpInst::Predicate Pred, DebugLoc DL = {})
VPSingleDefRecipe(SC,Operands,DL)1074 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp),
1075 CmpPredicate(Pred) {}
1076
1077 template <typename IterT>
1078 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1079 WrapFlagsTy WrapFlags, DebugLoc DL = {})
VPSingleDefRecipe(SC,Operands,DL)1080 : VPSingleDefRecipe(SC, Operands, DL),
1081 OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {}
1082
1083 template <typename IterT>
1084 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1085 FastMathFlags FMFs, DebugLoc DL = {})
VPSingleDefRecipe(SC,Operands,DL)1086 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp),
1087 FMFs(FMFs) {}
1088
1089 template <typename IterT>
1090 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1091 DisjointFlagsTy DisjointFlags, DebugLoc DL = {})
VPSingleDefRecipe(SC,Operands,DL)1092 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
1093 DisjointFlags(DisjointFlags) {}
1094
1095 protected:
1096 template <typename IterT>
1097 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1098 GEPFlagsTy GEPFlags, DebugLoc DL = {})
VPSingleDefRecipe(SC,Operands,DL)1099 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
1100 GEPFlags(GEPFlags) {}
1101
1102 public:
classof(const VPRecipeBase * R)1103 static inline bool classof(const VPRecipeBase *R) {
1104 return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
1105 R->getVPDefID() == VPRecipeBase::VPWidenSC ||
1106 R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
1107 R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1108 R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
1109 R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
1110 }
1111
classof(const VPUser * U)1112 static inline bool classof(const VPUser *U) {
1113 auto *R = dyn_cast<VPRecipeBase>(U);
1114 return R && classof(R);
1115 }
1116
1117 /// Drop all poison-generating flags.
dropPoisonGeneratingFlags()1118 void dropPoisonGeneratingFlags() {
1119 // NOTE: This needs to be kept in-sync with
1120 // Instruction::dropPoisonGeneratingFlags.
1121 switch (OpType) {
1122 case OperationType::OverflowingBinOp:
1123 WrapFlags.HasNUW = false;
1124 WrapFlags.HasNSW = false;
1125 break;
1126 case OperationType::DisjointOp:
1127 DisjointFlags.IsDisjoint = false;
1128 break;
1129 case OperationType::PossiblyExactOp:
1130 ExactFlags.IsExact = false;
1131 break;
1132 case OperationType::GEPOp:
1133 GEPFlags.IsInBounds = false;
1134 break;
1135 case OperationType::FPMathOp:
1136 FMFs.NoNaNs = false;
1137 FMFs.NoInfs = false;
1138 break;
1139 case OperationType::NonNegOp:
1140 NonNegFlags.NonNeg = false;
1141 break;
1142 case OperationType::Cmp:
1143 case OperationType::Other:
1144 break;
1145 }
1146 }
1147
1148 /// Set the IR flags for \p I.
setFlags(Instruction * I)1149 void setFlags(Instruction *I) const {
1150 switch (OpType) {
1151 case OperationType::OverflowingBinOp:
1152 I->setHasNoUnsignedWrap(WrapFlags.HasNUW);
1153 I->setHasNoSignedWrap(WrapFlags.HasNSW);
1154 break;
1155 case OperationType::DisjointOp:
1156 cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint);
1157 break;
1158 case OperationType::PossiblyExactOp:
1159 I->setIsExact(ExactFlags.IsExact);
1160 break;
1161 case OperationType::GEPOp:
1162 // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan.
1163 cast<GetElementPtrInst>(I)->setNoWrapFlags(
1164 GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds()
1165 : GEPNoWrapFlags::none());
1166 break;
1167 case OperationType::FPMathOp:
1168 I->setHasAllowReassoc(FMFs.AllowReassoc);
1169 I->setHasNoNaNs(FMFs.NoNaNs);
1170 I->setHasNoInfs(FMFs.NoInfs);
1171 I->setHasNoSignedZeros(FMFs.NoSignedZeros);
1172 I->setHasAllowReciprocal(FMFs.AllowReciprocal);
1173 I->setHasAllowContract(FMFs.AllowContract);
1174 I->setHasApproxFunc(FMFs.ApproxFunc);
1175 break;
1176 case OperationType::NonNegOp:
1177 I->setNonNeg(NonNegFlags.NonNeg);
1178 break;
1179 case OperationType::Cmp:
1180 case OperationType::Other:
1181 break;
1182 }
1183 }
1184
getPredicate()1185 CmpInst::Predicate getPredicate() const {
1186 assert(OpType == OperationType::Cmp &&
1187 "recipe doesn't have a compare predicate");
1188 return CmpPredicate;
1189 }
1190
isInBounds()1191 bool isInBounds() const {
1192 assert(OpType == OperationType::GEPOp &&
1193 "recipe doesn't have inbounds flag");
1194 return GEPFlags.IsInBounds;
1195 }
1196
1197 /// Returns true if the recipe has fast-math flags.
hasFastMathFlags()1198 bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
1199
1200 FastMathFlags getFastMathFlags() const;
1201
hasNoUnsignedWrap()1202 bool hasNoUnsignedWrap() const {
1203 assert(OpType == OperationType::OverflowingBinOp &&
1204 "recipe doesn't have a NUW flag");
1205 return WrapFlags.HasNUW;
1206 }
1207
hasNoSignedWrap()1208 bool hasNoSignedWrap() const {
1209 assert(OpType == OperationType::OverflowingBinOp &&
1210 "recipe doesn't have a NSW flag");
1211 return WrapFlags.HasNSW;
1212 }
1213
isDisjoint()1214 bool isDisjoint() const {
1215 assert(OpType == OperationType::DisjointOp &&
1216 "recipe cannot have a disjoing flag");
1217 return DisjointFlags.IsDisjoint;
1218 }
1219
1220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1221 void printFlags(raw_ostream &O) const;
1222 #endif
1223 };
1224
1225 /// This is a concrete Recipe that models a single VPlan-level instruction.
1226 /// While as any Recipe it may generate a sequence of IR instructions when
1227 /// executed, these instructions would always form a single-def expression as
1228 /// the VPInstruction is also a single def-use vertex.
1229 class VPInstruction : public VPRecipeWithIRFlags {
1230 friend class VPlanSlp;
1231
1232 public:
1233 /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
1234 enum {
1235 FirstOrderRecurrenceSplice =
1236 Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
1237 // values of a first-order recurrence.
1238 Not,
1239 SLPLoad,
1240 SLPStore,
1241 ActiveLaneMask,
1242 ExplicitVectorLength,
1243 /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
1244 /// The first operand is the incoming value from the predecessor in VPlan,
1245 /// the second operand is the incoming value for all other predecessors
1246 /// (which are currently not modeled in VPlan).
1247 ResumePhi,
1248 CalculateTripCountMinusVF,
1249 // Increment the canonical IV separately for each unrolled part.
1250 CanonicalIVIncrementForPart,
1251 BranchOnCount,
1252 BranchOnCond,
1253 ComputeReductionResult,
1254 // Takes the VPValue to extract from as first operand and the lane or part
1255 // to extract as second operand, counting from the end starting with 1 for
1256 // last. The second operand must be a positive constant and <= VF when
1257 // extracting from a vector or <= UF when extracting from an unrolled
1258 // scalar.
1259 ExtractFromEnd,
1260 LogicalAnd, // Non-poison propagating logical And.
1261 // Add an offset in bytes (second operand) to a base pointer (first
1262 // operand). Only generates scalar values (either for the first lane only or
1263 // for all lanes, depending on its uses).
1264 PtrAdd,
1265 };
1266
1267 private:
1268 typedef unsigned char OpcodeTy;
1269 OpcodeTy Opcode;
1270
1271 /// An optional name that can be used for the generated IR instruction.
1272 const std::string Name;
1273
1274 /// Returns true if this VPInstruction generates scalar values for all lanes.
1275 /// Most VPInstructions generate a single value per part, either vector or
1276 /// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
1277 /// values per all lanes, stemming from an original ingredient. This method
1278 /// identifies the (rare) cases of VPInstructions that do so as well, w/o an
1279 /// underlying ingredient.
1280 bool doesGeneratePerAllLanes() const;
1281
1282 /// Returns true if we can generate a scalar for the first lane only if
1283 /// needed.
1284 bool canGenerateScalarForFirstLane() const;
1285
1286 /// Utility methods serving execute(): generates a single instance of the
1287 /// modeled instruction for a given part. \returns the generated value for \p
1288 /// Part. In some cases an existing value is returned rather than a generated
1289 /// one.
1290 Value *generatePerPart(VPTransformState &State, unsigned Part);
1291
1292 /// Utility methods serving execute(): generates a scalar single instance of
1293 /// the modeled instruction for a given lane. \returns the scalar generated
1294 /// value for lane \p Lane.
1295 Value *generatePerLane(VPTransformState &State, const VPIteration &Lane);
1296
1297 #if !defined(NDEBUG)
1298 /// Return true if the VPInstruction is a floating point math operation, i.e.
1299 /// has fast-math flags.
1300 bool isFPMathOp() const;
1301 #endif
1302
1303 public:
1304 VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
1305 const Twine &Name = "")
VPRecipeWithIRFlags(VPDef::VPInstructionSC,Operands,DL)1306 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
1307 Opcode(Opcode), Name(Name.str()) {}
1308
1309 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1310 DebugLoc DL = {}, const Twine &Name = "")
VPInstruction(Opcode,ArrayRef<VPValue * > (Operands),DL,Name)1311 : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
1312
1313 VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A,
1314 VPValue *B, DebugLoc DL = {}, const Twine &Name = "");
1315
1316 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1317 WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")
VPRecipeWithIRFlags(VPDef::VPInstructionSC,Operands,WrapFlags,DL)1318 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),
1319 Opcode(Opcode), Name(Name.str()) {}
1320
1321 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1322 DisjointFlagsTy DisjointFlag, DebugLoc DL = {},
1323 const Twine &Name = "")
VPRecipeWithIRFlags(VPDef::VPInstructionSC,Operands,DisjointFlag,DL)1324 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL),
1325 Opcode(Opcode), Name(Name.str()) {
1326 assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
1327 }
1328
1329 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1330 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
1331
VP_CLASSOF_IMPL(VPDef::VPInstructionSC)1332 VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
1333
1334 VPInstruction *clone() override {
1335 SmallVector<VPValue *, 2> Operands(operands());
1336 auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name);
1337 New->transferFlags(*this);
1338 return New;
1339 }
1340
getOpcode()1341 unsigned getOpcode() const { return Opcode; }
1342
1343 /// Generate the instruction.
1344 /// TODO: We currently execute only per-part unless a specific instance is
1345 /// provided.
1346 void execute(VPTransformState &State) override;
1347
1348 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1349 /// Print the VPInstruction to \p O.
1350 void print(raw_ostream &O, const Twine &Indent,
1351 VPSlotTracker &SlotTracker) const override;
1352
1353 /// Print the VPInstruction to dbgs() (for debugging).
1354 LLVM_DUMP_METHOD void dump() const;
1355 #endif
1356
1357 /// Return true if this instruction may modify memory.
mayWriteToMemory()1358 bool mayWriteToMemory() const {
1359 // TODO: we can use attributes of the called function to rule out memory
1360 // modifications.
1361 return Opcode == Instruction::Store || Opcode == Instruction::Call ||
1362 Opcode == Instruction::Invoke || Opcode == SLPStore;
1363 }
1364
hasResult()1365 bool hasResult() const {
1366 // CallInst may or may not have a result, depending on the called function.
1367 // Conservatively return calls have results for now.
1368 switch (getOpcode()) {
1369 case Instruction::Ret:
1370 case Instruction::Br:
1371 case Instruction::Store:
1372 case Instruction::Switch:
1373 case Instruction::IndirectBr:
1374 case Instruction::Resume:
1375 case Instruction::CatchRet:
1376 case Instruction::Unreachable:
1377 case Instruction::Fence:
1378 case Instruction::AtomicRMW:
1379 case VPInstruction::BranchOnCond:
1380 case VPInstruction::BranchOnCount:
1381 return false;
1382 default:
1383 return true;
1384 }
1385 }
1386
1387 /// Returns true if the recipe only uses the first lane of operand \p Op.
1388 bool onlyFirstLaneUsed(const VPValue *Op) const override;
1389
1390 /// Returns true if the recipe only uses the first part of operand \p Op.
1391 bool onlyFirstPartUsed(const VPValue *Op) const override;
1392
1393 /// Returns true if this VPInstruction produces a scalar value from a vector,
1394 /// e.g. by performing a reduction or extracting a lane.
1395 bool isVectorToScalar() const;
1396
1397 /// Returns true if this VPInstruction's operands are single scalars and the
1398 /// result is also a single scalar.
1399 bool isSingleScalar() const;
1400 };
1401
1402 /// VPWidenRecipe is a recipe for producing a widened instruction using the
1403 /// opcode and operands of the recipe. This recipe covers most of the
1404 /// traditional vectorization cases where each recipe transforms into a
1405 /// vectorized version of itself.
1406 class VPWidenRecipe : public VPRecipeWithIRFlags {
1407 unsigned Opcode;
1408
1409 public:
1410 template <typename IterT>
VPWidenRecipe(Instruction & I,iterator_range<IterT> Operands)1411 VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
1412 : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
1413 Opcode(I.getOpcode()) {}
1414
1415 ~VPWidenRecipe() override = default;
1416
clone()1417 VPWidenRecipe *clone() override {
1418 auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands());
1419 R->transferFlags(*this);
1420 return R;
1421 }
1422
1423 VP_CLASSOF_IMPL(VPDef::VPWidenSC)
1424
1425 /// Produce a widened instruction using the opcode and operands of the recipe,
1426 /// processing State.VF elements.
1427 void execute(VPTransformState &State) override;
1428
getOpcode()1429 unsigned getOpcode() const { return Opcode; }
1430
1431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1432 /// Print the recipe.
1433 void print(raw_ostream &O, const Twine &Indent,
1434 VPSlotTracker &SlotTracker) const override;
1435 #endif
1436 };
1437
1438 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
1439 class VPWidenCastRecipe : public VPRecipeWithIRFlags {
1440 /// Cast instruction opcode.
1441 Instruction::CastOps Opcode;
1442
1443 /// Result type for the cast.
1444 Type *ResultTy;
1445
1446 public:
VPWidenCastRecipe(Instruction::CastOps Opcode,VPValue * Op,Type * ResultTy,CastInst & UI)1447 VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1448 CastInst &UI)
1449 : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
1450 ResultTy(ResultTy) {
1451 assert(UI.getOpcode() == Opcode &&
1452 "opcode of underlying cast doesn't match");
1453 }
1454
VPWidenCastRecipe(Instruction::CastOps Opcode,VPValue * Op,Type * ResultTy)1455 VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1456 : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
1457 ResultTy(ResultTy) {}
1458
1459 ~VPWidenCastRecipe() override = default;
1460
clone()1461 VPWidenCastRecipe *clone() override {
1462 if (auto *UV = getUnderlyingValue())
1463 return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
1464 *cast<CastInst>(UV));
1465
1466 return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy);
1467 }
1468
1469 VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
1470
1471 /// Produce widened copies of the cast.
1472 void execute(VPTransformState &State) override;
1473
1474 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1475 /// Print the recipe.
1476 void print(raw_ostream &O, const Twine &Indent,
1477 VPSlotTracker &SlotTracker) const override;
1478 #endif
1479
getOpcode()1480 Instruction::CastOps getOpcode() const { return Opcode; }
1481
1482 /// Returns the result type of the cast.
getResultType()1483 Type *getResultType() const { return ResultTy; }
1484 };
1485
1486 /// VPScalarCastRecipe is a recipe to create scalar cast instructions.
1487 class VPScalarCastRecipe : public VPSingleDefRecipe {
1488 Instruction::CastOps Opcode;
1489
1490 Type *ResultTy;
1491
1492 Value *generate(VPTransformState &State, unsigned Part);
1493
1494 public:
VPScalarCastRecipe(Instruction::CastOps Opcode,VPValue * Op,Type * ResultTy)1495 VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1496 : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode),
1497 ResultTy(ResultTy) {}
1498
1499 ~VPScalarCastRecipe() override = default;
1500
clone()1501 VPScalarCastRecipe *clone() override {
1502 return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy);
1503 }
1504
1505 VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
1506
1507 void execute(VPTransformState &State) override;
1508
1509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1510 void print(raw_ostream &O, const Twine &Indent,
1511 VPSlotTracker &SlotTracker) const override;
1512 #endif
1513
1514 /// Returns the result type of the cast.
getResultType()1515 Type *getResultType() const { return ResultTy; }
1516
onlyFirstLaneUsed(const VPValue * Op)1517 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1518 // At the moment, only uniform codegen is implemented.
1519 assert(is_contained(operands(), Op) &&
1520 "Op must be an operand of the recipe");
1521 return true;
1522 }
1523 };
1524
1525 /// A recipe for widening Call instructions.
1526 class VPWidenCallRecipe : public VPSingleDefRecipe {
1527 /// ID of the vector intrinsic to call when widening the call. If set the
1528 /// Intrinsic::not_intrinsic, a library call will be used instead.
1529 Intrinsic::ID VectorIntrinsicID;
1530 /// If this recipe represents a library call, Variant stores a pointer to
1531 /// the chosen function. There is a 1:1 mapping between a given VF and the
1532 /// chosen vectorized variant, so there will be a different vplan for each
1533 /// VF with a valid variant.
1534 Function *Variant;
1535
1536 public:
1537 template <typename IterT>
1538 VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
1539 Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
1540 Function *Variant = nullptr)
VPSingleDefRecipe(VPDef::VPWidenCallSC,CallArguments,UV,DL)1541 : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
1542 VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
1543 assert(
1544 isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
1545 "last operand must be the called function");
1546 }
1547
1548 ~VPWidenCallRecipe() override = default;
1549
clone()1550 VPWidenCallRecipe *clone() override {
1551 return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
1552 VectorIntrinsicID, getDebugLoc(), Variant);
1553 }
1554
1555 VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
1556
1557 /// Produce a widened version of the call instruction.
1558 void execute(VPTransformState &State) override;
1559
getCalledScalarFunction()1560 Function *getCalledScalarFunction() const {
1561 return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
1562 }
1563
arg_operands()1564 operand_range arg_operands() {
1565 return make_range(op_begin(), op_begin() + getNumOperands() - 1);
1566 }
arg_operands()1567 const_operand_range arg_operands() const {
1568 return make_range(op_begin(), op_begin() + getNumOperands() - 1);
1569 }
1570
1571 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1572 /// Print the recipe.
1573 void print(raw_ostream &O, const Twine &Indent,
1574 VPSlotTracker &SlotTracker) const override;
1575 #endif
1576 };
1577
1578 /// A recipe for widening select instructions.
1579 struct VPWidenSelectRecipe : public VPSingleDefRecipe {
1580 template <typename IterT>
VPWidenSelectRecipeVPWidenSelectRecipe1581 VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
1582 : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I,
1583 I.getDebugLoc()) {}
1584
1585 ~VPWidenSelectRecipe() override = default;
1586
cloneVPWidenSelectRecipe1587 VPWidenSelectRecipe *clone() override {
1588 return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
1589 operands());
1590 }
1591
1592 VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
1593
1594 /// Produce a widened version of the select instruction.
1595 void execute(VPTransformState &State) override;
1596
1597 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1598 /// Print the recipe.
1599 void print(raw_ostream &O, const Twine &Indent,
1600 VPSlotTracker &SlotTracker) const override;
1601 #endif
1602
getCondVPWidenSelectRecipe1603 VPValue *getCond() const {
1604 return getOperand(0);
1605 }
1606
isInvariantCondVPWidenSelectRecipe1607 bool isInvariantCond() const {
1608 return getCond()->isDefinedOutsideVectorRegions();
1609 }
1610 };
1611
1612 /// A recipe for handling GEP instructions.
1613 class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
isPointerLoopInvariant()1614 bool isPointerLoopInvariant() const {
1615 return getOperand(0)->isDefinedOutsideVectorRegions();
1616 }
1617
isIndexLoopInvariant(unsigned I)1618 bool isIndexLoopInvariant(unsigned I) const {
1619 return getOperand(I + 1)->isDefinedOutsideVectorRegions();
1620 }
1621
areAllOperandsInvariant()1622 bool areAllOperandsInvariant() const {
1623 return all_of(operands(), [](VPValue *Op) {
1624 return Op->isDefinedOutsideVectorRegions();
1625 });
1626 }
1627
1628 public:
1629 template <typename IterT>
VPWidenGEPRecipe(GetElementPtrInst * GEP,iterator_range<IterT> Operands)1630 VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
1631 : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {}
1632
1633 ~VPWidenGEPRecipe() override = default;
1634
clone()1635 VPWidenGEPRecipe *clone() override {
1636 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()),
1637 operands());
1638 }
1639
1640 VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
1641
1642 /// Generate the gep nodes.
1643 void execute(VPTransformState &State) override;
1644
1645 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1646 /// Print the recipe.
1647 void print(raw_ostream &O, const Twine &Indent,
1648 VPSlotTracker &SlotTracker) const override;
1649 #endif
1650 };
1651
1652 /// A recipe to compute the pointers for widened memory accesses of IndexTy for
1653 /// all parts. If IsReverse is true, compute pointers for accessing the input in
1654 /// reverse order per part.
1655 class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
1656 Type *IndexedTy;
1657 bool IsReverse;
1658
1659 public:
VPVectorPointerRecipe(VPValue * Ptr,Type * IndexedTy,bool IsReverse,bool IsInBounds,DebugLoc DL)1660 VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
1661 bool IsInBounds, DebugLoc DL)
1662 : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
1663 GEPFlagsTy(IsInBounds), DL),
1664 IndexedTy(IndexedTy), IsReverse(IsReverse) {}
1665
1666 VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
1667
1668 void execute(VPTransformState &State) override;
1669
onlyFirstLaneUsed(const VPValue * Op)1670 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1671 assert(is_contained(operands(), Op) &&
1672 "Op must be an operand of the recipe");
1673 return true;
1674 }
1675
clone()1676 VPVectorPointerRecipe *clone() override {
1677 return new VPVectorPointerRecipe(getOperand(0), IndexedTy, IsReverse,
1678 isInBounds(), getDebugLoc());
1679 }
1680
1681 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1682 /// Print the recipe.
1683 void print(raw_ostream &O, const Twine &Indent,
1684 VPSlotTracker &SlotTracker) const override;
1685 #endif
1686 };
1687
1688 /// A pure virtual base class for all recipes modeling header phis, including
1689 /// phis for first order recurrences, pointer inductions and reductions. The
1690 /// start value is the first operand of the recipe and the incoming value from
1691 /// the backedge is the second operand.
1692 ///
1693 /// Inductions are modeled using the following sub-classes:
1694 /// * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop,
1695 /// starting at a specified value (zero for the main vector loop, the resume
1696 /// value for the epilogue vector loop) and stepping by 1. The induction
1697 /// controls exiting of the vector loop by comparing against the vector trip
1698 /// count. Produces a single scalar PHI for the induction value per
1699 /// iteration.
1700 /// * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and
1701 /// floating point inductions with arbitrary start and step values. Produces
1702 /// a vector PHI per-part.
1703 /// * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding
1704 /// value of an IV with different start and step values. Produces a single
1705 /// scalar value per iteration
1706 /// * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a
1707 /// canonical or derived induction.
1708 /// * VPWidenPointerInductionRecipe: Generate vector and scalar values for a
1709 /// pointer induction. Produces either a vector PHI per-part or scalar values
1710 /// per-lane based on the canonical induction.
1711 class VPHeaderPHIRecipe : public VPSingleDefRecipe {
1712 protected:
1713 VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
1714 VPValue *Start = nullptr, DebugLoc DL = {})
VPSingleDefRecipe(VPDefID,ArrayRef<VPValue * > (),UnderlyingInstr,DL)1715 : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>(), UnderlyingInstr, DL) {
1716 if (Start)
1717 addOperand(Start);
1718 }
1719
1720 public:
1721 ~VPHeaderPHIRecipe() override = default;
1722
1723 /// Method to support type inquiry through isa, cast, and dyn_cast.
classof(const VPRecipeBase * B)1724 static inline bool classof(const VPRecipeBase *B) {
1725 return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC &&
1726 B->getVPDefID() <= VPDef::VPLastHeaderPHISC;
1727 }
classof(const VPValue * V)1728 static inline bool classof(const VPValue *V) {
1729 auto *B = V->getDefiningRecipe();
1730 return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC &&
1731 B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC;
1732 }
1733
1734 /// Generate the phi nodes.
1735 void execute(VPTransformState &State) override = 0;
1736
1737 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1738 /// Print the recipe.
1739 void print(raw_ostream &O, const Twine &Indent,
1740 VPSlotTracker &SlotTracker) const override = 0;
1741 #endif
1742
1743 /// Returns the start value of the phi, if one is set.
getStartValue()1744 VPValue *getStartValue() {
1745 return getNumOperands() == 0 ? nullptr : getOperand(0);
1746 }
getStartValue()1747 VPValue *getStartValue() const {
1748 return getNumOperands() == 0 ? nullptr : getOperand(0);
1749 }
1750
1751 /// Update the start value of the recipe.
setStartValue(VPValue * V)1752 void setStartValue(VPValue *V) { setOperand(0, V); }
1753
1754 /// Returns the incoming value from the loop backedge.
getBackedgeValue()1755 virtual VPValue *getBackedgeValue() {
1756 return getOperand(1);
1757 }
1758
1759 /// Returns the backedge value as a recipe. The backedge value is guaranteed
1760 /// to be a recipe.
getBackedgeRecipe()1761 virtual VPRecipeBase &getBackedgeRecipe() {
1762 return *getBackedgeValue()->getDefiningRecipe();
1763 }
1764 };
1765
1766 /// A recipe for handling phi nodes of integer and floating-point inductions,
1767 /// producing their vector values.
1768 class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
1769 PHINode *IV;
1770 TruncInst *Trunc;
1771 const InductionDescriptor &IndDesc;
1772
1773 public:
VPWidenIntOrFpInductionRecipe(PHINode * IV,VPValue * Start,VPValue * Step,const InductionDescriptor & IndDesc)1774 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1775 const InductionDescriptor &IndDesc)
1776 : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
1777 Trunc(nullptr), IndDesc(IndDesc) {
1778 addOperand(Step);
1779 }
1780
VPWidenIntOrFpInductionRecipe(PHINode * IV,VPValue * Start,VPValue * Step,const InductionDescriptor & IndDesc,TruncInst * Trunc)1781 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1782 const InductionDescriptor &IndDesc,
1783 TruncInst *Trunc)
1784 : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
1785 IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
1786 addOperand(Step);
1787 }
1788
1789 ~VPWidenIntOrFpInductionRecipe() override = default;
1790
clone()1791 VPWidenIntOrFpInductionRecipe *clone() override {
1792 return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(),
1793 getStepValue(), IndDesc, Trunc);
1794 }
1795
1796 VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
1797
1798 /// Generate the vectorized and scalarized versions of the phi node as
1799 /// needed by their users.
1800 void execute(VPTransformState &State) override;
1801
1802 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1803 /// Print the recipe.
1804 void print(raw_ostream &O, const Twine &Indent,
1805 VPSlotTracker &SlotTracker) const override;
1806 #endif
1807
getBackedgeValue()1808 VPValue *getBackedgeValue() override {
1809 // TODO: All operands of base recipe must exist and be at same index in
1810 // derived recipe.
1811 llvm_unreachable(
1812 "VPWidenIntOrFpInductionRecipe generates its own backedge value");
1813 }
1814
getBackedgeRecipe()1815 VPRecipeBase &getBackedgeRecipe() override {
1816 // TODO: All operands of base recipe must exist and be at same index in
1817 // derived recipe.
1818 llvm_unreachable(
1819 "VPWidenIntOrFpInductionRecipe generates its own backedge value");
1820 }
1821
1822 /// Returns the step value of the induction.
getStepValue()1823 VPValue *getStepValue() { return getOperand(1); }
getStepValue()1824 const VPValue *getStepValue() const { return getOperand(1); }
1825
1826 /// Returns the first defined value as TruncInst, if it is one or nullptr
1827 /// otherwise.
getTruncInst()1828 TruncInst *getTruncInst() { return Trunc; }
getTruncInst()1829 const TruncInst *getTruncInst() const { return Trunc; }
1830
getPHINode()1831 PHINode *getPHINode() { return IV; }
1832
1833 /// Returns the induction descriptor for the recipe.
getInductionDescriptor()1834 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1835
1836 /// Returns true if the induction is canonical, i.e. starting at 0 and
1837 /// incremented by UF * VF (= the original IV is incremented by 1) and has the
1838 /// same type as the canonical induction.
1839 bool isCanonical() const;
1840
1841 /// Returns the scalar type of the induction.
getScalarType()1842 Type *getScalarType() const {
1843 return Trunc ? Trunc->getType() : IV->getType();
1844 }
1845 };
1846
1847 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
1848 const InductionDescriptor &IndDesc;
1849
1850 bool IsScalarAfterVectorization;
1851
1852 public:
1853 /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
1854 /// Start.
VPWidenPointerInductionRecipe(PHINode * Phi,VPValue * Start,VPValue * Step,const InductionDescriptor & IndDesc,bool IsScalarAfterVectorization)1855 VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
1856 const InductionDescriptor &IndDesc,
1857 bool IsScalarAfterVectorization)
1858 : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi),
1859 IndDesc(IndDesc),
1860 IsScalarAfterVectorization(IsScalarAfterVectorization) {
1861 addOperand(Start);
1862 addOperand(Step);
1863 }
1864
1865 ~VPWidenPointerInductionRecipe() override = default;
1866
clone()1867 VPWidenPointerInductionRecipe *clone() override {
1868 return new VPWidenPointerInductionRecipe(
1869 cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
1870 IndDesc, IsScalarAfterVectorization);
1871 }
1872
1873 VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
1874
1875 /// Generate vector values for the pointer induction.
1876 void execute(VPTransformState &State) override;
1877
1878 /// Returns true if only scalar values will be generated.
1879 bool onlyScalarsGenerated(bool IsScalable);
1880
1881 /// Returns the induction descriptor for the recipe.
getInductionDescriptor()1882 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1883
1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1885 /// Print the recipe.
1886 void print(raw_ostream &O, const Twine &Indent,
1887 VPSlotTracker &SlotTracker) const override;
1888 #endif
1889 };
1890
1891 /// A recipe for handling phis that are widened in the vector loop.
1892 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
1893 /// managed in the recipe directly.
1894 class VPWidenPHIRecipe : public VPSingleDefRecipe {
1895 /// List of incoming blocks. Only used in the VPlan native path.
1896 SmallVector<VPBasicBlock *, 2> IncomingBlocks;
1897
1898 public:
1899 /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
1900 VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr)
VPSingleDefRecipe(VPDef::VPWidenPHISC,ArrayRef<VPValue * > (),Phi)1901 : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi) {
1902 if (Start)
1903 addOperand(Start);
1904 }
1905
clone()1906 VPWidenPHIRecipe *clone() override {
1907 llvm_unreachable("cloning not implemented yet");
1908 }
1909
1910 ~VPWidenPHIRecipe() override = default;
1911
1912 VP_CLASSOF_IMPL(VPDef::VPWidenPHISC)
1913
1914 /// Generate the phi/select nodes.
1915 void execute(VPTransformState &State) override;
1916
1917 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1918 /// Print the recipe.
1919 void print(raw_ostream &O, const Twine &Indent,
1920 VPSlotTracker &SlotTracker) const override;
1921 #endif
1922
1923 /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
addIncoming(VPValue * IncomingV,VPBasicBlock * IncomingBlock)1924 void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
1925 addOperand(IncomingV);
1926 IncomingBlocks.push_back(IncomingBlock);
1927 }
1928
1929 /// Returns the \p I th incoming VPBasicBlock.
getIncomingBlock(unsigned I)1930 VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
1931
1932 /// Returns the \p I th incoming VPValue.
getIncomingValue(unsigned I)1933 VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
1934 };
1935
1936 /// A recipe for handling first-order recurrence phis. The start value is the
1937 /// first operand of the recipe and the incoming value from the backedge is the
1938 /// second operand.
1939 struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
VPFirstOrderRecurrencePHIRecipeVPFirstOrderRecurrencePHIRecipe1940 VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
1941 : VPHeaderPHIRecipe(VPDef::VPFirstOrderRecurrencePHISC, Phi, &Start) {}
1942
VP_CLASSOF_IMPLVPFirstOrderRecurrencePHIRecipe1943 VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC)
1944
1945 static inline bool classof(const VPHeaderPHIRecipe *R) {
1946 return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC;
1947 }
1948
cloneVPFirstOrderRecurrencePHIRecipe1949 VPFirstOrderRecurrencePHIRecipe *clone() override {
1950 return new VPFirstOrderRecurrencePHIRecipe(
1951 cast<PHINode>(getUnderlyingInstr()), *getOperand(0));
1952 }
1953
1954 void execute(VPTransformState &State) override;
1955
1956 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1957 /// Print the recipe.
1958 void print(raw_ostream &O, const Twine &Indent,
1959 VPSlotTracker &SlotTracker) const override;
1960 #endif
1961 };
1962
1963 /// A recipe for handling reduction phis. The start value is the first operand
1964 /// of the recipe and the incoming value from the backedge is the second
1965 /// operand.
1966 class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
1967 /// Descriptor for the reduction.
1968 const RecurrenceDescriptor &RdxDesc;
1969
1970 /// The phi is part of an in-loop reduction.
1971 bool IsInLoop;
1972
1973 /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
1974 bool IsOrdered;
1975
1976 public:
1977 /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
1978 /// RdxDesc.
1979 VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
1980 VPValue &Start, bool IsInLoop = false,
1981 bool IsOrdered = false)
1982 : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
1983 RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
1984 assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
1985 }
1986
1987 ~VPReductionPHIRecipe() override = default;
1988
clone()1989 VPReductionPHIRecipe *clone() override {
1990 auto *R =
1991 new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
1992 *getOperand(0), IsInLoop, IsOrdered);
1993 R->addOperand(getBackedgeValue());
1994 return R;
1995 }
1996
VP_CLASSOF_IMPL(VPDef::VPReductionPHISC)1997 VP_CLASSOF_IMPL(VPDef::VPReductionPHISC)
1998
1999 static inline bool classof(const VPHeaderPHIRecipe *R) {
2000 return R->getVPDefID() == VPDef::VPReductionPHISC;
2001 }
2002
2003 /// Generate the phi/select nodes.
2004 void execute(VPTransformState &State) override;
2005
2006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2007 /// Print the recipe.
2008 void print(raw_ostream &O, const Twine &Indent,
2009 VPSlotTracker &SlotTracker) const override;
2010 #endif
2011
getRecurrenceDescriptor()2012 const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2013 return RdxDesc;
2014 }
2015
2016 /// Returns true, if the phi is part of an ordered reduction.
isOrdered()2017 bool isOrdered() const { return IsOrdered; }
2018
2019 /// Returns true, if the phi is part of an in-loop reduction.
isInLoop()2020 bool isInLoop() const { return IsInLoop; }
2021 };
2022
2023 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
2024 /// instructions.
2025 class VPBlendRecipe : public VPSingleDefRecipe {
2026 public:
2027 /// The blend operation is a User of the incoming values and of their
2028 /// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first
2029 /// incoming value does not have a mask associated.
VPBlendRecipe(PHINode * Phi,ArrayRef<VPValue * > Operands)2030 VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
2031 : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
2032 assert((Operands.size() + 1) % 2 == 0 &&
2033 "Expected an odd number of operands");
2034 }
2035
clone()2036 VPBlendRecipe *clone() override {
2037 SmallVector<VPValue *> Ops(operands());
2038 return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops);
2039 }
2040
VP_CLASSOF_IMPL(VPDef::VPBlendSC)2041 VP_CLASSOF_IMPL(VPDef::VPBlendSC)
2042
2043 /// Return the number of incoming values, taking into account that the first
2044 /// incoming value has no mask.
2045 unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
2046
2047 /// Return incoming value number \p Idx.
getIncomingValue(unsigned Idx)2048 VPValue *getIncomingValue(unsigned Idx) const {
2049 return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1);
2050 }
2051
2052 /// Return mask number \p Idx.
getMask(unsigned Idx)2053 VPValue *getMask(unsigned Idx) const {
2054 assert(Idx > 0 && "First index has no mask associated.");
2055 return getOperand(Idx * 2);
2056 }
2057
2058 /// Generate the phi/select nodes.
2059 void execute(VPTransformState &State) override;
2060
2061 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2062 /// Print the recipe.
2063 void print(raw_ostream &O, const Twine &Indent,
2064 VPSlotTracker &SlotTracker) const override;
2065 #endif
2066
2067 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2068 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2069 assert(is_contained(operands(), Op) &&
2070 "Op must be an operand of the recipe");
2071 // Recursing through Blend recipes only, must terminate at header phi's the
2072 // latest.
2073 return all_of(users(),
2074 [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
2075 }
2076 };
2077
2078 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
2079 /// or stores into one wide load/store and shuffles. The first operand of a
2080 /// VPInterleave recipe is the address, followed by the stored values, followed
2081 /// by an optional mask.
2082 class VPInterleaveRecipe : public VPRecipeBase {
2083 const InterleaveGroup<Instruction> *IG;
2084
2085 /// Indicates if the interleave group is in a conditional block and requires a
2086 /// mask.
2087 bool HasMask = false;
2088
2089 /// Indicates if gaps between members of the group need to be masked out or if
2090 /// unusued gaps can be loaded speculatively.
2091 bool NeedsMaskForGaps = false;
2092
2093 public:
VPInterleaveRecipe(const InterleaveGroup<Instruction> * IG,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * Mask,bool NeedsMaskForGaps)2094 VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
2095 ArrayRef<VPValue *> StoredValues, VPValue *Mask,
2096 bool NeedsMaskForGaps)
2097 : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG),
2098 NeedsMaskForGaps(NeedsMaskForGaps) {
2099 for (unsigned i = 0; i < IG->getFactor(); ++i)
2100 if (Instruction *I = IG->getMember(i)) {
2101 if (I->getType()->isVoidTy())
2102 continue;
2103 new VPValue(I, this);
2104 }
2105
2106 for (auto *SV : StoredValues)
2107 addOperand(SV);
2108 if (Mask) {
2109 HasMask = true;
2110 addOperand(Mask);
2111 }
2112 }
2113 ~VPInterleaveRecipe() override = default;
2114
clone()2115 VPInterleaveRecipe *clone() override {
2116 return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
2117 NeedsMaskForGaps);
2118 }
2119
VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)2120 VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
2121
2122 /// Return the address accessed by this recipe.
2123 VPValue *getAddr() const {
2124 return getOperand(0); // Address is the 1st, mandatory operand.
2125 }
2126
2127 /// Return the mask used by this recipe. Note that a full mask is represented
2128 /// by a nullptr.
getMask()2129 VPValue *getMask() const {
2130 // Mask is optional and therefore the last, currently 2nd operand.
2131 return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
2132 }
2133
2134 /// Return the VPValues stored by this interleave group. If it is a load
2135 /// interleave group, return an empty ArrayRef.
getStoredValues()2136 ArrayRef<VPValue *> getStoredValues() const {
2137 // The first operand is the address, followed by the stored values, followed
2138 // by an optional mask.
2139 return ArrayRef<VPValue *>(op_begin(), getNumOperands())
2140 .slice(1, getNumStoreOperands());
2141 }
2142
2143 /// Generate the wide load or store, and shuffles.
2144 void execute(VPTransformState &State) override;
2145
2146 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2147 /// Print the recipe.
2148 void print(raw_ostream &O, const Twine &Indent,
2149 VPSlotTracker &SlotTracker) const override;
2150 #endif
2151
getInterleaveGroup()2152 const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
2153
2154 /// Returns the number of stored operands of this interleave group. Returns 0
2155 /// for load interleave groups.
getNumStoreOperands()2156 unsigned getNumStoreOperands() const {
2157 return getNumOperands() - (HasMask ? 2 : 1);
2158 }
2159
2160 /// The recipe only uses the first lane of the address.
onlyFirstLaneUsed(const VPValue * Op)2161 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2162 assert(is_contained(operands(), Op) &&
2163 "Op must be an operand of the recipe");
2164 return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
2165 }
2166
getInsertPos()2167 Instruction *getInsertPos() const { return IG->getInsertPos(); }
2168 };
2169
2170 /// A recipe to represent inloop reduction operations, performing a reduction on
2171 /// a vector operand into a scalar value, and adding the result to a chain.
2172 /// The Operands are {ChainOp, VecOp, [Condition]}.
2173 class VPReductionRecipe : public VPSingleDefRecipe {
2174 /// The recurrence decriptor for the reduction in question.
2175 const RecurrenceDescriptor &RdxDesc;
2176 bool IsOrdered;
2177 /// Whether the reduction is conditional.
2178 bool IsConditional = false;
2179
2180 protected:
VPReductionRecipe(const unsigned char SC,const RecurrenceDescriptor & R,Instruction * I,ArrayRef<VPValue * > Operands,VPValue * CondOp,bool IsOrdered)2181 VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
2182 Instruction *I, ArrayRef<VPValue *> Operands,
2183 VPValue *CondOp, bool IsOrdered)
2184 : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
2185 if (CondOp) {
2186 IsConditional = true;
2187 addOperand(CondOp);
2188 }
2189 }
2190
2191 public:
VPReductionRecipe(const RecurrenceDescriptor & R,Instruction * I,VPValue * ChainOp,VPValue * VecOp,VPValue * CondOp,bool IsOrdered)2192 VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
2193 VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2194 bool IsOrdered)
2195 : VPReductionRecipe(VPDef::VPReductionSC, R, I,
2196 ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2197 IsOrdered) {}
2198
2199 ~VPReductionRecipe() override = default;
2200
clone()2201 VPReductionRecipe *clone() override {
2202 return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
2203 getVecOp(), getCondOp(), IsOrdered);
2204 }
2205
classof(const VPRecipeBase * R)2206 static inline bool classof(const VPRecipeBase *R) {
2207 return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
2208 R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
2209 }
2210
classof(const VPUser * U)2211 static inline bool classof(const VPUser *U) {
2212 auto *R = dyn_cast<VPRecipeBase>(U);
2213 return R && classof(R);
2214 }
2215
2216 /// Generate the reduction in the loop
2217 void execute(VPTransformState &State) override;
2218
2219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2220 /// Print the recipe.
2221 void print(raw_ostream &O, const Twine &Indent,
2222 VPSlotTracker &SlotTracker) const override;
2223 #endif
2224
2225 /// Return the recurrence decriptor for the in-loop reduction.
getRecurrenceDescriptor()2226 const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2227 return RdxDesc;
2228 }
2229 /// Return true if the in-loop reduction is ordered.
isOrdered()2230 bool isOrdered() const { return IsOrdered; };
2231 /// Return true if the in-loop reduction is conditional.
isConditional()2232 bool isConditional() const { return IsConditional; };
2233 /// The VPValue of the scalar Chain being accumulated.
getChainOp()2234 VPValue *getChainOp() const { return getOperand(0); }
2235 /// The VPValue of the vector value to be reduced.
getVecOp()2236 VPValue *getVecOp() const { return getOperand(1); }
2237 /// The VPValue of the condition for the block.
getCondOp()2238 VPValue *getCondOp() const {
2239 return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
2240 }
2241 };
2242
2243 /// A recipe to represent inloop reduction operations with vector-predication
2244 /// intrinsics, performing a reduction on a vector operand with the explicit
2245 /// vector length (EVL) into a scalar value, and adding the result to a chain.
2246 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
2247 class VPReductionEVLRecipe : public VPReductionRecipe {
2248 public:
VPReductionEVLRecipe(VPReductionRecipe * R,VPValue * EVL,VPValue * CondOp)2249 VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
2250 : VPReductionRecipe(
2251 VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
2252 cast_or_null<Instruction>(R->getUnderlyingValue()),
2253 ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
2254 R->isOrdered()) {}
2255
2256 ~VPReductionEVLRecipe() override = default;
2257
clone()2258 VPReductionEVLRecipe *clone() override {
2259 llvm_unreachable("cloning not implemented yet");
2260 }
2261
2262 VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
2263
2264 /// Generate the reduction in the loop
2265 void execute(VPTransformState &State) override;
2266
2267 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2268 /// Print the recipe.
2269 void print(raw_ostream &O, const Twine &Indent,
2270 VPSlotTracker &SlotTracker) const override;
2271 #endif
2272
2273 /// The VPValue of the explicit vector length.
getEVL()2274 VPValue *getEVL() const { return getOperand(2); }
2275
2276 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2277 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2278 assert(is_contained(operands(), Op) &&
2279 "Op must be an operand of the recipe");
2280 return Op == getEVL();
2281 }
2282 };
2283
2284 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
2285 /// copies of the original scalar type, one per lane, instead of producing a
2286 /// single copy of widened type for all lanes. If the instruction is known to be
2287 /// uniform only one copy, per lane zero, will be generated.
2288 class VPReplicateRecipe : public VPRecipeWithIRFlags {
2289 /// Indicator if only a single replica per lane is needed.
2290 bool IsUniform;
2291
2292 /// Indicator if the replicas are also predicated.
2293 bool IsPredicated;
2294
2295 public:
2296 template <typename IterT>
2297 VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
2298 bool IsUniform, VPValue *Mask = nullptr)
2299 : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
2300 IsUniform(IsUniform), IsPredicated(Mask) {
2301 if (Mask)
2302 addOperand(Mask);
2303 }
2304
2305 ~VPReplicateRecipe() override = default;
2306
clone()2307 VPReplicateRecipe *clone() override {
2308 auto *Copy =
2309 new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform,
2310 isPredicated() ? getMask() : nullptr);
2311 Copy->transferFlags(*this);
2312 return Copy;
2313 }
2314
2315 VP_CLASSOF_IMPL(VPDef::VPReplicateSC)
2316
2317 /// Generate replicas of the desired Ingredient. Replicas will be generated
2318 /// for all parts and lanes unless a specific part and lane are specified in
2319 /// the \p State.
2320 void execute(VPTransformState &State) override;
2321
2322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2323 /// Print the recipe.
2324 void print(raw_ostream &O, const Twine &Indent,
2325 VPSlotTracker &SlotTracker) const override;
2326 #endif
2327
isUniform()2328 bool isUniform() const { return IsUniform; }
2329
isPredicated()2330 bool isPredicated() const { return IsPredicated; }
2331
2332 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2333 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2334 assert(is_contained(operands(), Op) &&
2335 "Op must be an operand of the recipe");
2336 return isUniform();
2337 }
2338
2339 /// Returns true if the recipe uses scalars of operand \p Op.
usesScalars(const VPValue * Op)2340 bool usesScalars(const VPValue *Op) const override {
2341 assert(is_contained(operands(), Op) &&
2342 "Op must be an operand of the recipe");
2343 return true;
2344 }
2345
2346 /// Returns true if the recipe is used by a widened recipe via an intervening
2347 /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed
2348 /// in a vector.
2349 bool shouldPack() const;
2350
2351 /// Return the mask of a predicated VPReplicateRecipe.
getMask()2352 VPValue *getMask() {
2353 assert(isPredicated() && "Trying to get the mask of a unpredicated recipe");
2354 return getOperand(getNumOperands() - 1);
2355 }
2356
getOpcode()2357 unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); }
2358 };
2359
2360 /// A recipe for generating conditional branches on the bits of a mask.
2361 class VPBranchOnMaskRecipe : public VPRecipeBase {
2362 public:
VPBranchOnMaskRecipe(VPValue * BlockInMask)2363 VPBranchOnMaskRecipe(VPValue *BlockInMask)
2364 : VPRecipeBase(VPDef::VPBranchOnMaskSC, {}) {
2365 if (BlockInMask) // nullptr means all-one mask.
2366 addOperand(BlockInMask);
2367 }
2368
clone()2369 VPBranchOnMaskRecipe *clone() override {
2370 return new VPBranchOnMaskRecipe(getOperand(0));
2371 }
2372
2373 VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC)
2374
2375 /// Generate the extraction of the appropriate bit from the block mask and the
2376 /// conditional branch.
2377 void execute(VPTransformState &State) override;
2378
2379 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2380 /// Print the recipe.
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker)2381 void print(raw_ostream &O, const Twine &Indent,
2382 VPSlotTracker &SlotTracker) const override {
2383 O << Indent << "BRANCH-ON-MASK ";
2384 if (VPValue *Mask = getMask())
2385 Mask->printAsOperand(O, SlotTracker);
2386 else
2387 O << " All-One";
2388 }
2389 #endif
2390
2391 /// Return the mask used by this recipe. Note that a full mask is represented
2392 /// by a nullptr.
getMask()2393 VPValue *getMask() const {
2394 assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
2395 // Mask is optional.
2396 return getNumOperands() == 1 ? getOperand(0) : nullptr;
2397 }
2398
2399 /// Returns true if the recipe uses scalars of operand \p Op.
usesScalars(const VPValue * Op)2400 bool usesScalars(const VPValue *Op) const override {
2401 assert(is_contained(operands(), Op) &&
2402 "Op must be an operand of the recipe");
2403 return true;
2404 }
2405 };
2406
2407 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
2408 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
2409 /// order to merge values that are set under such a branch and feed their uses.
2410 /// The phi nodes can be scalar or vector depending on the users of the value.
2411 /// This recipe works in concert with VPBranchOnMaskRecipe.
2412 class VPPredInstPHIRecipe : public VPSingleDefRecipe {
2413 public:
2414 /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
2415 /// nodes after merging back from a Branch-on-Mask.
VPPredInstPHIRecipe(VPValue * PredV)2416 VPPredInstPHIRecipe(VPValue *PredV)
2417 : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {}
2418 ~VPPredInstPHIRecipe() override = default;
2419
clone()2420 VPPredInstPHIRecipe *clone() override {
2421 return new VPPredInstPHIRecipe(getOperand(0));
2422 }
2423
2424 VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
2425
2426 /// Generates phi nodes for live-outs as needed to retain SSA form.
2427 void execute(VPTransformState &State) override;
2428
2429 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2430 /// Print the recipe.
2431 void print(raw_ostream &O, const Twine &Indent,
2432 VPSlotTracker &SlotTracker) const override;
2433 #endif
2434
2435 /// Returns true if the recipe uses scalars of operand \p Op.
usesScalars(const VPValue * Op)2436 bool usesScalars(const VPValue *Op) const override {
2437 assert(is_contained(operands(), Op) &&
2438 "Op must be an operand of the recipe");
2439 return true;
2440 }
2441 };
2442
2443 /// A common base class for widening memory operations. An optional mask can be
2444 /// provided as the last operand.
2445 class VPWidenMemoryRecipe : public VPRecipeBase {
2446 protected:
2447 Instruction &Ingredient;
2448
2449 /// Whether the accessed addresses are consecutive.
2450 bool Consecutive;
2451
2452 /// Whether the consecutive accessed addresses are in reverse order.
2453 bool Reverse;
2454
2455 /// Whether the memory access is masked.
2456 bool IsMasked = false;
2457
setMask(VPValue * Mask)2458 void setMask(VPValue *Mask) {
2459 assert(!IsMasked && "cannot re-set mask");
2460 if (!Mask)
2461 return;
2462 addOperand(Mask);
2463 IsMasked = true;
2464 }
2465
VPWidenMemoryRecipe(const char unsigned SC,Instruction & I,std::initializer_list<VPValue * > Operands,bool Consecutive,bool Reverse,DebugLoc DL)2466 VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
2467 std::initializer_list<VPValue *> Operands,
2468 bool Consecutive, bool Reverse, DebugLoc DL)
2469 : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
2470 Reverse(Reverse) {
2471 assert((Consecutive || !Reverse) && "Reverse implies consecutive");
2472 }
2473
2474 public:
clone()2475 VPWidenMemoryRecipe *clone() override {
2476 llvm_unreachable("cloning not supported");
2477 }
2478
classof(const VPRecipeBase * R)2479 static inline bool classof(const VPRecipeBase *R) {
2480 return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
2481 R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
2482 R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
2483 R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
2484 }
2485
classof(const VPUser * U)2486 static inline bool classof(const VPUser *U) {
2487 auto *R = dyn_cast<VPRecipeBase>(U);
2488 return R && classof(R);
2489 }
2490
2491 /// Return whether the loaded-from / stored-to addresses are consecutive.
isConsecutive()2492 bool isConsecutive() const { return Consecutive; }
2493
2494 /// Return whether the consecutive loaded/stored addresses are in reverse
2495 /// order.
isReverse()2496 bool isReverse() const { return Reverse; }
2497
2498 /// Return the address accessed by this recipe.
getAddr()2499 VPValue *getAddr() const { return getOperand(0); }
2500
2501 /// Returns true if the recipe is masked.
isMasked()2502 bool isMasked() const { return IsMasked; }
2503
2504 /// Return the mask used by this recipe. Note that a full mask is represented
2505 /// by a nullptr.
getMask()2506 VPValue *getMask() const {
2507 // Mask is optional and therefore the last operand.
2508 return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
2509 }
2510
2511 /// Generate the wide load/store.
execute(VPTransformState & State)2512 void execute(VPTransformState &State) override {
2513 llvm_unreachable("VPWidenMemoryRecipe should not be instantiated.");
2514 }
2515
getIngredient()2516 Instruction &getIngredient() const { return Ingredient; }
2517 };
2518
2519 /// A recipe for widening load operations, using the address to load from and an
2520 /// optional mask.
2521 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipefinal2522 VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
2523 bool Consecutive, bool Reverse, DebugLoc DL)
2524 : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
2525 Reverse, DL),
2526 VPValue(this, &Load) {
2527 setMask(Mask);
2528 }
2529
clonefinal2530 VPWidenLoadRecipe *clone() override {
2531 return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
2532 getMask(), Consecutive, Reverse,
2533 getDebugLoc());
2534 }
2535
2536 VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
2537
2538 /// Generate a wide load or gather.
2539 void execute(VPTransformState &State) override;
2540
2541 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2542 /// Print the recipe.
2543 void print(raw_ostream &O, const Twine &Indent,
2544 VPSlotTracker &SlotTracker) const override;
2545 #endif
2546
2547 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsedfinal2548 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2549 assert(is_contained(operands(), Op) &&
2550 "Op must be an operand of the recipe");
2551 // Widened, consecutive loads operations only demand the first lane of
2552 // their address.
2553 return Op == getAddr() && isConsecutive();
2554 }
2555 };
2556
2557 /// A recipe for widening load operations with vector-predication intrinsics,
2558 /// using the address to load from, the explicit vector length and an optional
2559 /// mask.
2560 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipefinal2561 VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
2562 : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
2563 {L->getAddr(), EVL}, L->isConsecutive(),
2564 L->isReverse(), L->getDebugLoc()),
2565 VPValue(this, &getIngredient()) {
2566 setMask(Mask);
2567 }
2568
VP_CLASSOF_IMPLfinal2569 VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC)
2570
2571 /// Return the EVL operand.
2572 VPValue *getEVL() const { return getOperand(1); }
2573
2574 /// Generate the wide load or gather.
2575 void execute(VPTransformState &State) override;
2576
2577 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2578 /// Print the recipe.
2579 void print(raw_ostream &O, const Twine &Indent,
2580 VPSlotTracker &SlotTracker) const override;
2581 #endif
2582
2583 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsedfinal2584 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2585 assert(is_contained(operands(), Op) &&
2586 "Op must be an operand of the recipe");
2587 // Widened loads only demand the first lane of EVL and consecutive loads
2588 // only demand the first lane of their address.
2589 return Op == getEVL() || (Op == getAddr() && isConsecutive());
2590 }
2591 };
2592
2593 /// A recipe for widening store operations, using the stored value, the address
2594 /// to store to and an optional mask.
2595 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipefinal2596 VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
2597 VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
2598 : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
2599 Consecutive, Reverse, DL) {
2600 setMask(Mask);
2601 }
2602
clonefinal2603 VPWidenStoreRecipe *clone() override {
2604 return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
2605 getStoredValue(), getMask(), Consecutive,
2606 Reverse, getDebugLoc());
2607 }
2608
2609 VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
2610
2611 /// Return the value stored by this recipe.
getStoredValuefinal2612 VPValue *getStoredValue() const { return getOperand(1); }
2613
2614 /// Generate a wide store or scatter.
2615 void execute(VPTransformState &State) override;
2616
2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2618 /// Print the recipe.
2619 void print(raw_ostream &O, const Twine &Indent,
2620 VPSlotTracker &SlotTracker) const override;
2621 #endif
2622
2623 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsedfinal2624 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2625 assert(is_contained(operands(), Op) &&
2626 "Op must be an operand of the recipe");
2627 // Widened, consecutive stores only demand the first lane of their address,
2628 // unless the same operand is also stored.
2629 return Op == getAddr() && isConsecutive() && Op != getStoredValue();
2630 }
2631 };
2632
2633 /// A recipe for widening store operations with vector-predication intrinsics,
2634 /// using the value to store, the address to store to, the explicit vector
2635 /// length and an optional mask.
2636 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipefinal2637 VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
2638 : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
2639 {S->getAddr(), S->getStoredValue(), EVL},
2640 S->isConsecutive(), S->isReverse(),
2641 S->getDebugLoc()) {
2642 setMask(Mask);
2643 }
2644
VP_CLASSOF_IMPLfinal2645 VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
2646
2647 /// Return the address accessed by this recipe.
2648 VPValue *getStoredValue() const { return getOperand(1); }
2649
2650 /// Return the EVL operand.
getEVLfinal2651 VPValue *getEVL() const { return getOperand(2); }
2652
2653 /// Generate the wide store or scatter.
2654 void execute(VPTransformState &State) override;
2655
2656 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2657 /// Print the recipe.
2658 void print(raw_ostream &O, const Twine &Indent,
2659 VPSlotTracker &SlotTracker) const override;
2660 #endif
2661
2662 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsedfinal2663 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2664 assert(is_contained(operands(), Op) &&
2665 "Op must be an operand of the recipe");
2666 if (Op == getEVL()) {
2667 assert(getStoredValue() != Op && "unexpected store of EVL");
2668 return true;
2669 }
2670 // Widened, consecutive memory operations only demand the first lane of
2671 // their address, unless the same operand is also stored. That latter can
2672 // happen with opaque pointers.
2673 return Op == getAddr() && isConsecutive() && Op != getStoredValue();
2674 }
2675 };
2676
2677 /// Recipe to expand a SCEV expression.
2678 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
2679 const SCEV *Expr;
2680 ScalarEvolution &SE;
2681
2682 public:
VPExpandSCEVRecipe(const SCEV * Expr,ScalarEvolution & SE)2683 VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
2684 : VPSingleDefRecipe(VPDef::VPExpandSCEVSC, {}), Expr(Expr), SE(SE) {}
2685
2686 ~VPExpandSCEVRecipe() override = default;
2687
clone()2688 VPExpandSCEVRecipe *clone() override {
2689 return new VPExpandSCEVRecipe(Expr, SE);
2690 }
2691
2692 VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC)
2693
2694 /// Generate a canonical vector induction variable of the vector loop, with
2695 void execute(VPTransformState &State) override;
2696
2697 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2698 /// Print the recipe.
2699 void print(raw_ostream &O, const Twine &Indent,
2700 VPSlotTracker &SlotTracker) const override;
2701 #endif
2702
getSCEV()2703 const SCEV *getSCEV() const { return Expr; }
2704 };
2705
2706 /// Canonical scalar induction phi of the vector loop. Starting at the specified
2707 /// start value (either 0 or the resume value when vectorizing the epilogue
2708 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the
2709 /// canonical induction variable.
2710 class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
2711 public:
VPCanonicalIVPHIRecipe(VPValue * StartV,DebugLoc DL)2712 VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
2713 : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {}
2714
2715 ~VPCanonicalIVPHIRecipe() override = default;
2716
clone()2717 VPCanonicalIVPHIRecipe *clone() override {
2718 auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
2719 R->addOperand(getBackedgeValue());
2720 return R;
2721 }
2722
VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)2723 VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)
2724
2725 static inline bool classof(const VPHeaderPHIRecipe *D) {
2726 return D->getVPDefID() == VPDef::VPCanonicalIVPHISC;
2727 }
2728
2729 /// Generate the canonical scalar induction phi of the vector loop.
2730 void execute(VPTransformState &State) override;
2731
2732 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2733 /// Print the recipe.
2734 void print(raw_ostream &O, const Twine &Indent,
2735 VPSlotTracker &SlotTracker) const override;
2736 #endif
2737
2738 /// Returns the scalar type of the induction.
getScalarType()2739 Type *getScalarType() const {
2740 return getStartValue()->getLiveInIRValue()->getType();
2741 }
2742
2743 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2744 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2745 assert(is_contained(operands(), Op) &&
2746 "Op must be an operand of the recipe");
2747 return true;
2748 }
2749
2750 /// Returns true if the recipe only uses the first part of operand \p Op.
onlyFirstPartUsed(const VPValue * Op)2751 bool onlyFirstPartUsed(const VPValue *Op) const override {
2752 assert(is_contained(operands(), Op) &&
2753 "Op must be an operand of the recipe");
2754 return true;
2755 }
2756
2757 /// Check if the induction described by \p Kind, /p Start and \p Step is
2758 /// canonical, i.e. has the same start and step (of 1) as the canonical IV.
2759 bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start,
2760 VPValue *Step) const;
2761 };
2762
2763 /// A recipe for generating the active lane mask for the vector loop that is
2764 /// used to predicate the vector operations.
2765 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2766 /// remove VPActiveLaneMaskPHIRecipe.
2767 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
2768 public:
VPActiveLaneMaskPHIRecipe(VPValue * StartMask,DebugLoc DL)2769 VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
2770 : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
2771 DL) {}
2772
2773 ~VPActiveLaneMaskPHIRecipe() override = default;
2774
clone()2775 VPActiveLaneMaskPHIRecipe *clone() override {
2776 return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2777 }
2778
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)2779 VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
2780
2781 static inline bool classof(const VPHeaderPHIRecipe *D) {
2782 return D->getVPDefID() == VPDef::VPActiveLaneMaskPHISC;
2783 }
2784
2785 /// Generate the active lane mask phi of the vector loop.
2786 void execute(VPTransformState &State) override;
2787
2788 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2789 /// Print the recipe.
2790 void print(raw_ostream &O, const Twine &Indent,
2791 VPSlotTracker &SlotTracker) const override;
2792 #endif
2793 };
2794
2795 /// A recipe for generating the phi node for the current index of elements,
2796 /// adjusted in accordance with EVL value. It starts at the start value of the
2797 /// canonical induction and gets incremented by EVL in each iteration of the
2798 /// vector loop.
2799 class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
2800 public:
VPEVLBasedIVPHIRecipe(VPValue * StartIV,DebugLoc DL)2801 VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
2802 : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}
2803
2804 ~VPEVLBasedIVPHIRecipe() override = default;
2805
clone()2806 VPEVLBasedIVPHIRecipe *clone() override {
2807 llvm_unreachable("cloning not implemented yet");
2808 }
2809
VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)2810 VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)
2811
2812 static inline bool classof(const VPHeaderPHIRecipe *D) {
2813 return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
2814 }
2815
2816 /// Generate phi for handling IV based on EVL over iterations correctly.
2817 /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe.
2818 void execute(VPTransformState &State) override;
2819
2820 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2821 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2822 assert(is_contained(operands(), Op) &&
2823 "Op must be an operand of the recipe");
2824 return true;
2825 }
2826
2827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2828 /// Print the recipe.
2829 void print(raw_ostream &O, const Twine &Indent,
2830 VPSlotTracker &SlotTracker) const override;
2831 #endif
2832 };
2833
2834 /// A Recipe for widening the canonical induction variable of the vector loop.
2835 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
2836 public:
VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe * CanonicalIV)2837 VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
2838 : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
2839
2840 ~VPWidenCanonicalIVRecipe() override = default;
2841
clone()2842 VPWidenCanonicalIVRecipe *clone() override {
2843 return new VPWidenCanonicalIVRecipe(
2844 cast<VPCanonicalIVPHIRecipe>(getOperand(0)));
2845 }
2846
2847 VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC)
2848
2849 /// Generate a canonical vector induction variable of the vector loop, with
2850 /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
2851 /// step = <VF*UF, VF*UF, ..., VF*UF>.
2852 void execute(VPTransformState &State) override;
2853
2854 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2855 /// Print the recipe.
2856 void print(raw_ostream &O, const Twine &Indent,
2857 VPSlotTracker &SlotTracker) const override;
2858 #endif
2859 };
2860
2861 /// A recipe for converting the input value \p IV value to the corresponding
2862 /// value of an IV with different start and step values, using Start + IV *
2863 /// Step.
2864 class VPDerivedIVRecipe : public VPSingleDefRecipe {
2865 /// Kind of the induction.
2866 const InductionDescriptor::InductionKind Kind;
2867 /// If not nullptr, the floating point induction binary operator. Must be set
2868 /// for floating point inductions.
2869 const FPMathOperator *FPBinOp;
2870
2871 public:
VPDerivedIVRecipe(const InductionDescriptor & IndDesc,VPValue * Start,VPCanonicalIVPHIRecipe * CanonicalIV,VPValue * Step)2872 VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
2873 VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
2874 : VPDerivedIVRecipe(
2875 IndDesc.getKind(),
2876 dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
2877 Start, CanonicalIV, Step) {}
2878
VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,const FPMathOperator * FPBinOp,VPValue * Start,VPValue * IV,VPValue * Step)2879 VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
2880 const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV,
2881 VPValue *Step)
2882 : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind),
2883 FPBinOp(FPBinOp) {}
2884
2885 ~VPDerivedIVRecipe() override = default;
2886
clone()2887 VPDerivedIVRecipe *clone() override {
2888 return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1),
2889 getStepValue());
2890 }
2891
2892 VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
2893
2894 /// Generate the transformed value of the induction at offset StartValue (1.
2895 /// operand) + IV (2. operand) * StepValue (3, operand).
2896 void execute(VPTransformState &State) override;
2897
2898 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2899 /// Print the recipe.
2900 void print(raw_ostream &O, const Twine &Indent,
2901 VPSlotTracker &SlotTracker) const override;
2902 #endif
2903
getScalarType()2904 Type *getScalarType() const {
2905 return getStartValue()->getLiveInIRValue()->getType();
2906 }
2907
getStartValue()2908 VPValue *getStartValue() const { return getOperand(0); }
getStepValue()2909 VPValue *getStepValue() const { return getOperand(2); }
2910
2911 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2912 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2913 assert(is_contained(operands(), Op) &&
2914 "Op must be an operand of the recipe");
2915 return true;
2916 }
2917 };
2918
2919 /// A recipe for handling phi nodes of integer and floating-point inductions,
2920 /// producing their scalar values.
2921 class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
2922 Instruction::BinaryOps InductionOpcode;
2923
2924 public:
VPScalarIVStepsRecipe(VPValue * IV,VPValue * Step,Instruction::BinaryOps Opcode,FastMathFlags FMFs)2925 VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step,
2926 Instruction::BinaryOps Opcode, FastMathFlags FMFs)
2927 : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
2928 ArrayRef<VPValue *>({IV, Step}), FMFs),
2929 InductionOpcode(Opcode) {}
2930
VPScalarIVStepsRecipe(const InductionDescriptor & IndDesc,VPValue * IV,VPValue * Step)2931 VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
2932 VPValue *Step)
2933 : VPScalarIVStepsRecipe(
2934 IV, Step, IndDesc.getInductionOpcode(),
2935 dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
2936 ? IndDesc.getInductionBinOp()->getFastMathFlags()
2937 : FastMathFlags()) {}
2938
2939 ~VPScalarIVStepsRecipe() override = default;
2940
clone()2941 VPScalarIVStepsRecipe *clone() override {
2942 return new VPScalarIVStepsRecipe(
2943 getOperand(0), getOperand(1), InductionOpcode,
2944 hasFastMathFlags() ? getFastMathFlags() : FastMathFlags());
2945 }
2946
2947 VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
2948
2949 /// Generate the scalarized versions of the phi node as needed by their users.
2950 void execute(VPTransformState &State) override;
2951
2952 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2953 /// Print the recipe.
2954 void print(raw_ostream &O, const Twine &Indent,
2955 VPSlotTracker &SlotTracker) const override;
2956 #endif
2957
getStepValue()2958 VPValue *getStepValue() const { return getOperand(1); }
2959
2960 /// Returns true if the recipe only uses the first lane of operand \p Op.
onlyFirstLaneUsed(const VPValue * Op)2961 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2962 assert(is_contained(operands(), Op) &&
2963 "Op must be an operand of the recipe");
2964 return true;
2965 }
2966 };
2967
2968 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
2969 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
2970 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
2971 class VPBasicBlock : public VPBlockBase {
2972 public:
2973 using RecipeListTy = iplist<VPRecipeBase>;
2974
2975 protected:
2976 /// The VPRecipes held in the order of output instructions to generate.
2977 RecipeListTy Recipes;
2978
2979 VPBasicBlock(const unsigned char BlockSC, const Twine &Name = "")
2980 : VPBlockBase(BlockSC, Name.str()) {}
2981
2982 public:
2983 VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
2984 : VPBlockBase(VPBasicBlockSC, Name.str()) {
2985 if (Recipe)
2986 appendRecipe(Recipe);
2987 }
2988
~VPBasicBlock()2989 ~VPBasicBlock() override {
2990 while (!Recipes.empty())
2991 Recipes.pop_back();
2992 }
2993
2994 /// Instruction iterators...
2995 using iterator = RecipeListTy::iterator;
2996 using const_iterator = RecipeListTy::const_iterator;
2997 using reverse_iterator = RecipeListTy::reverse_iterator;
2998 using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
2999
3000 //===--------------------------------------------------------------------===//
3001 /// Recipe iterator methods
3002 ///
begin()3003 inline iterator begin() { return Recipes.begin(); }
begin()3004 inline const_iterator begin() const { return Recipes.begin(); }
end()3005 inline iterator end() { return Recipes.end(); }
end()3006 inline const_iterator end() const { return Recipes.end(); }
3007
rbegin()3008 inline reverse_iterator rbegin() { return Recipes.rbegin(); }
rbegin()3009 inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
rend()3010 inline reverse_iterator rend() { return Recipes.rend(); }
rend()3011 inline const_reverse_iterator rend() const { return Recipes.rend(); }
3012
size()3013 inline size_t size() const { return Recipes.size(); }
empty()3014 inline bool empty() const { return Recipes.empty(); }
front()3015 inline const VPRecipeBase &front() const { return Recipes.front(); }
front()3016 inline VPRecipeBase &front() { return Recipes.front(); }
back()3017 inline const VPRecipeBase &back() const { return Recipes.back(); }
back()3018 inline VPRecipeBase &back() { return Recipes.back(); }
3019
3020 /// Returns a reference to the list of recipes.
getRecipeList()3021 RecipeListTy &getRecipeList() { return Recipes; }
3022
3023 /// Returns a pointer to a member of the recipe list.
getSublistAccess(VPRecipeBase *)3024 static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
3025 return &VPBasicBlock::Recipes;
3026 }
3027
3028 /// Method to support type inquiry through isa, cast, and dyn_cast.
classof(const VPBlockBase * V)3029 static inline bool classof(const VPBlockBase *V) {
3030 return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC ||
3031 V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
3032 }
3033
insert(VPRecipeBase * Recipe,iterator InsertPt)3034 void insert(VPRecipeBase *Recipe, iterator InsertPt) {
3035 assert(Recipe && "No recipe to append.");
3036 assert(!Recipe->Parent && "Recipe already in VPlan");
3037 Recipe->Parent = this;
3038 Recipes.insert(InsertPt, Recipe);
3039 }
3040
3041 /// Augment the existing recipes of a VPBasicBlock with an additional
3042 /// \p Recipe as the last recipe.
appendRecipe(VPRecipeBase * Recipe)3043 void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
3044
3045 /// The method which generates the output IR instructions that correspond to
3046 /// this VPBasicBlock, thereby "executing" the VPlan.
3047 void execute(VPTransformState *State) override;
3048
3049 /// Return the cost of this VPBasicBlock.
3050 InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
3051
3052 /// Return the position of the first non-phi node recipe in the block.
3053 iterator getFirstNonPhi();
3054
3055 /// Returns an iterator range over the PHI-like recipes in the block.
phis()3056 iterator_range<iterator> phis() {
3057 return make_range(begin(), getFirstNonPhi());
3058 }
3059
3060 void dropAllReferences(VPValue *NewValue) override;
3061
3062 /// Split current block at \p SplitAt by inserting a new block between the
3063 /// current block and its successors and moving all recipes starting at
3064 /// SplitAt to the new block. Returns the new block.
3065 VPBasicBlock *splitAt(iterator SplitAt);
3066
3067 VPRegionBlock *getEnclosingLoopRegion();
3068
3069 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3070 /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
3071 /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
3072 ///
3073 /// Note that the numbering is applied to the whole VPlan, so printing
3074 /// individual blocks is consistent with the whole VPlan printing.
3075 void print(raw_ostream &O, const Twine &Indent,
3076 VPSlotTracker &SlotTracker) const override;
3077 using VPBlockBase::print; // Get the print(raw_stream &O) version.
3078 #endif
3079
3080 /// If the block has multiple successors, return the branch recipe terminating
3081 /// the block. If there are no or only a single successor, return nullptr;
3082 VPRecipeBase *getTerminator();
3083 const VPRecipeBase *getTerminator() const;
3084
3085 /// Returns true if the block is exiting it's parent region.
3086 bool isExiting() const;
3087
3088 /// Clone the current block and it's recipes, without updating the operands of
3089 /// the cloned recipes.
clone()3090 VPBasicBlock *clone() override {
3091 auto *NewBlock = new VPBasicBlock(getName());
3092 for (VPRecipeBase &R : *this)
3093 NewBlock->appendRecipe(R.clone());
3094 return NewBlock;
3095 }
3096
3097 protected:
3098 /// Execute the recipes in the IR basic block \p BB.
3099 void executeRecipes(VPTransformState *State, BasicBlock *BB);
3100
3101 private:
3102 /// Create an IR BasicBlock to hold the output instructions generated by this
3103 /// VPBasicBlock, and return it. Update the CFGState accordingly.
3104 BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
3105 };
3106
3107 /// A special type of VPBasicBlock that wraps an existing IR basic block.
3108 /// Recipes of the block get added before the first non-phi instruction in the
3109 /// wrapped block.
3110 /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's
3111 /// preheader block.
3112 class VPIRBasicBlock : public VPBasicBlock {
3113 BasicBlock *IRBB;
3114
3115 public:
VPIRBasicBlock(BasicBlock * IRBB)3116 VPIRBasicBlock(BasicBlock *IRBB)
3117 : VPBasicBlock(VPIRBasicBlockSC,
3118 (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()),
3119 IRBB(IRBB) {}
3120
~VPIRBasicBlock()3121 ~VPIRBasicBlock() override {}
3122
classof(const VPBlockBase * V)3123 static inline bool classof(const VPBlockBase *V) {
3124 return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
3125 }
3126
3127 /// The method which generates the output IR instructions that correspond to
3128 /// this VPBasicBlock, thereby "executing" the VPlan.
3129 void execute(VPTransformState *State) override;
3130
clone()3131 VPIRBasicBlock *clone() override {
3132 auto *NewBlock = new VPIRBasicBlock(IRBB);
3133 for (VPRecipeBase &R : Recipes)
3134 NewBlock->appendRecipe(R.clone());
3135 return NewBlock;
3136 }
3137
getIRBasicBlock()3138 BasicBlock *getIRBasicBlock() const { return IRBB; }
3139 };
3140
3141 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
3142 /// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
3143 /// A VPRegionBlock may indicate that its contents are to be replicated several
3144 /// times. This is designed to support predicated scalarization, in which a
3145 /// scalar if-then code structure needs to be generated VF * UF times. Having
3146 /// this replication indicator helps to keep a single model for multiple
3147 /// candidate VF's. The actual replication takes place only once the desired VF
3148 /// and UF have been determined.
3149 class VPRegionBlock : public VPBlockBase {
3150 /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
3151 VPBlockBase *Entry;
3152
3153 /// Hold the Single Exiting block of the SESE region modelled by the
3154 /// VPRegionBlock.
3155 VPBlockBase *Exiting;
3156
3157 /// An indicator whether this region is to generate multiple replicated
3158 /// instances of output IR corresponding to its VPBlockBases.
3159 bool IsReplicator;
3160
3161 public:
3162 VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
3163 const std::string &Name = "", bool IsReplicator = false)
VPBlockBase(VPRegionBlockSC,Name)3164 : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
3165 IsReplicator(IsReplicator) {
3166 assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
3167 assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
3168 Entry->setParent(this);
3169 Exiting->setParent(this);
3170 }
3171 VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
VPBlockBase(VPRegionBlockSC,Name)3172 : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
3173 IsReplicator(IsReplicator) {}
3174
~VPRegionBlock()3175 ~VPRegionBlock() override {
3176 if (Entry) {
3177 VPValue DummyValue;
3178 Entry->dropAllReferences(&DummyValue);
3179 deleteCFG(Entry);
3180 }
3181 }
3182
3183 /// Method to support type inquiry through isa, cast, and dyn_cast.
classof(const VPBlockBase * V)3184 static inline bool classof(const VPBlockBase *V) {
3185 return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
3186 }
3187
getEntry()3188 const VPBlockBase *getEntry() const { return Entry; }
getEntry()3189 VPBlockBase *getEntry() { return Entry; }
3190
3191 /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
3192 /// EntryBlock must have no predecessors.
setEntry(VPBlockBase * EntryBlock)3193 void setEntry(VPBlockBase *EntryBlock) {
3194 assert(EntryBlock->getPredecessors().empty() &&
3195 "Entry block cannot have predecessors.");
3196 Entry = EntryBlock;
3197 EntryBlock->setParent(this);
3198 }
3199
getExiting()3200 const VPBlockBase *getExiting() const { return Exiting; }
getExiting()3201 VPBlockBase *getExiting() { return Exiting; }
3202
3203 /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
3204 /// ExitingBlock must have no successors.
setExiting(VPBlockBase * ExitingBlock)3205 void setExiting(VPBlockBase *ExitingBlock) {
3206 assert(ExitingBlock->getSuccessors().empty() &&
3207 "Exit block cannot have successors.");
3208 Exiting = ExitingBlock;
3209 ExitingBlock->setParent(this);
3210 }
3211
3212 /// Returns the pre-header VPBasicBlock of the loop region.
getPreheaderVPBB()3213 VPBasicBlock *getPreheaderVPBB() {
3214 assert(!isReplicator() && "should only get pre-header of loop regions");
3215 return getSinglePredecessor()->getExitingBasicBlock();
3216 }
3217
3218 /// An indicator whether this region is to generate multiple replicated
3219 /// instances of output IR corresponding to its VPBlockBases.
isReplicator()3220 bool isReplicator() const { return IsReplicator; }
3221
3222 /// The method which generates the output IR instructions that correspond to
3223 /// this VPRegionBlock, thereby "executing" the VPlan.
3224 void execute(VPTransformState *State) override;
3225
3226 // Return the cost of this region.
3227 InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
3228
3229 void dropAllReferences(VPValue *NewValue) override;
3230
3231 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3232 /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
3233 /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
3234 /// consequtive numbers.
3235 ///
3236 /// Note that the numbering is applied to the whole VPlan, so printing
3237 /// individual regions is consistent with the whole VPlan printing.
3238 void print(raw_ostream &O, const Twine &Indent,
3239 VPSlotTracker &SlotTracker) const override;
3240 using VPBlockBase::print; // Get the print(raw_stream &O) version.
3241 #endif
3242
3243 /// Clone all blocks in the single-entry single-exit region of the block and
3244 /// their recipes without updating the operands of the cloned recipes.
3245 VPRegionBlock *clone() override;
3246 };
3247
3248 /// VPlan models a candidate for vectorization, encoding various decisions take
3249 /// to produce efficient output IR, including which branches, basic-blocks and
3250 /// output IR instructions to generate, and their cost. VPlan holds a
3251 /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
3252 /// VPBasicBlock.
3253 class VPlan {
3254 friend class VPlanPrinter;
3255 friend class VPSlotTracker;
3256
3257 /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the
3258 /// preheader of the vector loop.
3259 VPBasicBlock *Entry;
3260
3261 /// VPBasicBlock corresponding to the original preheader. Used to place
3262 /// VPExpandSCEV recipes for expressions used during skeleton creation and the
3263 /// rest of VPlan execution.
3264 VPBasicBlock *Preheader;
3265
3266 /// Holds the VFs applicable to this VPlan.
3267 SmallSetVector<ElementCount, 2> VFs;
3268
3269 /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for
3270 /// any UF.
3271 SmallSetVector<unsigned, 2> UFs;
3272
3273 /// Holds the name of the VPlan, for printing.
3274 std::string Name;
3275
3276 /// Represents the trip count of the original loop, for folding
3277 /// the tail.
3278 VPValue *TripCount = nullptr;
3279
3280 /// Represents the backedge taken count of the original loop, for folding
3281 /// the tail. It equals TripCount - 1.
3282 VPValue *BackedgeTakenCount = nullptr;
3283
3284 /// Represents the vector trip count.
3285 VPValue VectorTripCount;
3286
3287 /// Represents the loop-invariant VF * UF of the vector loop region.
3288 VPValue VFxUF;
3289
3290 /// Holds a mapping between Values and their corresponding VPValue inside
3291 /// VPlan.
3292 Value2VPValueTy Value2VPValue;
3293
3294 /// Contains all the external definitions created for this VPlan. External
3295 /// definitions are VPValues that hold a pointer to their underlying IR.
3296 SmallVector<VPValue *, 16> VPLiveInsToFree;
3297
3298 /// Values used outside the plan. It contains live-outs that need fixing. Any
3299 /// live-out that is fixed outside VPlan needs to be removed. The remaining
3300 /// live-outs are fixed via VPLiveOut::fixPhi.
3301 MapVector<PHINode *, VPLiveOut *> LiveOuts;
3302
3303 /// Mapping from SCEVs to the VPValues representing their expansions.
3304 /// NOTE: This mapping is temporary and will be removed once all users have
3305 /// been modeled in VPlan directly.
3306 DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
3307
3308 public:
3309 /// Construct a VPlan with original preheader \p Preheader, trip count \p TC
3310 /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to
3311 /// be disconnected, as the bypass blocks between them are not yet modeled in
3312 /// VPlan.
VPlan(VPBasicBlock * Preheader,VPValue * TC,VPBasicBlock * Entry)3313 VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry)
3314 : VPlan(Preheader, Entry) {
3315 TripCount = TC;
3316 }
3317
3318 /// Construct a VPlan with original preheader \p Preheader and \p Entry to
3319 /// the plan. At the moment, \p Preheader and \p Entry need to be
3320 /// disconnected, as the bypass blocks between them are not yet modeled in
3321 /// VPlan.
VPlan(VPBasicBlock * Preheader,VPBasicBlock * Entry)3322 VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry)
3323 : Entry(Entry), Preheader(Preheader) {
3324 Entry->setPlan(this);
3325 Preheader->setPlan(this);
3326 assert(Preheader->getNumSuccessors() == 0 &&
3327 Preheader->getNumPredecessors() == 0 &&
3328 "preheader must be disconnected");
3329 }
3330
3331 ~VPlan();
3332
3333 /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
3334 /// original scalar pre-header ) which contains SCEV expansions that need
3335 /// to happen before the CFG is modified; a VPBasicBlock for the vector
3336 /// pre-header, followed by a region for the vector loop, followed by the
3337 /// middle VPBasicBlock. If a check is needed to guard executing the scalar
3338 /// epilogue loop, it will be added to the middle block, together with
3339 /// VPBasicBlocks for the scalar preheader and exit blocks.
3340 static VPlanPtr createInitialVPlan(const SCEV *TripCount,
3341 ScalarEvolution &PSE,
3342 bool RequiresScalarEpilogueCheck,
3343 bool TailFolded, Loop *TheLoop);
3344
3345 /// Prepare the plan for execution, setting up the required live-in values.
3346 void prepareToExecute(Value *TripCount, Value *VectorTripCount,
3347 Value *CanonicalIVStartValue, VPTransformState &State);
3348
3349 /// Generate the IR code for this VPlan.
3350 void execute(VPTransformState *State);
3351
3352 /// Return the cost of this plan.
3353 InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
3354
getEntry()3355 VPBasicBlock *getEntry() { return Entry; }
getEntry()3356 const VPBasicBlock *getEntry() const { return Entry; }
3357
3358 /// The trip count of the original loop.
getTripCount()3359 VPValue *getTripCount() const {
3360 assert(TripCount && "trip count needs to be set before accessing it");
3361 return TripCount;
3362 }
3363
3364 /// Resets the trip count for the VPlan. The caller must make sure all uses of
3365 /// the original trip count have been replaced.
resetTripCount(VPValue * NewTripCount)3366 void resetTripCount(VPValue *NewTripCount) {
3367 assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
3368 "TripCount always must be set");
3369 TripCount = NewTripCount;
3370 }
3371
3372 /// The backedge taken count of the original loop.
getOrCreateBackedgeTakenCount()3373 VPValue *getOrCreateBackedgeTakenCount() {
3374 if (!BackedgeTakenCount)
3375 BackedgeTakenCount = new VPValue();
3376 return BackedgeTakenCount;
3377 }
3378
3379 /// The vector trip count.
getVectorTripCount()3380 VPValue &getVectorTripCount() { return VectorTripCount; }
3381
3382 /// Returns VF * UF of the vector loop region.
getVFxUF()3383 VPValue &getVFxUF() { return VFxUF; }
3384
addVF(ElementCount VF)3385 void addVF(ElementCount VF) { VFs.insert(VF); }
3386
setVF(ElementCount VF)3387 void setVF(ElementCount VF) {
3388 assert(hasVF(VF) && "Cannot set VF not already in plan");
3389 VFs.clear();
3390 VFs.insert(VF);
3391 }
3392
hasVF(ElementCount VF)3393 bool hasVF(ElementCount VF) { return VFs.count(VF); }
hasScalableVF()3394 bool hasScalableVF() {
3395 return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
3396 }
3397
3398 /// Returns an iterator range over all VFs of the plan.
3399 iterator_range<SmallSetVector<ElementCount, 2>::iterator>
vectorFactors()3400 vectorFactors() const {
3401 return {VFs.begin(), VFs.end()};
3402 }
3403
hasScalarVFOnly()3404 bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
3405
hasUF(unsigned UF)3406 bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
3407
setUF(unsigned UF)3408 void setUF(unsigned UF) {
3409 assert(hasUF(UF) && "Cannot set the UF not already in plan");
3410 UFs.clear();
3411 UFs.insert(UF);
3412 }
3413
3414 /// Return a string with the name of the plan and the applicable VFs and UFs.
3415 std::string getName() const;
3416
setName(const Twine & newName)3417 void setName(const Twine &newName) { Name = newName.str(); }
3418
3419 /// Gets the live-in VPValue for \p V or adds a new live-in (if none exists
3420 /// yet) for \p V.
getOrAddLiveIn(Value * V)3421 VPValue *getOrAddLiveIn(Value *V) {
3422 assert(V && "Trying to get or add the VPValue of a null Value");
3423 if (!Value2VPValue.count(V)) {
3424 VPValue *VPV = new VPValue(V);
3425 VPLiveInsToFree.push_back(VPV);
3426 assert(VPV->isLiveIn() && "VPV must be a live-in.");
3427 assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
3428 Value2VPValue[V] = VPV;
3429 }
3430
3431 assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
3432 assert(Value2VPValue[V]->isLiveIn() &&
3433 "Only live-ins should be in mapping");
3434 return Value2VPValue[V];
3435 }
3436
3437 /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
getLiveIn(Value * V)3438 VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); }
3439
3440 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3441 /// Print the live-ins of this VPlan to \p O.
3442 void printLiveIns(raw_ostream &O) const;
3443
3444 /// Print this VPlan to \p O.
3445 void print(raw_ostream &O) const;
3446
3447 /// Print this VPlan in DOT format to \p O.
3448 void printDOT(raw_ostream &O) const;
3449
3450 /// Dump the plan to stderr (for debugging).
3451 LLVM_DUMP_METHOD void dump() const;
3452 #endif
3453
3454 /// Returns the VPRegionBlock of the vector loop.
getVectorLoopRegion()3455 VPRegionBlock *getVectorLoopRegion() {
3456 return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
3457 }
getVectorLoopRegion()3458 const VPRegionBlock *getVectorLoopRegion() const {
3459 return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
3460 }
3461
3462 /// Returns the canonical induction recipe of the vector loop.
getCanonicalIV()3463 VPCanonicalIVPHIRecipe *getCanonicalIV() {
3464 VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
3465 if (EntryVPBB->empty()) {
3466 // VPlan native path.
3467 EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
3468 }
3469 return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
3470 }
3471
3472 void addLiveOut(PHINode *PN, VPValue *V);
3473
removeLiveOut(PHINode * PN)3474 void removeLiveOut(PHINode *PN) {
3475 delete LiveOuts[PN];
3476 LiveOuts.erase(PN);
3477 }
3478
getLiveOuts()3479 const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
3480 return LiveOuts;
3481 }
3482
getSCEVExpansion(const SCEV * S)3483 VPValue *getSCEVExpansion(const SCEV *S) const {
3484 return SCEVToExpansion.lookup(S);
3485 }
3486
addSCEVExpansion(const SCEV * S,VPValue * V)3487 void addSCEVExpansion(const SCEV *S, VPValue *V) {
3488 assert(!SCEVToExpansion.contains(S) && "SCEV already expanded");
3489 SCEVToExpansion[S] = V;
3490 }
3491
3492 /// \return The block corresponding to the original preheader.
getPreheader()3493 VPBasicBlock *getPreheader() { return Preheader; }
getPreheader()3494 const VPBasicBlock *getPreheader() const { return Preheader; }
3495
3496 /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
3497 /// recipes to refer to the clones, and return it.
3498 VPlan *duplicate();
3499 };
3500
3501 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3502 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
3503 /// indented and follows the dot format.
3504 class VPlanPrinter {
3505 raw_ostream &OS;
3506 const VPlan &Plan;
3507 unsigned Depth = 0;
3508 unsigned TabWidth = 2;
3509 std::string Indent;
3510 unsigned BID = 0;
3511 SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
3512
3513 VPSlotTracker SlotTracker;
3514
3515 /// Handle indentation.
bumpIndent(int b)3516 void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
3517
3518 /// Print a given \p Block of the Plan.
3519 void dumpBlock(const VPBlockBase *Block);
3520
3521 /// Print the information related to the CFG edges going out of a given
3522 /// \p Block, followed by printing the successor blocks themselves.
3523 void dumpEdges(const VPBlockBase *Block);
3524
3525 /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
3526 /// its successor blocks.
3527 void dumpBasicBlock(const VPBasicBlock *BasicBlock);
3528
3529 /// Print a given \p Region of the Plan.
3530 void dumpRegion(const VPRegionBlock *Region);
3531
getOrCreateBID(const VPBlockBase * Block)3532 unsigned getOrCreateBID(const VPBlockBase *Block) {
3533 return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
3534 }
3535
3536 Twine getOrCreateName(const VPBlockBase *Block);
3537
3538 Twine getUID(const VPBlockBase *Block);
3539
3540 /// Print the information related to a CFG edge between two VPBlockBases.
3541 void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
3542 const Twine &Label);
3543
3544 public:
VPlanPrinter(raw_ostream & O,const VPlan & P)3545 VPlanPrinter(raw_ostream &O, const VPlan &P)
3546 : OS(O), Plan(P), SlotTracker(&P) {}
3547
3548 LLVM_DUMP_METHOD void dump();
3549 };
3550
3551 struct VPlanIngredient {
3552 const Value *V;
3553
VPlanIngredientVPlanIngredient3554 VPlanIngredient(const Value *V) : V(V) {}
3555
3556 void print(raw_ostream &O) const;
3557 };
3558
3559 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
3560 I.print(OS);
3561 return OS;
3562 }
3563
3564 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
3565 Plan.print(OS);
3566 return OS;
3567 }
3568 #endif
3569
3570 //===----------------------------------------------------------------------===//
3571 // VPlan Utilities
3572 //===----------------------------------------------------------------------===//
3573
3574 /// Class that provides utilities for VPBlockBases in VPlan.
3575 class VPBlockUtils {
3576 public:
3577 VPBlockUtils() = delete;
3578
3579 /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
3580 /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
3581 /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
3582 /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
3583 /// have neither successors nor predecessors.
insertBlockAfter(VPBlockBase * NewBlock,VPBlockBase * BlockPtr)3584 static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
3585 assert(NewBlock->getSuccessors().empty() &&
3586 NewBlock->getPredecessors().empty() &&
3587 "Can't insert new block with predecessors or successors.");
3588 NewBlock->setParent(BlockPtr->getParent());
3589 SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
3590 for (VPBlockBase *Succ : Succs) {
3591 disconnectBlocks(BlockPtr, Succ);
3592 connectBlocks(NewBlock, Succ);
3593 }
3594 connectBlocks(BlockPtr, NewBlock);
3595 }
3596
3597 /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
3598 /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
3599 /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
3600 /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors
3601 /// and \p IfTrue and \p IfFalse must have neither successors nor
3602 /// predecessors.
insertTwoBlocksAfter(VPBlockBase * IfTrue,VPBlockBase * IfFalse,VPBlockBase * BlockPtr)3603 static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
3604 VPBlockBase *BlockPtr) {
3605 assert(IfTrue->getSuccessors().empty() &&
3606 "Can't insert IfTrue with successors.");
3607 assert(IfFalse->getSuccessors().empty() &&
3608 "Can't insert IfFalse with successors.");
3609 BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
3610 IfTrue->setPredecessors({BlockPtr});
3611 IfFalse->setPredecessors({BlockPtr});
3612 IfTrue->setParent(BlockPtr->getParent());
3613 IfFalse->setParent(BlockPtr->getParent());
3614 }
3615
3616 /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
3617 /// the successors of \p From and \p From to the predecessors of \p To. Both
3618 /// VPBlockBases must have the same parent, which can be null. Both
3619 /// VPBlockBases can be already connected to other VPBlockBases.
connectBlocks(VPBlockBase * From,VPBlockBase * To)3620 static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
3621 assert((From->getParent() == To->getParent()) &&
3622 "Can't connect two block with different parents");
3623 assert(From->getNumSuccessors() < 2 &&
3624 "Blocks can't have more than two successors.");
3625 From->appendSuccessor(To);
3626 To->appendPredecessor(From);
3627 }
3628
3629 /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
3630 /// from the successors of \p From and \p From from the predecessors of \p To.
disconnectBlocks(VPBlockBase * From,VPBlockBase * To)3631 static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
3632 assert(To && "Successor to disconnect is null.");
3633 From->removeSuccessor(To);
3634 To->removePredecessor(From);
3635 }
3636
3637 /// Return an iterator range over \p Range which only includes \p BlockTy
3638 /// blocks. The accesses are casted to \p BlockTy.
3639 template <typename BlockTy, typename T>
blocksOnly(const T & Range)3640 static auto blocksOnly(const T &Range) {
3641 // Create BaseTy with correct const-ness based on BlockTy.
3642 using BaseTy = std::conditional_t<std::is_const<BlockTy>::value,
3643 const VPBlockBase, VPBlockBase>;
3644
3645 // We need to first create an iterator range over (const) BlocktTy & instead
3646 // of (const) BlockTy * for filter_range to work properly.
3647 auto Mapped =
3648 map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; });
3649 auto Filter = make_filter_range(
3650 Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); });
3651 return map_range(Filter, [](BaseTy &Block) -> BlockTy * {
3652 return cast<BlockTy>(&Block);
3653 });
3654 }
3655 };
3656
3657 class VPInterleavedAccessInfo {
3658 DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
3659 InterleaveGroupMap;
3660
3661 /// Type for mapping of instruction based interleave groups to VPInstruction
3662 /// interleave groups
3663 using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
3664 InterleaveGroup<VPInstruction> *>;
3665
3666 /// Recursively \p Region and populate VPlan based interleave groups based on
3667 /// \p IAI.
3668 void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
3669 InterleavedAccessInfo &IAI);
3670 /// Recursively traverse \p Block and populate VPlan based interleave groups
3671 /// based on \p IAI.
3672 void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
3673 InterleavedAccessInfo &IAI);
3674
3675 public:
3676 VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
3677
~VPInterleavedAccessInfo()3678 ~VPInterleavedAccessInfo() {
3679 SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
3680 // Avoid releasing a pointer twice.
3681 for (auto &I : InterleaveGroupMap)
3682 DelSet.insert(I.second);
3683 for (auto *Ptr : DelSet)
3684 delete Ptr;
3685 }
3686
3687 /// Get the interleave group that \p Instr belongs to.
3688 ///
3689 /// \returns nullptr if doesn't have such group.
3690 InterleaveGroup<VPInstruction> *
getInterleaveGroup(VPInstruction * Instr)3691 getInterleaveGroup(VPInstruction *Instr) const {
3692 return InterleaveGroupMap.lookup(Instr);
3693 }
3694 };
3695
3696 /// Class that maps (parts of) an existing VPlan to trees of combined
3697 /// VPInstructions.
3698 class VPlanSlp {
3699 enum class OpMode { Failed, Load, Opcode };
3700
3701 /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
3702 /// DenseMap keys.
3703 struct BundleDenseMapInfo {
getEmptyKeyBundleDenseMapInfo3704 static SmallVector<VPValue *, 4> getEmptyKey() {
3705 return {reinterpret_cast<VPValue *>(-1)};
3706 }
3707
getTombstoneKeyBundleDenseMapInfo3708 static SmallVector<VPValue *, 4> getTombstoneKey() {
3709 return {reinterpret_cast<VPValue *>(-2)};
3710 }
3711
getHashValueBundleDenseMapInfo3712 static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
3713 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3714 }
3715
isEqualBundleDenseMapInfo3716 static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
3717 const SmallVector<VPValue *, 4> &RHS) {
3718 return LHS == RHS;
3719 }
3720 };
3721
3722 /// Mapping of values in the original VPlan to a combined VPInstruction.
3723 DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
3724 BundleToCombined;
3725
3726 VPInterleavedAccessInfo &IAI;
3727
3728 /// Basic block to operate on. For now, only instructions in a single BB are
3729 /// considered.
3730 const VPBasicBlock &BB;
3731
3732 /// Indicates whether we managed to combine all visited instructions or not.
3733 bool CompletelySLP = true;
3734
3735 /// Width of the widest combined bundle in bits.
3736 unsigned WidestBundleBits = 0;
3737
3738 using MultiNodeOpTy =
3739 typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
3740
3741 // Input operand bundles for the current multi node. Each multi node operand
3742 // bundle contains values not matching the multi node's opcode. They will
3743 // be reordered in reorderMultiNodeOps, once we completed building a
3744 // multi node.
3745 SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
3746
3747 /// Indicates whether we are building a multi node currently.
3748 bool MultiNodeActive = false;
3749
3750 /// Check if we can vectorize Operands together.
3751 bool areVectorizable(ArrayRef<VPValue *> Operands) const;
3752
3753 /// Add combined instruction \p New for the bundle \p Operands.
3754 void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
3755
3756 /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
3757 VPInstruction *markFailed();
3758
3759 /// Reorder operands in the multi node to maximize sequential memory access
3760 /// and commutative operations.
3761 SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
3762
3763 /// Choose the best candidate to use for the lane after \p Last. The set of
3764 /// candidates to choose from are values with an opcode matching \p Last's
3765 /// or loads consecutive to \p Last.
3766 std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
3767 SmallPtrSetImpl<VPValue *> &Candidates,
3768 VPInterleavedAccessInfo &IAI);
3769
3770 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3771 /// Print bundle \p Values to dbgs().
3772 void dumpBundle(ArrayRef<VPValue *> Values);
3773 #endif
3774
3775 public:
VPlanSlp(VPInterleavedAccessInfo & IAI,VPBasicBlock & BB)3776 VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
3777
3778 ~VPlanSlp() = default;
3779
3780 /// Tries to build an SLP tree rooted at \p Operands and returns a
3781 /// VPInstruction combining \p Operands, if they can be combined.
3782 VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
3783
3784 /// Return the width of the widest combined bundle in bits.
getWidestBundleBits()3785 unsigned getWidestBundleBits() const { return WidestBundleBits; }
3786
3787 /// Return true if all visited instruction can be combined.
isCompletelySLP()3788 bool isCompletelySLP() const { return CompletelySLP; }
3789 };
3790
3791 namespace vputils {
3792
3793 /// Returns true if only the first lane of \p Def is used.
3794 bool onlyFirstLaneUsed(const VPValue *Def);
3795
3796 /// Returns true if only the first part of \p Def is used.
3797 bool onlyFirstPartUsed(const VPValue *Def);
3798
3799 /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
3800 /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
3801 /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
3802 /// pre-header already contains a recipe expanding \p Expr, return it. If not,
3803 /// create a new one.
3804 VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
3805 ScalarEvolution &SE);
3806
3807 /// Returns true if \p VPV is uniform after vectorization.
isUniformAfterVectorization(VPValue * VPV)3808 inline bool isUniformAfterVectorization(VPValue *VPV) {
3809 // A value defined outside the vector region must be uniform after
3810 // vectorization inside a vector region.
3811 if (VPV->isDefinedOutsideVectorRegions())
3812 return true;
3813 VPRecipeBase *Def = VPV->getDefiningRecipe();
3814 assert(Def && "Must have definition for value defined inside vector region");
3815 if (auto Rep = dyn_cast<VPReplicateRecipe>(Def))
3816 return Rep->isUniform();
3817 if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
3818 return all_of(GEP->operands(), isUniformAfterVectorization);
3819 if (auto *VPI = dyn_cast<VPInstruction>(Def))
3820 return VPI->isSingleScalar() || VPI->isVectorToScalar();
3821 return false;
3822 }
3823
3824 /// Return true if \p V is a header mask in \p Plan.
3825 bool isHeaderMask(VPValue *V, VPlan &Plan);
3826 } // end namespace vputils
3827
3828 } // end namespace llvm
3829
3830 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
3831