1 //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file contains the declarations of the Vectorization Plan base classes: 11 /// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual 12 /// VPBlockBase, together implementing a Hierarchical CFG; 13 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained 14 /// within VPBasicBlocks; 15 /// 3. Pure virtual VPSingleDefRecipe serving as a base class for recipes that 16 /// also inherit from VPValue. 17 /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned 18 /// instruction; 19 /// 5. The VPlan class holding a candidate for vectorization; 20 /// 6. The VPlanPrinter class providing a way to print a plan in dot format; 21 /// These are documented in docs/VectorizationPlan.rst. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 26 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 27 28 #include "VPlanAnalysis.h" 29 #include "VPlanValue.h" 30 #include "llvm/ADT/DenseMap.h" 31 #include "llvm/ADT/MapVector.h" 32 #include "llvm/ADT/SmallBitVector.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Twine.h" 36 #include "llvm/ADT/ilist.h" 37 #include "llvm/ADT/ilist_node.h" 38 #include "llvm/Analysis/DomTreeUpdater.h" 39 #include "llvm/Analysis/IVDescriptors.h" 40 #include "llvm/Analysis/LoopInfo.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/IR/DebugLoc.h" 43 #include "llvm/IR/FMF.h" 44 #include "llvm/IR/Operator.h" 45 #include "llvm/Support/InstructionCost.h" 46 #include <algorithm> 47 #include <cassert> 48 #include <cstddef> 49 #include <string> 50 51 namespace llvm { 52 53 class BasicBlock; 54 class DominatorTree; 55 class InnerLoopVectorizer; 56 class IRBuilderBase; 57 class LoopInfo; 58 class raw_ostream; 59 class RecurrenceDescriptor; 60 class SCEV; 61 class Type; 62 class VPBasicBlock; 63 class VPRegionBlock; 64 class VPlan; 65 class VPReplicateRecipe; 66 class VPlanSlp; 67 class Value; 68 class LoopVectorizationCostModel; 69 class LoopVersioning; 70 71 struct VPCostContext; 72 73 namespace Intrinsic { 74 typedef unsigned ID; 75 } 76 77 /// Returns a calculation for the total number of elements for a given \p VF. 78 /// For fixed width vectors this value is a constant, whereas for scalable 79 /// vectors it is an expression determined at runtime. 80 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); 81 82 /// Return a value for Step multiplied by VF. 83 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 84 int64_t Step); 85 86 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 87 Loop *CurLoop = nullptr); 88 89 /// A helper function that returns the reciprocal of the block probability of 90 /// predicated blocks. If we return X, we are assuming the predicated block 91 /// will execute once for every X iterations of the loop header. 92 /// 93 /// TODO: We should use actual block probability here, if available. Currently, 94 /// we always assume predicated blocks have a 50% chance of executing. 95 inline unsigned getReciprocalPredBlockProb() { return 2; } 96 97 /// A range of powers-of-2 vectorization factors with fixed start and 98 /// adjustable end. The range includes start and excludes end, e.g.,: 99 /// [1, 16) = {1, 2, 4, 8} 100 struct VFRange { 101 // A power of 2. 102 const ElementCount Start; 103 104 // A power of 2. If End <= Start range is empty. 105 ElementCount End; 106 107 bool isEmpty() const { 108 return End.getKnownMinValue() <= Start.getKnownMinValue(); 109 } 110 111 VFRange(const ElementCount &Start, const ElementCount &End) 112 : Start(Start), End(End) { 113 assert(Start.isScalable() == End.isScalable() && 114 "Both Start and End should have the same scalable flag"); 115 assert(isPowerOf2_32(Start.getKnownMinValue()) && 116 "Expected Start to be a power of 2"); 117 assert(isPowerOf2_32(End.getKnownMinValue()) && 118 "Expected End to be a power of 2"); 119 } 120 121 /// Iterator to iterate over vectorization factors in a VFRange. 122 class iterator 123 : public iterator_facade_base<iterator, std::forward_iterator_tag, 124 ElementCount> { 125 ElementCount VF; 126 127 public: 128 iterator(ElementCount VF) : VF(VF) {} 129 130 bool operator==(const iterator &Other) const { return VF == Other.VF; } 131 132 ElementCount operator*() const { return VF; } 133 134 iterator &operator++() { 135 VF *= 2; 136 return *this; 137 } 138 }; 139 140 iterator begin() { return iterator(Start); } 141 iterator end() { 142 assert(isPowerOf2_32(End.getKnownMinValue())); 143 return iterator(End); 144 } 145 }; 146 147 using VPlanPtr = std::unique_ptr<VPlan>; 148 149 /// In what follows, the term "input IR" refers to code that is fed into the 150 /// vectorizer whereas the term "output IR" refers to code that is generated by 151 /// the vectorizer. 152 153 /// VPLane provides a way to access lanes in both fixed width and scalable 154 /// vectors, where for the latter the lane index sometimes needs calculating 155 /// as a runtime expression. 156 class VPLane { 157 public: 158 /// Kind describes how to interpret Lane. 159 enum class Kind : uint8_t { 160 /// For First, Lane is the index into the first N elements of a 161 /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>. 162 First, 163 /// For ScalableLast, Lane is the offset from the start of the last 164 /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For 165 /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of 166 /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. 167 ScalableLast 168 }; 169 170 private: 171 /// in [0..VF) 172 unsigned Lane; 173 174 /// Indicates how the Lane should be interpreted, as described above. 175 Kind LaneKind; 176 177 public: 178 VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} 179 180 static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } 181 182 static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) { 183 assert(Offset > 0 && Offset <= VF.getKnownMinValue() && 184 "trying to extract with invalid offset"); 185 unsigned LaneOffset = VF.getKnownMinValue() - Offset; 186 Kind LaneKind; 187 if (VF.isScalable()) 188 // In this case 'LaneOffset' refers to the offset from the start of the 189 // last subvector with VF.getKnownMinValue() elements. 190 LaneKind = VPLane::Kind::ScalableLast; 191 else 192 LaneKind = VPLane::Kind::First; 193 return VPLane(LaneOffset, LaneKind); 194 } 195 196 static VPLane getLastLaneForVF(const ElementCount &VF) { 197 return getLaneFromEnd(VF, 1); 198 } 199 200 /// Returns a compile-time known value for the lane index and asserts if the 201 /// lane can only be calculated at runtime. 202 unsigned getKnownLane() const { 203 assert(LaneKind == Kind::First); 204 return Lane; 205 } 206 207 /// Returns an expression describing the lane index that can be used at 208 /// runtime. 209 Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const; 210 211 /// Returns the Kind of lane offset. 212 Kind getKind() const { return LaneKind; } 213 214 /// Returns true if this is the first lane of the whole vector. 215 bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } 216 217 /// Maps the lane to a cache index based on \p VF. 218 unsigned mapToCacheIndex(const ElementCount &VF) const { 219 switch (LaneKind) { 220 case VPLane::Kind::ScalableLast: 221 assert(VF.isScalable() && Lane < VF.getKnownMinValue()); 222 return VF.getKnownMinValue() + Lane; 223 default: 224 assert(Lane < VF.getKnownMinValue()); 225 return Lane; 226 } 227 } 228 229 /// Returns the maxmimum number of lanes that we are able to consider 230 /// caching for \p VF. 231 static unsigned getNumCachedLanes(const ElementCount &VF) { 232 return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); 233 } 234 }; 235 236 /// VPIteration represents a single point in the iteration space of the output 237 /// (vectorized and/or unrolled) IR loop. 238 struct VPIteration { 239 /// in [0..UF) 240 unsigned Part; 241 242 VPLane Lane; 243 244 VPIteration(unsigned Part, unsigned Lane, 245 VPLane::Kind Kind = VPLane::Kind::First) 246 : Part(Part), Lane(Lane, Kind) {} 247 248 VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {} 249 250 bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); } 251 }; 252 253 /// VPTransformState holds information passed down when "executing" a VPlan, 254 /// needed for generating the output IR. 255 struct VPTransformState { 256 VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, 257 DominatorTree *DT, IRBuilderBase &Builder, 258 InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx); 259 260 /// The chosen Vectorization and Unroll Factors of the loop being vectorized. 261 ElementCount VF; 262 unsigned UF; 263 264 /// Hold the indices to generate specific scalar instructions. Null indicates 265 /// that all instances are to be generated, using either scalar or vector 266 /// instructions. 267 std::optional<VPIteration> Instance; 268 269 struct DataState { 270 /// A type for vectorized values in the new loop. Each value from the 271 /// original loop, when vectorized, is represented by UF vector values in 272 /// the new unrolled loop, where UF is the unroll factor. 273 typedef SmallVector<Value *, 2> PerPartValuesTy; 274 275 DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; 276 277 using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>; 278 DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars; 279 } Data; 280 281 /// Get the generated vector Value for a given VPValue \p Def and a given \p 282 /// Part if \p IsScalar is false, otherwise return the generated scalar 283 /// for \p Part. \See set. 284 Value *get(VPValue *Def, unsigned Part, bool IsScalar = false); 285 286 /// Get the generated Value for a given VPValue and given Part and Lane. 287 Value *get(VPValue *Def, const VPIteration &Instance); 288 289 bool hasVectorValue(VPValue *Def, unsigned Part) { 290 auto I = Data.PerPartOutput.find(Def); 291 return I != Data.PerPartOutput.end() && Part < I->second.size() && 292 I->second[Part]; 293 } 294 295 bool hasScalarValue(VPValue *Def, VPIteration Instance) { 296 auto I = Data.PerPartScalars.find(Def); 297 if (I == Data.PerPartScalars.end()) 298 return false; 299 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); 300 return Instance.Part < I->second.size() && 301 CacheIdx < I->second[Instance.Part].size() && 302 I->second[Instance.Part][CacheIdx]; 303 } 304 305 /// Set the generated vector Value for a given VPValue and a given Part, if \p 306 /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0). 307 void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) { 308 if (IsScalar) { 309 set(Def, V, VPIteration(Part, 0)); 310 return; 311 } 312 assert((VF.isScalar() || V->getType()->isVectorTy()) && 313 "scalar values must be stored as (Part, 0)"); 314 if (!Data.PerPartOutput.count(Def)) { 315 DataState::PerPartValuesTy Entry(UF); 316 Data.PerPartOutput[Def] = Entry; 317 } 318 Data.PerPartOutput[Def][Part] = V; 319 } 320 321 /// Reset an existing vector value for \p Def and a given \p Part. 322 void reset(VPValue *Def, Value *V, unsigned Part) { 323 auto Iter = Data.PerPartOutput.find(Def); 324 assert(Iter != Data.PerPartOutput.end() && 325 "need to overwrite existing value"); 326 Iter->second[Part] = V; 327 } 328 329 /// Set the generated scalar \p V for \p Def and the given \p Instance. 330 void set(VPValue *Def, Value *V, const VPIteration &Instance) { 331 auto Iter = Data.PerPartScalars.insert({Def, {}}); 332 auto &PerPartVec = Iter.first->second; 333 if (PerPartVec.size() <= Instance.Part) 334 PerPartVec.resize(Instance.Part + 1); 335 auto &Scalars = PerPartVec[Instance.Part]; 336 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); 337 if (Scalars.size() <= CacheIdx) 338 Scalars.resize(CacheIdx + 1); 339 assert(!Scalars[CacheIdx] && "should overwrite existing value"); 340 Scalars[CacheIdx] = V; 341 } 342 343 /// Reset an existing scalar value for \p Def and a given \p Instance. 344 void reset(VPValue *Def, Value *V, const VPIteration &Instance) { 345 auto Iter = Data.PerPartScalars.find(Def); 346 assert(Iter != Data.PerPartScalars.end() && 347 "need to overwrite existing value"); 348 assert(Instance.Part < Iter->second.size() && 349 "need to overwrite existing value"); 350 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); 351 assert(CacheIdx < Iter->second[Instance.Part].size() && 352 "need to overwrite existing value"); 353 Iter->second[Instance.Part][CacheIdx] = V; 354 } 355 356 /// Add additional metadata to \p To that was not present on \p Orig. 357 /// 358 /// Currently this is used to add the noalias annotations based on the 359 /// inserted memchecks. Use this for instructions that are *cloned* into the 360 /// vector loop. 361 void addNewMetadata(Instruction *To, const Instruction *Orig); 362 363 /// Add metadata from one instruction to another. 364 /// 365 /// This includes both the original MDs from \p From and additional ones (\see 366 /// addNewMetadata). Use this for *newly created* instructions in the vector 367 /// loop. 368 void addMetadata(Value *To, Instruction *From); 369 370 /// Set the debug location in the builder using the debug location \p DL. 371 void setDebugLocFrom(DebugLoc DL); 372 373 /// Construct the vector value of a scalarized value \p V one lane at a time. 374 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance); 375 376 /// Hold state information used when constructing the CFG of the output IR, 377 /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. 378 struct CFGState { 379 /// The previous VPBasicBlock visited. Initially set to null. 380 VPBasicBlock *PrevVPBB = nullptr; 381 382 /// The previous IR BasicBlock created or used. Initially set to the new 383 /// header BasicBlock. 384 BasicBlock *PrevBB = nullptr; 385 386 /// The last IR BasicBlock in the output IR. Set to the exit block of the 387 /// vector loop. 388 BasicBlock *ExitBB = nullptr; 389 390 /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case 391 /// of replication, maps the BasicBlock of the last replica created. 392 SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; 393 394 /// Updater for the DominatorTree. 395 DomTreeUpdater DTU; 396 397 CFGState(DominatorTree *DT) 398 : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {} 399 400 /// Returns the BasicBlock* mapped to the pre-header of the loop region 401 /// containing \p R. 402 BasicBlock *getPreheaderBBFor(VPRecipeBase *R); 403 } CFG; 404 405 /// Hold a pointer to LoopInfo to register new basic blocks in the loop. 406 LoopInfo *LI; 407 408 /// Hold a reference to the IRBuilder used to generate output IR code. 409 IRBuilderBase &Builder; 410 411 /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. 412 InnerLoopVectorizer *ILV; 413 414 /// Pointer to the VPlan code is generated for. 415 VPlan *Plan; 416 417 /// The loop object for the current parent region, or nullptr. 418 Loop *CurrentVectorLoop = nullptr; 419 420 /// LoopVersioning. It's only set up (non-null) if memchecks were 421 /// used. 422 /// 423 /// This is currently only used to add no-alias metadata based on the 424 /// memchecks. The actually versioning is performed manually. 425 LoopVersioning *LVer = nullptr; 426 427 /// Map SCEVs to their expanded values. Populated when executing 428 /// VPExpandSCEVRecipes. 429 DenseMap<const SCEV *, Value *> ExpandedSCEVs; 430 431 /// VPlan-based type analysis. 432 VPTypeAnalysis TypeAnalysis; 433 }; 434 435 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. 436 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. 437 class VPBlockBase { 438 friend class VPBlockUtils; 439 440 const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). 441 442 /// An optional name for the block. 443 std::string Name; 444 445 /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if 446 /// it is a topmost VPBlockBase. 447 VPRegionBlock *Parent = nullptr; 448 449 /// List of predecessor blocks. 450 SmallVector<VPBlockBase *, 1> Predecessors; 451 452 /// List of successor blocks. 453 SmallVector<VPBlockBase *, 1> Successors; 454 455 /// VPlan containing the block. Can only be set on the entry block of the 456 /// plan. 457 VPlan *Plan = nullptr; 458 459 /// Add \p Successor as the last successor to this block. 460 void appendSuccessor(VPBlockBase *Successor) { 461 assert(Successor && "Cannot add nullptr successor!"); 462 Successors.push_back(Successor); 463 } 464 465 /// Add \p Predecessor as the last predecessor to this block. 466 void appendPredecessor(VPBlockBase *Predecessor) { 467 assert(Predecessor && "Cannot add nullptr predecessor!"); 468 Predecessors.push_back(Predecessor); 469 } 470 471 /// Remove \p Predecessor from the predecessors of this block. 472 void removePredecessor(VPBlockBase *Predecessor) { 473 auto Pos = find(Predecessors, Predecessor); 474 assert(Pos && "Predecessor does not exist"); 475 Predecessors.erase(Pos); 476 } 477 478 /// Remove \p Successor from the successors of this block. 479 void removeSuccessor(VPBlockBase *Successor) { 480 auto Pos = find(Successors, Successor); 481 assert(Pos && "Successor does not exist"); 482 Successors.erase(Pos); 483 } 484 485 protected: 486 VPBlockBase(const unsigned char SC, const std::string &N) 487 : SubclassID(SC), Name(N) {} 488 489 public: 490 /// An enumeration for keeping track of the concrete subclass of VPBlockBase 491 /// that are actually instantiated. Values of this enumeration are kept in the 492 /// SubclassID field of the VPBlockBase objects. They are used for concrete 493 /// type identification. 494 using VPBlockTy = enum { VPRegionBlockSC, VPBasicBlockSC, VPIRBasicBlockSC }; 495 496 using VPBlocksTy = SmallVectorImpl<VPBlockBase *>; 497 498 virtual ~VPBlockBase() = default; 499 500 const std::string &getName() const { return Name; } 501 502 void setName(const Twine &newName) { Name = newName.str(); } 503 504 /// \return an ID for the concrete type of this object. 505 /// This is used to implement the classof checks. This should not be used 506 /// for any other purpose, as the values may change as LLVM evolves. 507 unsigned getVPBlockID() const { return SubclassID; } 508 509 VPRegionBlock *getParent() { return Parent; } 510 const VPRegionBlock *getParent() const { return Parent; } 511 512 /// \return A pointer to the plan containing the current block. 513 VPlan *getPlan(); 514 const VPlan *getPlan() const; 515 516 /// Sets the pointer of the plan containing the block. The block must be the 517 /// entry block into the VPlan. 518 void setPlan(VPlan *ParentPlan); 519 520 void setParent(VPRegionBlock *P) { Parent = P; } 521 522 /// \return the VPBasicBlock that is the entry of this VPBlockBase, 523 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this 524 /// VPBlockBase is a VPBasicBlock, it is returned. 525 const VPBasicBlock *getEntryBasicBlock() const; 526 VPBasicBlock *getEntryBasicBlock(); 527 528 /// \return the VPBasicBlock that is the exiting this VPBlockBase, 529 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this 530 /// VPBlockBase is a VPBasicBlock, it is returned. 531 const VPBasicBlock *getExitingBasicBlock() const; 532 VPBasicBlock *getExitingBasicBlock(); 533 534 const VPBlocksTy &getSuccessors() const { return Successors; } 535 VPBlocksTy &getSuccessors() { return Successors; } 536 537 iterator_range<VPBlockBase **> successors() { return Successors; } 538 539 const VPBlocksTy &getPredecessors() const { return Predecessors; } 540 VPBlocksTy &getPredecessors() { return Predecessors; } 541 542 /// \return the successor of this VPBlockBase if it has a single successor. 543 /// Otherwise return a null pointer. 544 VPBlockBase *getSingleSuccessor() const { 545 return (Successors.size() == 1 ? *Successors.begin() : nullptr); 546 } 547 548 /// \return the predecessor of this VPBlockBase if it has a single 549 /// predecessor. Otherwise return a null pointer. 550 VPBlockBase *getSinglePredecessor() const { 551 return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); 552 } 553 554 size_t getNumSuccessors() const { return Successors.size(); } 555 size_t getNumPredecessors() const { return Predecessors.size(); } 556 557 /// An Enclosing Block of a block B is any block containing B, including B 558 /// itself. \return the closest enclosing block starting from "this", which 559 /// has successors. \return the root enclosing block if all enclosing blocks 560 /// have no successors. 561 VPBlockBase *getEnclosingBlockWithSuccessors(); 562 563 /// \return the closest enclosing block starting from "this", which has 564 /// predecessors. \return the root enclosing block if all enclosing blocks 565 /// have no predecessors. 566 VPBlockBase *getEnclosingBlockWithPredecessors(); 567 568 /// \return the successors either attached directly to this VPBlockBase or, if 569 /// this VPBlockBase is the exit block of a VPRegionBlock and has no 570 /// successors of its own, search recursively for the first enclosing 571 /// VPRegionBlock that has successors and return them. If no such 572 /// VPRegionBlock exists, return the (empty) successors of the topmost 573 /// VPBlockBase reached. 574 const VPBlocksTy &getHierarchicalSuccessors() { 575 return getEnclosingBlockWithSuccessors()->getSuccessors(); 576 } 577 578 /// \return the hierarchical successor of this VPBlockBase if it has a single 579 /// hierarchical successor. Otherwise return a null pointer. 580 VPBlockBase *getSingleHierarchicalSuccessor() { 581 return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); 582 } 583 584 /// \return the predecessors either attached directly to this VPBlockBase or, 585 /// if this VPBlockBase is the entry block of a VPRegionBlock and has no 586 /// predecessors of its own, search recursively for the first enclosing 587 /// VPRegionBlock that has predecessors and return them. If no such 588 /// VPRegionBlock exists, return the (empty) predecessors of the topmost 589 /// VPBlockBase reached. 590 const VPBlocksTy &getHierarchicalPredecessors() { 591 return getEnclosingBlockWithPredecessors()->getPredecessors(); 592 } 593 594 /// \return the hierarchical predecessor of this VPBlockBase if it has a 595 /// single hierarchical predecessor. Otherwise return a null pointer. 596 VPBlockBase *getSingleHierarchicalPredecessor() { 597 return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); 598 } 599 600 /// Set a given VPBlockBase \p Successor as the single successor of this 601 /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. 602 /// This VPBlockBase must have no successors. 603 void setOneSuccessor(VPBlockBase *Successor) { 604 assert(Successors.empty() && "Setting one successor when others exist."); 605 assert(Successor->getParent() == getParent() && 606 "connected blocks must have the same parent"); 607 appendSuccessor(Successor); 608 } 609 610 /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two 611 /// successors of this VPBlockBase. This VPBlockBase is not added as 612 /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no 613 /// successors. 614 void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { 615 assert(Successors.empty() && "Setting two successors when others exist."); 616 appendSuccessor(IfTrue); 617 appendSuccessor(IfFalse); 618 } 619 620 /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase. 621 /// This VPBlockBase must have no predecessors. This VPBlockBase is not added 622 /// as successor of any VPBasicBlock in \p NewPreds. 623 void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) { 624 assert(Predecessors.empty() && "Block predecessors already set."); 625 for (auto *Pred : NewPreds) 626 appendPredecessor(Pred); 627 } 628 629 /// Set each VPBasicBlock in \p NewSuccss as successor of this VPBlockBase. 630 /// This VPBlockBase must have no successors. This VPBlockBase is not added 631 /// as predecessor of any VPBasicBlock in \p NewSuccs. 632 void setSuccessors(ArrayRef<VPBlockBase *> NewSuccs) { 633 assert(Successors.empty() && "Block successors already set."); 634 for (auto *Succ : NewSuccs) 635 appendSuccessor(Succ); 636 } 637 638 /// Remove all the predecessor of this block. 639 void clearPredecessors() { Predecessors.clear(); } 640 641 /// Remove all the successors of this block. 642 void clearSuccessors() { Successors.clear(); } 643 644 /// The method which generates the output IR that correspond to this 645 /// VPBlockBase, thereby "executing" the VPlan. 646 virtual void execute(VPTransformState *State) = 0; 647 648 /// Return the cost of the block. 649 virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; 650 651 /// Delete all blocks reachable from a given VPBlockBase, inclusive. 652 static void deleteCFG(VPBlockBase *Entry); 653 654 /// Return true if it is legal to hoist instructions into this block. 655 bool isLegalToHoistInto() { 656 // There are currently no constraints that prevent an instruction to be 657 // hoisted into a VPBlockBase. 658 return true; 659 } 660 661 /// Replace all operands of VPUsers in the block with \p NewValue and also 662 /// replaces all uses of VPValues defined in the block with NewValue. 663 virtual void dropAllReferences(VPValue *NewValue) = 0; 664 665 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 666 void printAsOperand(raw_ostream &OS, bool PrintType) const { 667 OS << getName(); 668 } 669 670 /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines 671 /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using 672 /// consequtive numbers. 673 /// 674 /// Note that the numbering is applied to the whole VPlan, so printing 675 /// individual blocks is consistent with the whole VPlan printing. 676 virtual void print(raw_ostream &O, const Twine &Indent, 677 VPSlotTracker &SlotTracker) const = 0; 678 679 /// Print plain-text dump of this VPlan to \p O. 680 void print(raw_ostream &O) const { 681 VPSlotTracker SlotTracker(getPlan()); 682 print(O, "", SlotTracker); 683 } 684 685 /// Print the successors of this block to \p O, prefixing all lines with \p 686 /// Indent. 687 void printSuccessors(raw_ostream &O, const Twine &Indent) const; 688 689 /// Dump this VPBlockBase to dbgs(). 690 LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 691 #endif 692 693 /// Clone the current block and it's recipes without updating the operands of 694 /// the cloned recipes, including all blocks in the single-entry single-exit 695 /// region for VPRegionBlocks. 696 virtual VPBlockBase *clone() = 0; 697 }; 698 699 /// A value that is used outside the VPlan. The operand of the user needs to be 700 /// added to the associated phi node. The incoming block from VPlan is 701 /// determined by where the VPValue is defined: if it is defined by a recipe 702 /// outside a region, its parent block is used, otherwise the middle block is 703 /// used. 704 class VPLiveOut : public VPUser { 705 PHINode *Phi; 706 707 public: 708 VPLiveOut(PHINode *Phi, VPValue *Op) 709 : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} 710 711 static inline bool classof(const VPUser *U) { 712 return U->getVPUserID() == VPUser::VPUserID::LiveOut; 713 } 714 715 /// Fix the wrapped phi node. This means adding an incoming value to exit 716 /// block phi's from the vector loop via middle block (values from scalar loop 717 /// already reach these phi's), and updating the value to scalar header phi's 718 /// from the scalar preheader. 719 void fixPhi(VPlan &Plan, VPTransformState &State); 720 721 /// Returns true if the VPLiveOut uses scalars of operand \p Op. 722 bool usesScalars(const VPValue *Op) const override { 723 assert(is_contained(operands(), Op) && 724 "Op must be an operand of the recipe"); 725 return true; 726 } 727 728 PHINode *getPhi() const { return Phi; } 729 730 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 731 /// Print the VPLiveOut to \p O. 732 void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; 733 #endif 734 }; 735 736 /// Struct to hold various analysis needed for cost computations. 737 struct VPCostContext { 738 const TargetTransformInfo &TTI; 739 VPTypeAnalysis Types; 740 LLVMContext &LLVMCtx; 741 LoopVectorizationCostModel &CM; 742 SmallPtrSet<Instruction *, 8> SkipCostComputation; 743 744 VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, 745 LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM) 746 : TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {} 747 748 /// Return the cost for \p UI with \p VF using the legacy cost model as 749 /// fallback until computing the cost of all recipes migrates to VPlan. 750 InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const; 751 752 /// Return true if the cost for \p UI shouldn't be computed, e.g. because it 753 /// has already been pre-computed. 754 bool skipCostComputation(Instruction *UI, bool IsVector) const; 755 }; 756 757 /// VPRecipeBase is a base class modeling a sequence of one or more output IR 758 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef 759 /// and is responsible for deleting its defined values. Single-value 760 /// recipes must inherit from VPSingleDef instead of inheriting from both 761 /// VPRecipeBase and VPValue separately. 762 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, 763 public VPDef, 764 public VPUser { 765 friend VPBasicBlock; 766 friend class VPBlockUtils; 767 768 /// Each VPRecipe belongs to a single VPBasicBlock. 769 VPBasicBlock *Parent = nullptr; 770 771 /// The debug location for the recipe. 772 DebugLoc DL; 773 774 public: 775 VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, 776 DebugLoc DL = {}) 777 : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} 778 779 template <typename IterT> 780 VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands, 781 DebugLoc DL = {}) 782 : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} 783 virtual ~VPRecipeBase() = default; 784 785 /// Clone the current recipe. 786 virtual VPRecipeBase *clone() = 0; 787 788 /// \return the VPBasicBlock which this VPRecipe belongs to. 789 VPBasicBlock *getParent() { return Parent; } 790 const VPBasicBlock *getParent() const { return Parent; } 791 792 /// The method which generates the output IR instructions that correspond to 793 /// this VPRecipe, thereby "executing" the VPlan. 794 virtual void execute(VPTransformState &State) = 0; 795 796 /// Return the cost of this recipe, taking into account if the cost 797 /// computation should be skipped and the ForceTargetInstructionCost flag. 798 /// Also takes care of printing the cost for debugging. 799 virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx); 800 801 /// Insert an unlinked recipe into a basic block immediately before 802 /// the specified recipe. 803 void insertBefore(VPRecipeBase *InsertPos); 804 /// Insert an unlinked recipe into \p BB immediately before the insertion 805 /// point \p IP; 806 void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP); 807 808 /// Insert an unlinked Recipe into a basic block immediately after 809 /// the specified Recipe. 810 void insertAfter(VPRecipeBase *InsertPos); 811 812 /// Unlink this recipe from its current VPBasicBlock and insert it into 813 /// the VPBasicBlock that MovePos lives in, right after MovePos. 814 void moveAfter(VPRecipeBase *MovePos); 815 816 /// Unlink this recipe and insert into BB before I. 817 /// 818 /// \pre I is a valid iterator into BB. 819 void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I); 820 821 /// This method unlinks 'this' from the containing basic block, but does not 822 /// delete it. 823 void removeFromParent(); 824 825 /// This method unlinks 'this' from the containing basic block and deletes it. 826 /// 827 /// \returns an iterator pointing to the element after the erased one 828 iplist<VPRecipeBase>::iterator eraseFromParent(); 829 830 /// Method to support type inquiry through isa, cast, and dyn_cast. 831 static inline bool classof(const VPDef *D) { 832 // All VPDefs are also VPRecipeBases. 833 return true; 834 } 835 836 static inline bool classof(const VPUser *U) { 837 return U->getVPUserID() == VPUser::VPUserID::Recipe; 838 } 839 840 /// Returns true if the recipe may have side-effects. 841 bool mayHaveSideEffects() const; 842 843 /// Returns true for PHI-like recipes. 844 bool isPhi() const { 845 return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC; 846 } 847 848 /// Returns true if the recipe may read from memory. 849 bool mayReadFromMemory() const; 850 851 /// Returns true if the recipe may write to memory. 852 bool mayWriteToMemory() const; 853 854 /// Returns true if the recipe may read from or write to memory. 855 bool mayReadOrWriteMemory() const { 856 return mayReadFromMemory() || mayWriteToMemory(); 857 } 858 859 /// Returns the debug location of the recipe. 860 DebugLoc getDebugLoc() const { return DL; } 861 862 protected: 863 /// Compute the cost of this recipe using the legacy cost model and the 864 /// underlying instructions. 865 InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const; 866 }; 867 868 // Helper macro to define common classof implementations for recipes. 869 #define VP_CLASSOF_IMPL(VPDefID) \ 870 static inline bool classof(const VPDef *D) { \ 871 return D->getVPDefID() == VPDefID; \ 872 } \ 873 static inline bool classof(const VPValue *V) { \ 874 auto *R = V->getDefiningRecipe(); \ 875 return R && R->getVPDefID() == VPDefID; \ 876 } \ 877 static inline bool classof(const VPUser *U) { \ 878 auto *R = dyn_cast<VPRecipeBase>(U); \ 879 return R && R->getVPDefID() == VPDefID; \ 880 } \ 881 static inline bool classof(const VPRecipeBase *R) { \ 882 return R->getVPDefID() == VPDefID; \ 883 } \ 884 static inline bool classof(const VPSingleDefRecipe *R) { \ 885 return R->getVPDefID() == VPDefID; \ 886 } 887 888 /// VPSingleDef is a base class for recipes for modeling a sequence of one or 889 /// more output IR that define a single result VPValue. 890 /// Note that VPRecipeBase must be inherited from before VPValue. 891 class VPSingleDefRecipe : public VPRecipeBase, public VPValue { 892 public: 893 template <typename IterT> 894 VPSingleDefRecipe(const unsigned char SC, IterT Operands, DebugLoc DL = {}) 895 : VPRecipeBase(SC, Operands, DL), VPValue(this) {} 896 897 VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, 898 DebugLoc DL = {}) 899 : VPRecipeBase(SC, Operands, DL), VPValue(this) {} 900 901 template <typename IterT> 902 VPSingleDefRecipe(const unsigned char SC, IterT Operands, Value *UV, 903 DebugLoc DL = {}) 904 : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {} 905 906 static inline bool classof(const VPRecipeBase *R) { 907 switch (R->getVPDefID()) { 908 case VPRecipeBase::VPDerivedIVSC: 909 case VPRecipeBase::VPEVLBasedIVPHISC: 910 case VPRecipeBase::VPExpandSCEVSC: 911 case VPRecipeBase::VPInstructionSC: 912 case VPRecipeBase::VPReductionEVLSC: 913 case VPRecipeBase::VPReductionSC: 914 case VPRecipeBase::VPReplicateSC: 915 case VPRecipeBase::VPScalarIVStepsSC: 916 case VPRecipeBase::VPVectorPointerSC: 917 case VPRecipeBase::VPWidenCallSC: 918 case VPRecipeBase::VPWidenCanonicalIVSC: 919 case VPRecipeBase::VPWidenCastSC: 920 case VPRecipeBase::VPWidenGEPSC: 921 case VPRecipeBase::VPWidenSC: 922 case VPRecipeBase::VPWidenSelectSC: 923 case VPRecipeBase::VPBlendSC: 924 case VPRecipeBase::VPPredInstPHISC: 925 case VPRecipeBase::VPCanonicalIVPHISC: 926 case VPRecipeBase::VPActiveLaneMaskPHISC: 927 case VPRecipeBase::VPFirstOrderRecurrencePHISC: 928 case VPRecipeBase::VPWidenPHISC: 929 case VPRecipeBase::VPWidenIntOrFpInductionSC: 930 case VPRecipeBase::VPWidenPointerInductionSC: 931 case VPRecipeBase::VPReductionPHISC: 932 case VPRecipeBase::VPScalarCastSC: 933 return true; 934 case VPRecipeBase::VPInterleaveSC: 935 case VPRecipeBase::VPBranchOnMaskSC: 936 case VPRecipeBase::VPWidenLoadEVLSC: 937 case VPRecipeBase::VPWidenLoadSC: 938 case VPRecipeBase::VPWidenStoreEVLSC: 939 case VPRecipeBase::VPWidenStoreSC: 940 // TODO: Widened stores don't define a value, but widened loads do. Split 941 // the recipes to be able to make widened loads VPSingleDefRecipes. 942 return false; 943 } 944 llvm_unreachable("Unhandled VPDefID"); 945 } 946 947 static inline bool classof(const VPUser *U) { 948 auto *R = dyn_cast<VPRecipeBase>(U); 949 return R && classof(R); 950 } 951 952 virtual VPSingleDefRecipe *clone() override = 0; 953 954 /// Returns the underlying instruction. 955 Instruction *getUnderlyingInstr() { 956 return cast<Instruction>(getUnderlyingValue()); 957 } 958 const Instruction *getUnderlyingInstr() const { 959 return cast<Instruction>(getUnderlyingValue()); 960 } 961 }; 962 963 /// Class to record LLVM IR flag for a recipe along with it. 964 class VPRecipeWithIRFlags : public VPSingleDefRecipe { 965 enum class OperationType : unsigned char { 966 Cmp, 967 OverflowingBinOp, 968 DisjointOp, 969 PossiblyExactOp, 970 GEPOp, 971 FPMathOp, 972 NonNegOp, 973 Other 974 }; 975 976 public: 977 struct WrapFlagsTy { 978 char HasNUW : 1; 979 char HasNSW : 1; 980 981 WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {} 982 }; 983 984 struct DisjointFlagsTy { 985 char IsDisjoint : 1; 986 DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} 987 }; 988 989 protected: 990 struct GEPFlagsTy { 991 char IsInBounds : 1; 992 GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {} 993 }; 994 995 private: 996 struct ExactFlagsTy { 997 char IsExact : 1; 998 }; 999 struct NonNegFlagsTy { 1000 char NonNeg : 1; 1001 }; 1002 struct FastMathFlagsTy { 1003 char AllowReassoc : 1; 1004 char NoNaNs : 1; 1005 char NoInfs : 1; 1006 char NoSignedZeros : 1; 1007 char AllowReciprocal : 1; 1008 char AllowContract : 1; 1009 char ApproxFunc : 1; 1010 1011 FastMathFlagsTy(const FastMathFlags &FMF); 1012 }; 1013 1014 OperationType OpType; 1015 1016 union { 1017 CmpInst::Predicate CmpPredicate; 1018 WrapFlagsTy WrapFlags; 1019 DisjointFlagsTy DisjointFlags; 1020 ExactFlagsTy ExactFlags; 1021 GEPFlagsTy GEPFlags; 1022 NonNegFlagsTy NonNegFlags; 1023 FastMathFlagsTy FMFs; 1024 unsigned AllFlags; 1025 }; 1026 1027 protected: 1028 void transferFlags(VPRecipeWithIRFlags &Other) { 1029 OpType = Other.OpType; 1030 AllFlags = Other.AllFlags; 1031 } 1032 1033 public: 1034 template <typename IterT> 1035 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {}) 1036 : VPSingleDefRecipe(SC, Operands, DL) { 1037 OpType = OperationType::Other; 1038 AllFlags = 0; 1039 } 1040 1041 template <typename IterT> 1042 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I) 1043 : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) { 1044 if (auto *Op = dyn_cast<CmpInst>(&I)) { 1045 OpType = OperationType::Cmp; 1046 CmpPredicate = Op->getPredicate(); 1047 } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) { 1048 OpType = OperationType::DisjointOp; 1049 DisjointFlags.IsDisjoint = Op->isDisjoint(); 1050 } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) { 1051 OpType = OperationType::OverflowingBinOp; 1052 WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()}; 1053 } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) { 1054 OpType = OperationType::PossiblyExactOp; 1055 ExactFlags.IsExact = Op->isExact(); 1056 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 1057 OpType = OperationType::GEPOp; 1058 GEPFlags.IsInBounds = GEP->isInBounds(); 1059 } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) { 1060 OpType = OperationType::NonNegOp; 1061 NonNegFlags.NonNeg = PNNI->hasNonNeg(); 1062 } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) { 1063 OpType = OperationType::FPMathOp; 1064 FMFs = Op->getFastMathFlags(); 1065 } else { 1066 OpType = OperationType::Other; 1067 AllFlags = 0; 1068 } 1069 } 1070 1071 template <typename IterT> 1072 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1073 CmpInst::Predicate Pred, DebugLoc DL = {}) 1074 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp), 1075 CmpPredicate(Pred) {} 1076 1077 template <typename IterT> 1078 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1079 WrapFlagsTy WrapFlags, DebugLoc DL = {}) 1080 : VPSingleDefRecipe(SC, Operands, DL), 1081 OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {} 1082 1083 template <typename IterT> 1084 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1085 FastMathFlags FMFs, DebugLoc DL = {}) 1086 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp), 1087 FMFs(FMFs) {} 1088 1089 template <typename IterT> 1090 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1091 DisjointFlagsTy DisjointFlags, DebugLoc DL = {}) 1092 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp), 1093 DisjointFlags(DisjointFlags) {} 1094 1095 protected: 1096 template <typename IterT> 1097 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1098 GEPFlagsTy GEPFlags, DebugLoc DL = {}) 1099 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp), 1100 GEPFlags(GEPFlags) {} 1101 1102 public: 1103 static inline bool classof(const VPRecipeBase *R) { 1104 return R->getVPDefID() == VPRecipeBase::VPInstructionSC || 1105 R->getVPDefID() == VPRecipeBase::VPWidenSC || 1106 R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || 1107 R->getVPDefID() == VPRecipeBase::VPWidenCastSC || 1108 R->getVPDefID() == VPRecipeBase::VPReplicateSC || 1109 R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; 1110 } 1111 1112 static inline bool classof(const VPUser *U) { 1113 auto *R = dyn_cast<VPRecipeBase>(U); 1114 return R && classof(R); 1115 } 1116 1117 /// Drop all poison-generating flags. 1118 void dropPoisonGeneratingFlags() { 1119 // NOTE: This needs to be kept in-sync with 1120 // Instruction::dropPoisonGeneratingFlags. 1121 switch (OpType) { 1122 case OperationType::OverflowingBinOp: 1123 WrapFlags.HasNUW = false; 1124 WrapFlags.HasNSW = false; 1125 break; 1126 case OperationType::DisjointOp: 1127 DisjointFlags.IsDisjoint = false; 1128 break; 1129 case OperationType::PossiblyExactOp: 1130 ExactFlags.IsExact = false; 1131 break; 1132 case OperationType::GEPOp: 1133 GEPFlags.IsInBounds = false; 1134 break; 1135 case OperationType::FPMathOp: 1136 FMFs.NoNaNs = false; 1137 FMFs.NoInfs = false; 1138 break; 1139 case OperationType::NonNegOp: 1140 NonNegFlags.NonNeg = false; 1141 break; 1142 case OperationType::Cmp: 1143 case OperationType::Other: 1144 break; 1145 } 1146 } 1147 1148 /// Set the IR flags for \p I. 1149 void setFlags(Instruction *I) const { 1150 switch (OpType) { 1151 case OperationType::OverflowingBinOp: 1152 I->setHasNoUnsignedWrap(WrapFlags.HasNUW); 1153 I->setHasNoSignedWrap(WrapFlags.HasNSW); 1154 break; 1155 case OperationType::DisjointOp: 1156 cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint); 1157 break; 1158 case OperationType::PossiblyExactOp: 1159 I->setIsExact(ExactFlags.IsExact); 1160 break; 1161 case OperationType::GEPOp: 1162 // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan. 1163 cast<GetElementPtrInst>(I)->setNoWrapFlags( 1164 GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds() 1165 : GEPNoWrapFlags::none()); 1166 break; 1167 case OperationType::FPMathOp: 1168 I->setHasAllowReassoc(FMFs.AllowReassoc); 1169 I->setHasNoNaNs(FMFs.NoNaNs); 1170 I->setHasNoInfs(FMFs.NoInfs); 1171 I->setHasNoSignedZeros(FMFs.NoSignedZeros); 1172 I->setHasAllowReciprocal(FMFs.AllowReciprocal); 1173 I->setHasAllowContract(FMFs.AllowContract); 1174 I->setHasApproxFunc(FMFs.ApproxFunc); 1175 break; 1176 case OperationType::NonNegOp: 1177 I->setNonNeg(NonNegFlags.NonNeg); 1178 break; 1179 case OperationType::Cmp: 1180 case OperationType::Other: 1181 break; 1182 } 1183 } 1184 1185 CmpInst::Predicate getPredicate() const { 1186 assert(OpType == OperationType::Cmp && 1187 "recipe doesn't have a compare predicate"); 1188 return CmpPredicate; 1189 } 1190 1191 bool isInBounds() const { 1192 assert(OpType == OperationType::GEPOp && 1193 "recipe doesn't have inbounds flag"); 1194 return GEPFlags.IsInBounds; 1195 } 1196 1197 /// Returns true if the recipe has fast-math flags. 1198 bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } 1199 1200 FastMathFlags getFastMathFlags() const; 1201 1202 bool hasNoUnsignedWrap() const { 1203 assert(OpType == OperationType::OverflowingBinOp && 1204 "recipe doesn't have a NUW flag"); 1205 return WrapFlags.HasNUW; 1206 } 1207 1208 bool hasNoSignedWrap() const { 1209 assert(OpType == OperationType::OverflowingBinOp && 1210 "recipe doesn't have a NSW flag"); 1211 return WrapFlags.HasNSW; 1212 } 1213 1214 bool isDisjoint() const { 1215 assert(OpType == OperationType::DisjointOp && 1216 "recipe cannot have a disjoing flag"); 1217 return DisjointFlags.IsDisjoint; 1218 } 1219 1220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1221 void printFlags(raw_ostream &O) const; 1222 #endif 1223 }; 1224 1225 /// This is a concrete Recipe that models a single VPlan-level instruction. 1226 /// While as any Recipe it may generate a sequence of IR instructions when 1227 /// executed, these instructions would always form a single-def expression as 1228 /// the VPInstruction is also a single def-use vertex. 1229 class VPInstruction : public VPRecipeWithIRFlags { 1230 friend class VPlanSlp; 1231 1232 public: 1233 /// VPlan opcodes, extending LLVM IR with idiomatics instructions. 1234 enum { 1235 FirstOrderRecurrenceSplice = 1236 Instruction::OtherOpsEnd + 1, // Combines the incoming and previous 1237 // values of a first-order recurrence. 1238 Not, 1239 SLPLoad, 1240 SLPStore, 1241 ActiveLaneMask, 1242 ExplicitVectorLength, 1243 /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan. 1244 /// The first operand is the incoming value from the predecessor in VPlan, 1245 /// the second operand is the incoming value for all other predecessors 1246 /// (which are currently not modeled in VPlan). 1247 ResumePhi, 1248 CalculateTripCountMinusVF, 1249 // Increment the canonical IV separately for each unrolled part. 1250 CanonicalIVIncrementForPart, 1251 BranchOnCount, 1252 BranchOnCond, 1253 ComputeReductionResult, 1254 // Takes the VPValue to extract from as first operand and the lane or part 1255 // to extract as second operand, counting from the end starting with 1 for 1256 // last. The second operand must be a positive constant and <= VF when 1257 // extracting from a vector or <= UF when extracting from an unrolled 1258 // scalar. 1259 ExtractFromEnd, 1260 LogicalAnd, // Non-poison propagating logical And. 1261 // Add an offset in bytes (second operand) to a base pointer (first 1262 // operand). Only generates scalar values (either for the first lane only or 1263 // for all lanes, depending on its uses). 1264 PtrAdd, 1265 }; 1266 1267 private: 1268 typedef unsigned char OpcodeTy; 1269 OpcodeTy Opcode; 1270 1271 /// An optional name that can be used for the generated IR instruction. 1272 const std::string Name; 1273 1274 /// Returns true if this VPInstruction generates scalar values for all lanes. 1275 /// Most VPInstructions generate a single value per part, either vector or 1276 /// scalar. VPReplicateRecipe takes care of generating multiple (scalar) 1277 /// values per all lanes, stemming from an original ingredient. This method 1278 /// identifies the (rare) cases of VPInstructions that do so as well, w/o an 1279 /// underlying ingredient. 1280 bool doesGeneratePerAllLanes() const; 1281 1282 /// Returns true if we can generate a scalar for the first lane only if 1283 /// needed. 1284 bool canGenerateScalarForFirstLane() const; 1285 1286 /// Utility methods serving execute(): generates a single instance of the 1287 /// modeled instruction for a given part. \returns the generated value for \p 1288 /// Part. In some cases an existing value is returned rather than a generated 1289 /// one. 1290 Value *generatePerPart(VPTransformState &State, unsigned Part); 1291 1292 /// Utility methods serving execute(): generates a scalar single instance of 1293 /// the modeled instruction for a given lane. \returns the scalar generated 1294 /// value for lane \p Lane. 1295 Value *generatePerLane(VPTransformState &State, const VPIteration &Lane); 1296 1297 #if !defined(NDEBUG) 1298 /// Return true if the VPInstruction is a floating point math operation, i.e. 1299 /// has fast-math flags. 1300 bool isFPMathOp() const; 1301 #endif 1302 1303 public: 1304 VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, 1305 const Twine &Name = "") 1306 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), 1307 Opcode(Opcode), Name(Name.str()) {} 1308 1309 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1310 DebugLoc DL = {}, const Twine &Name = "") 1311 : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} 1312 1313 VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A, 1314 VPValue *B, DebugLoc DL = {}, const Twine &Name = ""); 1315 1316 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1317 WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "") 1318 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL), 1319 Opcode(Opcode), Name(Name.str()) {} 1320 1321 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1322 DisjointFlagsTy DisjointFlag, DebugLoc DL = {}, 1323 const Twine &Name = "") 1324 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL), 1325 Opcode(Opcode), Name(Name.str()) { 1326 assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint"); 1327 } 1328 1329 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1330 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = ""); 1331 1332 VP_CLASSOF_IMPL(VPDef::VPInstructionSC) 1333 1334 VPInstruction *clone() override { 1335 SmallVector<VPValue *, 2> Operands(operands()); 1336 auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name); 1337 New->transferFlags(*this); 1338 return New; 1339 } 1340 1341 unsigned getOpcode() const { return Opcode; } 1342 1343 /// Generate the instruction. 1344 /// TODO: We currently execute only per-part unless a specific instance is 1345 /// provided. 1346 void execute(VPTransformState &State) override; 1347 1348 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1349 /// Print the VPInstruction to \p O. 1350 void print(raw_ostream &O, const Twine &Indent, 1351 VPSlotTracker &SlotTracker) const override; 1352 1353 /// Print the VPInstruction to dbgs() (for debugging). 1354 LLVM_DUMP_METHOD void dump() const; 1355 #endif 1356 1357 /// Return true if this instruction may modify memory. 1358 bool mayWriteToMemory() const { 1359 // TODO: we can use attributes of the called function to rule out memory 1360 // modifications. 1361 return Opcode == Instruction::Store || Opcode == Instruction::Call || 1362 Opcode == Instruction::Invoke || Opcode == SLPStore; 1363 } 1364 1365 bool hasResult() const { 1366 // CallInst may or may not have a result, depending on the called function. 1367 // Conservatively return calls have results for now. 1368 switch (getOpcode()) { 1369 case Instruction::Ret: 1370 case Instruction::Br: 1371 case Instruction::Store: 1372 case Instruction::Switch: 1373 case Instruction::IndirectBr: 1374 case Instruction::Resume: 1375 case Instruction::CatchRet: 1376 case Instruction::Unreachable: 1377 case Instruction::Fence: 1378 case Instruction::AtomicRMW: 1379 case VPInstruction::BranchOnCond: 1380 case VPInstruction::BranchOnCount: 1381 return false; 1382 default: 1383 return true; 1384 } 1385 } 1386 1387 /// Returns true if the recipe only uses the first lane of operand \p Op. 1388 bool onlyFirstLaneUsed(const VPValue *Op) const override; 1389 1390 /// Returns true if the recipe only uses the first part of operand \p Op. 1391 bool onlyFirstPartUsed(const VPValue *Op) const override; 1392 1393 /// Returns true if this VPInstruction produces a scalar value from a vector, 1394 /// e.g. by performing a reduction or extracting a lane. 1395 bool isVectorToScalar() const; 1396 1397 /// Returns true if this VPInstruction's operands are single scalars and the 1398 /// result is also a single scalar. 1399 bool isSingleScalar() const; 1400 }; 1401 1402 /// VPWidenRecipe is a recipe for producing a widened instruction using the 1403 /// opcode and operands of the recipe. This recipe covers most of the 1404 /// traditional vectorization cases where each recipe transforms into a 1405 /// vectorized version of itself. 1406 class VPWidenRecipe : public VPRecipeWithIRFlags { 1407 unsigned Opcode; 1408 1409 public: 1410 template <typename IterT> 1411 VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) 1412 : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), 1413 Opcode(I.getOpcode()) {} 1414 1415 ~VPWidenRecipe() override = default; 1416 1417 VPWidenRecipe *clone() override { 1418 auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands()); 1419 R->transferFlags(*this); 1420 return R; 1421 } 1422 1423 VP_CLASSOF_IMPL(VPDef::VPWidenSC) 1424 1425 /// Produce a widened instruction using the opcode and operands of the recipe, 1426 /// processing State.VF elements. 1427 void execute(VPTransformState &State) override; 1428 1429 unsigned getOpcode() const { return Opcode; } 1430 1431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1432 /// Print the recipe. 1433 void print(raw_ostream &O, const Twine &Indent, 1434 VPSlotTracker &SlotTracker) const override; 1435 #endif 1436 }; 1437 1438 /// VPWidenCastRecipe is a recipe to create vector cast instructions. 1439 class VPWidenCastRecipe : public VPRecipeWithIRFlags { 1440 /// Cast instruction opcode. 1441 Instruction::CastOps Opcode; 1442 1443 /// Result type for the cast. 1444 Type *ResultTy; 1445 1446 public: 1447 VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, 1448 CastInst &UI) 1449 : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode), 1450 ResultTy(ResultTy) { 1451 assert(UI.getOpcode() == Opcode && 1452 "opcode of underlying cast doesn't match"); 1453 } 1454 1455 VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) 1456 : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode), 1457 ResultTy(ResultTy) {} 1458 1459 ~VPWidenCastRecipe() override = default; 1460 1461 VPWidenCastRecipe *clone() override { 1462 if (auto *UV = getUnderlyingValue()) 1463 return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, 1464 *cast<CastInst>(UV)); 1465 1466 return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy); 1467 } 1468 1469 VP_CLASSOF_IMPL(VPDef::VPWidenCastSC) 1470 1471 /// Produce widened copies of the cast. 1472 void execute(VPTransformState &State) override; 1473 1474 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1475 /// Print the recipe. 1476 void print(raw_ostream &O, const Twine &Indent, 1477 VPSlotTracker &SlotTracker) const override; 1478 #endif 1479 1480 Instruction::CastOps getOpcode() const { return Opcode; } 1481 1482 /// Returns the result type of the cast. 1483 Type *getResultType() const { return ResultTy; } 1484 }; 1485 1486 /// VPScalarCastRecipe is a recipe to create scalar cast instructions. 1487 class VPScalarCastRecipe : public VPSingleDefRecipe { 1488 Instruction::CastOps Opcode; 1489 1490 Type *ResultTy; 1491 1492 Value *generate(VPTransformState &State, unsigned Part); 1493 1494 public: 1495 VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) 1496 : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode), 1497 ResultTy(ResultTy) {} 1498 1499 ~VPScalarCastRecipe() override = default; 1500 1501 VPScalarCastRecipe *clone() override { 1502 return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy); 1503 } 1504 1505 VP_CLASSOF_IMPL(VPDef::VPScalarCastSC) 1506 1507 void execute(VPTransformState &State) override; 1508 1509 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1510 void print(raw_ostream &O, const Twine &Indent, 1511 VPSlotTracker &SlotTracker) const override; 1512 #endif 1513 1514 /// Returns the result type of the cast. 1515 Type *getResultType() const { return ResultTy; } 1516 1517 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1518 // At the moment, only uniform codegen is implemented. 1519 assert(is_contained(operands(), Op) && 1520 "Op must be an operand of the recipe"); 1521 return true; 1522 } 1523 }; 1524 1525 /// A recipe for widening Call instructions. 1526 class VPWidenCallRecipe : public VPSingleDefRecipe { 1527 /// ID of the vector intrinsic to call when widening the call. If set the 1528 /// Intrinsic::not_intrinsic, a library call will be used instead. 1529 Intrinsic::ID VectorIntrinsicID; 1530 /// If this recipe represents a library call, Variant stores a pointer to 1531 /// the chosen function. There is a 1:1 mapping between a given VF and the 1532 /// chosen vectorized variant, so there will be a different vplan for each 1533 /// VF with a valid variant. 1534 Function *Variant; 1535 1536 public: 1537 template <typename IterT> 1538 VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments, 1539 Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {}, 1540 Function *Variant = nullptr) 1541 : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL), 1542 VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) { 1543 assert( 1544 isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) && 1545 "last operand must be the called function"); 1546 } 1547 1548 ~VPWidenCallRecipe() override = default; 1549 1550 VPWidenCallRecipe *clone() override { 1551 return new VPWidenCallRecipe(getUnderlyingValue(), operands(), 1552 VectorIntrinsicID, getDebugLoc(), Variant); 1553 } 1554 1555 VP_CLASSOF_IMPL(VPDef::VPWidenCallSC) 1556 1557 /// Produce a widened version of the call instruction. 1558 void execute(VPTransformState &State) override; 1559 1560 Function *getCalledScalarFunction() const { 1561 return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); 1562 } 1563 1564 operand_range arg_operands() { 1565 return make_range(op_begin(), op_begin() + getNumOperands() - 1); 1566 } 1567 const_operand_range arg_operands() const { 1568 return make_range(op_begin(), op_begin() + getNumOperands() - 1); 1569 } 1570 1571 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1572 /// Print the recipe. 1573 void print(raw_ostream &O, const Twine &Indent, 1574 VPSlotTracker &SlotTracker) const override; 1575 #endif 1576 }; 1577 1578 /// A recipe for widening select instructions. 1579 struct VPWidenSelectRecipe : public VPSingleDefRecipe { 1580 template <typename IterT> 1581 VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands) 1582 : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I, 1583 I.getDebugLoc()) {} 1584 1585 ~VPWidenSelectRecipe() override = default; 1586 1587 VPWidenSelectRecipe *clone() override { 1588 return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()), 1589 operands()); 1590 } 1591 1592 VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC) 1593 1594 /// Produce a widened version of the select instruction. 1595 void execute(VPTransformState &State) override; 1596 1597 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1598 /// Print the recipe. 1599 void print(raw_ostream &O, const Twine &Indent, 1600 VPSlotTracker &SlotTracker) const override; 1601 #endif 1602 1603 VPValue *getCond() const { 1604 return getOperand(0); 1605 } 1606 1607 bool isInvariantCond() const { 1608 return getCond()->isDefinedOutsideVectorRegions(); 1609 } 1610 }; 1611 1612 /// A recipe for handling GEP instructions. 1613 class VPWidenGEPRecipe : public VPRecipeWithIRFlags { 1614 bool isPointerLoopInvariant() const { 1615 return getOperand(0)->isDefinedOutsideVectorRegions(); 1616 } 1617 1618 bool isIndexLoopInvariant(unsigned I) const { 1619 return getOperand(I + 1)->isDefinedOutsideVectorRegions(); 1620 } 1621 1622 bool areAllOperandsInvariant() const { 1623 return all_of(operands(), [](VPValue *Op) { 1624 return Op->isDefinedOutsideVectorRegions(); 1625 }); 1626 } 1627 1628 public: 1629 template <typename IterT> 1630 VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) 1631 : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {} 1632 1633 ~VPWidenGEPRecipe() override = default; 1634 1635 VPWidenGEPRecipe *clone() override { 1636 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()), 1637 operands()); 1638 } 1639 1640 VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC) 1641 1642 /// Generate the gep nodes. 1643 void execute(VPTransformState &State) override; 1644 1645 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1646 /// Print the recipe. 1647 void print(raw_ostream &O, const Twine &Indent, 1648 VPSlotTracker &SlotTracker) const override; 1649 #endif 1650 }; 1651 1652 /// A recipe to compute the pointers for widened memory accesses of IndexTy for 1653 /// all parts. If IsReverse is true, compute pointers for accessing the input in 1654 /// reverse order per part. 1655 class VPVectorPointerRecipe : public VPRecipeWithIRFlags { 1656 Type *IndexedTy; 1657 bool IsReverse; 1658 1659 public: 1660 VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse, 1661 bool IsInBounds, DebugLoc DL) 1662 : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr), 1663 GEPFlagsTy(IsInBounds), DL), 1664 IndexedTy(IndexedTy), IsReverse(IsReverse) {} 1665 1666 VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) 1667 1668 void execute(VPTransformState &State) override; 1669 1670 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1671 assert(is_contained(operands(), Op) && 1672 "Op must be an operand of the recipe"); 1673 return true; 1674 } 1675 1676 VPVectorPointerRecipe *clone() override { 1677 return new VPVectorPointerRecipe(getOperand(0), IndexedTy, IsReverse, 1678 isInBounds(), getDebugLoc()); 1679 } 1680 1681 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1682 /// Print the recipe. 1683 void print(raw_ostream &O, const Twine &Indent, 1684 VPSlotTracker &SlotTracker) const override; 1685 #endif 1686 }; 1687 1688 /// A pure virtual base class for all recipes modeling header phis, including 1689 /// phis for first order recurrences, pointer inductions and reductions. The 1690 /// start value is the first operand of the recipe and the incoming value from 1691 /// the backedge is the second operand. 1692 /// 1693 /// Inductions are modeled using the following sub-classes: 1694 /// * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop, 1695 /// starting at a specified value (zero for the main vector loop, the resume 1696 /// value for the epilogue vector loop) and stepping by 1. The induction 1697 /// controls exiting of the vector loop by comparing against the vector trip 1698 /// count. Produces a single scalar PHI for the induction value per 1699 /// iteration. 1700 /// * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and 1701 /// floating point inductions with arbitrary start and step values. Produces 1702 /// a vector PHI per-part. 1703 /// * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding 1704 /// value of an IV with different start and step values. Produces a single 1705 /// scalar value per iteration 1706 /// * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a 1707 /// canonical or derived induction. 1708 /// * VPWidenPointerInductionRecipe: Generate vector and scalar values for a 1709 /// pointer induction. Produces either a vector PHI per-part or scalar values 1710 /// per-lane based on the canonical induction. 1711 class VPHeaderPHIRecipe : public VPSingleDefRecipe { 1712 protected: 1713 VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, 1714 VPValue *Start = nullptr, DebugLoc DL = {}) 1715 : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>(), UnderlyingInstr, DL) { 1716 if (Start) 1717 addOperand(Start); 1718 } 1719 1720 public: 1721 ~VPHeaderPHIRecipe() override = default; 1722 1723 /// Method to support type inquiry through isa, cast, and dyn_cast. 1724 static inline bool classof(const VPRecipeBase *B) { 1725 return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC && 1726 B->getVPDefID() <= VPDef::VPLastHeaderPHISC; 1727 } 1728 static inline bool classof(const VPValue *V) { 1729 auto *B = V->getDefiningRecipe(); 1730 return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC && 1731 B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC; 1732 } 1733 1734 /// Generate the phi nodes. 1735 void execute(VPTransformState &State) override = 0; 1736 1737 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1738 /// Print the recipe. 1739 void print(raw_ostream &O, const Twine &Indent, 1740 VPSlotTracker &SlotTracker) const override = 0; 1741 #endif 1742 1743 /// Returns the start value of the phi, if one is set. 1744 VPValue *getStartValue() { 1745 return getNumOperands() == 0 ? nullptr : getOperand(0); 1746 } 1747 VPValue *getStartValue() const { 1748 return getNumOperands() == 0 ? nullptr : getOperand(0); 1749 } 1750 1751 /// Update the start value of the recipe. 1752 void setStartValue(VPValue *V) { setOperand(0, V); } 1753 1754 /// Returns the incoming value from the loop backedge. 1755 virtual VPValue *getBackedgeValue() { 1756 return getOperand(1); 1757 } 1758 1759 /// Returns the backedge value as a recipe. The backedge value is guaranteed 1760 /// to be a recipe. 1761 virtual VPRecipeBase &getBackedgeRecipe() { 1762 return *getBackedgeValue()->getDefiningRecipe(); 1763 } 1764 }; 1765 1766 /// A recipe for handling phi nodes of integer and floating-point inductions, 1767 /// producing their vector values. 1768 class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { 1769 PHINode *IV; 1770 TruncInst *Trunc; 1771 const InductionDescriptor &IndDesc; 1772 1773 public: 1774 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, 1775 const InductionDescriptor &IndDesc) 1776 : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV), 1777 Trunc(nullptr), IndDesc(IndDesc) { 1778 addOperand(Step); 1779 } 1780 1781 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, 1782 const InductionDescriptor &IndDesc, 1783 TruncInst *Trunc) 1784 : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start), 1785 IV(IV), Trunc(Trunc), IndDesc(IndDesc) { 1786 addOperand(Step); 1787 } 1788 1789 ~VPWidenIntOrFpInductionRecipe() override = default; 1790 1791 VPWidenIntOrFpInductionRecipe *clone() override { 1792 return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(), 1793 getStepValue(), IndDesc, Trunc); 1794 } 1795 1796 VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC) 1797 1798 /// Generate the vectorized and scalarized versions of the phi node as 1799 /// needed by their users. 1800 void execute(VPTransformState &State) override; 1801 1802 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1803 /// Print the recipe. 1804 void print(raw_ostream &O, const Twine &Indent, 1805 VPSlotTracker &SlotTracker) const override; 1806 #endif 1807 1808 VPValue *getBackedgeValue() override { 1809 // TODO: All operands of base recipe must exist and be at same index in 1810 // derived recipe. 1811 llvm_unreachable( 1812 "VPWidenIntOrFpInductionRecipe generates its own backedge value"); 1813 } 1814 1815 VPRecipeBase &getBackedgeRecipe() override { 1816 // TODO: All operands of base recipe must exist and be at same index in 1817 // derived recipe. 1818 llvm_unreachable( 1819 "VPWidenIntOrFpInductionRecipe generates its own backedge value"); 1820 } 1821 1822 /// Returns the step value of the induction. 1823 VPValue *getStepValue() { return getOperand(1); } 1824 const VPValue *getStepValue() const { return getOperand(1); } 1825 1826 /// Returns the first defined value as TruncInst, if it is one or nullptr 1827 /// otherwise. 1828 TruncInst *getTruncInst() { return Trunc; } 1829 const TruncInst *getTruncInst() const { return Trunc; } 1830 1831 PHINode *getPHINode() { return IV; } 1832 1833 /// Returns the induction descriptor for the recipe. 1834 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } 1835 1836 /// Returns true if the induction is canonical, i.e. starting at 0 and 1837 /// incremented by UF * VF (= the original IV is incremented by 1) and has the 1838 /// same type as the canonical induction. 1839 bool isCanonical() const; 1840 1841 /// Returns the scalar type of the induction. 1842 Type *getScalarType() const { 1843 return Trunc ? Trunc->getType() : IV->getType(); 1844 } 1845 }; 1846 1847 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { 1848 const InductionDescriptor &IndDesc; 1849 1850 bool IsScalarAfterVectorization; 1851 1852 public: 1853 /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p 1854 /// Start. 1855 VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step, 1856 const InductionDescriptor &IndDesc, 1857 bool IsScalarAfterVectorization) 1858 : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi), 1859 IndDesc(IndDesc), 1860 IsScalarAfterVectorization(IsScalarAfterVectorization) { 1861 addOperand(Start); 1862 addOperand(Step); 1863 } 1864 1865 ~VPWidenPointerInductionRecipe() override = default; 1866 1867 VPWidenPointerInductionRecipe *clone() override { 1868 return new VPWidenPointerInductionRecipe( 1869 cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1), 1870 IndDesc, IsScalarAfterVectorization); 1871 } 1872 1873 VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC) 1874 1875 /// Generate vector values for the pointer induction. 1876 void execute(VPTransformState &State) override; 1877 1878 /// Returns true if only scalar values will be generated. 1879 bool onlyScalarsGenerated(bool IsScalable); 1880 1881 /// Returns the induction descriptor for the recipe. 1882 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } 1883 1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1885 /// Print the recipe. 1886 void print(raw_ostream &O, const Twine &Indent, 1887 VPSlotTracker &SlotTracker) const override; 1888 #endif 1889 }; 1890 1891 /// A recipe for handling phis that are widened in the vector loop. 1892 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are 1893 /// managed in the recipe directly. 1894 class VPWidenPHIRecipe : public VPSingleDefRecipe { 1895 /// List of incoming blocks. Only used in the VPlan native path. 1896 SmallVector<VPBasicBlock *, 2> IncomingBlocks; 1897 1898 public: 1899 /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. 1900 VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr) 1901 : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi) { 1902 if (Start) 1903 addOperand(Start); 1904 } 1905 1906 VPWidenPHIRecipe *clone() override { 1907 llvm_unreachable("cloning not implemented yet"); 1908 } 1909 1910 ~VPWidenPHIRecipe() override = default; 1911 1912 VP_CLASSOF_IMPL(VPDef::VPWidenPHISC) 1913 1914 /// Generate the phi/select nodes. 1915 void execute(VPTransformState &State) override; 1916 1917 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1918 /// Print the recipe. 1919 void print(raw_ostream &O, const Twine &Indent, 1920 VPSlotTracker &SlotTracker) const override; 1921 #endif 1922 1923 /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. 1924 void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { 1925 addOperand(IncomingV); 1926 IncomingBlocks.push_back(IncomingBlock); 1927 } 1928 1929 /// Returns the \p I th incoming VPBasicBlock. 1930 VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } 1931 1932 /// Returns the \p I th incoming VPValue. 1933 VPValue *getIncomingValue(unsigned I) { return getOperand(I); } 1934 }; 1935 1936 /// A recipe for handling first-order recurrence phis. The start value is the 1937 /// first operand of the recipe and the incoming value from the backedge is the 1938 /// second operand. 1939 struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { 1940 VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start) 1941 : VPHeaderPHIRecipe(VPDef::VPFirstOrderRecurrencePHISC, Phi, &Start) {} 1942 1943 VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC) 1944 1945 static inline bool classof(const VPHeaderPHIRecipe *R) { 1946 return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC; 1947 } 1948 1949 VPFirstOrderRecurrencePHIRecipe *clone() override { 1950 return new VPFirstOrderRecurrencePHIRecipe( 1951 cast<PHINode>(getUnderlyingInstr()), *getOperand(0)); 1952 } 1953 1954 void execute(VPTransformState &State) override; 1955 1956 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1957 /// Print the recipe. 1958 void print(raw_ostream &O, const Twine &Indent, 1959 VPSlotTracker &SlotTracker) const override; 1960 #endif 1961 }; 1962 1963 /// A recipe for handling reduction phis. The start value is the first operand 1964 /// of the recipe and the incoming value from the backedge is the second 1965 /// operand. 1966 class VPReductionPHIRecipe : public VPHeaderPHIRecipe { 1967 /// Descriptor for the reduction. 1968 const RecurrenceDescriptor &RdxDesc; 1969 1970 /// The phi is part of an in-loop reduction. 1971 bool IsInLoop; 1972 1973 /// The phi is part of an ordered reduction. Requires IsInLoop to be true. 1974 bool IsOrdered; 1975 1976 public: 1977 /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p 1978 /// RdxDesc. 1979 VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, 1980 VPValue &Start, bool IsInLoop = false, 1981 bool IsOrdered = false) 1982 : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), 1983 RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { 1984 assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); 1985 } 1986 1987 ~VPReductionPHIRecipe() override = default; 1988 1989 VPReductionPHIRecipe *clone() override { 1990 auto *R = 1991 new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc, 1992 *getOperand(0), IsInLoop, IsOrdered); 1993 R->addOperand(getBackedgeValue()); 1994 return R; 1995 } 1996 1997 VP_CLASSOF_IMPL(VPDef::VPReductionPHISC) 1998 1999 static inline bool classof(const VPHeaderPHIRecipe *R) { 2000 return R->getVPDefID() == VPDef::VPReductionPHISC; 2001 } 2002 2003 /// Generate the phi/select nodes. 2004 void execute(VPTransformState &State) override; 2005 2006 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2007 /// Print the recipe. 2008 void print(raw_ostream &O, const Twine &Indent, 2009 VPSlotTracker &SlotTracker) const override; 2010 #endif 2011 2012 const RecurrenceDescriptor &getRecurrenceDescriptor() const { 2013 return RdxDesc; 2014 } 2015 2016 /// Returns true, if the phi is part of an ordered reduction. 2017 bool isOrdered() const { return IsOrdered; } 2018 2019 /// Returns true, if the phi is part of an in-loop reduction. 2020 bool isInLoop() const { return IsInLoop; } 2021 }; 2022 2023 /// A recipe for vectorizing a phi-node as a sequence of mask-based select 2024 /// instructions. 2025 class VPBlendRecipe : public VPSingleDefRecipe { 2026 public: 2027 /// The blend operation is a User of the incoming values and of their 2028 /// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first 2029 /// incoming value does not have a mask associated. 2030 VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) 2031 : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) { 2032 assert((Operands.size() + 1) % 2 == 0 && 2033 "Expected an odd number of operands"); 2034 } 2035 2036 VPBlendRecipe *clone() override { 2037 SmallVector<VPValue *> Ops(operands()); 2038 return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops); 2039 } 2040 2041 VP_CLASSOF_IMPL(VPDef::VPBlendSC) 2042 2043 /// Return the number of incoming values, taking into account that the first 2044 /// incoming value has no mask. 2045 unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } 2046 2047 /// Return incoming value number \p Idx. 2048 VPValue *getIncomingValue(unsigned Idx) const { 2049 return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1); 2050 } 2051 2052 /// Return mask number \p Idx. 2053 VPValue *getMask(unsigned Idx) const { 2054 assert(Idx > 0 && "First index has no mask associated."); 2055 return getOperand(Idx * 2); 2056 } 2057 2058 /// Generate the phi/select nodes. 2059 void execute(VPTransformState &State) override; 2060 2061 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2062 /// Print the recipe. 2063 void print(raw_ostream &O, const Twine &Indent, 2064 VPSlotTracker &SlotTracker) const override; 2065 #endif 2066 2067 /// Returns true if the recipe only uses the first lane of operand \p Op. 2068 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2069 assert(is_contained(operands(), Op) && 2070 "Op must be an operand of the recipe"); 2071 // Recursing through Blend recipes only, must terminate at header phi's the 2072 // latest. 2073 return all_of(users(), 2074 [this](VPUser *U) { return U->onlyFirstLaneUsed(this); }); 2075 } 2076 }; 2077 2078 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load 2079 /// or stores into one wide load/store and shuffles. The first operand of a 2080 /// VPInterleave recipe is the address, followed by the stored values, followed 2081 /// by an optional mask. 2082 class VPInterleaveRecipe : public VPRecipeBase { 2083 const InterleaveGroup<Instruction> *IG; 2084 2085 /// Indicates if the interleave group is in a conditional block and requires a 2086 /// mask. 2087 bool HasMask = false; 2088 2089 /// Indicates if gaps between members of the group need to be masked out or if 2090 /// unusued gaps can be loaded speculatively. 2091 bool NeedsMaskForGaps = false; 2092 2093 public: 2094 VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, 2095 ArrayRef<VPValue *> StoredValues, VPValue *Mask, 2096 bool NeedsMaskForGaps) 2097 : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG), 2098 NeedsMaskForGaps(NeedsMaskForGaps) { 2099 for (unsigned i = 0; i < IG->getFactor(); ++i) 2100 if (Instruction *I = IG->getMember(i)) { 2101 if (I->getType()->isVoidTy()) 2102 continue; 2103 new VPValue(I, this); 2104 } 2105 2106 for (auto *SV : StoredValues) 2107 addOperand(SV); 2108 if (Mask) { 2109 HasMask = true; 2110 addOperand(Mask); 2111 } 2112 } 2113 ~VPInterleaveRecipe() override = default; 2114 2115 VPInterleaveRecipe *clone() override { 2116 return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), 2117 NeedsMaskForGaps); 2118 } 2119 2120 VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) 2121 2122 /// Return the address accessed by this recipe. 2123 VPValue *getAddr() const { 2124 return getOperand(0); // Address is the 1st, mandatory operand. 2125 } 2126 2127 /// Return the mask used by this recipe. Note that a full mask is represented 2128 /// by a nullptr. 2129 VPValue *getMask() const { 2130 // Mask is optional and therefore the last, currently 2nd operand. 2131 return HasMask ? getOperand(getNumOperands() - 1) : nullptr; 2132 } 2133 2134 /// Return the VPValues stored by this interleave group. If it is a load 2135 /// interleave group, return an empty ArrayRef. 2136 ArrayRef<VPValue *> getStoredValues() const { 2137 // The first operand is the address, followed by the stored values, followed 2138 // by an optional mask. 2139 return ArrayRef<VPValue *>(op_begin(), getNumOperands()) 2140 .slice(1, getNumStoreOperands()); 2141 } 2142 2143 /// Generate the wide load or store, and shuffles. 2144 void execute(VPTransformState &State) override; 2145 2146 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2147 /// Print the recipe. 2148 void print(raw_ostream &O, const Twine &Indent, 2149 VPSlotTracker &SlotTracker) const override; 2150 #endif 2151 2152 const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } 2153 2154 /// Returns the number of stored operands of this interleave group. Returns 0 2155 /// for load interleave groups. 2156 unsigned getNumStoreOperands() const { 2157 return getNumOperands() - (HasMask ? 2 : 1); 2158 } 2159 2160 /// The recipe only uses the first lane of the address. 2161 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2162 assert(is_contained(operands(), Op) && 2163 "Op must be an operand of the recipe"); 2164 return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); 2165 } 2166 2167 Instruction *getInsertPos() const { return IG->getInsertPos(); } 2168 }; 2169 2170 /// A recipe to represent inloop reduction operations, performing a reduction on 2171 /// a vector operand into a scalar value, and adding the result to a chain. 2172 /// The Operands are {ChainOp, VecOp, [Condition]}. 2173 class VPReductionRecipe : public VPSingleDefRecipe { 2174 /// The recurrence decriptor for the reduction in question. 2175 const RecurrenceDescriptor &RdxDesc; 2176 bool IsOrdered; 2177 /// Whether the reduction is conditional. 2178 bool IsConditional = false; 2179 2180 protected: 2181 VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, 2182 Instruction *I, ArrayRef<VPValue *> Operands, 2183 VPValue *CondOp, bool IsOrdered) 2184 : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) { 2185 if (CondOp) { 2186 IsConditional = true; 2187 addOperand(CondOp); 2188 } 2189 } 2190 2191 public: 2192 VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, 2193 VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, 2194 bool IsOrdered) 2195 : VPReductionRecipe(VPDef::VPReductionSC, R, I, 2196 ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, 2197 IsOrdered) {} 2198 2199 ~VPReductionRecipe() override = default; 2200 2201 VPReductionRecipe *clone() override { 2202 return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(), 2203 getVecOp(), getCondOp(), IsOrdered); 2204 } 2205 2206 static inline bool classof(const VPRecipeBase *R) { 2207 return R->getVPDefID() == VPRecipeBase::VPReductionSC || 2208 R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; 2209 } 2210 2211 static inline bool classof(const VPUser *U) { 2212 auto *R = dyn_cast<VPRecipeBase>(U); 2213 return R && classof(R); 2214 } 2215 2216 /// Generate the reduction in the loop 2217 void execute(VPTransformState &State) override; 2218 2219 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2220 /// Print the recipe. 2221 void print(raw_ostream &O, const Twine &Indent, 2222 VPSlotTracker &SlotTracker) const override; 2223 #endif 2224 2225 /// Return the recurrence decriptor for the in-loop reduction. 2226 const RecurrenceDescriptor &getRecurrenceDescriptor() const { 2227 return RdxDesc; 2228 } 2229 /// Return true if the in-loop reduction is ordered. 2230 bool isOrdered() const { return IsOrdered; }; 2231 /// Return true if the in-loop reduction is conditional. 2232 bool isConditional() const { return IsConditional; }; 2233 /// The VPValue of the scalar Chain being accumulated. 2234 VPValue *getChainOp() const { return getOperand(0); } 2235 /// The VPValue of the vector value to be reduced. 2236 VPValue *getVecOp() const { return getOperand(1); } 2237 /// The VPValue of the condition for the block. 2238 VPValue *getCondOp() const { 2239 return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; 2240 } 2241 }; 2242 2243 /// A recipe to represent inloop reduction operations with vector-predication 2244 /// intrinsics, performing a reduction on a vector operand with the explicit 2245 /// vector length (EVL) into a scalar value, and adding the result to a chain. 2246 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}. 2247 class VPReductionEVLRecipe : public VPReductionRecipe { 2248 public: 2249 VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp) 2250 : VPReductionRecipe( 2251 VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(), 2252 cast_or_null<Instruction>(R->getUnderlyingValue()), 2253 ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp, 2254 R->isOrdered()) {} 2255 2256 ~VPReductionEVLRecipe() override = default; 2257 2258 VPReductionEVLRecipe *clone() override { 2259 llvm_unreachable("cloning not implemented yet"); 2260 } 2261 2262 VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC) 2263 2264 /// Generate the reduction in the loop 2265 void execute(VPTransformState &State) override; 2266 2267 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2268 /// Print the recipe. 2269 void print(raw_ostream &O, const Twine &Indent, 2270 VPSlotTracker &SlotTracker) const override; 2271 #endif 2272 2273 /// The VPValue of the explicit vector length. 2274 VPValue *getEVL() const { return getOperand(2); } 2275 2276 /// Returns true if the recipe only uses the first lane of operand \p Op. 2277 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2278 assert(is_contained(operands(), Op) && 2279 "Op must be an operand of the recipe"); 2280 return Op == getEVL(); 2281 } 2282 }; 2283 2284 /// VPReplicateRecipe replicates a given instruction producing multiple scalar 2285 /// copies of the original scalar type, one per lane, instead of producing a 2286 /// single copy of widened type for all lanes. If the instruction is known to be 2287 /// uniform only one copy, per lane zero, will be generated. 2288 class VPReplicateRecipe : public VPRecipeWithIRFlags { 2289 /// Indicator if only a single replica per lane is needed. 2290 bool IsUniform; 2291 2292 /// Indicator if the replicas are also predicated. 2293 bool IsPredicated; 2294 2295 public: 2296 template <typename IterT> 2297 VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, 2298 bool IsUniform, VPValue *Mask = nullptr) 2299 : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I), 2300 IsUniform(IsUniform), IsPredicated(Mask) { 2301 if (Mask) 2302 addOperand(Mask); 2303 } 2304 2305 ~VPReplicateRecipe() override = default; 2306 2307 VPReplicateRecipe *clone() override { 2308 auto *Copy = 2309 new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform, 2310 isPredicated() ? getMask() : nullptr); 2311 Copy->transferFlags(*this); 2312 return Copy; 2313 } 2314 2315 VP_CLASSOF_IMPL(VPDef::VPReplicateSC) 2316 2317 /// Generate replicas of the desired Ingredient. Replicas will be generated 2318 /// for all parts and lanes unless a specific part and lane are specified in 2319 /// the \p State. 2320 void execute(VPTransformState &State) override; 2321 2322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2323 /// Print the recipe. 2324 void print(raw_ostream &O, const Twine &Indent, 2325 VPSlotTracker &SlotTracker) const override; 2326 #endif 2327 2328 bool isUniform() const { return IsUniform; } 2329 2330 bool isPredicated() const { return IsPredicated; } 2331 2332 /// Returns true if the recipe only uses the first lane of operand \p Op. 2333 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2334 assert(is_contained(operands(), Op) && 2335 "Op must be an operand of the recipe"); 2336 return isUniform(); 2337 } 2338 2339 /// Returns true if the recipe uses scalars of operand \p Op. 2340 bool usesScalars(const VPValue *Op) const override { 2341 assert(is_contained(operands(), Op) && 2342 "Op must be an operand of the recipe"); 2343 return true; 2344 } 2345 2346 /// Returns true if the recipe is used by a widened recipe via an intervening 2347 /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed 2348 /// in a vector. 2349 bool shouldPack() const; 2350 2351 /// Return the mask of a predicated VPReplicateRecipe. 2352 VPValue *getMask() { 2353 assert(isPredicated() && "Trying to get the mask of a unpredicated recipe"); 2354 return getOperand(getNumOperands() - 1); 2355 } 2356 2357 unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); } 2358 }; 2359 2360 /// A recipe for generating conditional branches on the bits of a mask. 2361 class VPBranchOnMaskRecipe : public VPRecipeBase { 2362 public: 2363 VPBranchOnMaskRecipe(VPValue *BlockInMask) 2364 : VPRecipeBase(VPDef::VPBranchOnMaskSC, {}) { 2365 if (BlockInMask) // nullptr means all-one mask. 2366 addOperand(BlockInMask); 2367 } 2368 2369 VPBranchOnMaskRecipe *clone() override { 2370 return new VPBranchOnMaskRecipe(getOperand(0)); 2371 } 2372 2373 VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC) 2374 2375 /// Generate the extraction of the appropriate bit from the block mask and the 2376 /// conditional branch. 2377 void execute(VPTransformState &State) override; 2378 2379 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2380 /// Print the recipe. 2381 void print(raw_ostream &O, const Twine &Indent, 2382 VPSlotTracker &SlotTracker) const override { 2383 O << Indent << "BRANCH-ON-MASK "; 2384 if (VPValue *Mask = getMask()) 2385 Mask->printAsOperand(O, SlotTracker); 2386 else 2387 O << " All-One"; 2388 } 2389 #endif 2390 2391 /// Return the mask used by this recipe. Note that a full mask is represented 2392 /// by a nullptr. 2393 VPValue *getMask() const { 2394 assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); 2395 // Mask is optional. 2396 return getNumOperands() == 1 ? getOperand(0) : nullptr; 2397 } 2398 2399 /// Returns true if the recipe uses scalars of operand \p Op. 2400 bool usesScalars(const VPValue *Op) const override { 2401 assert(is_contained(operands(), Op) && 2402 "Op must be an operand of the recipe"); 2403 return true; 2404 } 2405 }; 2406 2407 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when 2408 /// control converges back from a Branch-on-Mask. The phi nodes are needed in 2409 /// order to merge values that are set under such a branch and feed their uses. 2410 /// The phi nodes can be scalar or vector depending on the users of the value. 2411 /// This recipe works in concert with VPBranchOnMaskRecipe. 2412 class VPPredInstPHIRecipe : public VPSingleDefRecipe { 2413 public: 2414 /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi 2415 /// nodes after merging back from a Branch-on-Mask. 2416 VPPredInstPHIRecipe(VPValue *PredV) 2417 : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {} 2418 ~VPPredInstPHIRecipe() override = default; 2419 2420 VPPredInstPHIRecipe *clone() override { 2421 return new VPPredInstPHIRecipe(getOperand(0)); 2422 } 2423 2424 VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC) 2425 2426 /// Generates phi nodes for live-outs as needed to retain SSA form. 2427 void execute(VPTransformState &State) override; 2428 2429 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2430 /// Print the recipe. 2431 void print(raw_ostream &O, const Twine &Indent, 2432 VPSlotTracker &SlotTracker) const override; 2433 #endif 2434 2435 /// Returns true if the recipe uses scalars of operand \p Op. 2436 bool usesScalars(const VPValue *Op) const override { 2437 assert(is_contained(operands(), Op) && 2438 "Op must be an operand of the recipe"); 2439 return true; 2440 } 2441 }; 2442 2443 /// A common base class for widening memory operations. An optional mask can be 2444 /// provided as the last operand. 2445 class VPWidenMemoryRecipe : public VPRecipeBase { 2446 protected: 2447 Instruction &Ingredient; 2448 2449 /// Whether the accessed addresses are consecutive. 2450 bool Consecutive; 2451 2452 /// Whether the consecutive accessed addresses are in reverse order. 2453 bool Reverse; 2454 2455 /// Whether the memory access is masked. 2456 bool IsMasked = false; 2457 2458 void setMask(VPValue *Mask) { 2459 assert(!IsMasked && "cannot re-set mask"); 2460 if (!Mask) 2461 return; 2462 addOperand(Mask); 2463 IsMasked = true; 2464 } 2465 2466 VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, 2467 std::initializer_list<VPValue *> Operands, 2468 bool Consecutive, bool Reverse, DebugLoc DL) 2469 : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive), 2470 Reverse(Reverse) { 2471 assert((Consecutive || !Reverse) && "Reverse implies consecutive"); 2472 } 2473 2474 public: 2475 VPWidenMemoryRecipe *clone() override { 2476 llvm_unreachable("cloning not supported"); 2477 } 2478 2479 static inline bool classof(const VPRecipeBase *R) { 2480 return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || 2481 R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || 2482 R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || 2483 R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; 2484 } 2485 2486 static inline bool classof(const VPUser *U) { 2487 auto *R = dyn_cast<VPRecipeBase>(U); 2488 return R && classof(R); 2489 } 2490 2491 /// Return whether the loaded-from / stored-to addresses are consecutive. 2492 bool isConsecutive() const { return Consecutive; } 2493 2494 /// Return whether the consecutive loaded/stored addresses are in reverse 2495 /// order. 2496 bool isReverse() const { return Reverse; } 2497 2498 /// Return the address accessed by this recipe. 2499 VPValue *getAddr() const { return getOperand(0); } 2500 2501 /// Returns true if the recipe is masked. 2502 bool isMasked() const { return IsMasked; } 2503 2504 /// Return the mask used by this recipe. Note that a full mask is represented 2505 /// by a nullptr. 2506 VPValue *getMask() const { 2507 // Mask is optional and therefore the last operand. 2508 return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; 2509 } 2510 2511 /// Generate the wide load/store. 2512 void execute(VPTransformState &State) override { 2513 llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); 2514 } 2515 2516 Instruction &getIngredient() const { return Ingredient; } 2517 }; 2518 2519 /// A recipe for widening load operations, using the address to load from and an 2520 /// optional mask. 2521 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { 2522 VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, 2523 bool Consecutive, bool Reverse, DebugLoc DL) 2524 : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, 2525 Reverse, DL), 2526 VPValue(this, &Load) { 2527 setMask(Mask); 2528 } 2529 2530 VPWidenLoadRecipe *clone() override { 2531 return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), 2532 getMask(), Consecutive, Reverse, 2533 getDebugLoc()); 2534 } 2535 2536 VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); 2537 2538 /// Generate a wide load or gather. 2539 void execute(VPTransformState &State) override; 2540 2541 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2542 /// Print the recipe. 2543 void print(raw_ostream &O, const Twine &Indent, 2544 VPSlotTracker &SlotTracker) const override; 2545 #endif 2546 2547 /// Returns true if the recipe only uses the first lane of operand \p Op. 2548 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2549 assert(is_contained(operands(), Op) && 2550 "Op must be an operand of the recipe"); 2551 // Widened, consecutive loads operations only demand the first lane of 2552 // their address. 2553 return Op == getAddr() && isConsecutive(); 2554 } 2555 }; 2556 2557 /// A recipe for widening load operations with vector-predication intrinsics, 2558 /// using the address to load from, the explicit vector length and an optional 2559 /// mask. 2560 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { 2561 VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask) 2562 : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(), 2563 {L->getAddr(), EVL}, L->isConsecutive(), 2564 L->isReverse(), L->getDebugLoc()), 2565 VPValue(this, &getIngredient()) { 2566 setMask(Mask); 2567 } 2568 2569 VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC) 2570 2571 /// Return the EVL operand. 2572 VPValue *getEVL() const { return getOperand(1); } 2573 2574 /// Generate the wide load or gather. 2575 void execute(VPTransformState &State) override; 2576 2577 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2578 /// Print the recipe. 2579 void print(raw_ostream &O, const Twine &Indent, 2580 VPSlotTracker &SlotTracker) const override; 2581 #endif 2582 2583 /// Returns true if the recipe only uses the first lane of operand \p Op. 2584 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2585 assert(is_contained(operands(), Op) && 2586 "Op must be an operand of the recipe"); 2587 // Widened loads only demand the first lane of EVL and consecutive loads 2588 // only demand the first lane of their address. 2589 return Op == getEVL() || (Op == getAddr() && isConsecutive()); 2590 } 2591 }; 2592 2593 /// A recipe for widening store operations, using the stored value, the address 2594 /// to store to and an optional mask. 2595 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { 2596 VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, 2597 VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL) 2598 : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, 2599 Consecutive, Reverse, DL) { 2600 setMask(Mask); 2601 } 2602 2603 VPWidenStoreRecipe *clone() override { 2604 return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), 2605 getStoredValue(), getMask(), Consecutive, 2606 Reverse, getDebugLoc()); 2607 } 2608 2609 VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); 2610 2611 /// Return the value stored by this recipe. 2612 VPValue *getStoredValue() const { return getOperand(1); } 2613 2614 /// Generate a wide store or scatter. 2615 void execute(VPTransformState &State) override; 2616 2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2618 /// Print the recipe. 2619 void print(raw_ostream &O, const Twine &Indent, 2620 VPSlotTracker &SlotTracker) const override; 2621 #endif 2622 2623 /// Returns true if the recipe only uses the first lane of operand \p Op. 2624 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2625 assert(is_contained(operands(), Op) && 2626 "Op must be an operand of the recipe"); 2627 // Widened, consecutive stores only demand the first lane of their address, 2628 // unless the same operand is also stored. 2629 return Op == getAddr() && isConsecutive() && Op != getStoredValue(); 2630 } 2631 }; 2632 2633 /// A recipe for widening store operations with vector-predication intrinsics, 2634 /// using the value to store, the address to store to, the explicit vector 2635 /// length and an optional mask. 2636 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { 2637 VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask) 2638 : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(), 2639 {S->getAddr(), S->getStoredValue(), EVL}, 2640 S->isConsecutive(), S->isReverse(), 2641 S->getDebugLoc()) { 2642 setMask(Mask); 2643 } 2644 2645 VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC) 2646 2647 /// Return the address accessed by this recipe. 2648 VPValue *getStoredValue() const { return getOperand(1); } 2649 2650 /// Return the EVL operand. 2651 VPValue *getEVL() const { return getOperand(2); } 2652 2653 /// Generate the wide store or scatter. 2654 void execute(VPTransformState &State) override; 2655 2656 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2657 /// Print the recipe. 2658 void print(raw_ostream &O, const Twine &Indent, 2659 VPSlotTracker &SlotTracker) const override; 2660 #endif 2661 2662 /// Returns true if the recipe only uses the first lane of operand \p Op. 2663 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2664 assert(is_contained(operands(), Op) && 2665 "Op must be an operand of the recipe"); 2666 if (Op == getEVL()) { 2667 assert(getStoredValue() != Op && "unexpected store of EVL"); 2668 return true; 2669 } 2670 // Widened, consecutive memory operations only demand the first lane of 2671 // their address, unless the same operand is also stored. That latter can 2672 // happen with opaque pointers. 2673 return Op == getAddr() && isConsecutive() && Op != getStoredValue(); 2674 } 2675 }; 2676 2677 /// Recipe to expand a SCEV expression. 2678 class VPExpandSCEVRecipe : public VPSingleDefRecipe { 2679 const SCEV *Expr; 2680 ScalarEvolution &SE; 2681 2682 public: 2683 VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE) 2684 : VPSingleDefRecipe(VPDef::VPExpandSCEVSC, {}), Expr(Expr), SE(SE) {} 2685 2686 ~VPExpandSCEVRecipe() override = default; 2687 2688 VPExpandSCEVRecipe *clone() override { 2689 return new VPExpandSCEVRecipe(Expr, SE); 2690 } 2691 2692 VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC) 2693 2694 /// Generate a canonical vector induction variable of the vector loop, with 2695 void execute(VPTransformState &State) override; 2696 2697 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2698 /// Print the recipe. 2699 void print(raw_ostream &O, const Twine &Indent, 2700 VPSlotTracker &SlotTracker) const override; 2701 #endif 2702 2703 const SCEV *getSCEV() const { return Expr; } 2704 }; 2705 2706 /// Canonical scalar induction phi of the vector loop. Starting at the specified 2707 /// start value (either 0 or the resume value when vectorizing the epilogue 2708 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the 2709 /// canonical induction variable. 2710 class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { 2711 public: 2712 VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) 2713 : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {} 2714 2715 ~VPCanonicalIVPHIRecipe() override = default; 2716 2717 VPCanonicalIVPHIRecipe *clone() override { 2718 auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc()); 2719 R->addOperand(getBackedgeValue()); 2720 return R; 2721 } 2722 2723 VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC) 2724 2725 static inline bool classof(const VPHeaderPHIRecipe *D) { 2726 return D->getVPDefID() == VPDef::VPCanonicalIVPHISC; 2727 } 2728 2729 /// Generate the canonical scalar induction phi of the vector loop. 2730 void execute(VPTransformState &State) override; 2731 2732 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2733 /// Print the recipe. 2734 void print(raw_ostream &O, const Twine &Indent, 2735 VPSlotTracker &SlotTracker) const override; 2736 #endif 2737 2738 /// Returns the scalar type of the induction. 2739 Type *getScalarType() const { 2740 return getStartValue()->getLiveInIRValue()->getType(); 2741 } 2742 2743 /// Returns true if the recipe only uses the first lane of operand \p Op. 2744 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2745 assert(is_contained(operands(), Op) && 2746 "Op must be an operand of the recipe"); 2747 return true; 2748 } 2749 2750 /// Returns true if the recipe only uses the first part of operand \p Op. 2751 bool onlyFirstPartUsed(const VPValue *Op) const override { 2752 assert(is_contained(operands(), Op) && 2753 "Op must be an operand of the recipe"); 2754 return true; 2755 } 2756 2757 /// Check if the induction described by \p Kind, /p Start and \p Step is 2758 /// canonical, i.e. has the same start and step (of 1) as the canonical IV. 2759 bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start, 2760 VPValue *Step) const; 2761 }; 2762 2763 /// A recipe for generating the active lane mask for the vector loop that is 2764 /// used to predicate the vector operations. 2765 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and 2766 /// remove VPActiveLaneMaskPHIRecipe. 2767 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { 2768 public: 2769 VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) 2770 : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, 2771 DL) {} 2772 2773 ~VPActiveLaneMaskPHIRecipe() override = default; 2774 2775 VPActiveLaneMaskPHIRecipe *clone() override { 2776 return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); 2777 } 2778 2779 VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) 2780 2781 static inline bool classof(const VPHeaderPHIRecipe *D) { 2782 return D->getVPDefID() == VPDef::VPActiveLaneMaskPHISC; 2783 } 2784 2785 /// Generate the active lane mask phi of the vector loop. 2786 void execute(VPTransformState &State) override; 2787 2788 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2789 /// Print the recipe. 2790 void print(raw_ostream &O, const Twine &Indent, 2791 VPSlotTracker &SlotTracker) const override; 2792 #endif 2793 }; 2794 2795 /// A recipe for generating the phi node for the current index of elements, 2796 /// adjusted in accordance with EVL value. It starts at the start value of the 2797 /// canonical induction and gets incremented by EVL in each iteration of the 2798 /// vector loop. 2799 class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { 2800 public: 2801 VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL) 2802 : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {} 2803 2804 ~VPEVLBasedIVPHIRecipe() override = default; 2805 2806 VPEVLBasedIVPHIRecipe *clone() override { 2807 llvm_unreachable("cloning not implemented yet"); 2808 } 2809 2810 VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC) 2811 2812 static inline bool classof(const VPHeaderPHIRecipe *D) { 2813 return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC; 2814 } 2815 2816 /// Generate phi for handling IV based on EVL over iterations correctly. 2817 /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe. 2818 void execute(VPTransformState &State) override; 2819 2820 /// Returns true if the recipe only uses the first lane of operand \p Op. 2821 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2822 assert(is_contained(operands(), Op) && 2823 "Op must be an operand of the recipe"); 2824 return true; 2825 } 2826 2827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2828 /// Print the recipe. 2829 void print(raw_ostream &O, const Twine &Indent, 2830 VPSlotTracker &SlotTracker) const override; 2831 #endif 2832 }; 2833 2834 /// A Recipe for widening the canonical induction variable of the vector loop. 2835 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe { 2836 public: 2837 VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) 2838 : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {} 2839 2840 ~VPWidenCanonicalIVRecipe() override = default; 2841 2842 VPWidenCanonicalIVRecipe *clone() override { 2843 return new VPWidenCanonicalIVRecipe( 2844 cast<VPCanonicalIVPHIRecipe>(getOperand(0))); 2845 } 2846 2847 VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC) 2848 2849 /// Generate a canonical vector induction variable of the vector loop, with 2850 /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and 2851 /// step = <VF*UF, VF*UF, ..., VF*UF>. 2852 void execute(VPTransformState &State) override; 2853 2854 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2855 /// Print the recipe. 2856 void print(raw_ostream &O, const Twine &Indent, 2857 VPSlotTracker &SlotTracker) const override; 2858 #endif 2859 }; 2860 2861 /// A recipe for converting the input value \p IV value to the corresponding 2862 /// value of an IV with different start and step values, using Start + IV * 2863 /// Step. 2864 class VPDerivedIVRecipe : public VPSingleDefRecipe { 2865 /// Kind of the induction. 2866 const InductionDescriptor::InductionKind Kind; 2867 /// If not nullptr, the floating point induction binary operator. Must be set 2868 /// for floating point inductions. 2869 const FPMathOperator *FPBinOp; 2870 2871 public: 2872 VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start, 2873 VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step) 2874 : VPDerivedIVRecipe( 2875 IndDesc.getKind(), 2876 dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()), 2877 Start, CanonicalIV, Step) {} 2878 2879 VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind, 2880 const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV, 2881 VPValue *Step) 2882 : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind), 2883 FPBinOp(FPBinOp) {} 2884 2885 ~VPDerivedIVRecipe() override = default; 2886 2887 VPDerivedIVRecipe *clone() override { 2888 return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1), 2889 getStepValue()); 2890 } 2891 2892 VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC) 2893 2894 /// Generate the transformed value of the induction at offset StartValue (1. 2895 /// operand) + IV (2. operand) * StepValue (3, operand). 2896 void execute(VPTransformState &State) override; 2897 2898 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2899 /// Print the recipe. 2900 void print(raw_ostream &O, const Twine &Indent, 2901 VPSlotTracker &SlotTracker) const override; 2902 #endif 2903 2904 Type *getScalarType() const { 2905 return getStartValue()->getLiveInIRValue()->getType(); 2906 } 2907 2908 VPValue *getStartValue() const { return getOperand(0); } 2909 VPValue *getStepValue() const { return getOperand(2); } 2910 2911 /// Returns true if the recipe only uses the first lane of operand \p Op. 2912 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2913 assert(is_contained(operands(), Op) && 2914 "Op must be an operand of the recipe"); 2915 return true; 2916 } 2917 }; 2918 2919 /// A recipe for handling phi nodes of integer and floating-point inductions, 2920 /// producing their scalar values. 2921 class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags { 2922 Instruction::BinaryOps InductionOpcode; 2923 2924 public: 2925 VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step, 2926 Instruction::BinaryOps Opcode, FastMathFlags FMFs) 2927 : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC, 2928 ArrayRef<VPValue *>({IV, Step}), FMFs), 2929 InductionOpcode(Opcode) {} 2930 2931 VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, 2932 VPValue *Step) 2933 : VPScalarIVStepsRecipe( 2934 IV, Step, IndDesc.getInductionOpcode(), 2935 dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()) 2936 ? IndDesc.getInductionBinOp()->getFastMathFlags() 2937 : FastMathFlags()) {} 2938 2939 ~VPScalarIVStepsRecipe() override = default; 2940 2941 VPScalarIVStepsRecipe *clone() override { 2942 return new VPScalarIVStepsRecipe( 2943 getOperand(0), getOperand(1), InductionOpcode, 2944 hasFastMathFlags() ? getFastMathFlags() : FastMathFlags()); 2945 } 2946 2947 VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC) 2948 2949 /// Generate the scalarized versions of the phi node as needed by their users. 2950 void execute(VPTransformState &State) override; 2951 2952 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2953 /// Print the recipe. 2954 void print(raw_ostream &O, const Twine &Indent, 2955 VPSlotTracker &SlotTracker) const override; 2956 #endif 2957 2958 VPValue *getStepValue() const { return getOperand(1); } 2959 2960 /// Returns true if the recipe only uses the first lane of operand \p Op. 2961 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2962 assert(is_contained(operands(), Op) && 2963 "Op must be an operand of the recipe"); 2964 return true; 2965 } 2966 }; 2967 2968 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It 2969 /// holds a sequence of zero or more VPRecipe's each representing a sequence of 2970 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. 2971 class VPBasicBlock : public VPBlockBase { 2972 public: 2973 using RecipeListTy = iplist<VPRecipeBase>; 2974 2975 protected: 2976 /// The VPRecipes held in the order of output instructions to generate. 2977 RecipeListTy Recipes; 2978 2979 VPBasicBlock(const unsigned char BlockSC, const Twine &Name = "") 2980 : VPBlockBase(BlockSC, Name.str()) {} 2981 2982 public: 2983 VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) 2984 : VPBlockBase(VPBasicBlockSC, Name.str()) { 2985 if (Recipe) 2986 appendRecipe(Recipe); 2987 } 2988 2989 ~VPBasicBlock() override { 2990 while (!Recipes.empty()) 2991 Recipes.pop_back(); 2992 } 2993 2994 /// Instruction iterators... 2995 using iterator = RecipeListTy::iterator; 2996 using const_iterator = RecipeListTy::const_iterator; 2997 using reverse_iterator = RecipeListTy::reverse_iterator; 2998 using const_reverse_iterator = RecipeListTy::const_reverse_iterator; 2999 3000 //===--------------------------------------------------------------------===// 3001 /// Recipe iterator methods 3002 /// 3003 inline iterator begin() { return Recipes.begin(); } 3004 inline const_iterator begin() const { return Recipes.begin(); } 3005 inline iterator end() { return Recipes.end(); } 3006 inline const_iterator end() const { return Recipes.end(); } 3007 3008 inline reverse_iterator rbegin() { return Recipes.rbegin(); } 3009 inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } 3010 inline reverse_iterator rend() { return Recipes.rend(); } 3011 inline const_reverse_iterator rend() const { return Recipes.rend(); } 3012 3013 inline size_t size() const { return Recipes.size(); } 3014 inline bool empty() const { return Recipes.empty(); } 3015 inline const VPRecipeBase &front() const { return Recipes.front(); } 3016 inline VPRecipeBase &front() { return Recipes.front(); } 3017 inline const VPRecipeBase &back() const { return Recipes.back(); } 3018 inline VPRecipeBase &back() { return Recipes.back(); } 3019 3020 /// Returns a reference to the list of recipes. 3021 RecipeListTy &getRecipeList() { return Recipes; } 3022 3023 /// Returns a pointer to a member of the recipe list. 3024 static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { 3025 return &VPBasicBlock::Recipes; 3026 } 3027 3028 /// Method to support type inquiry through isa, cast, and dyn_cast. 3029 static inline bool classof(const VPBlockBase *V) { 3030 return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC || 3031 V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; 3032 } 3033 3034 void insert(VPRecipeBase *Recipe, iterator InsertPt) { 3035 assert(Recipe && "No recipe to append."); 3036 assert(!Recipe->Parent && "Recipe already in VPlan"); 3037 Recipe->Parent = this; 3038 Recipes.insert(InsertPt, Recipe); 3039 } 3040 3041 /// Augment the existing recipes of a VPBasicBlock with an additional 3042 /// \p Recipe as the last recipe. 3043 void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); } 3044 3045 /// The method which generates the output IR instructions that correspond to 3046 /// this VPBasicBlock, thereby "executing" the VPlan. 3047 void execute(VPTransformState *State) override; 3048 3049 /// Return the cost of this VPBasicBlock. 3050 InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; 3051 3052 /// Return the position of the first non-phi node recipe in the block. 3053 iterator getFirstNonPhi(); 3054 3055 /// Returns an iterator range over the PHI-like recipes in the block. 3056 iterator_range<iterator> phis() { 3057 return make_range(begin(), getFirstNonPhi()); 3058 } 3059 3060 void dropAllReferences(VPValue *NewValue) override; 3061 3062 /// Split current block at \p SplitAt by inserting a new block between the 3063 /// current block and its successors and moving all recipes starting at 3064 /// SplitAt to the new block. Returns the new block. 3065 VPBasicBlock *splitAt(iterator SplitAt); 3066 3067 VPRegionBlock *getEnclosingLoopRegion(); 3068 3069 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3070 /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p 3071 /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. 3072 /// 3073 /// Note that the numbering is applied to the whole VPlan, so printing 3074 /// individual blocks is consistent with the whole VPlan printing. 3075 void print(raw_ostream &O, const Twine &Indent, 3076 VPSlotTracker &SlotTracker) const override; 3077 using VPBlockBase::print; // Get the print(raw_stream &O) version. 3078 #endif 3079 3080 /// If the block has multiple successors, return the branch recipe terminating 3081 /// the block. If there are no or only a single successor, return nullptr; 3082 VPRecipeBase *getTerminator(); 3083 const VPRecipeBase *getTerminator() const; 3084 3085 /// Returns true if the block is exiting it's parent region. 3086 bool isExiting() const; 3087 3088 /// Clone the current block and it's recipes, without updating the operands of 3089 /// the cloned recipes. 3090 VPBasicBlock *clone() override { 3091 auto *NewBlock = new VPBasicBlock(getName()); 3092 for (VPRecipeBase &R : *this) 3093 NewBlock->appendRecipe(R.clone()); 3094 return NewBlock; 3095 } 3096 3097 protected: 3098 /// Execute the recipes in the IR basic block \p BB. 3099 void executeRecipes(VPTransformState *State, BasicBlock *BB); 3100 3101 private: 3102 /// Create an IR BasicBlock to hold the output instructions generated by this 3103 /// VPBasicBlock, and return it. Update the CFGState accordingly. 3104 BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); 3105 }; 3106 3107 /// A special type of VPBasicBlock that wraps an existing IR basic block. 3108 /// Recipes of the block get added before the first non-phi instruction in the 3109 /// wrapped block. 3110 /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's 3111 /// preheader block. 3112 class VPIRBasicBlock : public VPBasicBlock { 3113 BasicBlock *IRBB; 3114 3115 public: 3116 VPIRBasicBlock(BasicBlock *IRBB) 3117 : VPBasicBlock(VPIRBasicBlockSC, 3118 (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()), 3119 IRBB(IRBB) {} 3120 3121 ~VPIRBasicBlock() override {} 3122 3123 static inline bool classof(const VPBlockBase *V) { 3124 return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; 3125 } 3126 3127 /// The method which generates the output IR instructions that correspond to 3128 /// this VPBasicBlock, thereby "executing" the VPlan. 3129 void execute(VPTransformState *State) override; 3130 3131 VPIRBasicBlock *clone() override { 3132 auto *NewBlock = new VPIRBasicBlock(IRBB); 3133 for (VPRecipeBase &R : Recipes) 3134 NewBlock->appendRecipe(R.clone()); 3135 return NewBlock; 3136 } 3137 3138 BasicBlock *getIRBasicBlock() const { return IRBB; } 3139 }; 3140 3141 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks 3142 /// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG. 3143 /// A VPRegionBlock may indicate that its contents are to be replicated several 3144 /// times. This is designed to support predicated scalarization, in which a 3145 /// scalar if-then code structure needs to be generated VF * UF times. Having 3146 /// this replication indicator helps to keep a single model for multiple 3147 /// candidate VF's. The actual replication takes place only once the desired VF 3148 /// and UF have been determined. 3149 class VPRegionBlock : public VPBlockBase { 3150 /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. 3151 VPBlockBase *Entry; 3152 3153 /// Hold the Single Exiting block of the SESE region modelled by the 3154 /// VPRegionBlock. 3155 VPBlockBase *Exiting; 3156 3157 /// An indicator whether this region is to generate multiple replicated 3158 /// instances of output IR corresponding to its VPBlockBases. 3159 bool IsReplicator; 3160 3161 public: 3162 VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, 3163 const std::string &Name = "", bool IsReplicator = false) 3164 : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting), 3165 IsReplicator(IsReplicator) { 3166 assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); 3167 assert(Exiting->getSuccessors().empty() && "Exit block has successors."); 3168 Entry->setParent(this); 3169 Exiting->setParent(this); 3170 } 3171 VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) 3172 : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), 3173 IsReplicator(IsReplicator) {} 3174 3175 ~VPRegionBlock() override { 3176 if (Entry) { 3177 VPValue DummyValue; 3178 Entry->dropAllReferences(&DummyValue); 3179 deleteCFG(Entry); 3180 } 3181 } 3182 3183 /// Method to support type inquiry through isa, cast, and dyn_cast. 3184 static inline bool classof(const VPBlockBase *V) { 3185 return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; 3186 } 3187 3188 const VPBlockBase *getEntry() const { return Entry; } 3189 VPBlockBase *getEntry() { return Entry; } 3190 3191 /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p 3192 /// EntryBlock must have no predecessors. 3193 void setEntry(VPBlockBase *EntryBlock) { 3194 assert(EntryBlock->getPredecessors().empty() && 3195 "Entry block cannot have predecessors."); 3196 Entry = EntryBlock; 3197 EntryBlock->setParent(this); 3198 } 3199 3200 const VPBlockBase *getExiting() const { return Exiting; } 3201 VPBlockBase *getExiting() { return Exiting; } 3202 3203 /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p 3204 /// ExitingBlock must have no successors. 3205 void setExiting(VPBlockBase *ExitingBlock) { 3206 assert(ExitingBlock->getSuccessors().empty() && 3207 "Exit block cannot have successors."); 3208 Exiting = ExitingBlock; 3209 ExitingBlock->setParent(this); 3210 } 3211 3212 /// Returns the pre-header VPBasicBlock of the loop region. 3213 VPBasicBlock *getPreheaderVPBB() { 3214 assert(!isReplicator() && "should only get pre-header of loop regions"); 3215 return getSinglePredecessor()->getExitingBasicBlock(); 3216 } 3217 3218 /// An indicator whether this region is to generate multiple replicated 3219 /// instances of output IR corresponding to its VPBlockBases. 3220 bool isReplicator() const { return IsReplicator; } 3221 3222 /// The method which generates the output IR instructions that correspond to 3223 /// this VPRegionBlock, thereby "executing" the VPlan. 3224 void execute(VPTransformState *State) override; 3225 3226 // Return the cost of this region. 3227 InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; 3228 3229 void dropAllReferences(VPValue *NewValue) override; 3230 3231 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3232 /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with 3233 /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using 3234 /// consequtive numbers. 3235 /// 3236 /// Note that the numbering is applied to the whole VPlan, so printing 3237 /// individual regions is consistent with the whole VPlan printing. 3238 void print(raw_ostream &O, const Twine &Indent, 3239 VPSlotTracker &SlotTracker) const override; 3240 using VPBlockBase::print; // Get the print(raw_stream &O) version. 3241 #endif 3242 3243 /// Clone all blocks in the single-entry single-exit region of the block and 3244 /// their recipes without updating the operands of the cloned recipes. 3245 VPRegionBlock *clone() override; 3246 }; 3247 3248 /// VPlan models a candidate for vectorization, encoding various decisions take 3249 /// to produce efficient output IR, including which branches, basic-blocks and 3250 /// output IR instructions to generate, and their cost. VPlan holds a 3251 /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry 3252 /// VPBasicBlock. 3253 class VPlan { 3254 friend class VPlanPrinter; 3255 friend class VPSlotTracker; 3256 3257 /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the 3258 /// preheader of the vector loop. 3259 VPBasicBlock *Entry; 3260 3261 /// VPBasicBlock corresponding to the original preheader. Used to place 3262 /// VPExpandSCEV recipes for expressions used during skeleton creation and the 3263 /// rest of VPlan execution. 3264 VPBasicBlock *Preheader; 3265 3266 /// Holds the VFs applicable to this VPlan. 3267 SmallSetVector<ElementCount, 2> VFs; 3268 3269 /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for 3270 /// any UF. 3271 SmallSetVector<unsigned, 2> UFs; 3272 3273 /// Holds the name of the VPlan, for printing. 3274 std::string Name; 3275 3276 /// Represents the trip count of the original loop, for folding 3277 /// the tail. 3278 VPValue *TripCount = nullptr; 3279 3280 /// Represents the backedge taken count of the original loop, for folding 3281 /// the tail. It equals TripCount - 1. 3282 VPValue *BackedgeTakenCount = nullptr; 3283 3284 /// Represents the vector trip count. 3285 VPValue VectorTripCount; 3286 3287 /// Represents the loop-invariant VF * UF of the vector loop region. 3288 VPValue VFxUF; 3289 3290 /// Holds a mapping between Values and their corresponding VPValue inside 3291 /// VPlan. 3292 Value2VPValueTy Value2VPValue; 3293 3294 /// Contains all the external definitions created for this VPlan. External 3295 /// definitions are VPValues that hold a pointer to their underlying IR. 3296 SmallVector<VPValue *, 16> VPLiveInsToFree; 3297 3298 /// Values used outside the plan. It contains live-outs that need fixing. Any 3299 /// live-out that is fixed outside VPlan needs to be removed. The remaining 3300 /// live-outs are fixed via VPLiveOut::fixPhi. 3301 MapVector<PHINode *, VPLiveOut *> LiveOuts; 3302 3303 /// Mapping from SCEVs to the VPValues representing their expansions. 3304 /// NOTE: This mapping is temporary and will be removed once all users have 3305 /// been modeled in VPlan directly. 3306 DenseMap<const SCEV *, VPValue *> SCEVToExpansion; 3307 3308 public: 3309 /// Construct a VPlan with original preheader \p Preheader, trip count \p TC 3310 /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to 3311 /// be disconnected, as the bypass blocks between them are not yet modeled in 3312 /// VPlan. 3313 VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry) 3314 : VPlan(Preheader, Entry) { 3315 TripCount = TC; 3316 } 3317 3318 /// Construct a VPlan with original preheader \p Preheader and \p Entry to 3319 /// the plan. At the moment, \p Preheader and \p Entry need to be 3320 /// disconnected, as the bypass blocks between them are not yet modeled in 3321 /// VPlan. 3322 VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry) 3323 : Entry(Entry), Preheader(Preheader) { 3324 Entry->setPlan(this); 3325 Preheader->setPlan(this); 3326 assert(Preheader->getNumSuccessors() == 0 && 3327 Preheader->getNumPredecessors() == 0 && 3328 "preheader must be disconnected"); 3329 } 3330 3331 ~VPlan(); 3332 3333 /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping 3334 /// original scalar pre-header ) which contains SCEV expansions that need 3335 /// to happen before the CFG is modified; a VPBasicBlock for the vector 3336 /// pre-header, followed by a region for the vector loop, followed by the 3337 /// middle VPBasicBlock. If a check is needed to guard executing the scalar 3338 /// epilogue loop, it will be added to the middle block, together with 3339 /// VPBasicBlocks for the scalar preheader and exit blocks. 3340 static VPlanPtr createInitialVPlan(const SCEV *TripCount, 3341 ScalarEvolution &PSE, 3342 bool RequiresScalarEpilogueCheck, 3343 bool TailFolded, Loop *TheLoop); 3344 3345 /// Prepare the plan for execution, setting up the required live-in values. 3346 void prepareToExecute(Value *TripCount, Value *VectorTripCount, 3347 Value *CanonicalIVStartValue, VPTransformState &State); 3348 3349 /// Generate the IR code for this VPlan. 3350 void execute(VPTransformState *State); 3351 3352 /// Return the cost of this plan. 3353 InstructionCost cost(ElementCount VF, VPCostContext &Ctx); 3354 3355 VPBasicBlock *getEntry() { return Entry; } 3356 const VPBasicBlock *getEntry() const { return Entry; } 3357 3358 /// The trip count of the original loop. 3359 VPValue *getTripCount() const { 3360 assert(TripCount && "trip count needs to be set before accessing it"); 3361 return TripCount; 3362 } 3363 3364 /// Resets the trip count for the VPlan. The caller must make sure all uses of 3365 /// the original trip count have been replaced. 3366 void resetTripCount(VPValue *NewTripCount) { 3367 assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 && 3368 "TripCount always must be set"); 3369 TripCount = NewTripCount; 3370 } 3371 3372 /// The backedge taken count of the original loop. 3373 VPValue *getOrCreateBackedgeTakenCount() { 3374 if (!BackedgeTakenCount) 3375 BackedgeTakenCount = new VPValue(); 3376 return BackedgeTakenCount; 3377 } 3378 3379 /// The vector trip count. 3380 VPValue &getVectorTripCount() { return VectorTripCount; } 3381 3382 /// Returns VF * UF of the vector loop region. 3383 VPValue &getVFxUF() { return VFxUF; } 3384 3385 void addVF(ElementCount VF) { VFs.insert(VF); } 3386 3387 void setVF(ElementCount VF) { 3388 assert(hasVF(VF) && "Cannot set VF not already in plan"); 3389 VFs.clear(); 3390 VFs.insert(VF); 3391 } 3392 3393 bool hasVF(ElementCount VF) { return VFs.count(VF); } 3394 bool hasScalableVF() { 3395 return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); }); 3396 } 3397 3398 /// Returns an iterator range over all VFs of the plan. 3399 iterator_range<SmallSetVector<ElementCount, 2>::iterator> 3400 vectorFactors() const { 3401 return {VFs.begin(), VFs.end()}; 3402 } 3403 3404 bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } 3405 3406 bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } 3407 3408 void setUF(unsigned UF) { 3409 assert(hasUF(UF) && "Cannot set the UF not already in plan"); 3410 UFs.clear(); 3411 UFs.insert(UF); 3412 } 3413 3414 /// Return a string with the name of the plan and the applicable VFs and UFs. 3415 std::string getName() const; 3416 3417 void setName(const Twine &newName) { Name = newName.str(); } 3418 3419 /// Gets the live-in VPValue for \p V or adds a new live-in (if none exists 3420 /// yet) for \p V. 3421 VPValue *getOrAddLiveIn(Value *V) { 3422 assert(V && "Trying to get or add the VPValue of a null Value"); 3423 if (!Value2VPValue.count(V)) { 3424 VPValue *VPV = new VPValue(V); 3425 VPLiveInsToFree.push_back(VPV); 3426 assert(VPV->isLiveIn() && "VPV must be a live-in."); 3427 assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); 3428 Value2VPValue[V] = VPV; 3429 } 3430 3431 assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); 3432 assert(Value2VPValue[V]->isLiveIn() && 3433 "Only live-ins should be in mapping"); 3434 return Value2VPValue[V]; 3435 } 3436 3437 /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. 3438 VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); } 3439 3440 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3441 /// Print the live-ins of this VPlan to \p O. 3442 void printLiveIns(raw_ostream &O) const; 3443 3444 /// Print this VPlan to \p O. 3445 void print(raw_ostream &O) const; 3446 3447 /// Print this VPlan in DOT format to \p O. 3448 void printDOT(raw_ostream &O) const; 3449 3450 /// Dump the plan to stderr (for debugging). 3451 LLVM_DUMP_METHOD void dump() const; 3452 #endif 3453 3454 /// Returns the VPRegionBlock of the vector loop. 3455 VPRegionBlock *getVectorLoopRegion() { 3456 return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); 3457 } 3458 const VPRegionBlock *getVectorLoopRegion() const { 3459 return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); 3460 } 3461 3462 /// Returns the canonical induction recipe of the vector loop. 3463 VPCanonicalIVPHIRecipe *getCanonicalIV() { 3464 VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); 3465 if (EntryVPBB->empty()) { 3466 // VPlan native path. 3467 EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor()); 3468 } 3469 return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); 3470 } 3471 3472 void addLiveOut(PHINode *PN, VPValue *V); 3473 3474 void removeLiveOut(PHINode *PN) { 3475 delete LiveOuts[PN]; 3476 LiveOuts.erase(PN); 3477 } 3478 3479 const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const { 3480 return LiveOuts; 3481 } 3482 3483 VPValue *getSCEVExpansion(const SCEV *S) const { 3484 return SCEVToExpansion.lookup(S); 3485 } 3486 3487 void addSCEVExpansion(const SCEV *S, VPValue *V) { 3488 assert(!SCEVToExpansion.contains(S) && "SCEV already expanded"); 3489 SCEVToExpansion[S] = V; 3490 } 3491 3492 /// \return The block corresponding to the original preheader. 3493 VPBasicBlock *getPreheader() { return Preheader; } 3494 const VPBasicBlock *getPreheader() const { return Preheader; } 3495 3496 /// Clone the current VPlan, update all VPValues of the new VPlan and cloned 3497 /// recipes to refer to the clones, and return it. 3498 VPlan *duplicate(); 3499 }; 3500 3501 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3502 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is 3503 /// indented and follows the dot format. 3504 class VPlanPrinter { 3505 raw_ostream &OS; 3506 const VPlan &Plan; 3507 unsigned Depth = 0; 3508 unsigned TabWidth = 2; 3509 std::string Indent; 3510 unsigned BID = 0; 3511 SmallDenseMap<const VPBlockBase *, unsigned> BlockID; 3512 3513 VPSlotTracker SlotTracker; 3514 3515 /// Handle indentation. 3516 void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } 3517 3518 /// Print a given \p Block of the Plan. 3519 void dumpBlock(const VPBlockBase *Block); 3520 3521 /// Print the information related to the CFG edges going out of a given 3522 /// \p Block, followed by printing the successor blocks themselves. 3523 void dumpEdges(const VPBlockBase *Block); 3524 3525 /// Print a given \p BasicBlock, including its VPRecipes, followed by printing 3526 /// its successor blocks. 3527 void dumpBasicBlock(const VPBasicBlock *BasicBlock); 3528 3529 /// Print a given \p Region of the Plan. 3530 void dumpRegion(const VPRegionBlock *Region); 3531 3532 unsigned getOrCreateBID(const VPBlockBase *Block) { 3533 return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; 3534 } 3535 3536 Twine getOrCreateName(const VPBlockBase *Block); 3537 3538 Twine getUID(const VPBlockBase *Block); 3539 3540 /// Print the information related to a CFG edge between two VPBlockBases. 3541 void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, 3542 const Twine &Label); 3543 3544 public: 3545 VPlanPrinter(raw_ostream &O, const VPlan &P) 3546 : OS(O), Plan(P), SlotTracker(&P) {} 3547 3548 LLVM_DUMP_METHOD void dump(); 3549 }; 3550 3551 struct VPlanIngredient { 3552 const Value *V; 3553 3554 VPlanIngredient(const Value *V) : V(V) {} 3555 3556 void print(raw_ostream &O) const; 3557 }; 3558 3559 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { 3560 I.print(OS); 3561 return OS; 3562 } 3563 3564 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { 3565 Plan.print(OS); 3566 return OS; 3567 } 3568 #endif 3569 3570 //===----------------------------------------------------------------------===// 3571 // VPlan Utilities 3572 //===----------------------------------------------------------------------===// 3573 3574 /// Class that provides utilities for VPBlockBases in VPlan. 3575 class VPBlockUtils { 3576 public: 3577 VPBlockUtils() = delete; 3578 3579 /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p 3580 /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p 3581 /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's 3582 /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must 3583 /// have neither successors nor predecessors. 3584 static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { 3585 assert(NewBlock->getSuccessors().empty() && 3586 NewBlock->getPredecessors().empty() && 3587 "Can't insert new block with predecessors or successors."); 3588 NewBlock->setParent(BlockPtr->getParent()); 3589 SmallVector<VPBlockBase *> Succs(BlockPtr->successors()); 3590 for (VPBlockBase *Succ : Succs) { 3591 disconnectBlocks(BlockPtr, Succ); 3592 connectBlocks(NewBlock, Succ); 3593 } 3594 connectBlocks(BlockPtr, NewBlock); 3595 } 3596 3597 /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p 3598 /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p 3599 /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr 3600 /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors 3601 /// and \p IfTrue and \p IfFalse must have neither successors nor 3602 /// predecessors. 3603 static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, 3604 VPBlockBase *BlockPtr) { 3605 assert(IfTrue->getSuccessors().empty() && 3606 "Can't insert IfTrue with successors."); 3607 assert(IfFalse->getSuccessors().empty() && 3608 "Can't insert IfFalse with successors."); 3609 BlockPtr->setTwoSuccessors(IfTrue, IfFalse); 3610 IfTrue->setPredecessors({BlockPtr}); 3611 IfFalse->setPredecessors({BlockPtr}); 3612 IfTrue->setParent(BlockPtr->getParent()); 3613 IfFalse->setParent(BlockPtr->getParent()); 3614 } 3615 3616 /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to 3617 /// the successors of \p From and \p From to the predecessors of \p To. Both 3618 /// VPBlockBases must have the same parent, which can be null. Both 3619 /// VPBlockBases can be already connected to other VPBlockBases. 3620 static void connectBlocks(VPBlockBase *From, VPBlockBase *To) { 3621 assert((From->getParent() == To->getParent()) && 3622 "Can't connect two block with different parents"); 3623 assert(From->getNumSuccessors() < 2 && 3624 "Blocks can't have more than two successors."); 3625 From->appendSuccessor(To); 3626 To->appendPredecessor(From); 3627 } 3628 3629 /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To 3630 /// from the successors of \p From and \p From from the predecessors of \p To. 3631 static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) { 3632 assert(To && "Successor to disconnect is null."); 3633 From->removeSuccessor(To); 3634 To->removePredecessor(From); 3635 } 3636 3637 /// Return an iterator range over \p Range which only includes \p BlockTy 3638 /// blocks. The accesses are casted to \p BlockTy. 3639 template <typename BlockTy, typename T> 3640 static auto blocksOnly(const T &Range) { 3641 // Create BaseTy with correct const-ness based on BlockTy. 3642 using BaseTy = std::conditional_t<std::is_const<BlockTy>::value, 3643 const VPBlockBase, VPBlockBase>; 3644 3645 // We need to first create an iterator range over (const) BlocktTy & instead 3646 // of (const) BlockTy * for filter_range to work properly. 3647 auto Mapped = 3648 map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; }); 3649 auto Filter = make_filter_range( 3650 Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); }); 3651 return map_range(Filter, [](BaseTy &Block) -> BlockTy * { 3652 return cast<BlockTy>(&Block); 3653 }); 3654 } 3655 }; 3656 3657 class VPInterleavedAccessInfo { 3658 DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *> 3659 InterleaveGroupMap; 3660 3661 /// Type for mapping of instruction based interleave groups to VPInstruction 3662 /// interleave groups 3663 using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *, 3664 InterleaveGroup<VPInstruction> *>; 3665 3666 /// Recursively \p Region and populate VPlan based interleave groups based on 3667 /// \p IAI. 3668 void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, 3669 InterleavedAccessInfo &IAI); 3670 /// Recursively traverse \p Block and populate VPlan based interleave groups 3671 /// based on \p IAI. 3672 void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 3673 InterleavedAccessInfo &IAI); 3674 3675 public: 3676 VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); 3677 3678 ~VPInterleavedAccessInfo() { 3679 SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet; 3680 // Avoid releasing a pointer twice. 3681 for (auto &I : InterleaveGroupMap) 3682 DelSet.insert(I.second); 3683 for (auto *Ptr : DelSet) 3684 delete Ptr; 3685 } 3686 3687 /// Get the interleave group that \p Instr belongs to. 3688 /// 3689 /// \returns nullptr if doesn't have such group. 3690 InterleaveGroup<VPInstruction> * 3691 getInterleaveGroup(VPInstruction *Instr) const { 3692 return InterleaveGroupMap.lookup(Instr); 3693 } 3694 }; 3695 3696 /// Class that maps (parts of) an existing VPlan to trees of combined 3697 /// VPInstructions. 3698 class VPlanSlp { 3699 enum class OpMode { Failed, Load, Opcode }; 3700 3701 /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as 3702 /// DenseMap keys. 3703 struct BundleDenseMapInfo { 3704 static SmallVector<VPValue *, 4> getEmptyKey() { 3705 return {reinterpret_cast<VPValue *>(-1)}; 3706 } 3707 3708 static SmallVector<VPValue *, 4> getTombstoneKey() { 3709 return {reinterpret_cast<VPValue *>(-2)}; 3710 } 3711 3712 static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) { 3713 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 3714 } 3715 3716 static bool isEqual(const SmallVector<VPValue *, 4> &LHS, 3717 const SmallVector<VPValue *, 4> &RHS) { 3718 return LHS == RHS; 3719 } 3720 }; 3721 3722 /// Mapping of values in the original VPlan to a combined VPInstruction. 3723 DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo> 3724 BundleToCombined; 3725 3726 VPInterleavedAccessInfo &IAI; 3727 3728 /// Basic block to operate on. For now, only instructions in a single BB are 3729 /// considered. 3730 const VPBasicBlock &BB; 3731 3732 /// Indicates whether we managed to combine all visited instructions or not. 3733 bool CompletelySLP = true; 3734 3735 /// Width of the widest combined bundle in bits. 3736 unsigned WidestBundleBits = 0; 3737 3738 using MultiNodeOpTy = 3739 typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; 3740 3741 // Input operand bundles for the current multi node. Each multi node operand 3742 // bundle contains values not matching the multi node's opcode. They will 3743 // be reordered in reorderMultiNodeOps, once we completed building a 3744 // multi node. 3745 SmallVector<MultiNodeOpTy, 4> MultiNodeOps; 3746 3747 /// Indicates whether we are building a multi node currently. 3748 bool MultiNodeActive = false; 3749 3750 /// Check if we can vectorize Operands together. 3751 bool areVectorizable(ArrayRef<VPValue *> Operands) const; 3752 3753 /// Add combined instruction \p New for the bundle \p Operands. 3754 void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New); 3755 3756 /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. 3757 VPInstruction *markFailed(); 3758 3759 /// Reorder operands in the multi node to maximize sequential memory access 3760 /// and commutative operations. 3761 SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps(); 3762 3763 /// Choose the best candidate to use for the lane after \p Last. The set of 3764 /// candidates to choose from are values with an opcode matching \p Last's 3765 /// or loads consecutive to \p Last. 3766 std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last, 3767 SmallPtrSetImpl<VPValue *> &Candidates, 3768 VPInterleavedAccessInfo &IAI); 3769 3770 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3771 /// Print bundle \p Values to dbgs(). 3772 void dumpBundle(ArrayRef<VPValue *> Values); 3773 #endif 3774 3775 public: 3776 VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} 3777 3778 ~VPlanSlp() = default; 3779 3780 /// Tries to build an SLP tree rooted at \p Operands and returns a 3781 /// VPInstruction combining \p Operands, if they can be combined. 3782 VPInstruction *buildGraph(ArrayRef<VPValue *> Operands); 3783 3784 /// Return the width of the widest combined bundle in bits. 3785 unsigned getWidestBundleBits() const { return WidestBundleBits; } 3786 3787 /// Return true if all visited instruction can be combined. 3788 bool isCompletelySLP() const { return CompletelySLP; } 3789 }; 3790 3791 namespace vputils { 3792 3793 /// Returns true if only the first lane of \p Def is used. 3794 bool onlyFirstLaneUsed(const VPValue *Def); 3795 3796 /// Returns true if only the first part of \p Def is used. 3797 bool onlyFirstPartUsed(const VPValue *Def); 3798 3799 /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p 3800 /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in 3801 /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's 3802 /// pre-header already contains a recipe expanding \p Expr, return it. If not, 3803 /// create a new one. 3804 VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, 3805 ScalarEvolution &SE); 3806 3807 /// Returns true if \p VPV is uniform after vectorization. 3808 inline bool isUniformAfterVectorization(VPValue *VPV) { 3809 // A value defined outside the vector region must be uniform after 3810 // vectorization inside a vector region. 3811 if (VPV->isDefinedOutsideVectorRegions()) 3812 return true; 3813 VPRecipeBase *Def = VPV->getDefiningRecipe(); 3814 assert(Def && "Must have definition for value defined inside vector region"); 3815 if (auto Rep = dyn_cast<VPReplicateRecipe>(Def)) 3816 return Rep->isUniform(); 3817 if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def)) 3818 return all_of(GEP->operands(), isUniformAfterVectorization); 3819 if (auto *VPI = dyn_cast<VPInstruction>(Def)) 3820 return VPI->isSingleScalar() || VPI->isVectorToScalar(); 3821 return false; 3822 } 3823 3824 /// Return true if \p V is a header mask in \p Plan. 3825 bool isHeaderMask(VPValue *V, VPlan &Plan); 3826 } // end namespace vputils 3827 3828 } // end namespace llvm 3829 3830 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 3831