1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10 // stores that can be put together into vector-stores. Next, it attempts to 11 // construct vectorizable tree using the use-def chains. If a profitable tree 12 // was found, the SLP vectorizer performs vectorization on the tree. 13 // 14 // The pass is inspired by the work described in the paper: 15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 20 #include "llvm/ADT/DenseMap.h" 21 #include "llvm/ADT/DenseSet.h" 22 #include "llvm/ADT/PriorityQueue.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/ScopeExit.h" 25 #include "llvm/ADT/SetOperations.h" 26 #include "llvm/ADT/SetVector.h" 27 #include "llvm/ADT/SmallBitVector.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallString.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/ADT/iterator.h" 33 #include "llvm/ADT/iterator_range.h" 34 #include "llvm/Analysis/AliasAnalysis.h" 35 #include "llvm/Analysis/AssumptionCache.h" 36 #include "llvm/Analysis/CodeMetrics.h" 37 #include "llvm/Analysis/ConstantFolding.h" 38 #include "llvm/Analysis/DemandedBits.h" 39 #include "llvm/Analysis/GlobalsModRef.h" 40 #include "llvm/Analysis/IVDescriptors.h" 41 #include "llvm/Analysis/LoopAccessAnalysis.h" 42 #include "llvm/Analysis/LoopInfo.h" 43 #include "llvm/Analysis/MemoryLocation.h" 44 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 45 #include "llvm/Analysis/ScalarEvolution.h" 46 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 47 #include "llvm/Analysis/TargetLibraryInfo.h" 48 #include "llvm/Analysis/TargetTransformInfo.h" 49 #include "llvm/Analysis/ValueTracking.h" 50 #include "llvm/Analysis/VectorUtils.h" 51 #include "llvm/IR/Attributes.h" 52 #include "llvm/IR/BasicBlock.h" 53 #include "llvm/IR/Constant.h" 54 #include "llvm/IR/Constants.h" 55 #include "llvm/IR/DataLayout.h" 56 #include "llvm/IR/DerivedTypes.h" 57 #include "llvm/IR/Dominators.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/IRBuilder.h" 60 #include "llvm/IR/InstrTypes.h" 61 #include "llvm/IR/Instruction.h" 62 #include "llvm/IR/Instructions.h" 63 #include "llvm/IR/IntrinsicInst.h" 64 #include "llvm/IR/Intrinsics.h" 65 #include "llvm/IR/Module.h" 66 #include "llvm/IR/Operator.h" 67 #include "llvm/IR/PatternMatch.h" 68 #include "llvm/IR/Type.h" 69 #include "llvm/IR/Use.h" 70 #include "llvm/IR/User.h" 71 #include "llvm/IR/Value.h" 72 #include "llvm/IR/ValueHandle.h" 73 #ifdef EXPENSIVE_CHECKS 74 #include "llvm/IR/Verifier.h" 75 #endif 76 #include "llvm/Pass.h" 77 #include "llvm/Support/Casting.h" 78 #include "llvm/Support/CommandLine.h" 79 #include "llvm/Support/Compiler.h" 80 #include "llvm/Support/DOTGraphTraits.h" 81 #include "llvm/Support/Debug.h" 82 #include "llvm/Support/ErrorHandling.h" 83 #include "llvm/Support/GraphWriter.h" 84 #include "llvm/Support/InstructionCost.h" 85 #include "llvm/Support/KnownBits.h" 86 #include "llvm/Support/MathExtras.h" 87 #include "llvm/Support/raw_ostream.h" 88 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 89 #include "llvm/Transforms/Utils/Local.h" 90 #include "llvm/Transforms/Utils/LoopUtils.h" 91 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 92 #include <algorithm> 93 #include <cassert> 94 #include <cstdint> 95 #include <iterator> 96 #include <memory> 97 #include <optional> 98 #include <set> 99 #include <string> 100 #include <tuple> 101 #include <utility> 102 103 using namespace llvm; 104 using namespace llvm::PatternMatch; 105 using namespace slpvectorizer; 106 107 #define SV_NAME "slp-vectorizer" 108 #define DEBUG_TYPE "SLP" 109 110 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 111 112 static cl::opt<bool> 113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, 114 cl::desc("Run the SLP vectorization passes")); 115 116 static cl::opt<bool> 117 SLPReVec("slp-revec", cl::init(false), cl::Hidden, 118 cl::desc("Enable vectorization for wider vector utilization")); 119 120 static cl::opt<int> 121 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 122 cl::desc("Only vectorize if you gain more than this " 123 "number ")); 124 125 static cl::opt<bool> SLPSkipEarlyProfitabilityCheck( 126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden, 127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on " 128 "heuristics and makes vectorization decision via cost modeling.")); 129 130 static cl::opt<bool> 131 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 132 cl::desc("Attempt to vectorize horizontal reductions")); 133 134 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 136 cl::desc( 137 "Attempt to vectorize horizontal reductions feeding into a store")); 138 139 // NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run 140 // even if we match a reduction but do not vectorize in the end. 141 static cl::opt<bool> AllowHorRdxIdenityOptimization( 142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, 143 cl::desc("Allow optimization of original scalar identity operations on " 144 "matched horizontal reductions.")); 145 146 static cl::opt<int> 147 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 148 cl::desc("Attempt to vectorize for this register size in bits")); 149 150 static cl::opt<unsigned> 151 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, 152 cl::desc("Maximum SLP vectorization factor (0=unlimited)")); 153 154 /// Limits the size of scheduling regions in a block. 155 /// It avoid long compile times for _very_ large blocks where vector 156 /// instructions are spread over a wide range. 157 /// This limit is way higher than needed by real-world functions. 158 static cl::opt<int> 159 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 160 cl::desc("Limit the size of the SLP scheduling region per block")); 161 162 static cl::opt<int> MinVectorRegSizeOption( 163 "slp-min-reg-size", cl::init(128), cl::Hidden, 164 cl::desc("Attempt to vectorize for this register size in bits")); 165 166 static cl::opt<unsigned> RecursionMaxDepth( 167 "slp-recursion-max-depth", cl::init(12), cl::Hidden, 168 cl::desc("Limit the recursion depth when building a vectorizable tree")); 169 170 static cl::opt<unsigned> MinTreeSize( 171 "slp-min-tree-size", cl::init(3), cl::Hidden, 172 cl::desc("Only vectorize small trees if they are fully vectorizable")); 173 174 // The maximum depth that the look-ahead score heuristic will explore. 175 // The higher this value, the higher the compilation time overhead. 176 static cl::opt<int> LookAheadMaxDepth( 177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, 178 cl::desc("The maximum look-ahead depth for operand reordering scores")); 179 180 // The maximum depth that the look-ahead score heuristic will explore 181 // when it probing among candidates for vectorization tree roots. 182 // The higher this value, the higher the compilation time overhead but unlike 183 // similar limit for operands ordering this is less frequently used, hence 184 // impact of higher value is less noticeable. 185 static cl::opt<int> RootLookAheadMaxDepth( 186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, 187 cl::desc("The maximum look-ahead depth for searching best rooting option")); 188 189 static cl::opt<unsigned> MinProfitableStridedLoads( 190 "slp-min-strided-loads", cl::init(2), cl::Hidden, 191 cl::desc("The minimum number of loads, which should be considered strided, " 192 "if the stride is > 1 or is runtime value")); 193 194 static cl::opt<unsigned> MaxProfitableLoadStride( 195 "slp-max-stride", cl::init(8), cl::Hidden, 196 cl::desc("The maximum stride, considered to be profitable.")); 197 198 static cl::opt<bool> 199 ViewSLPTree("view-slp-tree", cl::Hidden, 200 cl::desc("Display the SLP trees with Graphviz")); 201 202 static cl::opt<bool> VectorizeNonPowerOf2( 203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, 204 cl::desc("Try to vectorize with non-power-of-2 number of elements.")); 205 206 // Limit the number of alias checks. The limit is chosen so that 207 // it has no negative effect on the llvm benchmarks. 208 static const unsigned AliasedCheckLimit = 10; 209 210 // Limit of the number of uses for potentially transformed instructions/values, 211 // used in checks to avoid compile-time explode. 212 static constexpr int UsesLimit = 64; 213 214 // Another limit for the alias checks: The maximum distance between load/store 215 // instructions where alias checks are done. 216 // This limit is useful for very large basic blocks. 217 static const unsigned MaxMemDepDistance = 160; 218 219 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 220 /// regions to be handled. 221 static const int MinScheduleRegionSize = 16; 222 223 /// Maximum allowed number of operands in the PHI nodes. 224 static const unsigned MaxPHINumOperands = 128; 225 226 /// Predicate for the element types that the SLP vectorizer supports. 227 /// 228 /// The most important thing to filter here are types which are invalid in LLVM 229 /// vectors. We also filter target specific types which have absolutely no 230 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 231 /// avoids spending time checking the cost model and realizing that they will 232 /// be inevitably scalarized. 233 static bool isValidElementType(Type *Ty) { 234 // TODO: Support ScalableVectorType. 235 if (SLPReVec && isa<FixedVectorType>(Ty)) 236 Ty = Ty->getScalarType(); 237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 238 !Ty->isPPC_FP128Ty(); 239 } 240 241 /// \returns the number of elements for Ty. 242 static unsigned getNumElements(Type *Ty) { 243 assert(!isa<ScalableVectorType>(Ty) && 244 "ScalableVectorType is not supported."); 245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) 246 return VecTy->getNumElements(); 247 return 1; 248 } 249 250 /// \returns the vector type of ScalarTy based on vectorization factor. 251 static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { 252 return FixedVectorType::get(ScalarTy->getScalarType(), 253 VF * getNumElements(ScalarTy)); 254 } 255 256 /// \returns True if the value is a constant (but not globals/constant 257 /// expressions). 258 static bool isConstant(Value *V) { 259 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V); 260 } 261 262 /// Checks if \p V is one of vector-like instructions, i.e. undef, 263 /// insertelement/extractelement with constant indices for fixed vector type or 264 /// extractvalue instruction. 265 static bool isVectorLikeInstWithConstOps(Value *V) { 266 if (!isa<InsertElementInst, ExtractElementInst>(V) && 267 !isa<ExtractValueInst, UndefValue>(V)) 268 return false; 269 auto *I = dyn_cast<Instruction>(V); 270 if (!I || isa<ExtractValueInst>(I)) 271 return true; 272 if (!isa<FixedVectorType>(I->getOperand(0)->getType())) 273 return false; 274 if (isa<ExtractElementInst>(I)) 275 return isConstant(I->getOperand(1)); 276 assert(isa<InsertElementInst>(V) && "Expected only insertelement."); 277 return isConstant(I->getOperand(2)); 278 } 279 280 /// Returns power-of-2 number of elements in a single register (part), given the 281 /// total number of elements \p Size and number of registers (parts) \p 282 /// NumParts. 283 static unsigned getPartNumElems(unsigned Size, unsigned NumParts) { 284 return PowerOf2Ceil(divideCeil(Size, NumParts)); 285 } 286 287 /// Returns correct remaining number of elements, considering total amount \p 288 /// Size, (power-of-2 number) of elements in a single register \p PartNumElems 289 /// and current register (part) \p Part. 290 static unsigned getNumElems(unsigned Size, unsigned PartNumElems, 291 unsigned Part) { 292 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems); 293 } 294 295 #if !defined(NDEBUG) 296 /// Print a short descriptor of the instruction bundle suitable for debug output. 297 static std::string shortBundleName(ArrayRef<Value *> VL) { 298 std::string Result; 299 raw_string_ostream OS(Result); 300 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]"; 301 OS.flush(); 302 return Result; 303 } 304 #endif 305 306 /// \returns true if all of the instructions in \p VL are in the same block or 307 /// false otherwise. 308 static bool allSameBlock(ArrayRef<Value *> VL) { 309 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 310 if (!I0) 311 return false; 312 if (all_of(VL, isVectorLikeInstWithConstOps)) 313 return true; 314 315 BasicBlock *BB = I0->getParent(); 316 for (int I = 1, E = VL.size(); I < E; I++) { 317 auto *II = dyn_cast<Instruction>(VL[I]); 318 if (!II) 319 return false; 320 321 if (BB != II->getParent()) 322 return false; 323 } 324 return true; 325 } 326 327 /// \returns True if all of the values in \p VL are constants (but not 328 /// globals/constant expressions). 329 static bool allConstant(ArrayRef<Value *> VL) { 330 // Constant expressions and globals can't be vectorized like normal integer/FP 331 // constants. 332 return all_of(VL, isConstant); 333 } 334 335 /// \returns True if all of the values in \p VL are identical or some of them 336 /// are UndefValue. 337 static bool isSplat(ArrayRef<Value *> VL) { 338 Value *FirstNonUndef = nullptr; 339 for (Value *V : VL) { 340 if (isa<UndefValue>(V)) 341 continue; 342 if (!FirstNonUndef) { 343 FirstNonUndef = V; 344 continue; 345 } 346 if (V != FirstNonUndef) 347 return false; 348 } 349 return FirstNonUndef != nullptr; 350 } 351 352 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 353 static bool isCommutative(Instruction *I) { 354 if (auto *Cmp = dyn_cast<CmpInst>(I)) 355 return Cmp->isCommutative(); 356 if (auto *BO = dyn_cast<BinaryOperator>(I)) 357 return BO->isCommutative() || 358 (BO->getOpcode() == Instruction::Sub && 359 !BO->hasNUsesOrMore(UsesLimit) && 360 all_of( 361 BO->uses(), 362 [](const Use &U) { 363 // Commutative, if icmp eq/ne sub, 0 364 ICmpInst::Predicate Pred; 365 if (match(U.getUser(), 366 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) && 367 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) 368 return true; 369 // Commutative, if abs(sub nsw, true) or abs(sub, false). 370 ConstantInt *Flag; 371 return match(U.getUser(), 372 m_Intrinsic<Intrinsic::abs>( 373 m_Specific(U.get()), m_ConstantInt(Flag))) && 374 (!cast<Instruction>(U.get())->hasNoSignedWrap() || 375 Flag->isOne()); 376 })) || 377 (BO->getOpcode() == Instruction::FSub && 378 !BO->hasNUsesOrMore(UsesLimit) && 379 all_of(BO->uses(), [](const Use &U) { 380 return match(U.getUser(), 381 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get()))); 382 })); 383 return I->isCommutative(); 384 } 385 386 template <typename T> 387 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst, 388 unsigned Offset) { 389 static_assert(std::is_same_v<T, InsertElementInst> || 390 std::is_same_v<T, ExtractElementInst>, 391 "unsupported T"); 392 int Index = Offset; 393 if (const auto *IE = dyn_cast<T>(Inst)) { 394 const auto *VT = dyn_cast<FixedVectorType>(IE->getType()); 395 if (!VT) 396 return std::nullopt; 397 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); 398 if (!CI) 399 return std::nullopt; 400 if (CI->getValue().uge(VT->getNumElements())) 401 return std::nullopt; 402 Index *= VT->getNumElements(); 403 Index += CI->getZExtValue(); 404 return Index; 405 } 406 return std::nullopt; 407 } 408 409 /// \returns inserting or extracting index of InsertElement, ExtractElement or 410 /// InsertValue instruction, using Offset as base offset for index. 411 /// \returns std::nullopt if the index is not an immediate. 412 static std::optional<unsigned> getElementIndex(const Value *Inst, 413 unsigned Offset = 0) { 414 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset)) 415 return Index; 416 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset)) 417 return Index; 418 419 int Index = Offset; 420 421 const auto *IV = dyn_cast<InsertValueInst>(Inst); 422 if (!IV) 423 return std::nullopt; 424 425 Type *CurrentType = IV->getType(); 426 for (unsigned I : IV->indices()) { 427 if (const auto *ST = dyn_cast<StructType>(CurrentType)) { 428 Index *= ST->getNumElements(); 429 CurrentType = ST->getElementType(I); 430 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) { 431 Index *= AT->getNumElements(); 432 CurrentType = AT->getElementType(); 433 } else { 434 return std::nullopt; 435 } 436 Index += I; 437 } 438 return Index; 439 } 440 441 namespace { 442 /// Specifies the way the mask should be analyzed for undefs/poisonous elements 443 /// in the shuffle mask. 444 enum class UseMask { 445 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors, 446 ///< check for the mask elements for the first argument (mask 447 ///< indices are in range [0:VF)). 448 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check 449 ///< for the mask elements for the second argument (mask indices 450 ///< are in range [VF:2*VF)) 451 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for 452 ///< future shuffle elements and mark them as ones as being used 453 ///< in future. Non-undef elements are considered as unused since 454 ///< they're already marked as used in the mask. 455 }; 456 } // namespace 457 458 /// Prepares a use bitset for the given mask either for the first argument or 459 /// for the second. 460 static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask, 461 UseMask MaskArg) { 462 SmallBitVector UseMask(VF, true); 463 for (auto [Idx, Value] : enumerate(Mask)) { 464 if (Value == PoisonMaskElem) { 465 if (MaskArg == UseMask::UndefsAsMask) 466 UseMask.reset(Idx); 467 continue; 468 } 469 if (MaskArg == UseMask::FirstArg && Value < VF) 470 UseMask.reset(Value); 471 else if (MaskArg == UseMask::SecondArg && Value >= VF) 472 UseMask.reset(Value - VF); 473 } 474 return UseMask; 475 } 476 477 /// Checks if the given value is actually an undefined constant vector. 478 /// Also, if the \p UseMask is not empty, tries to check if the non-masked 479 /// elements actually mask the insertelement buildvector, if any. 480 template <bool IsPoisonOnly = false> 481 static SmallBitVector isUndefVector(const Value *V, 482 const SmallBitVector &UseMask = {}) { 483 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true); 484 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>; 485 if (isa<T>(V)) 486 return Res; 487 auto *VecTy = dyn_cast<FixedVectorType>(V->getType()); 488 if (!VecTy) 489 return Res.reset(); 490 auto *C = dyn_cast<Constant>(V); 491 if (!C) { 492 if (!UseMask.empty()) { 493 const Value *Base = V; 494 while (auto *II = dyn_cast<InsertElementInst>(Base)) { 495 Base = II->getOperand(0); 496 if (isa<T>(II->getOperand(1))) 497 continue; 498 std::optional<unsigned> Idx = getElementIndex(II); 499 if (!Idx) { 500 Res.reset(); 501 return Res; 502 } 503 if (*Idx < UseMask.size() && !UseMask.test(*Idx)) 504 Res.reset(*Idx); 505 } 506 // TODO: Add analysis for shuffles here too. 507 if (V == Base) { 508 Res.reset(); 509 } else { 510 SmallBitVector SubMask(UseMask.size(), false); 511 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask); 512 } 513 } else { 514 Res.reset(); 515 } 516 return Res; 517 } 518 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { 519 if (Constant *Elem = C->getAggregateElement(I)) 520 if (!isa<T>(Elem) && 521 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I)))) 522 Res.reset(I); 523 } 524 return Res; 525 } 526 527 /// Checks if the vector of instructions can be represented as a shuffle, like: 528 /// %x0 = extractelement <4 x i8> %x, i32 0 529 /// %x3 = extractelement <4 x i8> %x, i32 3 530 /// %y1 = extractelement <4 x i8> %y, i32 1 531 /// %y2 = extractelement <4 x i8> %y, i32 2 532 /// %x0x0 = mul i8 %x0, %x0 533 /// %x3x3 = mul i8 %x3, %x3 534 /// %y1y1 = mul i8 %y1, %y1 535 /// %y2y2 = mul i8 %y2, %y2 536 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 537 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 538 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 539 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 540 /// ret <4 x i8> %ins4 541 /// can be transformed into: 542 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 543 /// i32 6> 544 /// %2 = mul <4 x i8> %1, %1 545 /// ret <4 x i8> %2 546 /// Mask will return the Shuffle Mask equivalent to the extracted elements. 547 /// TODO: Can we split off and reuse the shuffle mask detection from 548 /// ShuffleVectorInst/getShuffleCost? 549 static std::optional<TargetTransformInfo::ShuffleKind> 550 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { 551 const auto *It = find_if(VL, IsaPred<ExtractElementInst>); 552 if (It == VL.end()) 553 return std::nullopt; 554 unsigned Size = 555 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) { 556 auto *EI = dyn_cast<ExtractElementInst>(V); 557 if (!EI) 558 return S; 559 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); 560 if (!VTy) 561 return S; 562 return std::max(S, VTy->getNumElements()); 563 }); 564 565 Value *Vec1 = nullptr; 566 Value *Vec2 = nullptr; 567 bool HasNonUndefVec = any_of(VL, [](Value *V) { 568 auto *EE = dyn_cast<ExtractElementInst>(V); 569 if (!EE) 570 return false; 571 Value *Vec = EE->getVectorOperand(); 572 if (isa<UndefValue>(Vec)) 573 return false; 574 return isGuaranteedNotToBePoison(Vec); 575 }); 576 enum ShuffleMode { Unknown, Select, Permute }; 577 ShuffleMode CommonShuffleMode = Unknown; 578 Mask.assign(VL.size(), PoisonMaskElem); 579 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 580 // Undef can be represented as an undef element in a vector. 581 if (isa<UndefValue>(VL[I])) 582 continue; 583 auto *EI = cast<ExtractElementInst>(VL[I]); 584 if (isa<ScalableVectorType>(EI->getVectorOperandType())) 585 return std::nullopt; 586 auto *Vec = EI->getVectorOperand(); 587 // We can extractelement from undef or poison vector. 588 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all()) 589 continue; 590 // All vector operands must have the same number of vector elements. 591 if (isa<UndefValue>(Vec)) { 592 Mask[I] = I; 593 } else { 594 if (isa<UndefValue>(EI->getIndexOperand())) 595 continue; 596 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); 597 if (!Idx) 598 return std::nullopt; 599 // Undefined behavior if Idx is negative or >= Size. 600 if (Idx->getValue().uge(Size)) 601 continue; 602 unsigned IntIdx = Idx->getValue().getZExtValue(); 603 Mask[I] = IntIdx; 604 } 605 if (isUndefVector(Vec).all() && HasNonUndefVec) 606 continue; 607 // For correct shuffling we have to have at most 2 different vector operands 608 // in all extractelement instructions. 609 if (!Vec1 || Vec1 == Vec) { 610 Vec1 = Vec; 611 } else if (!Vec2 || Vec2 == Vec) { 612 Vec2 = Vec; 613 Mask[I] += Size; 614 } else { 615 return std::nullopt; 616 } 617 if (CommonShuffleMode == Permute) 618 continue; 619 // If the extract index is not the same as the operation number, it is a 620 // permutation. 621 if (Mask[I] % Size != I) { 622 CommonShuffleMode = Permute; 623 continue; 624 } 625 CommonShuffleMode = Select; 626 } 627 // If we're not crossing lanes in different vectors, consider it as blending. 628 if (CommonShuffleMode == Select && Vec2) 629 return TargetTransformInfo::SK_Select; 630 // If Vec2 was never used, we have a permutation of a single vector, otherwise 631 // we have permutation of 2 vectors. 632 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc 633 : TargetTransformInfo::SK_PermuteSingleSrc; 634 } 635 636 /// \returns True if Extract{Value,Element} instruction extracts element Idx. 637 static std::optional<unsigned> getExtractIndex(Instruction *E) { 638 unsigned Opcode = E->getOpcode(); 639 assert((Opcode == Instruction::ExtractElement || 640 Opcode == Instruction::ExtractValue) && 641 "Expected extractelement or extractvalue instruction."); 642 if (Opcode == Instruction::ExtractElement) { 643 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 644 if (!CI) 645 return std::nullopt; 646 return CI->getZExtValue(); 647 } 648 auto *EI = cast<ExtractValueInst>(E); 649 if (EI->getNumIndices() != 1) 650 return std::nullopt; 651 return *EI->idx_begin(); 652 } 653 654 namespace { 655 656 /// Main data required for vectorization of instructions. 657 struct InstructionsState { 658 /// The very first instruction in the list with the main opcode. 659 Value *OpValue = nullptr; 660 661 /// The main/alternate instruction. 662 Instruction *MainOp = nullptr; 663 Instruction *AltOp = nullptr; 664 665 /// The main/alternate opcodes for the list of instructions. 666 unsigned getOpcode() const { 667 return MainOp ? MainOp->getOpcode() : 0; 668 } 669 670 unsigned getAltOpcode() const { 671 return AltOp ? AltOp->getOpcode() : 0; 672 } 673 674 /// Some of the instructions in the list have alternate opcodes. 675 bool isAltShuffle() const { return AltOp != MainOp; } 676 677 bool isOpcodeOrAlt(Instruction *I) const { 678 unsigned CheckedOpcode = I->getOpcode(); 679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; 680 } 681 682 InstructionsState() = delete; 683 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) 684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} 685 }; 686 687 } // end anonymous namespace 688 689 /// Chooses the correct key for scheduling data. If \p Op has the same (or 690 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p 691 /// OpValue. 692 static Value *isOneOf(const InstructionsState &S, Value *Op) { 693 auto *I = dyn_cast<Instruction>(Op); 694 if (I && S.isOpcodeOrAlt(I)) 695 return Op; 696 return S.OpValue; 697 } 698 699 /// \returns true if \p Opcode is allowed as part of the main/alternate 700 /// instruction for SLP vectorization. 701 /// 702 /// Example of unsupported opcode is SDIV that can potentially cause UB if the 703 /// "shuffled out" lane would result in division by zero. 704 static bool isValidForAlternation(unsigned Opcode) { 705 if (Instruction::isIntDivRem(Opcode)) 706 return false; 707 708 return true; 709 } 710 711 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 712 const TargetLibraryInfo &TLI, 713 unsigned BaseIndex = 0); 714 715 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. 716 /// compatible instructions or constants, or just some other regular values. 717 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, 718 Value *Op1, const TargetLibraryInfo &TLI) { 719 return (isConstant(BaseOp0) && isConstant(Op0)) || 720 (isConstant(BaseOp1) && isConstant(Op1)) || 721 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && 722 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || 723 BaseOp0 == Op0 || BaseOp1 == Op1 || 724 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || 725 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); 726 } 727 728 /// \returns true if a compare instruction \p CI has similar "look" and 729 /// same predicate as \p BaseCI, "as is" or with its operands and predicate 730 /// swapped, false otherwise. 731 static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, 732 const TargetLibraryInfo &TLI) { 733 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && 734 "Assessing comparisons of different types?"); 735 CmpInst::Predicate BasePred = BaseCI->getPredicate(); 736 CmpInst::Predicate Pred = CI->getPredicate(); 737 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred); 738 739 Value *BaseOp0 = BaseCI->getOperand(0); 740 Value *BaseOp1 = BaseCI->getOperand(1); 741 Value *Op0 = CI->getOperand(0); 742 Value *Op1 = CI->getOperand(1); 743 744 return (BasePred == Pred && 745 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || 746 (BasePred == SwappedPred && 747 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI)); 748 } 749 750 /// \returns analysis of the Instructions in \p VL described in 751 /// InstructionsState, the Opcode that we suppose the whole list 752 /// could be vectorized even if its structure is diverse. 753 static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 754 const TargetLibraryInfo &TLI, 755 unsigned BaseIndex) { 756 // Make sure these are all Instructions. 757 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) 758 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 759 760 bool IsCastOp = isa<CastInst>(VL[BaseIndex]); 761 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); 762 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]); 763 CmpInst::Predicate BasePred = 764 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate() 765 : CmpInst::BAD_ICMP_PREDICATE; 766 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); 767 unsigned AltOpcode = Opcode; 768 unsigned AltIndex = BaseIndex; 769 770 bool SwappedPredsCompatible = [&]() { 771 if (!IsCmpOp) 772 return false; 773 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds; 774 UniquePreds.insert(BasePred); 775 UniqueNonSwappedPreds.insert(BasePred); 776 for (Value *V : VL) { 777 auto *I = dyn_cast<CmpInst>(V); 778 if (!I) 779 return false; 780 CmpInst::Predicate CurrentPred = I->getPredicate(); 781 CmpInst::Predicate SwappedCurrentPred = 782 CmpInst::getSwappedPredicate(CurrentPred); 783 UniqueNonSwappedPreds.insert(CurrentPred); 784 if (!UniquePreds.contains(CurrentPred) && 785 !UniquePreds.contains(SwappedCurrentPred)) 786 UniquePreds.insert(CurrentPred); 787 } 788 // Total number of predicates > 2, but if consider swapped predicates 789 // compatible only 2, consider swappable predicates as compatible opcodes, 790 // not alternate. 791 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2; 792 }(); 793 // Check for one alternate opcode from another BinaryOperator. 794 // TODO - generalize to support all operators (types, calls etc.). 795 auto *IBase = cast<Instruction>(VL[BaseIndex]); 796 Intrinsic::ID BaseID = 0; 797 SmallVector<VFInfo> BaseMappings; 798 if (auto *CallBase = dyn_cast<CallInst>(IBase)) { 799 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); 800 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); 801 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) 802 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 803 } 804 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { 805 auto *I = cast<Instruction>(VL[Cnt]); 806 unsigned InstOpcode = I->getOpcode(); 807 if (IsBinOp && isa<BinaryOperator>(I)) { 808 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 809 continue; 810 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && 811 isValidForAlternation(Opcode)) { 812 AltOpcode = InstOpcode; 813 AltIndex = Cnt; 814 continue; 815 } 816 } else if (IsCastOp && isa<CastInst>(I)) { 817 Value *Op0 = IBase->getOperand(0); 818 Type *Ty0 = Op0->getType(); 819 Value *Op1 = I->getOperand(0); 820 Type *Ty1 = Op1->getType(); 821 if (Ty0 == Ty1) { 822 if (InstOpcode == Opcode || InstOpcode == AltOpcode) 823 continue; 824 if (Opcode == AltOpcode) { 825 assert(isValidForAlternation(Opcode) && 826 isValidForAlternation(InstOpcode) && 827 "Cast isn't safe for alternation, logic needs to be updated!"); 828 AltOpcode = InstOpcode; 829 AltIndex = Cnt; 830 continue; 831 } 832 } 833 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) { 834 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]); 835 Type *Ty0 = BaseInst->getOperand(0)->getType(); 836 Type *Ty1 = Inst->getOperand(0)->getType(); 837 if (Ty0 == Ty1) { 838 assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); 839 // Check for compatible operands. If the corresponding operands are not 840 // compatible - need to perform alternate vectorization. 841 CmpInst::Predicate CurrentPred = Inst->getPredicate(); 842 CmpInst::Predicate SwappedCurrentPred = 843 CmpInst::getSwappedPredicate(CurrentPred); 844 845 if ((E == 2 || SwappedPredsCompatible) && 846 (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) 847 continue; 848 849 if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) 850 continue; 851 auto *AltInst = cast<CmpInst>(VL[AltIndex]); 852 if (AltIndex != BaseIndex) { 853 if (isCmpSameOrSwapped(AltInst, Inst, TLI)) 854 continue; 855 } else if (BasePred != CurrentPred) { 856 assert( 857 isValidForAlternation(InstOpcode) && 858 "CmpInst isn't safe for alternation, logic needs to be updated!"); 859 AltIndex = Cnt; 860 continue; 861 } 862 CmpInst::Predicate AltPred = AltInst->getPredicate(); 863 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || 864 AltPred == CurrentPred || AltPred == SwappedCurrentPred) 865 continue; 866 } 867 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) { 868 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { 869 if (Gep->getNumOperands() != 2 || 870 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType()) 871 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 872 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) { 873 if (!isVectorLikeInstWithConstOps(EI)) 874 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 875 } else if (auto *LI = dyn_cast<LoadInst>(I)) { 876 auto *BaseLI = cast<LoadInst>(IBase); 877 if (!LI->isSimple() || !BaseLI->isSimple()) 878 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 879 } else if (auto *Call = dyn_cast<CallInst>(I)) { 880 auto *CallBase = cast<CallInst>(IBase); 881 if (Call->getCalledFunction() != CallBase->getCalledFunction()) 882 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 883 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() || 884 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), 885 Call->op_begin() + Call->getBundleOperandsEndIndex(), 886 CallBase->op_begin() + 887 CallBase->getBundleOperandsStartIndex()))) 888 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 889 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); 890 if (ID != BaseID) 891 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 892 if (!ID) { 893 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call); 894 if (Mappings.size() != BaseMappings.size() || 895 Mappings.front().ISA != BaseMappings.front().ISA || 896 Mappings.front().ScalarName != BaseMappings.front().ScalarName || 897 Mappings.front().VectorName != BaseMappings.front().VectorName || 898 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || 899 Mappings.front().Shape.Parameters != 900 BaseMappings.front().Shape.Parameters) 901 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 902 } 903 } 904 continue; 905 } 906 return InstructionsState(VL[BaseIndex], nullptr, nullptr); 907 } 908 909 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), 910 cast<Instruction>(VL[AltIndex])); 911 } 912 913 /// \returns true if all of the values in \p VL have the same type or false 914 /// otherwise. 915 static bool allSameType(ArrayRef<Value *> VL) { 916 Type *Ty = VL.front()->getType(); 917 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; }); 918 } 919 920 /// \returns True if in-tree use also needs extract. This refers to 921 /// possible scalar operand in vectorized instruction. 922 static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 923 TargetLibraryInfo *TLI) { 924 unsigned Opcode = UserInst->getOpcode(); 925 switch (Opcode) { 926 case Instruction::Load: { 927 LoadInst *LI = cast<LoadInst>(UserInst); 928 return (LI->getPointerOperand() == Scalar); 929 } 930 case Instruction::Store: { 931 StoreInst *SI = cast<StoreInst>(UserInst); 932 return (SI->getPointerOperand() == Scalar); 933 } 934 case Instruction::Call: { 935 CallInst *CI = cast<CallInst>(UserInst); 936 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 937 return any_of(enumerate(CI->args()), [&](auto &&Arg) { 938 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) && 939 Arg.value().get() == Scalar; 940 }); 941 } 942 default: 943 return false; 944 } 945 } 946 947 /// \returns the AA location that is being access by the instruction. 948 static MemoryLocation getLocation(Instruction *I) { 949 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 950 return MemoryLocation::get(SI); 951 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 952 return MemoryLocation::get(LI); 953 return MemoryLocation(); 954 } 955 956 /// \returns True if the instruction is not a volatile or atomic load/store. 957 static bool isSimple(Instruction *I) { 958 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 959 return LI->isSimple(); 960 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 961 return SI->isSimple(); 962 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 963 return !MI->isVolatile(); 964 return true; 965 } 966 967 /// Shuffles \p Mask in accordance with the given \p SubMask. 968 /// \param ExtendingManyInputs Supports reshuffling of the mask with not only 969 /// one but two input vectors. 970 static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, 971 bool ExtendingManyInputs = false) { 972 if (SubMask.empty()) 973 return; 974 assert( 975 (!ExtendingManyInputs || SubMask.size() > Mask.size() || 976 // Check if input scalars were extended to match the size of other node. 977 (SubMask.size() == Mask.size() && 978 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(), 979 [](int Idx) { return Idx == PoisonMaskElem; }))) && 980 "SubMask with many inputs support must be larger than the mask."); 981 if (Mask.empty()) { 982 Mask.append(SubMask.begin(), SubMask.end()); 983 return; 984 } 985 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem); 986 int TermValue = std::min(Mask.size(), SubMask.size()); 987 for (int I = 0, E = SubMask.size(); I < E; ++I) { 988 if (SubMask[I] == PoisonMaskElem || 989 (!ExtendingManyInputs && 990 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue))) 991 continue; 992 NewMask[I] = Mask[SubMask[I]]; 993 } 994 Mask.swap(NewMask); 995 } 996 997 /// Order may have elements assigned special value (size) which is out of 998 /// bounds. Such indices only appear on places which correspond to undef values 999 /// (see canReuseExtract for details) and used in order to avoid undef values 1000 /// have effect on operands ordering. 1001 /// The first loop below simply finds all unused indices and then the next loop 1002 /// nest assigns these indices for undef values positions. 1003 /// As an example below Order has two undef positions and they have assigned 1004 /// values 3 and 7 respectively: 1005 /// before: 6 9 5 4 9 2 1 0 1006 /// after: 6 3 5 4 7 2 1 0 1007 static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) { 1008 const unsigned Sz = Order.size(); 1009 SmallBitVector UnusedIndices(Sz, /*t=*/true); 1010 SmallBitVector MaskedIndices(Sz); 1011 for (unsigned I = 0; I < Sz; ++I) { 1012 if (Order[I] < Sz) 1013 UnusedIndices.reset(Order[I]); 1014 else 1015 MaskedIndices.set(I); 1016 } 1017 if (MaskedIndices.none()) 1018 return; 1019 assert(UnusedIndices.count() == MaskedIndices.count() && 1020 "Non-synced masked/available indices."); 1021 int Idx = UnusedIndices.find_first(); 1022 int MIdx = MaskedIndices.find_first(); 1023 while (MIdx >= 0) { 1024 assert(Idx >= 0 && "Indices must be synced."); 1025 Order[MIdx] = Idx; 1026 Idx = UnusedIndices.find_next(Idx); 1027 MIdx = MaskedIndices.find_next(MIdx); 1028 } 1029 } 1030 1031 /// \returns a bitset for selecting opcodes. false for Opcode0 and true for 1032 /// Opcode1. 1033 SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0, 1034 unsigned Opcode1) { 1035 SmallBitVector OpcodeMask(VL.size(), false); 1036 for (unsigned Lane : seq<unsigned>(VL.size())) 1037 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1) 1038 OpcodeMask.set(Lane); 1039 return OpcodeMask; 1040 } 1041 1042 namespace llvm { 1043 1044 static void inversePermutation(ArrayRef<unsigned> Indices, 1045 SmallVectorImpl<int> &Mask) { 1046 Mask.clear(); 1047 const unsigned E = Indices.size(); 1048 Mask.resize(E, PoisonMaskElem); 1049 for (unsigned I = 0; I < E; ++I) 1050 Mask[Indices[I]] = I; 1051 } 1052 1053 /// Reorders the list of scalars in accordance with the given \p Mask. 1054 static void reorderScalars(SmallVectorImpl<Value *> &Scalars, 1055 ArrayRef<int> Mask) { 1056 assert(!Mask.empty() && "Expected non-empty mask."); 1057 SmallVector<Value *> Prev(Scalars.size(), 1058 PoisonValue::get(Scalars.front()->getType())); 1059 Prev.swap(Scalars); 1060 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 1061 if (Mask[I] != PoisonMaskElem) 1062 Scalars[Mask[I]] = Prev[I]; 1063 } 1064 1065 /// Checks if the provided value does not require scheduling. It does not 1066 /// require scheduling if this is not an instruction or it is an instruction 1067 /// that does not read/write memory and all operands are either not instructions 1068 /// or phi nodes or instructions from different blocks. 1069 static bool areAllOperandsNonInsts(Value *V) { 1070 auto *I = dyn_cast<Instruction>(V); 1071 if (!I) 1072 return true; 1073 return !mayHaveNonDefUseDependency(*I) && 1074 all_of(I->operands(), [I](Value *V) { 1075 auto *IO = dyn_cast<Instruction>(V); 1076 if (!IO) 1077 return true; 1078 return isa<PHINode>(IO) || IO->getParent() != I->getParent(); 1079 }); 1080 } 1081 1082 /// Checks if the provided value does not require scheduling. It does not 1083 /// require scheduling if this is not an instruction or it is an instruction 1084 /// that does not read/write memory and all users are phi nodes or instructions 1085 /// from the different blocks. 1086 static bool isUsedOutsideBlock(Value *V) { 1087 auto *I = dyn_cast<Instruction>(V); 1088 if (!I) 1089 return true; 1090 // Limits the number of uses to save compile time. 1091 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && 1092 all_of(I->users(), [I](User *U) { 1093 auto *IU = dyn_cast<Instruction>(U); 1094 if (!IU) 1095 return true; 1096 return IU->getParent() != I->getParent() || isa<PHINode>(IU); 1097 }); 1098 } 1099 1100 /// Checks if the specified value does not require scheduling. It does not 1101 /// require scheduling if all operands and all users do not need to be scheduled 1102 /// in the current basic block. 1103 static bool doesNotNeedToBeScheduled(Value *V) { 1104 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); 1105 } 1106 1107 /// Checks if the specified array of instructions does not require scheduling. 1108 /// It is so if all either instructions have operands that do not require 1109 /// scheduling or their users do not require scheduling since they are phis or 1110 /// in other basic blocks. 1111 static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) { 1112 return !VL.empty() && 1113 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); 1114 } 1115 1116 namespace slpvectorizer { 1117 1118 /// Bottom Up SLP Vectorizer. 1119 class BoUpSLP { 1120 struct TreeEntry; 1121 struct ScheduleData; 1122 class ShuffleCostEstimator; 1123 class ShuffleInstructionBuilder; 1124 1125 public: 1126 /// Tracks the state we can represent the loads in the given sequence. 1127 enum class LoadsState { 1128 Gather, 1129 Vectorize, 1130 ScatterVectorize, 1131 StridedVectorize 1132 }; 1133 1134 using ValueList = SmallVector<Value *, 8>; 1135 using InstrList = SmallVector<Instruction *, 16>; 1136 using ValueSet = SmallPtrSet<Value *, 16>; 1137 using StoreList = SmallVector<StoreInst *, 8>; 1138 using ExtraValueToDebugLocsMap = 1139 MapVector<Value *, SmallVector<Instruction *, 2>>; 1140 using OrdersType = SmallVector<unsigned, 4>; 1141 1142 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 1143 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, 1144 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, 1145 const DataLayout *DL, OptimizationRemarkEmitter *ORE) 1146 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt), 1147 AC(AC), DB(DB), DL(DL), ORE(ORE), 1148 Builder(Se->getContext(), TargetFolder(*DL)) { 1149 CodeMetrics::collectEphemeralValues(F, AC, EphValues); 1150 // Use the vector register size specified by the target unless overridden 1151 // by a command-line option. 1152 // TODO: It would be better to limit the vectorization factor based on 1153 // data type rather than just register size. For example, x86 AVX has 1154 // 256-bit registers, but it does not support integer operations 1155 // at that width (that requires AVX2). 1156 if (MaxVectorRegSizeOption.getNumOccurrences()) 1157 MaxVecRegSize = MaxVectorRegSizeOption; 1158 else 1159 MaxVecRegSize = 1160 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 1161 .getFixedValue(); 1162 1163 if (MinVectorRegSizeOption.getNumOccurrences()) 1164 MinVecRegSize = MinVectorRegSizeOption; 1165 else 1166 MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); 1167 } 1168 1169 /// Vectorize the tree that starts with the elements in \p VL. 1170 /// Returns the vectorized root. 1171 Value *vectorizeTree(); 1172 1173 /// Vectorize the tree but with the list of externally used values \p 1174 /// ExternallyUsedValues. Values in this MapVector can be replaced but the 1175 /// generated extractvalue instructions. 1176 /// \param ReplacedExternals containd list of replaced external values 1177 /// {scalar, replace} after emitting extractelement for external uses. 1178 Value * 1179 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, 1180 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals, 1181 Instruction *ReductionRoot = nullptr); 1182 1183 /// \returns the cost incurred by unwanted spills and fills, caused by 1184 /// holding live values over call sites. 1185 InstructionCost getSpillCost() const; 1186 1187 /// \returns the vectorization cost of the subtree that starts at \p VL. 1188 /// A negative number means that this is profitable. 1189 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt); 1190 1191 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 1192 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 1193 void buildTree(ArrayRef<Value *> Roots, 1194 const SmallDenseSet<Value *> &UserIgnoreLst); 1195 1196 /// Construct a vectorizable tree that starts at \p Roots. 1197 void buildTree(ArrayRef<Value *> Roots); 1198 1199 /// Returns whether the root node has in-tree uses. 1200 bool doesRootHaveInTreeUses() const { 1201 return !VectorizableTree.empty() && 1202 !VectorizableTree.front()->UserTreeIndices.empty(); 1203 } 1204 1205 /// Return the scalars of the root node. 1206 ArrayRef<Value *> getRootNodeScalars() const { 1207 assert(!VectorizableTree.empty() && "No graph to get the first node from"); 1208 return VectorizableTree.front()->Scalars; 1209 } 1210 1211 /// Checks if the root graph node can be emitted with narrower bitwidth at 1212 /// codegen and returns it signedness, if so. 1213 bool isSignedMinBitwidthRootNode() const { 1214 return MinBWs.at(VectorizableTree.front().get()).second; 1215 } 1216 1217 /// Builds external uses of the vectorized scalars, i.e. the list of 1218 /// vectorized scalars to be extracted, their lanes and their scalar users. \p 1219 /// ExternallyUsedValues contains additional list of external uses to handle 1220 /// vectorization of reductions. 1221 void 1222 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); 1223 1224 /// Transforms graph nodes to target specific representations, if profitable. 1225 void transformNodes(); 1226 1227 /// Clear the internal data structures that are created by 'buildTree'. 1228 void deleteTree() { 1229 VectorizableTree.clear(); 1230 ScalarToTreeEntry.clear(); 1231 MultiNodeScalars.clear(); 1232 MustGather.clear(); 1233 NonScheduledFirst.clear(); 1234 EntryToLastInstruction.clear(); 1235 ExternalUses.clear(); 1236 ExternalUsesAsGEPs.clear(); 1237 for (auto &Iter : BlocksSchedules) { 1238 BlockScheduling *BS = Iter.second.get(); 1239 BS->clear(); 1240 } 1241 MinBWs.clear(); 1242 ReductionBitWidth = 0; 1243 CastMaxMinBWSizes.reset(); 1244 ExtraBitWidthNodes.clear(); 1245 InstrElementSize.clear(); 1246 UserIgnoreList = nullptr; 1247 PostponedGathers.clear(); 1248 ValueToGatherNodes.clear(); 1249 } 1250 1251 unsigned getTreeSize() const { return VectorizableTree.size(); } 1252 1253 /// Perform LICM and CSE on the newly generated gather sequences. 1254 void optimizeGatherSequence(); 1255 1256 /// Checks if the specified gather tree entry \p TE can be represented as a 1257 /// shuffled vector entry + (possibly) permutation with other gathers. It 1258 /// implements the checks only for possibly ordered scalars (Loads, 1259 /// ExtractElement, ExtractValue), which can be part of the graph. 1260 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); 1261 1262 /// Sort loads into increasing pointers offsets to allow greater clustering. 1263 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); 1264 1265 /// Gets reordering data for the given tree entry. If the entry is vectorized 1266 /// - just return ReorderIndices, otherwise check if the scalars can be 1267 /// reordered and return the most optimal order. 1268 /// \return std::nullopt if ordering is not important, empty order, if 1269 /// identity order is important, or the actual order. 1270 /// \param TopToBottom If true, include the order of vectorized stores and 1271 /// insertelement nodes, otherwise skip them. 1272 std::optional<OrdersType> getReorderingData(const TreeEntry &TE, 1273 bool TopToBottom); 1274 1275 /// Reorders the current graph to the most profitable order starting from the 1276 /// root node to the leaf nodes. The best order is chosen only from the nodes 1277 /// of the same size (vectorization factor). Smaller nodes are considered 1278 /// parts of subgraph with smaller VF and they are reordered independently. We 1279 /// can make it because we still need to extend smaller nodes to the wider VF 1280 /// and we can merge reordering shuffles with the widening shuffles. 1281 void reorderTopToBottom(); 1282 1283 /// Reorders the current graph to the most profitable order starting from 1284 /// leaves to the root. It allows to rotate small subgraphs and reduce the 1285 /// number of reshuffles if the leaf nodes use the same order. In this case we 1286 /// can merge the orders and just shuffle user node instead of shuffling its 1287 /// operands. Plus, even the leaf nodes have different orders, it allows to 1288 /// sink reordering in the graph closer to the root node and merge it later 1289 /// during analysis. 1290 void reorderBottomToTop(bool IgnoreReorder = false); 1291 1292 /// \return The vector element size in bits to use when vectorizing the 1293 /// expression tree ending at \p V. If V is a store, the size is the width of 1294 /// the stored value. Otherwise, the size is the width of the largest loaded 1295 /// value reaching V. This method is used by the vectorizer to calculate 1296 /// vectorization factors. 1297 unsigned getVectorElementSize(Value *V); 1298 1299 /// Compute the minimum type sizes required to represent the entries in a 1300 /// vectorizable tree. 1301 void computeMinimumValueSizes(); 1302 1303 // \returns maximum vector register size as set by TTI or overridden by cl::opt. 1304 unsigned getMaxVecRegSize() const { 1305 return MaxVecRegSize; 1306 } 1307 1308 // \returns minimum vector register size as set by cl::opt. 1309 unsigned getMinVecRegSize() const { 1310 return MinVecRegSize; 1311 } 1312 1313 unsigned getMinVF(unsigned Sz) const { 1314 return std::max(2U, getMinVecRegSize() / Sz); 1315 } 1316 1317 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 1318 unsigned MaxVF = MaxVFOption.getNumOccurrences() ? 1319 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); 1320 return MaxVF ? MaxVF : UINT_MAX; 1321 } 1322 1323 /// Check if homogeneous aggregate is isomorphic to some VectorType. 1324 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 1325 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 1326 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 1327 /// 1328 /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 1329 unsigned canMapToVector(Type *T) const; 1330 1331 /// \returns True if the VectorizableTree is both tiny and not fully 1332 /// vectorizable. We do not vectorize such trees. 1333 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; 1334 1335 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 1336 /// can be load combined in the backend. Load combining may not be allowed in 1337 /// the IR optimizer, so we do not want to alter the pattern. For example, 1338 /// partially transforming a scalar bswap() pattern into vector code is 1339 /// effectively impossible for the backend to undo. 1340 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1341 /// may not be necessary. 1342 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; 1343 1344 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 1345 /// can be load combined in the backend. Load combining may not be allowed in 1346 /// the IR optimizer, so we do not want to alter the pattern. For example, 1347 /// partially transforming a scalar bswap() pattern into vector code is 1348 /// effectively impossible for the backend to undo. 1349 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1350 /// may not be necessary. 1351 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; 1352 1353 /// Checks if the given array of loads can be represented as a vectorized, 1354 /// scatter or just simple gather. 1355 /// \param VL list of loads. 1356 /// \param VL0 main load value. 1357 /// \param Order returned order of load instructions. 1358 /// \param PointerOps returned list of pointer operands. 1359 /// \param TryRecursiveCheck used to check if long masked gather can be 1360 /// represented as a serie of loads/insert subvector, if profitable. 1361 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, 1362 SmallVectorImpl<unsigned> &Order, 1363 SmallVectorImpl<Value *> &PointerOps, 1364 bool TryRecursiveCheck = true) const; 1365 1366 OptimizationRemarkEmitter *getORE() { return ORE; } 1367 1368 /// This structure holds any data we need about the edges being traversed 1369 /// during buildTree_rec(). We keep track of: 1370 /// (i) the user TreeEntry index, and 1371 /// (ii) the index of the edge. 1372 struct EdgeInfo { 1373 EdgeInfo() = default; 1374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) 1375 : UserTE(UserTE), EdgeIdx(EdgeIdx) {} 1376 /// The user TreeEntry. 1377 TreeEntry *UserTE = nullptr; 1378 /// The operand index of the use. 1379 unsigned EdgeIdx = UINT_MAX; 1380 #ifndef NDEBUG 1381 friend inline raw_ostream &operator<<(raw_ostream &OS, 1382 const BoUpSLP::EdgeInfo &EI) { 1383 EI.dump(OS); 1384 return OS; 1385 } 1386 /// Debug print. 1387 void dump(raw_ostream &OS) const { 1388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") 1389 << " EdgeIdx:" << EdgeIdx << "}"; 1390 } 1391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } 1392 #endif 1393 bool operator == (const EdgeInfo &Other) const { 1394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx; 1395 } 1396 }; 1397 1398 /// A helper class used for scoring candidates for two consecutive lanes. 1399 class LookAheadHeuristics { 1400 const TargetLibraryInfo &TLI; 1401 const DataLayout &DL; 1402 ScalarEvolution &SE; 1403 const BoUpSLP &R; 1404 int NumLanes; // Total number of lanes (aka vectorization factor). 1405 int MaxLevel; // The maximum recursion depth for accumulating score. 1406 1407 public: 1408 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, 1409 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, 1410 int MaxLevel) 1411 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), 1412 MaxLevel(MaxLevel) {} 1413 1414 // The hard-coded scores listed here are not very important, though it shall 1415 // be higher for better matches to improve the resulting cost. When 1416 // computing the scores of matching one sub-tree with another, we are 1417 // basically counting the number of values that are matching. So even if all 1418 // scores are set to 1, we would still get a decent matching result. 1419 // However, sometimes we have to break ties. For example we may have to 1420 // choose between matching loads vs matching opcodes. This is what these 1421 // scores are helping us with: they provide the order of preference. Also, 1422 // this is important if the scalar is externally used or used in another 1423 // tree entry node in the different lane. 1424 1425 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 1426 static const int ScoreConsecutiveLoads = 4; 1427 /// The same load multiple times. This should have a better score than 1428 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it 1429 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for 1430 /// a vector load and 1.0 for a broadcast. 1431 static const int ScoreSplatLoads = 3; 1432 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). 1433 static const int ScoreReversedLoads = 3; 1434 /// A load candidate for masked gather. 1435 static const int ScoreMaskedGatherCandidate = 1; 1436 /// ExtractElementInst from same vector and consecutive indexes. 1437 static const int ScoreConsecutiveExtracts = 4; 1438 /// ExtractElementInst from same vector and reversed indices. 1439 static const int ScoreReversedExtracts = 3; 1440 /// Constants. 1441 static const int ScoreConstants = 2; 1442 /// Instructions with the same opcode. 1443 static const int ScoreSameOpcode = 2; 1444 /// Instructions with alt opcodes (e.g, add + sub). 1445 static const int ScoreAltOpcodes = 1; 1446 /// Identical instructions (a.k.a. splat or broadcast). 1447 static const int ScoreSplat = 1; 1448 /// Matching with an undef is preferable to failing. 1449 static const int ScoreUndef = 1; 1450 /// Score for failing to find a decent match. 1451 static const int ScoreFail = 0; 1452 /// Score if all users are vectorized. 1453 static const int ScoreAllUserVectorized = 1; 1454 1455 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 1456 /// \p U1 and \p U2 are the users of \p V1 and \p V2. 1457 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p 1458 /// MainAltOps. 1459 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, 1460 ArrayRef<Value *> MainAltOps) const { 1461 if (!isValidElementType(V1->getType()) || 1462 !isValidElementType(V2->getType())) 1463 return LookAheadHeuristics::ScoreFail; 1464 1465 if (V1 == V2) { 1466 if (isa<LoadInst>(V1)) { 1467 // Retruns true if the users of V1 and V2 won't need to be extracted. 1468 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) { 1469 // Bail out if we have too many uses to save compilation time. 1470 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit)) 1471 return false; 1472 1473 auto AllUsersVectorized = [U1, U2, this](Value *V) { 1474 return llvm::all_of(V->users(), [U1, U2, this](Value *U) { 1475 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr; 1476 }); 1477 }; 1478 return AllUsersVectorized(V1) && AllUsersVectorized(V2); 1479 }; 1480 // A broadcast of a load can be cheaper on some targets. 1481 if (R.TTI->isLegalBroadcastLoad(V1->getType(), 1482 ElementCount::getFixed(NumLanes)) && 1483 ((int)V1->getNumUses() == NumLanes || 1484 AllUsersAreInternal(V1, V2))) 1485 return LookAheadHeuristics::ScoreSplatLoads; 1486 } 1487 return LookAheadHeuristics::ScoreSplat; 1488 } 1489 1490 auto CheckSameEntryOrFail = [&]() { 1491 if (const TreeEntry *TE1 = R.getTreeEntry(V1); 1492 TE1 && TE1 == R.getTreeEntry(V2)) 1493 return LookAheadHeuristics::ScoreSplatLoads; 1494 return LookAheadHeuristics::ScoreFail; 1495 }; 1496 1497 auto *LI1 = dyn_cast<LoadInst>(V1); 1498 auto *LI2 = dyn_cast<LoadInst>(V2); 1499 if (LI1 && LI2) { 1500 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || 1501 !LI2->isSimple()) 1502 return CheckSameEntryOrFail(); 1503 1504 std::optional<int> Dist = getPointersDiff( 1505 LI1->getType(), LI1->getPointerOperand(), LI2->getType(), 1506 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); 1507 if (!Dist || *Dist == 0) { 1508 if (getUnderlyingObject(LI1->getPointerOperand()) == 1509 getUnderlyingObject(LI2->getPointerOperand()) && 1510 R.TTI->isLegalMaskedGather( 1511 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign())) 1512 return LookAheadHeuristics::ScoreMaskedGatherCandidate; 1513 return CheckSameEntryOrFail(); 1514 } 1515 // The distance is too large - still may be profitable to use masked 1516 // loads/gathers. 1517 if (std::abs(*Dist) > NumLanes / 2) 1518 return LookAheadHeuristics::ScoreMaskedGatherCandidate; 1519 // This still will detect consecutive loads, but we might have "holes" 1520 // in some cases. It is ok for non-power-2 vectorization and may produce 1521 // better results. It should not affect current vectorization. 1522 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads 1523 : LookAheadHeuristics::ScoreReversedLoads; 1524 } 1525 1526 auto *C1 = dyn_cast<Constant>(V1); 1527 auto *C2 = dyn_cast<Constant>(V2); 1528 if (C1 && C2) 1529 return LookAheadHeuristics::ScoreConstants; 1530 1531 // Extracts from consecutive indexes of the same vector better score as 1532 // the extracts could be optimized away. 1533 Value *EV1; 1534 ConstantInt *Ex1Idx; 1535 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { 1536 // Undefs are always profitable for extractelements. 1537 // Compiler can easily combine poison and extractelement <non-poison> or 1538 // undef and extractelement <poison>. But combining undef + 1539 // extractelement <non-poison-but-may-produce-poison> requires some 1540 // extra operations. 1541 if (isa<UndefValue>(V2)) 1542 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all()) 1543 ? LookAheadHeuristics::ScoreConsecutiveExtracts 1544 : LookAheadHeuristics::ScoreSameOpcode; 1545 Value *EV2 = nullptr; 1546 ConstantInt *Ex2Idx = nullptr; 1547 if (match(V2, 1548 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), 1549 m_Undef())))) { 1550 // Undefs are always profitable for extractelements. 1551 if (!Ex2Idx) 1552 return LookAheadHeuristics::ScoreConsecutiveExtracts; 1553 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType()) 1554 return LookAheadHeuristics::ScoreConsecutiveExtracts; 1555 if (EV2 == EV1) { 1556 int Idx1 = Ex1Idx->getZExtValue(); 1557 int Idx2 = Ex2Idx->getZExtValue(); 1558 int Dist = Idx2 - Idx1; 1559 // The distance is too large - still may be profitable to use 1560 // shuffles. 1561 if (std::abs(Dist) == 0) 1562 return LookAheadHeuristics::ScoreSplat; 1563 if (std::abs(Dist) > NumLanes / 2) 1564 return LookAheadHeuristics::ScoreSameOpcode; 1565 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts 1566 : LookAheadHeuristics::ScoreReversedExtracts; 1567 } 1568 return LookAheadHeuristics::ScoreAltOpcodes; 1569 } 1570 return CheckSameEntryOrFail(); 1571 } 1572 1573 auto *I1 = dyn_cast<Instruction>(V1); 1574 auto *I2 = dyn_cast<Instruction>(V2); 1575 if (I1 && I2) { 1576 if (I1->getParent() != I2->getParent()) 1577 return CheckSameEntryOrFail(); 1578 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end()); 1579 Ops.push_back(I1); 1580 Ops.push_back(I2); 1581 InstructionsState S = getSameOpcode(Ops, TLI); 1582 // Note: Only consider instructions with <= 2 operands to avoid 1583 // complexity explosion. 1584 if (S.getOpcode() && 1585 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() || 1586 !S.isAltShuffle()) && 1587 all_of(Ops, [&S](Value *V) { 1588 return cast<Instruction>(V)->getNumOperands() == 1589 S.MainOp->getNumOperands(); 1590 })) 1591 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes 1592 : LookAheadHeuristics::ScoreSameOpcode; 1593 } 1594 1595 if (isa<UndefValue>(V2)) 1596 return LookAheadHeuristics::ScoreUndef; 1597 1598 return CheckSameEntryOrFail(); 1599 } 1600 1601 /// Go through the operands of \p LHS and \p RHS recursively until 1602 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are 1603 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands 1604 /// of \p U1 and \p U2), except at the beginning of the recursion where 1605 /// these are set to nullptr. 1606 /// 1607 /// For example: 1608 /// \verbatim 1609 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] 1610 /// \ / \ / \ / \ / 1611 /// + + + + 1612 /// G1 G2 G3 G4 1613 /// \endverbatim 1614 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 1615 /// each level recursively, accumulating the score. It starts from matching 1616 /// the additions at level 0, then moves on to the loads (level 1). The 1617 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 1618 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while 1619 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. 1620 /// Please note that the order of the operands does not matter, as we 1621 /// evaluate the score of all profitable combinations of operands. In 1622 /// other words the score of G1 and G4 is the same as G1 and G2. This 1623 /// heuristic is based on ideas described in: 1624 /// Look-ahead SLP: Auto-vectorization in the presence of commutative 1625 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 1626 /// LuÃs F. W. Góes 1627 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, 1628 Instruction *U2, int CurrLevel, 1629 ArrayRef<Value *> MainAltOps) const { 1630 1631 // Get the shallow score of V1 and V2. 1632 int ShallowScoreAtThisLevel = 1633 getShallowScore(LHS, RHS, U1, U2, MainAltOps); 1634 1635 // If reached MaxLevel, 1636 // or if V1 and V2 are not instructions, 1637 // or if they are SPLAT, 1638 // or if they are not consecutive, 1639 // or if profitable to vectorize loads or extractelements, early return 1640 // the current cost. 1641 auto *I1 = dyn_cast<Instruction>(LHS); 1642 auto *I2 = dyn_cast<Instruction>(RHS); 1643 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || 1644 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || 1645 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || 1646 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || 1647 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && 1648 ShallowScoreAtThisLevel)) 1649 return ShallowScoreAtThisLevel; 1650 assert(I1 && I2 && "Should have early exited."); 1651 1652 // Contains the I2 operand indexes that got matched with I1 operands. 1653 SmallSet<unsigned, 4> Op2Used; 1654 1655 // Recursion towards the operands of I1 and I2. We are trying all possible 1656 // operand pairs, and keeping track of the best score. 1657 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); 1658 OpIdx1 != NumOperands1; ++OpIdx1) { 1659 // Try to pair op1I with the best operand of I2. 1660 int MaxTmpScore = 0; 1661 unsigned MaxOpIdx2 = 0; 1662 bool FoundBest = false; 1663 // If I2 is commutative try all combinations. 1664 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; 1665 unsigned ToIdx = isCommutative(I2) 1666 ? I2->getNumOperands() 1667 : std::min(I2->getNumOperands(), OpIdx1 + 1); 1668 assert(FromIdx <= ToIdx && "Bad index"); 1669 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { 1670 // Skip operands already paired with OpIdx1. 1671 if (Op2Used.count(OpIdx2)) 1672 continue; 1673 // Recursively calculate the cost at each level 1674 int TmpScore = 1675 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), 1676 I1, I2, CurrLevel + 1, std::nullopt); 1677 // Look for the best score. 1678 if (TmpScore > LookAheadHeuristics::ScoreFail && 1679 TmpScore > MaxTmpScore) { 1680 MaxTmpScore = TmpScore; 1681 MaxOpIdx2 = OpIdx2; 1682 FoundBest = true; 1683 } 1684 } 1685 if (FoundBest) { 1686 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 1687 Op2Used.insert(MaxOpIdx2); 1688 ShallowScoreAtThisLevel += MaxTmpScore; 1689 } 1690 } 1691 return ShallowScoreAtThisLevel; 1692 } 1693 }; 1694 /// A helper data structure to hold the operands of a vector of instructions. 1695 /// This supports a fixed vector length for all operand vectors. 1696 class VLOperands { 1697 /// For each operand we need (i) the value, and (ii) the opcode that it 1698 /// would be attached to if the expression was in a left-linearized form. 1699 /// This is required to avoid illegal operand reordering. 1700 /// For example: 1701 /// \verbatim 1702 /// 0 Op1 1703 /// |/ 1704 /// Op1 Op2 Linearized + Op2 1705 /// \ / ----------> |/ 1706 /// - - 1707 /// 1708 /// Op1 - Op2 (0 + Op1) - Op2 1709 /// \endverbatim 1710 /// 1711 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 1712 /// 1713 /// Another way to think of this is to track all the operations across the 1714 /// path from the operand all the way to the root of the tree and to 1715 /// calculate the operation that corresponds to this path. For example, the 1716 /// path from Op2 to the root crosses the RHS of the '-', therefore the 1717 /// corresponding operation is a '-' (which matches the one in the 1718 /// linearized tree, as shown above). 1719 /// 1720 /// For lack of a better term, we refer to this operation as Accumulated 1721 /// Path Operation (APO). 1722 struct OperandData { 1723 OperandData() = default; 1724 OperandData(Value *V, bool APO, bool IsUsed) 1725 : V(V), APO(APO), IsUsed(IsUsed) {} 1726 /// The operand value. 1727 Value *V = nullptr; 1728 /// TreeEntries only allow a single opcode, or an alternate sequence of 1729 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 1730 /// APO. It is set to 'true' if 'V' is attached to an inverse operation 1731 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 1732 /// (e.g., Add/Mul) 1733 bool APO = false; 1734 /// Helper data for the reordering function. 1735 bool IsUsed = false; 1736 }; 1737 1738 /// During operand reordering, we are trying to select the operand at lane 1739 /// that matches best with the operand at the neighboring lane. Our 1740 /// selection is based on the type of value we are looking for. For example, 1741 /// if the neighboring lane has a load, we need to look for a load that is 1742 /// accessing a consecutive address. These strategies are summarized in the 1743 /// 'ReorderingMode' enumerator. 1744 enum class ReorderingMode { 1745 Load, ///< Matching loads to consecutive memory addresses 1746 Opcode, ///< Matching instructions based on opcode (same or alternate) 1747 Constant, ///< Matching constants 1748 Splat, ///< Matching the same instruction multiple times (broadcast) 1749 Failed, ///< We failed to create a vectorizable group 1750 }; 1751 1752 using OperandDataVec = SmallVector<OperandData, 2>; 1753 1754 /// A vector of operand vectors. 1755 SmallVector<OperandDataVec, 4> OpsVec; 1756 1757 const TargetLibraryInfo &TLI; 1758 const DataLayout &DL; 1759 ScalarEvolution &SE; 1760 const BoUpSLP &R; 1761 const Loop *L = nullptr; 1762 1763 /// \returns the operand data at \p OpIdx and \p Lane. 1764 OperandData &getData(unsigned OpIdx, unsigned Lane) { 1765 return OpsVec[OpIdx][Lane]; 1766 } 1767 1768 /// \returns the operand data at \p OpIdx and \p Lane. Const version. 1769 const OperandData &getData(unsigned OpIdx, unsigned Lane) const { 1770 return OpsVec[OpIdx][Lane]; 1771 } 1772 1773 /// Clears the used flag for all entries. 1774 void clearUsed() { 1775 for (unsigned OpIdx = 0, NumOperands = getNumOperands(); 1776 OpIdx != NumOperands; ++OpIdx) 1777 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; 1778 ++Lane) 1779 OpsVec[OpIdx][Lane].IsUsed = false; 1780 } 1781 1782 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 1783 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { 1784 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); 1785 } 1786 1787 /// \param Lane lane of the operands under analysis. 1788 /// \param OpIdx operand index in \p Lane lane we're looking the best 1789 /// candidate for. 1790 /// \param Idx operand index of the current candidate value. 1791 /// \returns The additional score due to possible broadcasting of the 1792 /// elements in the lane. It is more profitable to have power-of-2 unique 1793 /// elements in the lane, it will be vectorized with higher probability 1794 /// after removing duplicates. Currently the SLP vectorizer supports only 1795 /// vectorization of the power-of-2 number of unique scalars. 1796 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 1797 Value *IdxLaneV = getData(Idx, Lane).V; 1798 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) 1799 return 0; 1800 SmallPtrSet<Value *, 4> Uniques; 1801 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { 1802 if (Ln == Lane) 1803 continue; 1804 Value *OpIdxLnV = getData(OpIdx, Ln).V; 1805 if (!isa<Instruction>(OpIdxLnV)) 1806 return 0; 1807 Uniques.insert(OpIdxLnV); 1808 } 1809 int UniquesCount = Uniques.size(); 1810 int UniquesCntWithIdxLaneV = 1811 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; 1812 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 1813 int UniquesCntWithOpIdxLaneV = 1814 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; 1815 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) 1816 return 0; 1817 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - 1818 UniquesCntWithOpIdxLaneV) - 1819 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); 1820 } 1821 1822 /// \param Lane lane of the operands under analysis. 1823 /// \param OpIdx operand index in \p Lane lane we're looking the best 1824 /// candidate for. 1825 /// \param Idx operand index of the current candidate value. 1826 /// \returns The additional score for the scalar which users are all 1827 /// vectorized. 1828 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { 1829 Value *IdxLaneV = getData(Idx, Lane).V; 1830 Value *OpIdxLaneV = getData(OpIdx, Lane).V; 1831 // Do not care about number of uses for vector-like instructions 1832 // (extractelement/extractvalue with constant indices), they are extracts 1833 // themselves and already externally used. Vectorization of such 1834 // instructions does not add extra extractelement instruction, just may 1835 // remove it. 1836 if (isVectorLikeInstWithConstOps(IdxLaneV) && 1837 isVectorLikeInstWithConstOps(OpIdxLaneV)) 1838 return LookAheadHeuristics::ScoreAllUserVectorized; 1839 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV); 1840 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV)) 1841 return 0; 1842 return R.areAllUsersVectorized(IdxLaneI) 1843 ? LookAheadHeuristics::ScoreAllUserVectorized 1844 : 0; 1845 } 1846 1847 /// Score scaling factor for fully compatible instructions but with 1848 /// different number of external uses. Allows better selection of the 1849 /// instructions with less external uses. 1850 static const int ScoreScaleFactor = 10; 1851 1852 /// \Returns the look-ahead score, which tells us how much the sub-trees 1853 /// rooted at \p LHS and \p RHS match, the more they match the higher the 1854 /// score. This helps break ties in an informed way when we cannot decide on 1855 /// the order of the operands by just considering the immediate 1856 /// predecessors. 1857 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, 1858 int Lane, unsigned OpIdx, unsigned Idx, 1859 bool &IsUsed) { 1860 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), 1861 LookAheadMaxDepth); 1862 // Keep track of the instruction stack as we recurse into the operands 1863 // during the look-ahead score exploration. 1864 int Score = 1865 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, 1866 /*CurrLevel=*/1, MainAltOps); 1867 if (Score) { 1868 int SplatScore = getSplatScore(Lane, OpIdx, Idx); 1869 if (Score <= -SplatScore) { 1870 // Set the minimum score for splat-like sequence to avoid setting 1871 // failed state. 1872 Score = 1; 1873 } else { 1874 Score += SplatScore; 1875 // Scale score to see the difference between different operands 1876 // and similar operands but all vectorized/not all vectorized 1877 // uses. It does not affect actual selection of the best 1878 // compatible operand in general, just allows to select the 1879 // operand with all vectorized uses. 1880 Score *= ScoreScaleFactor; 1881 Score += getExternalUseScore(Lane, OpIdx, Idx); 1882 IsUsed = true; 1883 } 1884 } 1885 return Score; 1886 } 1887 1888 /// Best defined scores per lanes between the passes. Used to choose the 1889 /// best operand (with the highest score) between the passes. 1890 /// The key - {Operand Index, Lane}. 1891 /// The value - the best score between the passes for the lane and the 1892 /// operand. 1893 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8> 1894 BestScoresPerLanes; 1895 1896 // Search all operands in Ops[*][Lane] for the one that matches best 1897 // Ops[OpIdx][LastLane] and return its opreand index. 1898 // If no good match can be found, return std::nullopt. 1899 std::optional<unsigned> 1900 getBestOperand(unsigned OpIdx, int Lane, int LastLane, 1901 ArrayRef<ReorderingMode> ReorderingModes, 1902 ArrayRef<Value *> MainAltOps) { 1903 unsigned NumOperands = getNumOperands(); 1904 1905 // The operand of the previous lane at OpIdx. 1906 Value *OpLastLane = getData(OpIdx, LastLane).V; 1907 1908 // Our strategy mode for OpIdx. 1909 ReorderingMode RMode = ReorderingModes[OpIdx]; 1910 if (RMode == ReorderingMode::Failed) 1911 return std::nullopt; 1912 1913 // The linearized opcode of the operand at OpIdx, Lane. 1914 bool OpIdxAPO = getData(OpIdx, Lane).APO; 1915 1916 // The best operand index and its score. 1917 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we 1918 // are using the score to differentiate between the two. 1919 struct BestOpData { 1920 std::optional<unsigned> Idx; 1921 unsigned Score = 0; 1922 } BestOp; 1923 BestOp.Score = 1924 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0) 1925 .first->second; 1926 1927 // Track if the operand must be marked as used. If the operand is set to 1928 // Score 1 explicitly (because of non power-of-2 unique scalars, we may 1929 // want to reestimate the operands again on the following iterations). 1930 bool IsUsed = RMode == ReorderingMode::Splat || 1931 RMode == ReorderingMode::Constant || 1932 RMode == ReorderingMode::Load; 1933 // Iterate through all unused operands and look for the best. 1934 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { 1935 // Get the operand at Idx and Lane. 1936 OperandData &OpData = getData(Idx, Lane); 1937 Value *Op = OpData.V; 1938 bool OpAPO = OpData.APO; 1939 1940 // Skip already selected operands. 1941 if (OpData.IsUsed) 1942 continue; 1943 1944 // Skip if we are trying to move the operand to a position with a 1945 // different opcode in the linearized tree form. This would break the 1946 // semantics. 1947 if (OpAPO != OpIdxAPO) 1948 continue; 1949 1950 // Look for an operand that matches the current mode. 1951 switch (RMode) { 1952 case ReorderingMode::Load: 1953 case ReorderingMode::Opcode: { 1954 bool LeftToRight = Lane > LastLane; 1955 Value *OpLeft = (LeftToRight) ? OpLastLane : Op; 1956 Value *OpRight = (LeftToRight) ? Op : OpLastLane; 1957 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, 1958 OpIdx, Idx, IsUsed); 1959 if (Score > static_cast<int>(BestOp.Score) || 1960 (Score > 0 && Score == static_cast<int>(BestOp.Score) && 1961 Idx == OpIdx)) { 1962 BestOp.Idx = Idx; 1963 BestOp.Score = Score; 1964 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; 1965 } 1966 break; 1967 } 1968 case ReorderingMode::Constant: 1969 if (isa<Constant>(Op) || 1970 (!BestOp.Score && L && L->isLoopInvariant(Op))) { 1971 BestOp.Idx = Idx; 1972 if (isa<Constant>(Op)) { 1973 BestOp.Score = LookAheadHeuristics::ScoreConstants; 1974 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = 1975 LookAheadHeuristics::ScoreConstants; 1976 } 1977 if (isa<UndefValue>(Op) || !isa<Constant>(Op)) 1978 IsUsed = false; 1979 } 1980 break; 1981 case ReorderingMode::Splat: 1982 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) { 1983 IsUsed = Op == OpLastLane; 1984 if (Op == OpLastLane) { 1985 BestOp.Score = LookAheadHeuristics::ScoreSplat; 1986 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = 1987 LookAheadHeuristics::ScoreSplat; 1988 } 1989 BestOp.Idx = Idx; 1990 } 1991 break; 1992 case ReorderingMode::Failed: 1993 llvm_unreachable("Not expected Failed reordering mode."); 1994 } 1995 } 1996 1997 if (BestOp.Idx) { 1998 getData(*BestOp.Idx, Lane).IsUsed = IsUsed; 1999 return BestOp.Idx; 2000 } 2001 // If we could not find a good match return std::nullopt. 2002 return std::nullopt; 2003 } 2004 2005 /// Helper for reorderOperandVecs. 2006 /// \returns the lane that we should start reordering from. This is the one 2007 /// which has the least number of operands that can freely move about or 2008 /// less profitable because it already has the most optimal set of operands. 2009 unsigned getBestLaneToStartReordering() const { 2010 unsigned Min = UINT_MAX; 2011 unsigned SameOpNumber = 0; 2012 // std::pair<unsigned, unsigned> is used to implement a simple voting 2013 // algorithm and choose the lane with the least number of operands that 2014 // can freely move about or less profitable because it already has the 2015 // most optimal set of operands. The first unsigned is a counter for 2016 // voting, the second unsigned is the counter of lanes with instructions 2017 // with same/alternate opcodes and same parent basic block. 2018 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; 2019 // Try to be closer to the original results, if we have multiple lanes 2020 // with same cost. If 2 lanes have the same cost, use the one with the 2021 // lowest index. 2022 for (int I = getNumLanes(); I > 0; --I) { 2023 unsigned Lane = I - 1; 2024 OperandsOrderData NumFreeOpsHash = 2025 getMaxNumOperandsThatCanBeReordered(Lane); 2026 // Compare the number of operands that can move and choose the one with 2027 // the least number. 2028 if (NumFreeOpsHash.NumOfAPOs < Min) { 2029 Min = NumFreeOpsHash.NumOfAPOs; 2030 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 2031 HashMap.clear(); 2032 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 2033 } else if (NumFreeOpsHash.NumOfAPOs == Min && 2034 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { 2035 // Select the most optimal lane in terms of number of operands that 2036 // should be moved around. 2037 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; 2038 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 2039 } else if (NumFreeOpsHash.NumOfAPOs == Min && 2040 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { 2041 auto *It = HashMap.find(NumFreeOpsHash.Hash); 2042 if (It == HashMap.end()) 2043 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); 2044 else 2045 ++It->second.first; 2046 } 2047 } 2048 // Select the lane with the minimum counter. 2049 unsigned BestLane = 0; 2050 unsigned CntMin = UINT_MAX; 2051 for (const auto &Data : reverse(HashMap)) { 2052 if (Data.second.first < CntMin) { 2053 CntMin = Data.second.first; 2054 BestLane = Data.second.second; 2055 } 2056 } 2057 return BestLane; 2058 } 2059 2060 /// Data structure that helps to reorder operands. 2061 struct OperandsOrderData { 2062 /// The best number of operands with the same APOs, which can be 2063 /// reordered. 2064 unsigned NumOfAPOs = UINT_MAX; 2065 /// Number of operands with the same/alternate instruction opcode and 2066 /// parent. 2067 unsigned NumOpsWithSameOpcodeParent = 0; 2068 /// Hash for the actual operands ordering. 2069 /// Used to count operands, actually their position id and opcode 2070 /// value. It is used in the voting mechanism to find the lane with the 2071 /// least number of operands that can freely move about or less profitable 2072 /// because it already has the most optimal set of operands. Can be 2073 /// replaced with SmallVector<unsigned> instead but hash code is faster 2074 /// and requires less memory. 2075 unsigned Hash = 0; 2076 }; 2077 /// \returns the maximum number of operands that are allowed to be reordered 2078 /// for \p Lane and the number of compatible instructions(with the same 2079 /// parent/opcode). This is used as a heuristic for selecting the first lane 2080 /// to start operand reordering. 2081 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { 2082 unsigned CntTrue = 0; 2083 unsigned NumOperands = getNumOperands(); 2084 // Operands with the same APO can be reordered. We therefore need to count 2085 // how many of them we have for each APO, like this: Cnt[APO] = x. 2086 // Since we only have two APOs, namely true and false, we can avoid using 2087 // a map. Instead we can simply count the number of operands that 2088 // correspond to one of them (in this case the 'true' APO), and calculate 2089 // the other by subtracting it from the total number of operands. 2090 // Operands with the same instruction opcode and parent are more 2091 // profitable since we don't need to move them in many cases, with a high 2092 // probability such lane already can be vectorized effectively. 2093 bool AllUndefs = true; 2094 unsigned NumOpsWithSameOpcodeParent = 0; 2095 Instruction *OpcodeI = nullptr; 2096 BasicBlock *Parent = nullptr; 2097 unsigned Hash = 0; 2098 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2099 const OperandData &OpData = getData(OpIdx, Lane); 2100 if (OpData.APO) 2101 ++CntTrue; 2102 // Use Boyer-Moore majority voting for finding the majority opcode and 2103 // the number of times it occurs. 2104 if (auto *I = dyn_cast<Instruction>(OpData.V)) { 2105 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || 2106 I->getParent() != Parent) { 2107 if (NumOpsWithSameOpcodeParent == 0) { 2108 NumOpsWithSameOpcodeParent = 1; 2109 OpcodeI = I; 2110 Parent = I->getParent(); 2111 } else { 2112 --NumOpsWithSameOpcodeParent; 2113 } 2114 } else { 2115 ++NumOpsWithSameOpcodeParent; 2116 } 2117 } 2118 Hash = hash_combine( 2119 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); 2120 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); 2121 } 2122 if (AllUndefs) 2123 return {}; 2124 OperandsOrderData Data; 2125 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); 2126 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; 2127 Data.Hash = Hash; 2128 return Data; 2129 } 2130 2131 /// Go through the instructions in VL and append their operands. 2132 void appendOperandsOfVL(ArrayRef<Value *> VL) { 2133 assert(!VL.empty() && "Bad VL"); 2134 assert((empty() || VL.size() == getNumLanes()) && 2135 "Expected same number of lanes"); 2136 assert(isa<Instruction>(VL[0]) && "Expected instruction"); 2137 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); 2138 constexpr unsigned IntrinsicNumOperands = 2; 2139 if (isa<IntrinsicInst>(VL[0])) 2140 NumOperands = IntrinsicNumOperands; 2141 OpsVec.resize(NumOperands); 2142 unsigned NumLanes = VL.size(); 2143 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2144 OpsVec[OpIdx].resize(NumLanes); 2145 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 2146 assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); 2147 // Our tree has just 3 nodes: the root and two operands. 2148 // It is therefore trivial to get the APO. We only need to check the 2149 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 2150 // RHS operand. The LHS operand of both add and sub is never attached 2151 // to an inversese operation in the linearized form, therefore its APO 2152 // is false. The RHS is true only if VL[Lane] is an inverse operation. 2153 2154 // Since operand reordering is performed on groups of commutative 2155 // operations or alternating sequences (e.g., +, -), we can safely 2156 // tell the inverse operations by checking commutativity. 2157 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); 2158 bool APO = (OpIdx == 0) ? false : IsInverseOperation; 2159 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), 2160 APO, false}; 2161 } 2162 } 2163 } 2164 2165 /// \returns the number of operands. 2166 unsigned getNumOperands() const { return OpsVec.size(); } 2167 2168 /// \returns the number of lanes. 2169 unsigned getNumLanes() const { return OpsVec[0].size(); } 2170 2171 /// \returns the operand value at \p OpIdx and \p Lane. 2172 Value *getValue(unsigned OpIdx, unsigned Lane) const { 2173 return getData(OpIdx, Lane).V; 2174 } 2175 2176 /// \returns true if the data structure is empty. 2177 bool empty() const { return OpsVec.empty(); } 2178 2179 /// Clears the data. 2180 void clear() { OpsVec.clear(); } 2181 2182 /// \Returns true if there are enough operands identical to \p Op to fill 2183 /// the whole vector (it is mixed with constants or loop invariant values). 2184 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 2185 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { 2186 bool OpAPO = getData(OpIdx, Lane).APO; 2187 bool IsInvariant = L && L->isLoopInvariant(Op); 2188 unsigned Cnt = 0; 2189 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 2190 if (Ln == Lane) 2191 continue; 2192 // This is set to true if we found a candidate for broadcast at Lane. 2193 bool FoundCandidate = false; 2194 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { 2195 OperandData &Data = getData(OpI, Ln); 2196 if (Data.APO != OpAPO || Data.IsUsed) 2197 continue; 2198 Value *OpILane = getValue(OpI, Lane); 2199 bool IsConstantOp = isa<Constant>(OpILane); 2200 // Consider the broadcast candidate if: 2201 // 1. Same value is found in one of the operands. 2202 if (Data.V == Op || 2203 // 2. The operand in the given lane is not constant but there is a 2204 // constant operand in another lane (which can be moved to the 2205 // given lane). In this case we can represent it as a simple 2206 // permutation of constant and broadcast. 2207 (!IsConstantOp && 2208 ((Lns > 2 && isa<Constant>(Data.V)) || 2209 // 2.1. If we have only 2 lanes, need to check that value in the 2210 // next lane does not build same opcode sequence. 2211 (Lns == 2 && 2212 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) 2213 .getOpcode() && 2214 isa<Constant>(Data.V)))) || 2215 // 3. The operand in the current lane is loop invariant (can be 2216 // hoisted out) and another operand is also a loop invariant 2217 // (though not a constant). In this case the whole vector can be 2218 // hoisted out. 2219 // FIXME: need to teach the cost model about this case for better 2220 // estimation. 2221 (IsInvariant && !isa<Constant>(Data.V) && 2222 !getSameOpcode({Op, Data.V}, TLI).getOpcode() && 2223 L->isLoopInvariant(Data.V))) { 2224 FoundCandidate = true; 2225 Data.IsUsed = Data.V == Op; 2226 if (Data.V == Op) 2227 ++Cnt; 2228 break; 2229 } 2230 } 2231 if (!FoundCandidate) 2232 return false; 2233 } 2234 return getNumLanes() == 2 || Cnt > 1; 2235 } 2236 2237 /// Checks if there is at least single compatible operand in lanes other 2238 /// than \p Lane, compatible with the operand \p Op. 2239 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const { 2240 bool OpAPO = getData(OpIdx, Lane).APO; 2241 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 2242 if (Ln == Lane) 2243 continue; 2244 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) { 2245 const OperandData &Data = getData(OpI, Ln); 2246 if (Data.APO != OpAPO || Data.IsUsed) 2247 return true; 2248 Value *OpILn = getValue(OpI, Ln); 2249 return (L && L->isLoopInvariant(OpILn)) || 2250 (getSameOpcode({Op, OpILn}, TLI).getOpcode() && 2251 Op->getParent() == cast<Instruction>(OpILn)->getParent()); 2252 })) 2253 return true; 2254 } 2255 return false; 2256 } 2257 2258 public: 2259 /// Initialize with all the operands of the instruction vector \p RootVL. 2260 VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R) 2261 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), 2262 L(R.LI->getLoopFor( 2263 (cast<Instruction>(RootVL.front())->getParent()))) { 2264 // Append all the operands of RootVL. 2265 appendOperandsOfVL(RootVL); 2266 } 2267 2268 /// \Returns a value vector with the operands across all lanes for the 2269 /// opearnd at \p OpIdx. 2270 ValueList getVL(unsigned OpIdx) const { 2271 ValueList OpVL(OpsVec[OpIdx].size()); 2272 assert(OpsVec[OpIdx].size() == getNumLanes() && 2273 "Expected same num of lanes across all operands"); 2274 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) 2275 OpVL[Lane] = OpsVec[OpIdx][Lane].V; 2276 return OpVL; 2277 } 2278 2279 // Performs operand reordering for 2 or more operands. 2280 // The original operands are in OrigOps[OpIdx][Lane]. 2281 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 2282 void reorder() { 2283 unsigned NumOperands = getNumOperands(); 2284 unsigned NumLanes = getNumLanes(); 2285 // Each operand has its own mode. We are using this mode to help us select 2286 // the instructions for each lane, so that they match best with the ones 2287 // we have selected so far. 2288 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); 2289 2290 // This is a greedy single-pass algorithm. We are going over each lane 2291 // once and deciding on the best order right away with no back-tracking. 2292 // However, in order to increase its effectiveness, we start with the lane 2293 // that has operands that can move the least. For example, given the 2294 // following lanes: 2295 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd 2296 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st 2297 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd 2298 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th 2299 // we will start at Lane 1, since the operands of the subtraction cannot 2300 // be reordered. Then we will visit the rest of the lanes in a circular 2301 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 2302 2303 // Find the first lane that we will start our search from. 2304 unsigned FirstLane = getBestLaneToStartReordering(); 2305 2306 // Initialize the modes. 2307 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2308 Value *OpLane0 = getValue(OpIdx, FirstLane); 2309 // Keep track if we have instructions with all the same opcode on one 2310 // side. 2311 if (isa<LoadInst>(OpLane0)) 2312 ReorderingModes[OpIdx] = ReorderingMode::Load; 2313 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) { 2314 // Check if OpLane0 should be broadcast. 2315 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) || 2316 !canBeVectorized(OpILane0, OpIdx, FirstLane)) 2317 ReorderingModes[OpIdx] = ReorderingMode::Splat; 2318 else 2319 ReorderingModes[OpIdx] = ReorderingMode::Opcode; 2320 } else if (isa<Constant>(OpLane0)) 2321 ReorderingModes[OpIdx] = ReorderingMode::Constant; 2322 else if (isa<Argument>(OpLane0)) 2323 // Our best hope is a Splat. It may save some cost in some cases. 2324 ReorderingModes[OpIdx] = ReorderingMode::Splat; 2325 else 2326 // NOTE: This should be unreachable. 2327 ReorderingModes[OpIdx] = ReorderingMode::Failed; 2328 } 2329 2330 // Check that we don't have same operands. No need to reorder if operands 2331 // are just perfect diamond or shuffled diamond match. Do not do it only 2332 // for possible broadcasts or non-power of 2 number of scalars (just for 2333 // now). 2334 auto &&SkipReordering = [this]() { 2335 SmallPtrSet<Value *, 4> UniqueValues; 2336 ArrayRef<OperandData> Op0 = OpsVec.front(); 2337 for (const OperandData &Data : Op0) 2338 UniqueValues.insert(Data.V); 2339 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { 2340 if (any_of(Op, [&UniqueValues](const OperandData &Data) { 2341 return !UniqueValues.contains(Data.V); 2342 })) 2343 return false; 2344 } 2345 // TODO: Check if we can remove a check for non-power-2 number of 2346 // scalars after full support of non-power-2 vectorization. 2347 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); 2348 }; 2349 2350 // If the initial strategy fails for any of the operand indexes, then we 2351 // perform reordering again in a second pass. This helps avoid assigning 2352 // high priority to the failed strategy, and should improve reordering for 2353 // the non-failed operand indexes. 2354 for (int Pass = 0; Pass != 2; ++Pass) { 2355 // Check if no need to reorder operands since they're are perfect or 2356 // shuffled diamond match. 2357 // Need to do it to avoid extra external use cost counting for 2358 // shuffled matches, which may cause regressions. 2359 if (SkipReordering()) 2360 break; 2361 // Skip the second pass if the first pass did not fail. 2362 bool StrategyFailed = false; 2363 // Mark all operand data as free to use. 2364 clearUsed(); 2365 // We keep the original operand order for the FirstLane, so reorder the 2366 // rest of the lanes. We are visiting the nodes in a circular fashion, 2367 // using FirstLane as the center point and increasing the radius 2368 // distance. 2369 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands); 2370 for (unsigned I = 0; I < NumOperands; ++I) 2371 MainAltOps[I].push_back(getData(I, FirstLane).V); 2372 2373 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { 2374 // Visit the lane on the right and then the lane on the left. 2375 for (int Direction : {+1, -1}) { 2376 int Lane = FirstLane + Direction * Distance; 2377 if (Lane < 0 || Lane >= (int)NumLanes) 2378 continue; 2379 int LastLane = Lane - Direction; 2380 assert(LastLane >= 0 && LastLane < (int)NumLanes && 2381 "Out of bounds"); 2382 // Look for a good match for each operand. 2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 2384 // Search for the operand that matches SortedOps[OpIdx][Lane-1]. 2385 std::optional<unsigned> BestIdx = getBestOperand( 2386 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]); 2387 // By not selecting a value, we allow the operands that follow to 2388 // select a better matching value. We will get a non-null value in 2389 // the next run of getBestOperand(). 2390 if (BestIdx) { 2391 // Swap the current operand with the one returned by 2392 // getBestOperand(). 2393 swap(OpIdx, *BestIdx, Lane); 2394 } else { 2395 // Enable the second pass. 2396 StrategyFailed = true; 2397 } 2398 // Try to get the alternate opcode and follow it during analysis. 2399 if (MainAltOps[OpIdx].size() != 2) { 2400 OperandData &AltOp = getData(OpIdx, Lane); 2401 InstructionsState OpS = 2402 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); 2403 if (OpS.getOpcode() && OpS.isAltShuffle()) 2404 MainAltOps[OpIdx].push_back(AltOp.V); 2405 } 2406 } 2407 } 2408 } 2409 // Skip second pass if the strategy did not fail. 2410 if (!StrategyFailed) 2411 break; 2412 } 2413 } 2414 2415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2416 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { 2417 switch (RMode) { 2418 case ReorderingMode::Load: 2419 return "Load"; 2420 case ReorderingMode::Opcode: 2421 return "Opcode"; 2422 case ReorderingMode::Constant: 2423 return "Constant"; 2424 case ReorderingMode::Splat: 2425 return "Splat"; 2426 case ReorderingMode::Failed: 2427 return "Failed"; 2428 } 2429 llvm_unreachable("Unimplemented Reordering Type"); 2430 } 2431 2432 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, 2433 raw_ostream &OS) { 2434 return OS << getModeStr(RMode); 2435 } 2436 2437 /// Debug print. 2438 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { 2439 printMode(RMode, dbgs()); 2440 } 2441 2442 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { 2443 return printMode(RMode, OS); 2444 } 2445 2446 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { 2447 const unsigned Indent = 2; 2448 unsigned Cnt = 0; 2449 for (const OperandDataVec &OpDataVec : OpsVec) { 2450 OS << "Operand " << Cnt++ << "\n"; 2451 for (const OperandData &OpData : OpDataVec) { 2452 OS.indent(Indent) << "{"; 2453 if (Value *V = OpData.V) 2454 OS << *V; 2455 else 2456 OS << "null"; 2457 OS << ", APO:" << OpData.APO << "}\n"; 2458 } 2459 OS << "\n"; 2460 } 2461 return OS; 2462 } 2463 2464 /// Debug print. 2465 LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 2466 #endif 2467 }; 2468 2469 /// Evaluate each pair in \p Candidates and return index into \p Candidates 2470 /// for a pair which have highest score deemed to have best chance to form 2471 /// root of profitable tree to vectorize. Return std::nullopt if no candidate 2472 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit 2473 /// of the cost, considered to be good enough score. 2474 std::optional<int> 2475 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates, 2476 int Limit = LookAheadHeuristics::ScoreFail) const { 2477 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, 2478 RootLookAheadMaxDepth); 2479 int BestScore = Limit; 2480 std::optional<int> Index; 2481 for (int I : seq<int>(0, Candidates.size())) { 2482 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, 2483 Candidates[I].second, 2484 /*U1=*/nullptr, /*U2=*/nullptr, 2485 /*Level=*/1, std::nullopt); 2486 if (Score > BestScore) { 2487 BestScore = Score; 2488 Index = I; 2489 } 2490 } 2491 return Index; 2492 } 2493 2494 /// Checks if the instruction is marked for deletion. 2495 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } 2496 2497 /// Removes an instruction from its block and eventually deletes it. 2498 /// It's like Instruction::eraseFromParent() except that the actual deletion 2499 /// is delayed until BoUpSLP is destructed. 2500 void eraseInstruction(Instruction *I) { 2501 DeletedInstructions.insert(I); 2502 } 2503 2504 /// Remove instructions from the parent function and clear the operands of \p 2505 /// DeadVals instructions, marking for deletion trivially dead operands. 2506 template <typename T> 2507 void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) { 2508 SmallVector<WeakTrackingVH> DeadInsts; 2509 for (T *V : DeadVals) { 2510 auto *I = cast<Instruction>(V); 2511 DeletedInstructions.insert(I); 2512 } 2513 DenseSet<Value *> Processed; 2514 for (T *V : DeadVals) { 2515 if (!V || !Processed.insert(V).second) 2516 continue; 2517 auto *I = cast<Instruction>(V); 2518 salvageDebugInfo(*I); 2519 SmallVector<const TreeEntry *> Entries; 2520 if (const TreeEntry *Entry = getTreeEntry(I)) { 2521 Entries.push_back(Entry); 2522 auto It = MultiNodeScalars.find(I); 2523 if (It != MultiNodeScalars.end()) 2524 Entries.append(It->second.begin(), It->second.end()); 2525 } 2526 for (Use &U : I->operands()) { 2527 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get()); 2528 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() && 2529 wouldInstructionBeTriviallyDead(OpI, TLI) && 2530 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) { 2531 return Entry->VectorizedValue == OpI; 2532 }))) 2533 DeadInsts.push_back(OpI); 2534 } 2535 I->dropAllReferences(); 2536 } 2537 for (T *V : DeadVals) { 2538 auto *I = cast<Instruction>(V); 2539 if (!I->getParent()) 2540 continue; 2541 assert((I->use_empty() || all_of(I->uses(), 2542 [&](Use &U) { 2543 return isDeleted( 2544 cast<Instruction>(U.getUser())); 2545 })) && 2546 "trying to erase instruction with users."); 2547 I->removeFromParent(); 2548 SE->forgetValue(I); 2549 } 2550 // Process the dead instruction list until empty. 2551 while (!DeadInsts.empty()) { 2552 Value *V = DeadInsts.pop_back_val(); 2553 Instruction *VI = cast_or_null<Instruction>(V); 2554 if (!VI || !VI->getParent()) 2555 continue; 2556 assert(isInstructionTriviallyDead(VI, TLI) && 2557 "Live instruction found in dead worklist!"); 2558 assert(VI->use_empty() && "Instructions with uses are not dead."); 2559 2560 // Don't lose the debug info while deleting the instructions. 2561 salvageDebugInfo(*VI); 2562 2563 // Null out all of the instruction's operands to see if any operand 2564 // becomes dead as we go. 2565 for (Use &OpU : VI->operands()) { 2566 Value *OpV = OpU.get(); 2567 if (!OpV) 2568 continue; 2569 OpU.set(nullptr); 2570 2571 if (!OpV->use_empty()) 2572 continue; 2573 2574 // If the operand is an instruction that became dead as we nulled out 2575 // the operand, and if it is 'trivially' dead, delete it in a future 2576 // loop iteration. 2577 if (auto *OpI = dyn_cast<Instruction>(OpV)) 2578 if (!DeletedInstructions.contains(OpI) && 2579 isInstructionTriviallyDead(OpI, TLI)) 2580 DeadInsts.push_back(OpI); 2581 } 2582 2583 VI->removeFromParent(); 2584 DeletedInstructions.insert(VI); 2585 SE->forgetValue(VI); 2586 } 2587 } 2588 2589 /// Checks if the instruction was already analyzed for being possible 2590 /// reduction root. 2591 bool isAnalyzedReductionRoot(Instruction *I) const { 2592 return AnalyzedReductionsRoots.count(I); 2593 } 2594 /// Register given instruction as already analyzed for being possible 2595 /// reduction root. 2596 void analyzedReductionRoot(Instruction *I) { 2597 AnalyzedReductionsRoots.insert(I); 2598 } 2599 /// Checks if the provided list of reduced values was checked already for 2600 /// vectorization. 2601 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const { 2602 return AnalyzedReductionVals.contains(hash_value(VL)); 2603 } 2604 /// Adds the list of reduced values to list of already checked values for the 2605 /// vectorization. 2606 void analyzedReductionVals(ArrayRef<Value *> VL) { 2607 AnalyzedReductionVals.insert(hash_value(VL)); 2608 } 2609 /// Clear the list of the analyzed reduction root instructions. 2610 void clearReductionData() { 2611 AnalyzedReductionsRoots.clear(); 2612 AnalyzedReductionVals.clear(); 2613 AnalyzedMinBWVals.clear(); 2614 } 2615 /// Checks if the given value is gathered in one of the nodes. 2616 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const { 2617 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); 2618 } 2619 /// Checks if the given value is gathered in one of the nodes. 2620 bool isGathered(const Value *V) const { 2621 return MustGather.contains(V); 2622 } 2623 /// Checks if the specified value was not schedule. 2624 bool isNotScheduled(const Value *V) const { 2625 return NonScheduledFirst.contains(V); 2626 } 2627 2628 /// Check if the value is vectorized in the tree. 2629 bool isVectorized(Value *V) const { return getTreeEntry(V); } 2630 2631 ~BoUpSLP(); 2632 2633 private: 2634 /// Determine if a node \p E in can be demoted to a smaller type with a 2635 /// truncation. We collect the entries that will be demoted in ToDemote. 2636 /// \param E Node for analysis 2637 /// \param ToDemote indices of the nodes to be demoted. 2638 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot, 2639 unsigned &BitWidth, 2640 SmallVectorImpl<unsigned> &ToDemote, 2641 DenseSet<const TreeEntry *> &Visited, 2642 unsigned &MaxDepthLevel, 2643 bool &IsProfitableToDemote, 2644 bool IsTruncRoot) const; 2645 2646 /// Check if the operands on the edges \p Edges of the \p UserTE allows 2647 /// reordering (i.e. the operands can be reordered because they have only one 2648 /// user and reordarable). 2649 /// \param ReorderableGathers List of all gather nodes that require reordering 2650 /// (e.g., gather of extractlements or partially vectorizable loads). 2651 /// \param GatherOps List of gather operand nodes for \p UserTE that require 2652 /// reordering, subset of \p NonVectorized. 2653 bool 2654 canReorderOperands(TreeEntry *UserTE, 2655 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 2656 ArrayRef<TreeEntry *> ReorderableGathers, 2657 SmallVectorImpl<TreeEntry *> &GatherOps); 2658 2659 /// Checks if the given \p TE is a gather node with clustered reused scalars 2660 /// and reorders it per given \p Mask. 2661 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const; 2662 2663 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2664 /// if any. If it is not vectorized (gather node), returns nullptr. 2665 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { 2666 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx); 2667 TreeEntry *TE = nullptr; 2668 const auto *It = find_if(VL, [&](Value *V) { 2669 TE = getTreeEntry(V); 2670 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) 2671 return true; 2672 auto It = MultiNodeScalars.find(V); 2673 if (It != MultiNodeScalars.end()) { 2674 for (TreeEntry *E : It->second) { 2675 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) { 2676 TE = E; 2677 return true; 2678 } 2679 } 2680 } 2681 return false; 2682 }); 2683 if (It != VL.end()) { 2684 assert(TE->isSame(VL) && "Expected same scalars."); 2685 return TE; 2686 } 2687 return nullptr; 2688 } 2689 2690 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2691 /// if any. If it is not vectorized (gather node), returns nullptr. 2692 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, 2693 unsigned OpIdx) const { 2694 return const_cast<BoUpSLP *>(this)->getVectorizedOperand( 2695 const_cast<TreeEntry *>(UserTE), OpIdx); 2696 } 2697 2698 /// Checks if all users of \p I are the part of the vectorization tree. 2699 bool areAllUsersVectorized( 2700 Instruction *I, 2701 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const; 2702 2703 /// Return information about the vector formed for the specified index 2704 /// of a vector of (the same) instruction. 2705 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops); 2706 2707 /// \ returns the graph entry for the \p Idx operand of the \p E entry. 2708 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; 2709 2710 /// \returns Cast context for the given graph node. 2711 TargetTransformInfo::CastContextHint 2712 getCastContextHint(const TreeEntry &TE) const; 2713 2714 /// \returns the cost of the vectorizable entry. 2715 InstructionCost getEntryCost(const TreeEntry *E, 2716 ArrayRef<Value *> VectorizedVals, 2717 SmallPtrSetImpl<Value *> &CheckedExtracts); 2718 2719 /// This is the recursive part of buildTree. 2720 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, 2721 const EdgeInfo &EI); 2722 2723 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 2724 /// be vectorized to use the original vector (or aggregate "bitcast" to a 2725 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 2726 /// returns false, setting \p CurrentOrder to either an empty vector or a 2727 /// non-identity permutation that allows to reuse extract instructions. 2728 /// \param ResizeAllowed indicates whether it is allowed to handle subvector 2729 /// extract order. 2730 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 2731 SmallVectorImpl<unsigned> &CurrentOrder, 2732 bool ResizeAllowed = false) const; 2733 2734 /// Vectorize a single entry in the tree. 2735 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 2736 /// avoid issues with def-use order. 2737 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs); 2738 2739 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry 2740 /// \p E. 2741 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 2742 /// avoid issues with def-use order. 2743 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs); 2744 2745 /// Create a new vector from a list of scalar values. Produces a sequence 2746 /// which exploits values reused across lanes, and arranges the inserts 2747 /// for ease of later optimization. 2748 template <typename BVTy, typename ResTy, typename... Args> 2749 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params); 2750 2751 /// Create a new vector from a list of scalar values. Produces a sequence 2752 /// which exploits values reused across lanes, and arranges the inserts 2753 /// for ease of later optimization. 2754 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy); 2755 2756 /// Returns the instruction in the bundle, which can be used as a base point 2757 /// for scheduling. Usually it is the last instruction in the bundle, except 2758 /// for the case when all operands are external (in this case, it is the first 2759 /// instruction in the list). 2760 Instruction &getLastInstructionInBundle(const TreeEntry *E); 2761 2762 /// Tries to find extractelement instructions with constant indices from fixed 2763 /// vector type and gather such instructions into a bunch, which highly likely 2764 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 2765 /// was successful, the matched scalars are replaced by poison values in \p VL 2766 /// for future analysis. 2767 std::optional<TargetTransformInfo::ShuffleKind> 2768 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL, 2769 SmallVectorImpl<int> &Mask) const; 2770 2771 /// Tries to find extractelement instructions with constant indices from fixed 2772 /// vector type and gather such instructions into a bunch, which highly likely 2773 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 2774 /// was successful, the matched scalars are replaced by poison values in \p VL 2775 /// for future analysis. 2776 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> 2777 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, 2778 SmallVectorImpl<int> &Mask, 2779 unsigned NumParts) const; 2780 2781 /// Checks if the gathered \p VL can be represented as a single register 2782 /// shuffle(s) of previous tree entries. 2783 /// \param TE Tree entry checked for permutation. 2784 /// \param VL List of scalars (a subset of the TE scalar), checked for 2785 /// permutations. Must form single-register vector. 2786 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 2787 /// commands to build the mask using the original vector value, without 2788 /// relying on the potential reordering. 2789 /// \returns ShuffleKind, if gathered values can be represented as shuffles of 2790 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. 2791 std::optional<TargetTransformInfo::ShuffleKind> 2792 isGatherShuffledSingleRegisterEntry( 2793 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, 2794 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, 2795 bool ForOrder); 2796 2797 /// Checks if the gathered \p VL can be represented as multi-register 2798 /// shuffle(s) of previous tree entries. 2799 /// \param TE Tree entry checked for permutation. 2800 /// \param VL List of scalars (a subset of the TE scalar), checked for 2801 /// permutations. 2802 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 2803 /// commands to build the mask using the original vector value, without 2804 /// relying on the potential reordering. 2805 /// \returns per-register series of ShuffleKind, if gathered values can be 2806 /// represented as shuffles of previous tree entries. \p Mask is filled with 2807 /// the shuffle mask (also on per-register base). 2808 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> 2809 isGatherShuffledEntry( 2810 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, 2811 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, 2812 unsigned NumParts, bool ForOrder = false); 2813 2814 /// \returns the scalarization cost for this list of values. Assuming that 2815 /// this subtree gets vectorized, we may need to extract the values from the 2816 /// roots. This method calculates the cost of extracting the values. 2817 /// \param ForPoisonSrc true if initial vector is poison, false otherwise. 2818 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, 2819 Type *ScalarTy) const; 2820 2821 /// Set the Builder insert point to one after the last instruction in 2822 /// the bundle 2823 void setInsertPointAfterBundle(const TreeEntry *E); 2824 2825 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not 2826 /// specified, the starting vector value is poison. 2827 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy); 2828 2829 /// \returns whether the VectorizableTree is fully vectorizable and will 2830 /// be beneficial even the tree height is tiny. 2831 bool isFullyVectorizableTinyTree(bool ForReduction) const; 2832 2833 /// Reorder commutative or alt operands to get better probability of 2834 /// generating vectorized code. 2835 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 2836 SmallVectorImpl<Value *> &Left, 2837 SmallVectorImpl<Value *> &Right, 2838 const BoUpSLP &R); 2839 2840 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the 2841 /// users of \p TE and collects the stores. It returns the map from the store 2842 /// pointers to the collected stores. 2843 DenseMap<Value *, SmallVector<StoreInst *>> 2844 collectUserStores(const BoUpSLP::TreeEntry *TE) const; 2845 2846 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the 2847 /// stores in \p StoresVec can form a vector instruction. If so it returns 2848 /// true and populates \p ReorderIndices with the shuffle indices of the 2849 /// stores when compared to the sorted vector. 2850 bool canFormVector(ArrayRef<StoreInst *> StoresVec, 2851 OrdersType &ReorderIndices) const; 2852 2853 /// Iterates through the users of \p TE, looking for scalar stores that can be 2854 /// potentially vectorized in a future SLP-tree. If found, it keeps track of 2855 /// their order and builds an order index vector for each store bundle. It 2856 /// returns all these order vectors found. 2857 /// We run this after the tree has formed, otherwise we may come across user 2858 /// instructions that are not yet in the tree. 2859 SmallVector<OrdersType, 1> 2860 findExternalStoreUsersReorderIndices(TreeEntry *TE) const; 2861 2862 struct TreeEntry { 2863 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; 2864 TreeEntry(VecTreeTy &Container) : Container(Container) {} 2865 2866 /// \returns Common mask for reorder indices and reused scalars. 2867 SmallVector<int> getCommonMask() const { 2868 SmallVector<int> Mask; 2869 inversePermutation(ReorderIndices, Mask); 2870 ::addMask(Mask, ReuseShuffleIndices); 2871 return Mask; 2872 } 2873 2874 /// \returns true if the scalars in VL are equal to this entry. 2875 bool isSame(ArrayRef<Value *> VL) const { 2876 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { 2877 if (Mask.size() != VL.size() && VL.size() == Scalars.size()) 2878 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 2879 return VL.size() == Mask.size() && 2880 std::equal(VL.begin(), VL.end(), Mask.begin(), 2881 [Scalars](Value *V, int Idx) { 2882 return (isa<UndefValue>(V) && 2883 Idx == PoisonMaskElem) || 2884 (Idx != PoisonMaskElem && V == Scalars[Idx]); 2885 }); 2886 }; 2887 if (!ReorderIndices.empty()) { 2888 // TODO: implement matching if the nodes are just reordered, still can 2889 // treat the vector as the same if the list of scalars matches VL 2890 // directly, without reordering. 2891 SmallVector<int> Mask; 2892 inversePermutation(ReorderIndices, Mask); 2893 if (VL.size() == Scalars.size()) 2894 return IsSame(Scalars, Mask); 2895 if (VL.size() == ReuseShuffleIndices.size()) { 2896 ::addMask(Mask, ReuseShuffleIndices); 2897 return IsSame(Scalars, Mask); 2898 } 2899 return false; 2900 } 2901 return IsSame(Scalars, ReuseShuffleIndices); 2902 } 2903 2904 bool isOperandGatherNode(const EdgeInfo &UserEI) const { 2905 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && 2906 UserTreeIndices.front().UserTE == UserEI.UserTE; 2907 } 2908 2909 /// \returns true if current entry has same operands as \p TE. 2910 bool hasEqualOperands(const TreeEntry &TE) const { 2911 if (TE.getNumOperands() != getNumOperands()) 2912 return false; 2913 SmallBitVector Used(getNumOperands()); 2914 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { 2915 unsigned PrevCount = Used.count(); 2916 for (unsigned K = 0; K < E; ++K) { 2917 if (Used.test(K)) 2918 continue; 2919 if (getOperand(K) == TE.getOperand(I)) { 2920 Used.set(K); 2921 break; 2922 } 2923 } 2924 // Check if we actually found the matching operand. 2925 if (PrevCount == Used.count()) 2926 return false; 2927 } 2928 return true; 2929 } 2930 2931 /// \return Final vectorization factor for the node. Defined by the total 2932 /// number of vectorized scalars, including those, used several times in the 2933 /// entry and counted in the \a ReuseShuffleIndices, if any. 2934 unsigned getVectorFactor() const { 2935 if (!ReuseShuffleIndices.empty()) 2936 return ReuseShuffleIndices.size(); 2937 return Scalars.size(); 2938 }; 2939 2940 /// Checks if the current node is a gather node. 2941 bool isGather() const {return State == NeedToGather; } 2942 2943 /// A vector of scalars. 2944 ValueList Scalars; 2945 2946 /// The Scalars are vectorized into this value. It is initialized to Null. 2947 WeakTrackingVH VectorizedValue = nullptr; 2948 2949 /// New vector phi instructions emitted for the vectorized phi nodes. 2950 PHINode *PHI = nullptr; 2951 2952 /// Do we need to gather this sequence or vectorize it 2953 /// (either with vector instruction or with scatter/gather 2954 /// intrinsics for store/load)? 2955 enum EntryState { 2956 Vectorize, 2957 ScatterVectorize, 2958 StridedVectorize, 2959 NeedToGather 2960 }; 2961 EntryState State; 2962 2963 /// Does this sequence require some shuffling? 2964 SmallVector<int, 4> ReuseShuffleIndices; 2965 2966 /// Does this entry require reordering? 2967 SmallVector<unsigned, 4> ReorderIndices; 2968 2969 /// Points back to the VectorizableTree. 2970 /// 2971 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has 2972 /// to be a pointer and needs to be able to initialize the child iterator. 2973 /// Thus we need a reference back to the container to translate the indices 2974 /// to entries. 2975 VecTreeTy &Container; 2976 2977 /// The TreeEntry index containing the user of this entry. We can actually 2978 /// have multiple users so the data structure is not truly a tree. 2979 SmallVector<EdgeInfo, 1> UserTreeIndices; 2980 2981 /// The index of this treeEntry in VectorizableTree. 2982 int Idx = -1; 2983 2984 private: 2985 /// The operands of each instruction in each lane Operands[op_index][lane]. 2986 /// Note: This helps avoid the replication of the code that performs the 2987 /// reordering of operands during buildTree_rec() and vectorizeTree(). 2988 SmallVector<ValueList, 2> Operands; 2989 2990 /// The main/alternate instruction. 2991 Instruction *MainOp = nullptr; 2992 Instruction *AltOp = nullptr; 2993 2994 public: 2995 /// Set this bundle's \p OpIdx'th operand to \p OpVL. 2996 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { 2997 if (Operands.size() < OpIdx + 1) 2998 Operands.resize(OpIdx + 1); 2999 assert(Operands[OpIdx].empty() && "Already resized?"); 3000 assert(OpVL.size() <= Scalars.size() && 3001 "Number of operands is greater than the number of scalars."); 3002 Operands[OpIdx].resize(OpVL.size()); 3003 copy(OpVL, Operands[OpIdx].begin()); 3004 } 3005 3006 /// Set the operands of this bundle in their original order. 3007 void setOperandsInOrder() { 3008 assert(Operands.empty() && "Already initialized?"); 3009 auto *I0 = cast<Instruction>(Scalars[0]); 3010 Operands.resize(I0->getNumOperands()); 3011 unsigned NumLanes = Scalars.size(); 3012 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); 3013 OpIdx != NumOperands; ++OpIdx) { 3014 Operands[OpIdx].resize(NumLanes); 3015 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3016 auto *I = cast<Instruction>(Scalars[Lane]); 3017 assert(I->getNumOperands() == NumOperands && 3018 "Expected same number of operands"); 3019 Operands[OpIdx][Lane] = I->getOperand(OpIdx); 3020 } 3021 } 3022 } 3023 3024 /// Reorders operands of the node to the given mask \p Mask. 3025 void reorderOperands(ArrayRef<int> Mask) { 3026 for (ValueList &Operand : Operands) 3027 reorderScalars(Operand, Mask); 3028 } 3029 3030 /// \returns the \p OpIdx operand of this TreeEntry. 3031 ValueList &getOperand(unsigned OpIdx) { 3032 assert(OpIdx < Operands.size() && "Off bounds"); 3033 return Operands[OpIdx]; 3034 } 3035 3036 /// \returns the \p OpIdx operand of this TreeEntry. 3037 ArrayRef<Value *> getOperand(unsigned OpIdx) const { 3038 assert(OpIdx < Operands.size() && "Off bounds"); 3039 return Operands[OpIdx]; 3040 } 3041 3042 /// \returns the number of operands. 3043 unsigned getNumOperands() const { return Operands.size(); } 3044 3045 /// \return the single \p OpIdx operand. 3046 Value *getSingleOperand(unsigned OpIdx) const { 3047 assert(OpIdx < Operands.size() && "Off bounds"); 3048 assert(!Operands[OpIdx].empty() && "No operand available"); 3049 return Operands[OpIdx][0]; 3050 } 3051 3052 /// Some of the instructions in the list have alternate opcodes. 3053 bool isAltShuffle() const { return MainOp != AltOp; } 3054 3055 bool isOpcodeOrAlt(Instruction *I) const { 3056 unsigned CheckedOpcode = I->getOpcode(); 3057 return (getOpcode() == CheckedOpcode || 3058 getAltOpcode() == CheckedOpcode); 3059 } 3060 3061 /// Chooses the correct key for scheduling data. If \p Op has the same (or 3062 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 3063 /// \p OpValue. 3064 Value *isOneOf(Value *Op) const { 3065 auto *I = dyn_cast<Instruction>(Op); 3066 if (I && isOpcodeOrAlt(I)) 3067 return Op; 3068 return MainOp; 3069 } 3070 3071 void setOperations(const InstructionsState &S) { 3072 MainOp = S.MainOp; 3073 AltOp = S.AltOp; 3074 } 3075 3076 Instruction *getMainOp() const { 3077 return MainOp; 3078 } 3079 3080 Instruction *getAltOp() const { 3081 return AltOp; 3082 } 3083 3084 /// The main/alternate opcodes for the list of instructions. 3085 unsigned getOpcode() const { 3086 return MainOp ? MainOp->getOpcode() : 0; 3087 } 3088 3089 unsigned getAltOpcode() const { 3090 return AltOp ? AltOp->getOpcode() : 0; 3091 } 3092 3093 /// When ReuseReorderShuffleIndices is empty it just returns position of \p 3094 /// V within vector of Scalars. Otherwise, try to remap on its reuse index. 3095 int findLaneForValue(Value *V) const { 3096 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); 3097 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 3098 if (!ReorderIndices.empty()) 3099 FoundLane = ReorderIndices[FoundLane]; 3100 assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); 3101 if (!ReuseShuffleIndices.empty()) { 3102 FoundLane = std::distance(ReuseShuffleIndices.begin(), 3103 find(ReuseShuffleIndices, FoundLane)); 3104 } 3105 return FoundLane; 3106 } 3107 3108 /// Build a shuffle mask for graph entry which represents a merge of main 3109 /// and alternate operations. 3110 void 3111 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp, 3112 SmallVectorImpl<int> &Mask, 3113 SmallVectorImpl<Value *> *OpScalars = nullptr, 3114 SmallVectorImpl<Value *> *AltScalars = nullptr) const; 3115 3116 /// Return true if this is a non-power-of-2 node. 3117 bool isNonPowOf2Vec() const { 3118 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size()); 3119 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && 3120 "Reshuffling not supported with non-power-of-2 vectors yet."); 3121 return IsNonPowerOf2; 3122 } 3123 3124 #ifndef NDEBUG 3125 /// Debug printer. 3126 LLVM_DUMP_METHOD void dump() const { 3127 dbgs() << Idx << ".\n"; 3128 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { 3129 dbgs() << "Operand " << OpI << ":\n"; 3130 for (const Value *V : Operands[OpI]) 3131 dbgs().indent(2) << *V << "\n"; 3132 } 3133 dbgs() << "Scalars: \n"; 3134 for (Value *V : Scalars) 3135 dbgs().indent(2) << *V << "\n"; 3136 dbgs() << "State: "; 3137 switch (State) { 3138 case Vectorize: 3139 dbgs() << "Vectorize\n"; 3140 break; 3141 case ScatterVectorize: 3142 dbgs() << "ScatterVectorize\n"; 3143 break; 3144 case StridedVectorize: 3145 dbgs() << "StridedVectorize\n"; 3146 break; 3147 case NeedToGather: 3148 dbgs() << "NeedToGather\n"; 3149 break; 3150 } 3151 dbgs() << "MainOp: "; 3152 if (MainOp) 3153 dbgs() << *MainOp << "\n"; 3154 else 3155 dbgs() << "NULL\n"; 3156 dbgs() << "AltOp: "; 3157 if (AltOp) 3158 dbgs() << *AltOp << "\n"; 3159 else 3160 dbgs() << "NULL\n"; 3161 dbgs() << "VectorizedValue: "; 3162 if (VectorizedValue) 3163 dbgs() << *VectorizedValue << "\n"; 3164 else 3165 dbgs() << "NULL\n"; 3166 dbgs() << "ReuseShuffleIndices: "; 3167 if (ReuseShuffleIndices.empty()) 3168 dbgs() << "Empty"; 3169 else 3170 for (int ReuseIdx : ReuseShuffleIndices) 3171 dbgs() << ReuseIdx << ", "; 3172 dbgs() << "\n"; 3173 dbgs() << "ReorderIndices: "; 3174 for (unsigned ReorderIdx : ReorderIndices) 3175 dbgs() << ReorderIdx << ", "; 3176 dbgs() << "\n"; 3177 dbgs() << "UserTreeIndices: "; 3178 for (const auto &EInfo : UserTreeIndices) 3179 dbgs() << EInfo << ", "; 3180 dbgs() << "\n"; 3181 } 3182 #endif 3183 }; 3184 3185 #ifndef NDEBUG 3186 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, 3187 InstructionCost VecCost, InstructionCost ScalarCost, 3188 StringRef Banner) const { 3189 dbgs() << "SLP: " << Banner << ":\n"; 3190 E->dump(); 3191 dbgs() << "SLP: Costs:\n"; 3192 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; 3193 dbgs() << "SLP: VectorCost = " << VecCost << "\n"; 3194 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; 3195 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " 3196 << ReuseShuffleCost + VecCost - ScalarCost << "\n"; 3197 } 3198 #endif 3199 3200 /// Create a new VectorizableTree entry. 3201 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 3202 std::optional<ScheduleData *> Bundle, 3203 const InstructionsState &S, 3204 const EdgeInfo &UserTreeIdx, 3205 ArrayRef<int> ReuseShuffleIndices = std::nullopt, 3206 ArrayRef<unsigned> ReorderIndices = std::nullopt) { 3207 TreeEntry::EntryState EntryState = 3208 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; 3209 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, 3210 ReuseShuffleIndices, ReorderIndices); 3211 } 3212 3213 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, 3214 TreeEntry::EntryState EntryState, 3215 std::optional<ScheduleData *> Bundle, 3216 const InstructionsState &S, 3217 const EdgeInfo &UserTreeIdx, 3218 ArrayRef<int> ReuseShuffleIndices = std::nullopt, 3219 ArrayRef<unsigned> ReorderIndices = std::nullopt) { 3220 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || 3221 (Bundle && EntryState != TreeEntry::NeedToGather)) && 3222 "Need to vectorize gather entry?"); 3223 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); 3224 TreeEntry *Last = VectorizableTree.back().get(); 3225 Last->Idx = VectorizableTree.size() - 1; 3226 Last->State = EntryState; 3227 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), 3228 ReuseShuffleIndices.end()); 3229 if (ReorderIndices.empty()) { 3230 Last->Scalars.assign(VL.begin(), VL.end()); 3231 Last->setOperations(S); 3232 } else { 3233 // Reorder scalars and build final mask. 3234 Last->Scalars.assign(VL.size(), nullptr); 3235 transform(ReorderIndices, Last->Scalars.begin(), 3236 [VL](unsigned Idx) -> Value * { 3237 if (Idx >= VL.size()) 3238 return UndefValue::get(VL.front()->getType()); 3239 return VL[Idx]; 3240 }); 3241 InstructionsState S = getSameOpcode(Last->Scalars, *TLI); 3242 Last->setOperations(S); 3243 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); 3244 } 3245 if (!Last->isGather()) { 3246 for (Value *V : VL) { 3247 const TreeEntry *TE = getTreeEntry(V); 3248 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) && 3249 "Scalar already in tree!"); 3250 if (TE) { 3251 if (TE != Last) 3252 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last); 3253 continue; 3254 } 3255 ScalarToTreeEntry[V] = Last; 3256 } 3257 // Update the scheduler bundle to point to this TreeEntry. 3258 ScheduleData *BundleMember = *Bundle; 3259 assert((BundleMember || isa<PHINode>(S.MainOp) || 3260 isVectorLikeInstWithConstOps(S.MainOp) || 3261 doesNotNeedToSchedule(VL)) && 3262 "Bundle and VL out of sync"); 3263 if (BundleMember) { 3264 for (Value *V : VL) { 3265 if (doesNotNeedToBeScheduled(V)) 3266 continue; 3267 if (!BundleMember) 3268 continue; 3269 BundleMember->TE = Last; 3270 BundleMember = BundleMember->NextInBundle; 3271 } 3272 } 3273 assert(!BundleMember && "Bundle and VL out of sync"); 3274 } else { 3275 // Build a map for gathered scalars to the nodes where they are used. 3276 bool AllConstsOrCasts = true; 3277 for (Value *V : VL) 3278 if (!isConstant(V)) { 3279 auto *I = dyn_cast<CastInst>(V); 3280 AllConstsOrCasts &= I && I->getType()->isIntegerTy(); 3281 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); 3282 } 3283 if (AllConstsOrCasts) 3284 CastMaxMinBWSizes = 3285 std::make_pair(std::numeric_limits<unsigned>::max(), 1); 3286 MustGather.insert(VL.begin(), VL.end()); 3287 } 3288 3289 if (UserTreeIdx.UserTE) { 3290 Last->UserTreeIndices.push_back(UserTreeIdx); 3291 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && 3292 "Reordering isn't implemented for non-power-of-2 nodes yet"); 3293 } 3294 return Last; 3295 } 3296 3297 /// -- Vectorization State -- 3298 /// Holds all of the tree entries. 3299 TreeEntry::VecTreeTy VectorizableTree; 3300 3301 #ifndef NDEBUG 3302 /// Debug printer. 3303 LLVM_DUMP_METHOD void dumpVectorizableTree() const { 3304 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { 3305 VectorizableTree[Id]->dump(); 3306 dbgs() << "\n"; 3307 } 3308 } 3309 #endif 3310 3311 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } 3312 3313 const TreeEntry *getTreeEntry(Value *V) const { 3314 return ScalarToTreeEntry.lookup(V); 3315 } 3316 3317 /// Check that the operand node of alternate node does not generate 3318 /// buildvector sequence. If it is, then probably not worth it to build 3319 /// alternate shuffle, if number of buildvector operands + alternate 3320 /// instruction > than the number of buildvector instructions. 3321 /// \param S the instructions state of the analyzed values. 3322 /// \param VL list of the instructions with alternate opcodes. 3323 bool areAltOperandsProfitable(const InstructionsState &S, 3324 ArrayRef<Value *> VL) const; 3325 3326 /// Checks if the specified list of the instructions/values can be vectorized 3327 /// and fills required data before actual scheduling of the instructions. 3328 TreeEntry::EntryState getScalarsVectorizationState( 3329 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, 3330 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const; 3331 3332 /// Maps a specific scalar to its tree entry. 3333 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry; 3334 3335 /// List of scalars, used in several vectorize nodes, and the list of the 3336 /// nodes. 3337 SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars; 3338 3339 /// Maps a value to the proposed vectorizable size. 3340 SmallDenseMap<Value *, unsigned> InstrElementSize; 3341 3342 /// A list of scalars that we found that we need to keep as scalars. 3343 ValueSet MustGather; 3344 3345 /// A set of first non-schedulable values. 3346 ValueSet NonScheduledFirst; 3347 3348 /// A map between the vectorized entries and the last instructions in the 3349 /// bundles. The bundles are built in use order, not in the def order of the 3350 /// instructions. So, we cannot rely directly on the last instruction in the 3351 /// bundle being the last instruction in the program order during 3352 /// vectorization process since the basic blocks are affected, need to 3353 /// pre-gather them before. 3354 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction; 3355 3356 /// List of gather nodes, depending on other gather/vector nodes, which should 3357 /// be emitted after the vector instruction emission process to correctly 3358 /// handle order of the vector instructions and shuffles. 3359 SetVector<const TreeEntry *> PostponedGathers; 3360 3361 using ValueToGatherNodesMap = 3362 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>; 3363 ValueToGatherNodesMap ValueToGatherNodes; 3364 3365 /// This POD struct describes one external user in the vectorized tree. 3366 struct ExternalUser { 3367 ExternalUser(Value *S, llvm::User *U, int L) 3368 : Scalar(S), User(U), Lane(L) {} 3369 3370 // Which scalar in our function. 3371 Value *Scalar; 3372 3373 // Which user that uses the scalar. 3374 llvm::User *User; 3375 3376 // Which lane does the scalar belong to. 3377 int Lane; 3378 }; 3379 using UserList = SmallVector<ExternalUser, 16>; 3380 3381 /// Checks if two instructions may access the same memory. 3382 /// 3383 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 3384 /// is invariant in the calling loop. 3385 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 3386 Instruction *Inst2) { 3387 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2)) 3388 return true; 3389 // First check if the result is already in the cache. 3390 AliasCacheKey Key = std::make_pair(Inst1, Inst2); 3391 auto It = AliasCache.find(Key); 3392 if (It != AliasCache.end()) 3393 return It->second; 3394 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); 3395 // Store the result in the cache. 3396 AliasCache.try_emplace(Key, Aliased); 3397 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased); 3398 return Aliased; 3399 } 3400 3401 using AliasCacheKey = std::pair<Instruction *, Instruction *>; 3402 3403 /// Cache for alias results. 3404 /// TODO: consider moving this to the AliasAnalysis itself. 3405 DenseMap<AliasCacheKey, bool> AliasCache; 3406 3407 // Cache for pointerMayBeCaptured calls inside AA. This is preserved 3408 // globally through SLP because we don't perform any action which 3409 // invalidates capture results. 3410 BatchAAResults BatchAA; 3411 3412 /// Temporary store for deleted instructions. Instructions will be deleted 3413 /// eventually when the BoUpSLP is destructed. The deferral is required to 3414 /// ensure that there are no incorrect collisions in the AliasCache, which 3415 /// can happen if a new instruction is allocated at the same address as a 3416 /// previously deleted instruction. 3417 DenseSet<Instruction *> DeletedInstructions; 3418 3419 /// Set of the instruction, being analyzed already for reductions. 3420 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots; 3421 3422 /// Set of hashes for the list of reduction values already being analyzed. 3423 DenseSet<size_t> AnalyzedReductionVals; 3424 3425 /// Values, already been analyzed for mininmal bitwidth and found to be 3426 /// non-profitable. 3427 DenseSet<Value *> AnalyzedMinBWVals; 3428 3429 /// A list of values that need to extracted out of the tree. 3430 /// This list holds pairs of (Internal Scalar : External User). External User 3431 /// can be nullptr, it means that this Internal Scalar will be used later, 3432 /// after vectorization. 3433 UserList ExternalUses; 3434 3435 /// A list of GEPs which can be reaplced by scalar GEPs instead of 3436 /// extractelement instructions. 3437 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs; 3438 3439 /// Values used only by @llvm.assume calls. 3440 SmallPtrSet<const Value *, 32> EphValues; 3441 3442 /// Holds all of the instructions that we gathered, shuffle instructions and 3443 /// extractelements. 3444 SetVector<Instruction *> GatherShuffleExtractSeq; 3445 3446 /// A list of blocks that we are going to CSE. 3447 DenseSet<BasicBlock *> CSEBlocks; 3448 3449 /// Contains all scheduling relevant data for an instruction. 3450 /// A ScheduleData either represents a single instruction or a member of an 3451 /// instruction bundle (= a group of instructions which is combined into a 3452 /// vector instruction). 3453 struct ScheduleData { 3454 // The initial value for the dependency counters. It means that the 3455 // dependencies are not calculated yet. 3456 enum { InvalidDeps = -1 }; 3457 3458 ScheduleData() = default; 3459 3460 void init(int BlockSchedulingRegionID, Value *OpVal) { 3461 FirstInBundle = this; 3462 NextInBundle = nullptr; 3463 NextLoadStore = nullptr; 3464 IsScheduled = false; 3465 SchedulingRegionID = BlockSchedulingRegionID; 3466 clearDependencies(); 3467 OpValue = OpVal; 3468 TE = nullptr; 3469 } 3470 3471 /// Verify basic self consistency properties 3472 void verify() { 3473 if (hasValidDependencies()) { 3474 assert(UnscheduledDeps <= Dependencies && "invariant"); 3475 } else { 3476 assert(UnscheduledDeps == Dependencies && "invariant"); 3477 } 3478 3479 if (IsScheduled) { 3480 assert(isSchedulingEntity() && 3481 "unexpected scheduled state"); 3482 for (const ScheduleData *BundleMember = this; BundleMember; 3483 BundleMember = BundleMember->NextInBundle) { 3484 assert(BundleMember->hasValidDependencies() && 3485 BundleMember->UnscheduledDeps == 0 && 3486 "unexpected scheduled state"); 3487 assert((BundleMember == this || !BundleMember->IsScheduled) && 3488 "only bundle is marked scheduled"); 3489 } 3490 } 3491 3492 assert(Inst->getParent() == FirstInBundle->Inst->getParent() && 3493 "all bundle members must be in same basic block"); 3494 } 3495 3496 /// Returns true if the dependency information has been calculated. 3497 /// Note that depenendency validity can vary between instructions within 3498 /// a single bundle. 3499 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 3500 3501 /// Returns true for single instructions and for bundle representatives 3502 /// (= the head of a bundle). 3503 bool isSchedulingEntity() const { return FirstInBundle == this; } 3504 3505 /// Returns true if it represents an instruction bundle and not only a 3506 /// single instruction. 3507 bool isPartOfBundle() const { 3508 return NextInBundle != nullptr || FirstInBundle != this || TE; 3509 } 3510 3511 /// Returns true if it is ready for scheduling, i.e. it has no more 3512 /// unscheduled depending instructions/bundles. 3513 bool isReady() const { 3514 assert(isSchedulingEntity() && 3515 "can't consider non-scheduling entity for ready list"); 3516 return unscheduledDepsInBundle() == 0 && !IsScheduled; 3517 } 3518 3519 /// Modifies the number of unscheduled dependencies for this instruction, 3520 /// and returns the number of remaining dependencies for the containing 3521 /// bundle. 3522 int incrementUnscheduledDeps(int Incr) { 3523 assert(hasValidDependencies() && 3524 "increment of unscheduled deps would be meaningless"); 3525 UnscheduledDeps += Incr; 3526 return FirstInBundle->unscheduledDepsInBundle(); 3527 } 3528 3529 /// Sets the number of unscheduled dependencies to the number of 3530 /// dependencies. 3531 void resetUnscheduledDeps() { 3532 UnscheduledDeps = Dependencies; 3533 } 3534 3535 /// Clears all dependency information. 3536 void clearDependencies() { 3537 Dependencies = InvalidDeps; 3538 resetUnscheduledDeps(); 3539 MemoryDependencies.clear(); 3540 ControlDependencies.clear(); 3541 } 3542 3543 int unscheduledDepsInBundle() const { 3544 assert(isSchedulingEntity() && "only meaningful on the bundle"); 3545 int Sum = 0; 3546 for (const ScheduleData *BundleMember = this; BundleMember; 3547 BundleMember = BundleMember->NextInBundle) { 3548 if (BundleMember->UnscheduledDeps == InvalidDeps) 3549 return InvalidDeps; 3550 Sum += BundleMember->UnscheduledDeps; 3551 } 3552 return Sum; 3553 } 3554 3555 void dump(raw_ostream &os) const { 3556 if (!isSchedulingEntity()) { 3557 os << "/ " << *Inst; 3558 } else if (NextInBundle) { 3559 os << '[' << *Inst; 3560 ScheduleData *SD = NextInBundle; 3561 while (SD) { 3562 os << ';' << *SD->Inst; 3563 SD = SD->NextInBundle; 3564 } 3565 os << ']'; 3566 } else { 3567 os << *Inst; 3568 } 3569 } 3570 3571 Instruction *Inst = nullptr; 3572 3573 /// Opcode of the current instruction in the schedule data. 3574 Value *OpValue = nullptr; 3575 3576 /// The TreeEntry that this instruction corresponds to. 3577 TreeEntry *TE = nullptr; 3578 3579 /// Points to the head in an instruction bundle (and always to this for 3580 /// single instructions). 3581 ScheduleData *FirstInBundle = nullptr; 3582 3583 /// Single linked list of all instructions in a bundle. Null if it is a 3584 /// single instruction. 3585 ScheduleData *NextInBundle = nullptr; 3586 3587 /// Single linked list of all memory instructions (e.g. load, store, call) 3588 /// in the block - until the end of the scheduling region. 3589 ScheduleData *NextLoadStore = nullptr; 3590 3591 /// The dependent memory instructions. 3592 /// This list is derived on demand in calculateDependencies(). 3593 SmallVector<ScheduleData *, 4> MemoryDependencies; 3594 3595 /// List of instructions which this instruction could be control dependent 3596 /// on. Allowing such nodes to be scheduled below this one could introduce 3597 /// a runtime fault which didn't exist in the original program. 3598 /// ex: this is a load or udiv following a readonly call which inf loops 3599 SmallVector<ScheduleData *, 4> ControlDependencies; 3600 3601 /// This ScheduleData is in the current scheduling region if this matches 3602 /// the current SchedulingRegionID of BlockScheduling. 3603 int SchedulingRegionID = 0; 3604 3605 /// Used for getting a "good" final ordering of instructions. 3606 int SchedulingPriority = 0; 3607 3608 /// The number of dependencies. Constitutes of the number of users of the 3609 /// instruction plus the number of dependent memory instructions (if any). 3610 /// This value is calculated on demand. 3611 /// If InvalidDeps, the number of dependencies is not calculated yet. 3612 int Dependencies = InvalidDeps; 3613 3614 /// The number of dependencies minus the number of dependencies of scheduled 3615 /// instructions. As soon as this is zero, the instruction/bundle gets ready 3616 /// for scheduling. 3617 /// Note that this is negative as long as Dependencies is not calculated. 3618 int UnscheduledDeps = InvalidDeps; 3619 3620 /// True if this instruction is scheduled (or considered as scheduled in the 3621 /// dry-run). 3622 bool IsScheduled = false; 3623 }; 3624 3625 #ifndef NDEBUG 3626 friend inline raw_ostream &operator<<(raw_ostream &os, 3627 const BoUpSLP::ScheduleData &SD) { 3628 SD.dump(os); 3629 return os; 3630 } 3631 #endif 3632 3633 friend struct GraphTraits<BoUpSLP *>; 3634 friend struct DOTGraphTraits<BoUpSLP *>; 3635 3636 /// Contains all scheduling data for a basic block. 3637 /// It does not schedules instructions, which are not memory read/write 3638 /// instructions and their operands are either constants, or arguments, or 3639 /// phis, or instructions from others blocks, or their users are phis or from 3640 /// the other blocks. The resulting vector instructions can be placed at the 3641 /// beginning of the basic block without scheduling (if operands does not need 3642 /// to be scheduled) or at the end of the block (if users are outside of the 3643 /// block). It allows to save some compile time and memory used by the 3644 /// compiler. 3645 /// ScheduleData is assigned for each instruction in between the boundaries of 3646 /// the tree entry, even for those, which are not part of the graph. It is 3647 /// required to correctly follow the dependencies between the instructions and 3648 /// their correct scheduling. The ScheduleData is not allocated for the 3649 /// instructions, which do not require scheduling, like phis, nodes with 3650 /// extractelements/insertelements only or nodes with instructions, with 3651 /// uses/operands outside of the block. 3652 struct BlockScheduling { 3653 BlockScheduling(BasicBlock *BB) 3654 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} 3655 3656 void clear() { 3657 ReadyInsts.clear(); 3658 ScheduleStart = nullptr; 3659 ScheduleEnd = nullptr; 3660 FirstLoadStoreInRegion = nullptr; 3661 LastLoadStoreInRegion = nullptr; 3662 RegionHasStackSave = false; 3663 3664 // Reduce the maximum schedule region size by the size of the 3665 // previous scheduling run. 3666 ScheduleRegionSizeLimit -= ScheduleRegionSize; 3667 if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 3668 ScheduleRegionSizeLimit = MinScheduleRegionSize; 3669 ScheduleRegionSize = 0; 3670 3671 // Make a new scheduling region, i.e. all existing ScheduleData is not 3672 // in the new region yet. 3673 ++SchedulingRegionID; 3674 } 3675 3676 ScheduleData *getScheduleData(Instruction *I) { 3677 if (BB != I->getParent()) 3678 // Avoid lookup if can't possibly be in map. 3679 return nullptr; 3680 ScheduleData *SD = ScheduleDataMap.lookup(I); 3681 if (SD && isInSchedulingRegion(SD)) 3682 return SD; 3683 return nullptr; 3684 } 3685 3686 ScheduleData *getScheduleData(Value *V) { 3687 if (auto *I = dyn_cast<Instruction>(V)) 3688 return getScheduleData(I); 3689 return nullptr; 3690 } 3691 3692 ScheduleData *getScheduleData(Value *V, Value *Key) { 3693 if (V == Key) 3694 return getScheduleData(V); 3695 auto I = ExtraScheduleDataMap.find(V); 3696 if (I != ExtraScheduleDataMap.end()) { 3697 ScheduleData *SD = I->second.lookup(Key); 3698 if (SD && isInSchedulingRegion(SD)) 3699 return SD; 3700 } 3701 return nullptr; 3702 } 3703 3704 bool isInSchedulingRegion(ScheduleData *SD) const { 3705 return SD->SchedulingRegionID == SchedulingRegionID; 3706 } 3707 3708 /// Marks an instruction as scheduled and puts all dependent ready 3709 /// instructions into the ready-list. 3710 template <typename ReadyListType> 3711 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 3712 SD->IsScheduled = true; 3713 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 3714 3715 for (ScheduleData *BundleMember = SD; BundleMember; 3716 BundleMember = BundleMember->NextInBundle) { 3717 if (BundleMember->Inst != BundleMember->OpValue) 3718 continue; 3719 3720 // Handle the def-use chain dependencies. 3721 3722 // Decrement the unscheduled counter and insert to ready list if ready. 3723 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { 3724 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { 3725 if (OpDef && OpDef->hasValidDependencies() && 3726 OpDef->incrementUnscheduledDeps(-1) == 0) { 3727 // There are no more unscheduled dependencies after 3728 // decrementing, so we can put the dependent instruction 3729 // into the ready list. 3730 ScheduleData *DepBundle = OpDef->FirstInBundle; 3731 assert(!DepBundle->IsScheduled && 3732 "already scheduled bundle gets ready"); 3733 ReadyList.insert(DepBundle); 3734 LLVM_DEBUG(dbgs() 3735 << "SLP: gets ready (def): " << *DepBundle << "\n"); 3736 } 3737 }); 3738 }; 3739 3740 // If BundleMember is a vector bundle, its operands may have been 3741 // reordered during buildTree(). We therefore need to get its operands 3742 // through the TreeEntry. 3743 if (TreeEntry *TE = BundleMember->TE) { 3744 // Need to search for the lane since the tree entry can be reordered. 3745 int Lane = std::distance(TE->Scalars.begin(), 3746 find(TE->Scalars, BundleMember->Inst)); 3747 assert(Lane >= 0 && "Lane not set"); 3748 3749 // Since vectorization tree is being built recursively this assertion 3750 // ensures that the tree entry has all operands set before reaching 3751 // this code. Couple of exceptions known at the moment are extracts 3752 // where their second (immediate) operand is not added. Since 3753 // immediates do not affect scheduler behavior this is considered 3754 // okay. 3755 auto *In = BundleMember->Inst; 3756 assert( 3757 In && 3758 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) || 3759 In->getNumOperands() == TE->getNumOperands()) && 3760 "Missed TreeEntry operands?"); 3761 (void)In; // fake use to avoid build failure when assertions disabled 3762 3763 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); 3764 OpIdx != NumOperands; ++OpIdx) 3765 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) 3766 DecrUnsched(I); 3767 } else { 3768 // If BundleMember is a stand-alone instruction, no operand reordering 3769 // has taken place, so we directly access its operands. 3770 for (Use &U : BundleMember->Inst->operands()) 3771 if (auto *I = dyn_cast<Instruction>(U.get())) 3772 DecrUnsched(I); 3773 } 3774 // Handle the memory dependencies. 3775 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 3776 if (MemoryDepSD->hasValidDependencies() && 3777 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 3778 // There are no more unscheduled dependencies after decrementing, 3779 // so we can put the dependent instruction into the ready list. 3780 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 3781 assert(!DepBundle->IsScheduled && 3782 "already scheduled bundle gets ready"); 3783 ReadyList.insert(DepBundle); 3784 LLVM_DEBUG(dbgs() 3785 << "SLP: gets ready (mem): " << *DepBundle << "\n"); 3786 } 3787 } 3788 // Handle the control dependencies. 3789 for (ScheduleData *DepSD : BundleMember->ControlDependencies) { 3790 if (DepSD->incrementUnscheduledDeps(-1) == 0) { 3791 // There are no more unscheduled dependencies after decrementing, 3792 // so we can put the dependent instruction into the ready list. 3793 ScheduleData *DepBundle = DepSD->FirstInBundle; 3794 assert(!DepBundle->IsScheduled && 3795 "already scheduled bundle gets ready"); 3796 ReadyList.insert(DepBundle); 3797 LLVM_DEBUG(dbgs() 3798 << "SLP: gets ready (ctl): " << *DepBundle << "\n"); 3799 } 3800 } 3801 } 3802 } 3803 3804 /// Verify basic self consistency properties of the data structure. 3805 void verify() { 3806 if (!ScheduleStart) 3807 return; 3808 3809 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && 3810 ScheduleStart->comesBefore(ScheduleEnd) && 3811 "Not a valid scheduling region?"); 3812 3813 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 3814 auto *SD = getScheduleData(I); 3815 if (!SD) 3816 continue; 3817 assert(isInSchedulingRegion(SD) && 3818 "primary schedule data not in window?"); 3819 assert(isInSchedulingRegion(SD->FirstInBundle) && 3820 "entire bundle in window!"); 3821 (void)SD; 3822 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); }); 3823 } 3824 3825 for (auto *SD : ReadyInsts) { 3826 assert(SD->isSchedulingEntity() && SD->isReady() && 3827 "item in ready list not ready?"); 3828 (void)SD; 3829 } 3830 } 3831 3832 void doForAllOpcodes(Value *V, 3833 function_ref<void(ScheduleData *SD)> Action) { 3834 if (ScheduleData *SD = getScheduleData(V)) 3835 Action(SD); 3836 auto I = ExtraScheduleDataMap.find(V); 3837 if (I != ExtraScheduleDataMap.end()) 3838 for (auto &P : I->second) 3839 if (isInSchedulingRegion(P.second)) 3840 Action(P.second); 3841 } 3842 3843 /// Put all instructions into the ReadyList which are ready for scheduling. 3844 template <typename ReadyListType> 3845 void initialFillReadyList(ReadyListType &ReadyList) { 3846 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 3847 doForAllOpcodes(I, [&](ScheduleData *SD) { 3848 if (SD->isSchedulingEntity() && SD->hasValidDependencies() && 3849 SD->isReady()) { 3850 ReadyList.insert(SD); 3851 LLVM_DEBUG(dbgs() 3852 << "SLP: initially in ready list: " << *SD << "\n"); 3853 } 3854 }); 3855 } 3856 } 3857 3858 /// Build a bundle from the ScheduleData nodes corresponding to the 3859 /// scalar instruction for each lane. 3860 ScheduleData *buildBundle(ArrayRef<Value *> VL); 3861 3862 /// Checks if a bundle of instructions can be scheduled, i.e. has no 3863 /// cyclic dependencies. This is only a dry-run, no instructions are 3864 /// actually moved at this stage. 3865 /// \returns the scheduling bundle. The returned Optional value is not 3866 /// std::nullopt if \p VL is allowed to be scheduled. 3867 std::optional<ScheduleData *> 3868 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 3869 const InstructionsState &S); 3870 3871 /// Un-bundles a group of instructions. 3872 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); 3873 3874 /// Allocates schedule data chunk. 3875 ScheduleData *allocateScheduleDataChunks(); 3876 3877 /// Extends the scheduling region so that V is inside the region. 3878 /// \returns true if the region size is within the limit. 3879 bool extendSchedulingRegion(Value *V, const InstructionsState &S); 3880 3881 /// Initialize the ScheduleData structures for new instructions in the 3882 /// scheduling region. 3883 void initScheduleData(Instruction *FromI, Instruction *ToI, 3884 ScheduleData *PrevLoadStore, 3885 ScheduleData *NextLoadStore); 3886 3887 /// Updates the dependency information of a bundle and of all instructions/ 3888 /// bundles which depend on the original bundle. 3889 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 3890 BoUpSLP *SLP); 3891 3892 /// Sets all instruction in the scheduling region to un-scheduled. 3893 void resetSchedule(); 3894 3895 BasicBlock *BB; 3896 3897 /// Simple memory allocation for ScheduleData. 3898 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 3899 3900 /// The size of a ScheduleData array in ScheduleDataChunks. 3901 int ChunkSize; 3902 3903 /// The allocator position in the current chunk, which is the last entry 3904 /// of ScheduleDataChunks. 3905 int ChunkPos; 3906 3907 /// Attaches ScheduleData to Instruction. 3908 /// Note that the mapping survives during all vectorization iterations, i.e. 3909 /// ScheduleData structures are recycled. 3910 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap; 3911 3912 /// Attaches ScheduleData to Instruction with the leading key. 3913 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> 3914 ExtraScheduleDataMap; 3915 3916 /// The ready-list for scheduling (only used for the dry-run). 3917 SetVector<ScheduleData *> ReadyInsts; 3918 3919 /// The first instruction of the scheduling region. 3920 Instruction *ScheduleStart = nullptr; 3921 3922 /// The first instruction _after_ the scheduling region. 3923 Instruction *ScheduleEnd = nullptr; 3924 3925 /// The first memory accessing instruction in the scheduling region 3926 /// (can be null). 3927 ScheduleData *FirstLoadStoreInRegion = nullptr; 3928 3929 /// The last memory accessing instruction in the scheduling region 3930 /// (can be null). 3931 ScheduleData *LastLoadStoreInRegion = nullptr; 3932 3933 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling 3934 /// region? Used to optimize the dependence calculation for the 3935 /// common case where there isn't. 3936 bool RegionHasStackSave = false; 3937 3938 /// The current size of the scheduling region. 3939 int ScheduleRegionSize = 0; 3940 3941 /// The maximum size allowed for the scheduling region. 3942 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; 3943 3944 /// The ID of the scheduling region. For a new vectorization iteration this 3945 /// is incremented which "removes" all ScheduleData from the region. 3946 /// Make sure that the initial SchedulingRegionID is greater than the 3947 /// initial SchedulingRegionID in ScheduleData (which is 0). 3948 int SchedulingRegionID = 1; 3949 }; 3950 3951 /// Attaches the BlockScheduling structures to basic blocks. 3952 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 3953 3954 /// Performs the "real" scheduling. Done before vectorization is actually 3955 /// performed in a basic block. 3956 void scheduleBlock(BlockScheduling *BS); 3957 3958 /// List of users to ignore during scheduling and that don't need extracting. 3959 const SmallDenseSet<Value *> *UserIgnoreList = nullptr; 3960 3961 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 3962 /// sorted SmallVectors of unsigned. 3963 struct OrdersTypeDenseMapInfo { 3964 static OrdersType getEmptyKey() { 3965 OrdersType V; 3966 V.push_back(~1U); 3967 return V; 3968 } 3969 3970 static OrdersType getTombstoneKey() { 3971 OrdersType V; 3972 V.push_back(~2U); 3973 return V; 3974 } 3975 3976 static unsigned getHashValue(const OrdersType &V) { 3977 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 3978 } 3979 3980 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { 3981 return LHS == RHS; 3982 } 3983 }; 3984 3985 // Analysis and block reference. 3986 Function *F; 3987 ScalarEvolution *SE; 3988 TargetTransformInfo *TTI; 3989 TargetLibraryInfo *TLI; 3990 LoopInfo *LI; 3991 DominatorTree *DT; 3992 AssumptionCache *AC; 3993 DemandedBits *DB; 3994 const DataLayout *DL; 3995 OptimizationRemarkEmitter *ORE; 3996 3997 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 3998 unsigned MinVecRegSize; // Set by cl::opt (default: 128). 3999 4000 /// Instruction builder to construct the vectorized tree. 4001 IRBuilder<TargetFolder> Builder; 4002 4003 /// A map of scalar integer values to the smallest bit width with which they 4004 /// can legally be represented. The values map to (width, signed) pairs, 4005 /// where "width" indicates the minimum bit width and "signed" is True if the 4006 /// value must be signed-extended, rather than zero-extended, back to its 4007 /// original width. 4008 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs; 4009 4010 /// Final size of the reduced vector, if the current graph represents the 4011 /// input for the reduction and it was possible to narrow the size of the 4012 /// reduction. 4013 unsigned ReductionBitWidth = 0; 4014 4015 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of 4016 /// type sizes, used in the tree. 4017 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes; 4018 4019 /// Indices of the vectorized nodes, which supposed to be the roots of the new 4020 /// bitwidth analysis attempt, like trunc, IToFP or ICmp. 4021 DenseSet<unsigned> ExtraBitWidthNodes; 4022 }; 4023 4024 } // end namespace slpvectorizer 4025 4026 template <> struct GraphTraits<BoUpSLP *> { 4027 using TreeEntry = BoUpSLP::TreeEntry; 4028 4029 /// NodeRef has to be a pointer per the GraphWriter. 4030 using NodeRef = TreeEntry *; 4031 4032 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; 4033 4034 /// Add the VectorizableTree to the index iterator to be able to return 4035 /// TreeEntry pointers. 4036 struct ChildIteratorType 4037 : public iterator_adaptor_base< 4038 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { 4039 ContainerTy &VectorizableTree; 4040 4041 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, 4042 ContainerTy &VT) 4043 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} 4044 4045 NodeRef operator*() { return I->UserTE; } 4046 }; 4047 4048 static NodeRef getEntryNode(BoUpSLP &R) { 4049 return R.VectorizableTree[0].get(); 4050 } 4051 4052 static ChildIteratorType child_begin(NodeRef N) { 4053 return {N->UserTreeIndices.begin(), N->Container}; 4054 } 4055 4056 static ChildIteratorType child_end(NodeRef N) { 4057 return {N->UserTreeIndices.end(), N->Container}; 4058 } 4059 4060 /// For the node iterator we just need to turn the TreeEntry iterator into a 4061 /// TreeEntry* iterator so that it dereferences to NodeRef. 4062 class nodes_iterator { 4063 using ItTy = ContainerTy::iterator; 4064 ItTy It; 4065 4066 public: 4067 nodes_iterator(const ItTy &It2) : It(It2) {} 4068 NodeRef operator*() { return It->get(); } 4069 nodes_iterator operator++() { 4070 ++It; 4071 return *this; 4072 } 4073 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } 4074 }; 4075 4076 static nodes_iterator nodes_begin(BoUpSLP *R) { 4077 return nodes_iterator(R->VectorizableTree.begin()); 4078 } 4079 4080 static nodes_iterator nodes_end(BoUpSLP *R) { 4081 return nodes_iterator(R->VectorizableTree.end()); 4082 } 4083 4084 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } 4085 }; 4086 4087 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { 4088 using TreeEntry = BoUpSLP::TreeEntry; 4089 4090 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 4091 4092 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { 4093 std::string Str; 4094 raw_string_ostream OS(Str); 4095 OS << Entry->Idx << ".\n"; 4096 if (isSplat(Entry->Scalars)) 4097 OS << "<splat> "; 4098 for (auto *V : Entry->Scalars) { 4099 OS << *V; 4100 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { 4101 return EU.Scalar == V; 4102 })) 4103 OS << " <extract>"; 4104 OS << "\n"; 4105 } 4106 return Str; 4107 } 4108 4109 static std::string getNodeAttributes(const TreeEntry *Entry, 4110 const BoUpSLP *) { 4111 if (Entry->isGather()) 4112 return "color=red"; 4113 if (Entry->State == TreeEntry::ScatterVectorize || 4114 Entry->State == TreeEntry::StridedVectorize) 4115 return "color=blue"; 4116 return ""; 4117 } 4118 }; 4119 4120 } // end namespace llvm 4121 4122 BoUpSLP::~BoUpSLP() { 4123 SmallVector<WeakTrackingVH> DeadInsts; 4124 for (auto *I : DeletedInstructions) { 4125 if (!I->getParent()) { 4126 // Temporarily insert instruction back to erase them from parent and 4127 // memory later. 4128 if (isa<PHINode>(I)) 4129 // Phi nodes must be the very first instructions in the block. 4130 I->insertBefore(F->getEntryBlock(), 4131 F->getEntryBlock().getFirstNonPHIIt()); 4132 else 4133 I->insertBefore(F->getEntryBlock().getTerminator()); 4134 continue; 4135 } 4136 for (Use &U : I->operands()) { 4137 auto *Op = dyn_cast<Instruction>(U.get()); 4138 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() && 4139 wouldInstructionBeTriviallyDead(Op, TLI)) 4140 DeadInsts.emplace_back(Op); 4141 } 4142 I->dropAllReferences(); 4143 } 4144 for (auto *I : DeletedInstructions) { 4145 assert(I->use_empty() && 4146 "trying to erase instruction with users."); 4147 I->eraseFromParent(); 4148 } 4149 4150 // Cleanup any dead scalar code feeding the vectorized instructions 4151 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); 4152 4153 #ifdef EXPENSIVE_CHECKS 4154 // If we could guarantee that this call is not extremely slow, we could 4155 // remove the ifdef limitation (see PR47712). 4156 assert(!verifyFunction(*F, &dbgs())); 4157 #endif 4158 } 4159 4160 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses 4161 /// contains original mask for the scalars reused in the node. Procedure 4162 /// transform this mask in accordance with the given \p Mask. 4163 static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { 4164 assert(!Mask.empty() && Reuses.size() == Mask.size() && 4165 "Expected non-empty mask."); 4166 SmallVector<int> Prev(Reuses.begin(), Reuses.end()); 4167 Prev.swap(Reuses); 4168 for (unsigned I = 0, E = Prev.size(); I < E; ++I) 4169 if (Mask[I] != PoisonMaskElem) 4170 Reuses[Mask[I]] = Prev[I]; 4171 } 4172 4173 /// Reorders the given \p Order according to the given \p Mask. \p Order - is 4174 /// the original order of the scalars. Procedure transforms the provided order 4175 /// in accordance with the given \p Mask. If the resulting \p Order is just an 4176 /// identity order, \p Order is cleared. 4177 static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask, 4178 bool BottomOrder = false) { 4179 assert(!Mask.empty() && "Expected non-empty mask."); 4180 unsigned Sz = Mask.size(); 4181 if (BottomOrder) { 4182 SmallVector<unsigned> PrevOrder; 4183 if (Order.empty()) { 4184 PrevOrder.resize(Sz); 4185 std::iota(PrevOrder.begin(), PrevOrder.end(), 0); 4186 } else { 4187 PrevOrder.swap(Order); 4188 } 4189 Order.assign(Sz, Sz); 4190 for (unsigned I = 0; I < Sz; ++I) 4191 if (Mask[I] != PoisonMaskElem) 4192 Order[I] = PrevOrder[Mask[I]]; 4193 if (all_of(enumerate(Order), [&](const auto &Data) { 4194 return Data.value() == Sz || Data.index() == Data.value(); 4195 })) { 4196 Order.clear(); 4197 return; 4198 } 4199 fixupOrderingIndices(Order); 4200 return; 4201 } 4202 SmallVector<int> MaskOrder; 4203 if (Order.empty()) { 4204 MaskOrder.resize(Sz); 4205 std::iota(MaskOrder.begin(), MaskOrder.end(), 0); 4206 } else { 4207 inversePermutation(Order, MaskOrder); 4208 } 4209 reorderReuses(MaskOrder, Mask); 4210 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) { 4211 Order.clear(); 4212 return; 4213 } 4214 Order.assign(Sz, Sz); 4215 for (unsigned I = 0; I < Sz; ++I) 4216 if (MaskOrder[I] != PoisonMaskElem) 4217 Order[MaskOrder[I]] = I; 4218 fixupOrderingIndices(Order); 4219 } 4220 4221 std::optional<BoUpSLP::OrdersType> 4222 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { 4223 assert(TE.isGather() && "Expected gather node only."); 4224 // Try to find subvector extract/insert patterns and reorder only such 4225 // patterns. 4226 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end()); 4227 Type *ScalarTy = GatheredScalars.front()->getType(); 4228 int NumScalars = GatheredScalars.size(); 4229 if (!isValidElementType(ScalarTy)) 4230 return std::nullopt; 4231 auto *VecTy = getWidenedType(ScalarTy, NumScalars); 4232 int NumParts = TTI->getNumberOfParts(VecTy); 4233 if (NumParts == 0 || NumParts >= NumScalars) 4234 NumParts = 1; 4235 SmallVector<int> ExtractMask; 4236 SmallVector<int> Mask; 4237 SmallVector<SmallVector<const TreeEntry *>> Entries; 4238 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles = 4239 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); 4240 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles = 4241 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts, 4242 /*ForOrder=*/true); 4243 // No shuffled operands - ignore. 4244 if (GatherShuffles.empty() && ExtractShuffles.empty()) 4245 return std::nullopt; 4246 OrdersType CurrentOrder(NumScalars, NumScalars); 4247 if (GatherShuffles.size() == 1 && 4248 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && 4249 Entries.front().front()->isSame(TE.Scalars)) { 4250 // Perfect match in the graph, will reuse the previously vectorized 4251 // node. Cost is 0. 4252 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0); 4253 return CurrentOrder; 4254 } 4255 auto IsSplatMask = [](ArrayRef<int> Mask) { 4256 int SingleElt = PoisonMaskElem; 4257 return all_of(Mask, [&](int I) { 4258 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem) 4259 SingleElt = I; 4260 return I == PoisonMaskElem || I == SingleElt; 4261 }); 4262 }; 4263 // Exclusive broadcast mask - ignore. 4264 if ((ExtractShuffles.empty() && IsSplatMask(Mask) && 4265 (Entries.size() != 1 || 4266 Entries.front().front()->ReorderIndices.empty())) || 4267 (GatherShuffles.empty() && IsSplatMask(ExtractMask))) 4268 return std::nullopt; 4269 SmallBitVector ShuffledSubMasks(NumParts); 4270 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder, 4271 ArrayRef<int> Mask, int PartSz, int NumParts, 4272 function_ref<unsigned(unsigned)> GetVF) { 4273 for (int I : seq<int>(0, NumParts)) { 4274 if (ShuffledSubMasks.test(I)) 4275 continue; 4276 const int VF = GetVF(I); 4277 if (VF == 0) 4278 continue; 4279 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I); 4280 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit); 4281 // Shuffle of at least 2 vectors - ignore. 4282 if (any_of(Slice, [&](int I) { return I != NumScalars; })) { 4283 std::fill(Slice.begin(), Slice.end(), NumScalars); 4284 ShuffledSubMasks.set(I); 4285 continue; 4286 } 4287 // Try to include as much elements from the mask as possible. 4288 int FirstMin = INT_MAX; 4289 int SecondVecFound = false; 4290 for (int K : seq<int>(Limit)) { 4291 int Idx = Mask[I * PartSz + K]; 4292 if (Idx == PoisonMaskElem) { 4293 Value *V = GatheredScalars[I * PartSz + K]; 4294 if (isConstant(V) && !isa<PoisonValue>(V)) { 4295 SecondVecFound = true; 4296 break; 4297 } 4298 continue; 4299 } 4300 if (Idx < VF) { 4301 if (FirstMin > Idx) 4302 FirstMin = Idx; 4303 } else { 4304 SecondVecFound = true; 4305 break; 4306 } 4307 } 4308 FirstMin = (FirstMin / PartSz) * PartSz; 4309 // Shuffle of at least 2 vectors - ignore. 4310 if (SecondVecFound) { 4311 std::fill(Slice.begin(), Slice.end(), NumScalars); 4312 ShuffledSubMasks.set(I); 4313 continue; 4314 } 4315 for (int K : seq<int>(Limit)) { 4316 int Idx = Mask[I * PartSz + K]; 4317 if (Idx == PoisonMaskElem) 4318 continue; 4319 Idx -= FirstMin; 4320 if (Idx >= PartSz) { 4321 SecondVecFound = true; 4322 break; 4323 } 4324 if (CurrentOrder[I * PartSz + Idx] > 4325 static_cast<unsigned>(I * PartSz + K) && 4326 CurrentOrder[I * PartSz + Idx] != 4327 static_cast<unsigned>(I * PartSz + Idx)) 4328 CurrentOrder[I * PartSz + Idx] = I * PartSz + K; 4329 } 4330 // Shuffle of at least 2 vectors - ignore. 4331 if (SecondVecFound) { 4332 std::fill(Slice.begin(), Slice.end(), NumScalars); 4333 ShuffledSubMasks.set(I); 4334 continue; 4335 } 4336 } 4337 }; 4338 int PartSz = getPartNumElems(NumScalars, NumParts); 4339 if (!ExtractShuffles.empty()) 4340 TransformMaskToOrder( 4341 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) { 4342 if (!ExtractShuffles[I]) 4343 return 0U; 4344 unsigned VF = 0; 4345 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I); 4346 for (unsigned Idx : seq<unsigned>(Sz)) { 4347 int K = I * PartSz + Idx; 4348 if (ExtractMask[K] == PoisonMaskElem) 4349 continue; 4350 if (!TE.ReuseShuffleIndices.empty()) 4351 K = TE.ReuseShuffleIndices[K]; 4352 if (!TE.ReorderIndices.empty()) 4353 K = std::distance(TE.ReorderIndices.begin(), 4354 find(TE.ReorderIndices, K)); 4355 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]); 4356 if (!EI) 4357 continue; 4358 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType()) 4359 ->getElementCount() 4360 .getKnownMinValue()); 4361 } 4362 return VF; 4363 }); 4364 // Check special corner case - single shuffle of the same entry. 4365 if (GatherShuffles.size() == 1 && NumParts != 1) { 4366 if (ShuffledSubMasks.any()) 4367 return std::nullopt; 4368 PartSz = NumScalars; 4369 NumParts = 1; 4370 } 4371 if (!Entries.empty()) 4372 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) { 4373 if (!GatherShuffles[I]) 4374 return 0U; 4375 return std::max(Entries[I].front()->getVectorFactor(), 4376 Entries[I].back()->getVectorFactor()); 4377 }); 4378 int NumUndefs = 4379 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; }); 4380 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) 4381 return std::nullopt; 4382 return std::move(CurrentOrder); 4383 } 4384 4385 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, 4386 const TargetLibraryInfo &TLI, 4387 bool CompareOpcodes = true) { 4388 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) 4389 return false; 4390 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); 4391 if (!GEP1) 4392 return false; 4393 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); 4394 if (!GEP2) 4395 return false; 4396 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 && 4397 ((isConstant(GEP1->getOperand(1)) && 4398 isConstant(GEP2->getOperand(1))) || 4399 !CompareOpcodes || 4400 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) 4401 .getOpcode()); 4402 } 4403 4404 /// Calculates minimal alignment as a common alignment. 4405 template <typename T> 4406 static Align computeCommonAlignment(ArrayRef<Value *> VL) { 4407 Align CommonAlignment = cast<T>(VL.front())->getAlign(); 4408 for (Value *V : VL.drop_front()) 4409 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign()); 4410 return CommonAlignment; 4411 } 4412 4413 /// Check if \p Order represents reverse order. 4414 static bool isReverseOrder(ArrayRef<unsigned> Order) { 4415 unsigned Sz = Order.size(); 4416 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) { 4417 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value(); 4418 }); 4419 } 4420 4421 /// Checks if the provided list of pointers \p Pointers represents the strided 4422 /// pointers for type ElemTy. If they are not, std::nullopt is returned. 4423 /// Otherwise, if \p Inst is not specified, just initialized optional value is 4424 /// returned to show that the pointers represent strided pointers. If \p Inst 4425 /// specified, the runtime stride is materialized before the given \p Inst. 4426 /// \returns std::nullopt if the pointers are not pointers with the runtime 4427 /// stride, nullptr or actual stride value, otherwise. 4428 static std::optional<Value *> 4429 calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, 4430 const DataLayout &DL, ScalarEvolution &SE, 4431 SmallVectorImpl<unsigned> &SortedIndices, 4432 Instruction *Inst = nullptr) { 4433 SmallVector<const SCEV *> SCEVs; 4434 const SCEV *PtrSCEVLowest = nullptr; 4435 const SCEV *PtrSCEVHighest = nullptr; 4436 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest 4437 // addresses). 4438 for (Value *Ptr : PointerOps) { 4439 const SCEV *PtrSCEV = SE.getSCEV(Ptr); 4440 if (!PtrSCEV) 4441 return std::nullopt; 4442 SCEVs.push_back(PtrSCEV); 4443 if (!PtrSCEVLowest && !PtrSCEVHighest) { 4444 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV; 4445 continue; 4446 } 4447 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); 4448 if (isa<SCEVCouldNotCompute>(Diff)) 4449 return std::nullopt; 4450 if (Diff->isNonConstantNegative()) { 4451 PtrSCEVLowest = PtrSCEV; 4452 continue; 4453 } 4454 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV); 4455 if (isa<SCEVCouldNotCompute>(Diff1)) 4456 return std::nullopt; 4457 if (Diff1->isNonConstantNegative()) { 4458 PtrSCEVHighest = PtrSCEV; 4459 continue; 4460 } 4461 } 4462 // Dist = PtrSCEVHighest - PtrSCEVLowest; 4463 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest); 4464 if (isa<SCEVCouldNotCompute>(Dist)) 4465 return std::nullopt; 4466 int Size = DL.getTypeStoreSize(ElemTy); 4467 auto TryGetStride = [&](const SCEV *Dist, 4468 const SCEV *Multiplier) -> const SCEV * { 4469 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) { 4470 if (M->getOperand(0) == Multiplier) 4471 return M->getOperand(1); 4472 if (M->getOperand(1) == Multiplier) 4473 return M->getOperand(0); 4474 return nullptr; 4475 } 4476 if (Multiplier == Dist) 4477 return SE.getConstant(Dist->getType(), 1); 4478 return SE.getUDivExactExpr(Dist, Multiplier); 4479 }; 4480 // Stride_in_elements = Dist / element_size * (num_elems - 1). 4481 const SCEV *Stride = nullptr; 4482 if (Size != 1 || SCEVs.size() > 2) { 4483 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1)); 4484 Stride = TryGetStride(Dist, Sz); 4485 if (!Stride) 4486 return std::nullopt; 4487 } 4488 if (!Stride || isa<SCEVConstant>(Stride)) 4489 return std::nullopt; 4490 // Iterate through all pointers and check if all distances are 4491 // unique multiple of Stride. 4492 using DistOrdPair = std::pair<int64_t, int>; 4493 auto Compare = llvm::less_first(); 4494 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); 4495 int Cnt = 0; 4496 bool IsConsecutive = true; 4497 for (const SCEV *PtrSCEV : SCEVs) { 4498 unsigned Dist = 0; 4499 if (PtrSCEV != PtrSCEVLowest) { 4500 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); 4501 const SCEV *Coeff = TryGetStride(Diff, Stride); 4502 if (!Coeff) 4503 return std::nullopt; 4504 const auto *SC = dyn_cast<SCEVConstant>(Coeff); 4505 if (!SC || isa<SCEVCouldNotCompute>(SC)) 4506 return std::nullopt; 4507 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, 4508 SE.getMulExpr(Stride, SC))) 4509 ->isZero()) 4510 return std::nullopt; 4511 Dist = SC->getAPInt().getZExtValue(); 4512 } 4513 // If the strides are not the same or repeated, we can't vectorize. 4514 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) 4515 return std::nullopt; 4516 auto Res = Offsets.emplace(Dist, Cnt); 4517 if (!Res.second) 4518 return std::nullopt; 4519 // Consecutive order if the inserted element is the last one. 4520 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); 4521 ++Cnt; 4522 } 4523 if (Offsets.size() != SCEVs.size()) 4524 return std::nullopt; 4525 SortedIndices.clear(); 4526 if (!IsConsecutive) { 4527 // Fill SortedIndices array only if it is non-consecutive. 4528 SortedIndices.resize(PointerOps.size()); 4529 Cnt = 0; 4530 for (const std::pair<int64_t, int> &Pair : Offsets) { 4531 SortedIndices[Cnt] = Pair.second; 4532 ++Cnt; 4533 } 4534 } 4535 if (!Inst) 4536 return nullptr; 4537 SCEVExpander Expander(SE, DL, "strided-load-vec"); 4538 return Expander.expandCodeFor(Stride, Stride->getType(), Inst); 4539 } 4540 4541 static std::pair<InstructionCost, InstructionCost> 4542 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, 4543 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, 4544 Type *ScalarTy, VectorType *VecTy); 4545 4546 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( 4547 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order, 4548 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const { 4549 // Check that a vectorized load would load the same memory as a scalar 4550 // load. For example, we don't want to vectorize loads that are smaller 4551 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 4552 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 4553 // from such a struct, we read/write packed bits disagreeing with the 4554 // unvectorized version. 4555 Type *ScalarTy = VL0->getType(); 4556 4557 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) 4558 return LoadsState::Gather; 4559 4560 // Make sure all loads in the bundle are simple - we can't vectorize 4561 // atomic or volatile loads. 4562 PointerOps.clear(); 4563 const unsigned Sz = VL.size(); 4564 PointerOps.resize(Sz); 4565 auto *POIter = PointerOps.begin(); 4566 for (Value *V : VL) { 4567 auto *L = cast<LoadInst>(V); 4568 if (!L->isSimple()) 4569 return LoadsState::Gather; 4570 *POIter = L->getPointerOperand(); 4571 ++POIter; 4572 } 4573 4574 Order.clear(); 4575 auto *VecTy = getWidenedType(ScalarTy, Sz); 4576 // Check the order of pointer operands or that all pointers are the same. 4577 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); 4578 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. 4579 if (!Order.empty() && !isPowerOf2_32(VL.size())) { 4580 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " 4581 "supported with VectorizeNonPowerOf2"); 4582 return LoadsState::Gather; 4583 } 4584 4585 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL); 4586 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) && 4587 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) && 4588 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order)) 4589 return LoadsState::StridedVectorize; 4590 if (IsSorted || all_of(PointerOps, [&](Value *P) { 4591 return arePointersCompatible(P, PointerOps.front(), *TLI); 4592 })) { 4593 if (IsSorted) { 4594 Value *Ptr0; 4595 Value *PtrN; 4596 if (Order.empty()) { 4597 Ptr0 = PointerOps.front(); 4598 PtrN = PointerOps.back(); 4599 } else { 4600 Ptr0 = PointerOps[Order.front()]; 4601 PtrN = PointerOps[Order.back()]; 4602 } 4603 std::optional<int> Diff = 4604 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); 4605 // Check that the sorted loads are consecutive. 4606 if (static_cast<unsigned>(*Diff) == Sz - 1) 4607 return LoadsState::Vectorize; 4608 // Simple check if not a strided access - clear order. 4609 bool IsPossibleStrided = *Diff % (Sz - 1) == 0; 4610 // Try to generate strided load node if: 4611 // 1. Target with strided load support is detected. 4612 // 2. The number of loads is greater than MinProfitableStridedLoads, 4613 // or the potential stride <= MaxProfitableLoadStride and the 4614 // potential stride is power-of-2 (to avoid perf regressions for the very 4615 // small number of loads) and max distance > number of loads, or potential 4616 // stride is -1. 4617 // 3. The loads are ordered, or number of unordered loads <= 4618 // MaxProfitableUnorderedLoads, or loads are in reversed order. 4619 // (this check is to avoid extra costs for very expensive shuffles). 4620 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads || 4621 (static_cast<unsigned>(std::abs(*Diff)) <= 4622 MaxProfitableLoadStride * Sz && 4623 isPowerOf2_32(std::abs(*Diff)))) && 4624 static_cast<unsigned>(std::abs(*Diff)) > Sz) || 4625 *Diff == -(static_cast<int>(Sz) - 1))) { 4626 int Stride = *Diff / static_cast<int>(Sz - 1); 4627 if (*Diff == Stride * static_cast<int>(Sz - 1)) { 4628 Align Alignment = 4629 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) 4630 ->getAlign(); 4631 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) { 4632 // Iterate through all pointers and check if all distances are 4633 // unique multiple of Dist. 4634 SmallSet<int, 4> Dists; 4635 for (Value *Ptr : PointerOps) { 4636 int Dist = 0; 4637 if (Ptr == PtrN) 4638 Dist = *Diff; 4639 else if (Ptr != Ptr0) 4640 Dist = 4641 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); 4642 // If the strides are not the same or repeated, we can't 4643 // vectorize. 4644 if (((Dist / Stride) * Stride) != Dist || 4645 !Dists.insert(Dist).second) 4646 break; 4647 } 4648 if (Dists.size() == Sz) 4649 return LoadsState::StridedVectorize; 4650 } 4651 } 4652 } 4653 } 4654 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) { 4655 unsigned Sz = DL->getTypeSizeInBits(ScalarTy); 4656 unsigned MinVF = getMinVF(Sz); 4657 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF); 4658 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF); 4659 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) { 4660 unsigned VectorizedCnt = 0; 4661 SmallVector<LoadsState> States; 4662 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; 4663 Cnt += VF, ++VectorizedCnt) { 4664 ArrayRef<Value *> Slice = VL.slice(Cnt, VF); 4665 SmallVector<unsigned> Order; 4666 SmallVector<Value *> PointerOps; 4667 LoadsState LS = 4668 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, 4669 /*TryRecursiveCheck=*/false); 4670 // Check that the sorted loads are consecutive. 4671 if (LS == LoadsState::Gather) 4672 break; 4673 // If need the reorder - consider as high-cost masked gather for now. 4674 if ((LS == LoadsState::Vectorize || 4675 LS == LoadsState::StridedVectorize) && 4676 !Order.empty() && !isReverseOrder(Order)) 4677 LS = LoadsState::ScatterVectorize; 4678 States.push_back(LS); 4679 } 4680 // Can be vectorized later as a serie of loads/insertelements. 4681 if (VectorizedCnt == VL.size() / VF) { 4682 // Compare masked gather cost and loads + insersubvector costs. 4683 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4684 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( 4685 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr, 4686 CostKind, ScalarTy, VecTy); 4687 InstructionCost MaskedGatherCost = 4688 TTI.getGatherScatterOpCost( 4689 Instruction::Load, VecTy, 4690 cast<LoadInst>(VL0)->getPointerOperand(), 4691 /*VariableMask=*/false, CommonAlignment, CostKind) + 4692 VectorGEPCost - ScalarGEPCost; 4693 InstructionCost VecLdCost = 0; 4694 auto *SubVecTy = getWidenedType(ScalarTy, VF); 4695 for (auto [I, LS] : enumerate(States)) { 4696 auto *LI0 = cast<LoadInst>(VL[I * VF]); 4697 switch (LS) { 4698 case LoadsState::Vectorize: { 4699 auto [ScalarGEPCost, VectorGEPCost] = 4700 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), 4701 LI0->getPointerOperand(), Instruction::Load, 4702 CostKind, ScalarTy, SubVecTy); 4703 VecLdCost += TTI.getMemoryOpCost( 4704 Instruction::Load, SubVecTy, LI0->getAlign(), 4705 LI0->getPointerAddressSpace(), CostKind, 4706 TTI::OperandValueInfo()) + 4707 VectorGEPCost - ScalarGEPCost; 4708 break; 4709 } 4710 case LoadsState::StridedVectorize: { 4711 auto [ScalarGEPCost, VectorGEPCost] = 4712 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), 4713 LI0->getPointerOperand(), Instruction::Load, 4714 CostKind, ScalarTy, SubVecTy); 4715 VecLdCost += 4716 TTI.getStridedMemoryOpCost( 4717 Instruction::Load, SubVecTy, LI0->getPointerOperand(), 4718 /*VariableMask=*/false, CommonAlignment, CostKind) + 4719 VectorGEPCost - ScalarGEPCost; 4720 break; 4721 } 4722 case LoadsState::ScatterVectorize: { 4723 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( 4724 TTI, ArrayRef(PointerOps).slice(I * VF, VF), 4725 LI0->getPointerOperand(), Instruction::GetElementPtr, 4726 CostKind, ScalarTy, SubVecTy); 4727 VecLdCost += 4728 TTI.getGatherScatterOpCost( 4729 Instruction::Load, SubVecTy, LI0->getPointerOperand(), 4730 /*VariableMask=*/false, CommonAlignment, CostKind) + 4731 VectorGEPCost - ScalarGEPCost; 4732 break; 4733 } 4734 case LoadsState::Gather: 4735 llvm_unreachable( 4736 "Expected only consecutive, strided or masked gather loads."); 4737 } 4738 SmallVector<int> ShuffleMask(VL.size()); 4739 for (int Idx : seq<int>(0, VL.size())) 4740 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; 4741 VecLdCost += 4742 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask, 4743 CostKind, I * VF, SubVecTy); 4744 } 4745 // If masked gather cost is higher - better to vectorize, so 4746 // consider it as a gather node. It will be better estimated 4747 // later. 4748 if (MaskedGatherCost >= VecLdCost) 4749 return true; 4750 } 4751 } 4752 return false; 4753 }; 4754 // TODO: need to improve analysis of the pointers, if not all of them are 4755 // GEPs or have > 2 operands, we end up with a gather node, which just 4756 // increases the cost. 4757 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent()); 4758 bool ProfitableGatherPointers = 4759 L && Sz > 2 && 4760 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) { 4761 return L->isLoopInvariant(V); 4762 })) <= Sz / 2; 4763 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { 4764 auto *GEP = dyn_cast<GetElementPtrInst>(P); 4765 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || 4766 (GEP && GEP->getNumOperands() == 2 && 4767 isa<Constant, Instruction>(GEP->getOperand(1))); 4768 })) { 4769 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL); 4770 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) && 4771 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) { 4772 // Check if potential masked gather can be represented as series 4773 // of loads + insertsubvectors. 4774 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) { 4775 // If masked gather cost is higher - better to vectorize, so 4776 // consider it as a gather node. It will be better estimated 4777 // later. 4778 return LoadsState::Gather; 4779 } 4780 return LoadsState::ScatterVectorize; 4781 } 4782 } 4783 } 4784 4785 return LoadsState::Gather; 4786 } 4787 4788 static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, 4789 const DataLayout &DL, ScalarEvolution &SE, 4790 SmallVectorImpl<unsigned> &SortedIndices) { 4791 assert(llvm::all_of( 4792 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && 4793 "Expected list of pointer operands."); 4794 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each 4795 // Ptr into, sort and return the sorted indices with values next to one 4796 // another. 4797 MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases; 4798 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); 4799 4800 unsigned Cnt = 1; 4801 for (Value *Ptr : VL.drop_front()) { 4802 bool Found = any_of(Bases, [&](auto &Base) { 4803 std::optional<int> Diff = 4804 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, 4805 /*StrictCheck=*/true); 4806 if (!Diff) 4807 return false; 4808 4809 Base.second.emplace_back(Ptr, *Diff, Cnt++); 4810 return true; 4811 }); 4812 4813 if (!Found) { 4814 // If we haven't found enough to usefully cluster, return early. 4815 if (Bases.size() > VL.size() / 2 - 1) 4816 return false; 4817 4818 // Not found already - add a new Base 4819 Bases[Ptr].emplace_back(Ptr, 0, Cnt++); 4820 } 4821 } 4822 4823 // For each of the bases sort the pointers by Offset and check if any of the 4824 // base become consecutively allocated. 4825 bool AnyConsecutive = false; 4826 for (auto &Base : Bases) { 4827 auto &Vec = Base.second; 4828 if (Vec.size() > 1) { 4829 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X, 4830 const std::tuple<Value *, int, unsigned> &Y) { 4831 return std::get<1>(X) < std::get<1>(Y); 4832 }); 4833 int InitialOffset = std::get<1>(Vec[0]); 4834 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) { 4835 return std::get<1>(P.value()) == int(P.index()) + InitialOffset; 4836 }); 4837 } 4838 } 4839 4840 // Fill SortedIndices array only if it looks worth-while to sort the ptrs. 4841 SortedIndices.clear(); 4842 if (!AnyConsecutive) 4843 return false; 4844 4845 for (auto &Base : Bases) { 4846 for (auto &T : Base.second) 4847 SortedIndices.push_back(std::get<2>(T)); 4848 } 4849 4850 assert(SortedIndices.size() == VL.size() && 4851 "Expected SortedIndices to be the size of VL"); 4852 return true; 4853 } 4854 4855 std::optional<BoUpSLP::OrdersType> 4856 BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { 4857 assert(TE.isGather() && "Expected gather node only."); 4858 Type *ScalarTy = TE.Scalars[0]->getType(); 4859 4860 SmallVector<Value *> Ptrs; 4861 Ptrs.reserve(TE.Scalars.size()); 4862 for (Value *V : TE.Scalars) { 4863 auto *L = dyn_cast<LoadInst>(V); 4864 if (!L || !L->isSimple()) 4865 return std::nullopt; 4866 Ptrs.push_back(L->getPointerOperand()); 4867 } 4868 4869 BoUpSLP::OrdersType Order; 4870 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) 4871 return std::move(Order); 4872 return std::nullopt; 4873 } 4874 4875 /// Check if two insertelement instructions are from the same buildvector. 4876 static bool areTwoInsertFromSameBuildVector( 4877 InsertElementInst *VU, InsertElementInst *V, 4878 function_ref<Value *(InsertElementInst *)> GetBaseOperand) { 4879 // Instructions must be from the same basic blocks. 4880 if (VU->getParent() != V->getParent()) 4881 return false; 4882 // Checks if 2 insertelements are from the same buildvector. 4883 if (VU->getType() != V->getType()) 4884 return false; 4885 // Multiple used inserts are separate nodes. 4886 if (!VU->hasOneUse() && !V->hasOneUse()) 4887 return false; 4888 auto *IE1 = VU; 4889 auto *IE2 = V; 4890 std::optional<unsigned> Idx1 = getElementIndex(IE1); 4891 std::optional<unsigned> Idx2 = getElementIndex(IE2); 4892 if (Idx1 == std::nullopt || Idx2 == std::nullopt) 4893 return false; 4894 // Go through the vector operand of insertelement instructions trying to find 4895 // either VU as the original vector for IE2 or V as the original vector for 4896 // IE1. 4897 SmallBitVector ReusedIdx( 4898 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue()); 4899 bool IsReusedIdx = false; 4900 do { 4901 if (IE2 == VU && !IE1) 4902 return VU->hasOneUse(); 4903 if (IE1 == V && !IE2) 4904 return V->hasOneUse(); 4905 if (IE1 && IE1 != V) { 4906 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2); 4907 IsReusedIdx |= ReusedIdx.test(Idx1); 4908 ReusedIdx.set(Idx1); 4909 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx) 4910 IE1 = nullptr; 4911 else 4912 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1)); 4913 } 4914 if (IE2 && IE2 != VU) { 4915 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1); 4916 IsReusedIdx |= ReusedIdx.test(Idx2); 4917 ReusedIdx.set(Idx2); 4918 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx) 4919 IE2 = nullptr; 4920 else 4921 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2)); 4922 } 4923 } while (!IsReusedIdx && (IE1 || IE2)); 4924 return false; 4925 } 4926 4927 std::optional<BoUpSLP::OrdersType> 4928 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { 4929 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. 4930 if (TE.isNonPowOf2Vec()) 4931 return std::nullopt; 4932 4933 // No need to reorder if need to shuffle reuses, still need to shuffle the 4934 // node. 4935 if (!TE.ReuseShuffleIndices.empty()) { 4936 if (isSplat(TE.Scalars)) 4937 return std::nullopt; 4938 // Check if reuse shuffle indices can be improved by reordering. 4939 // For this, check that reuse mask is "clustered", i.e. each scalar values 4940 // is used once in each submask of size <number_of_scalars>. 4941 // Example: 4 scalar values. 4942 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. 4943 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because 4944 // element 3 is used twice in the second submask. 4945 unsigned Sz = TE.Scalars.size(); 4946 if (TE.isGather()) { 4947 if (std::optional<OrdersType> CurrentOrder = 4948 findReusedOrderedScalars(TE)) { 4949 SmallVector<int> Mask; 4950 fixupOrderingIndices(*CurrentOrder); 4951 inversePermutation(*CurrentOrder, Mask); 4952 ::addMask(Mask, TE.ReuseShuffleIndices); 4953 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor()); 4954 unsigned Sz = TE.Scalars.size(); 4955 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) { 4956 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz))) 4957 if (Idx != PoisonMaskElem) 4958 Res[Idx + K * Sz] = I + K * Sz; 4959 } 4960 return std::move(Res); 4961 } 4962 } 4963 if (Sz == 2 && TE.getVectorFactor() == 4 && 4964 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(), 4965 2 * TE.getVectorFactor())) == 1) 4966 return std::nullopt; 4967 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, 4968 Sz)) { 4969 SmallVector<int> ReorderMask(Sz, PoisonMaskElem); 4970 if (TE.ReorderIndices.empty()) 4971 std::iota(ReorderMask.begin(), ReorderMask.end(), 0); 4972 else 4973 inversePermutation(TE.ReorderIndices, ReorderMask); 4974 ::addMask(ReorderMask, TE.ReuseShuffleIndices); 4975 unsigned VF = ReorderMask.size(); 4976 OrdersType ResOrder(VF, VF); 4977 unsigned NumParts = divideCeil(VF, Sz); 4978 SmallBitVector UsedVals(NumParts); 4979 for (unsigned I = 0; I < VF; I += Sz) { 4980 int Val = PoisonMaskElem; 4981 unsigned UndefCnt = 0; 4982 unsigned Limit = std::min(Sz, VF - I); 4983 if (any_of(ArrayRef(ReorderMask).slice(I, Limit), 4984 [&](int Idx) { 4985 if (Val == PoisonMaskElem && Idx != PoisonMaskElem) 4986 Val = Idx; 4987 if (Idx == PoisonMaskElem) 4988 ++UndefCnt; 4989 return Idx != PoisonMaskElem && Idx != Val; 4990 }) || 4991 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) || 4992 UndefCnt > Sz / 2) 4993 return std::nullopt; 4994 UsedVals.set(Val); 4995 for (unsigned K = 0; K < NumParts; ++K) 4996 ResOrder[Val + Sz * K] = I + K; 4997 } 4998 return std::move(ResOrder); 4999 } 5000 unsigned VF = TE.getVectorFactor(); 5001 // Try build correct order for extractelement instructions. 5002 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), 5003 TE.ReuseShuffleIndices.end()); 5004 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() && 5005 all_of(TE.Scalars, [Sz](Value *V) { 5006 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V)); 5007 return Idx && *Idx < Sz; 5008 })) { 5009 SmallVector<int> ReorderMask(Sz, PoisonMaskElem); 5010 if (TE.ReorderIndices.empty()) 5011 std::iota(ReorderMask.begin(), ReorderMask.end(), 0); 5012 else 5013 inversePermutation(TE.ReorderIndices, ReorderMask); 5014 for (unsigned I = 0; I < VF; ++I) { 5015 int &Idx = ReusedMask[I]; 5016 if (Idx == PoisonMaskElem) 5017 continue; 5018 Value *V = TE.Scalars[ReorderMask[Idx]]; 5019 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V)); 5020 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); 5021 } 5022 } 5023 // Build the order of the VF size, need to reorder reuses shuffles, they are 5024 // always of VF size. 5025 OrdersType ResOrder(VF); 5026 std::iota(ResOrder.begin(), ResOrder.end(), 0); 5027 auto *It = ResOrder.begin(); 5028 for (unsigned K = 0; K < VF; K += Sz) { 5029 OrdersType CurrentOrder(TE.ReorderIndices); 5030 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)}; 5031 if (SubMask.front() == PoisonMaskElem) 5032 std::iota(SubMask.begin(), SubMask.end(), 0); 5033 reorderOrder(CurrentOrder, SubMask); 5034 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); 5035 std::advance(It, Sz); 5036 } 5037 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) { 5038 return Data.index() == Data.value(); 5039 })) 5040 return std::nullopt; // No need to reorder. 5041 return std::move(ResOrder); 5042 } 5043 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom && 5044 any_of(TE.UserTreeIndices, 5045 [](const EdgeInfo &EI) { 5046 return !Instruction::isBinaryOp(EI.UserTE->getOpcode()); 5047 }) && 5048 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) 5049 return std::nullopt; 5050 if ((TE.State == TreeEntry::Vectorize || 5051 TE.State == TreeEntry::StridedVectorize) && 5052 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || 5053 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && 5054 !TE.isAltShuffle()) 5055 return TE.ReorderIndices; 5056 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { 5057 auto PHICompare = [&](unsigned I1, unsigned I2) { 5058 Value *V1 = TE.Scalars[I1]; 5059 Value *V2 = TE.Scalars[I2]; 5060 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0)) 5061 return false; 5062 if (V1->getNumUses() < V2->getNumUses()) 5063 return true; 5064 if (V1->getNumUses() > V2->getNumUses()) 5065 return false; 5066 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin()); 5067 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin()); 5068 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1)) 5069 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) { 5070 if (!areTwoInsertFromSameBuildVector( 5071 IE1, IE2, 5072 [](InsertElementInst *II) { return II->getOperand(0); })) 5073 return I1 < I2; 5074 return getElementIndex(IE1) < getElementIndex(IE2); 5075 } 5076 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1)) 5077 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) { 5078 if (EE1->getOperand(0) != EE2->getOperand(0)) 5079 return I1 < I2; 5080 return getElementIndex(EE1) < getElementIndex(EE2); 5081 } 5082 return I1 < I2; 5083 }; 5084 auto IsIdentityOrder = [](const OrdersType &Order) { 5085 for (unsigned Idx : seq<unsigned>(0, Order.size())) 5086 if (Idx != Order[Idx]) 5087 return false; 5088 return true; 5089 }; 5090 if (!TE.ReorderIndices.empty()) 5091 return TE.ReorderIndices; 5092 DenseMap<unsigned, unsigned> PhiToId; 5093 SmallVector<unsigned> Phis(TE.Scalars.size()); 5094 std::iota(Phis.begin(), Phis.end(), 0); 5095 OrdersType ResOrder(TE.Scalars.size()); 5096 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) 5097 PhiToId[Id] = Id; 5098 stable_sort(Phis, PHICompare); 5099 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) 5100 ResOrder[Id] = PhiToId[Phis[Id]]; 5101 if (IsIdentityOrder(ResOrder)) 5102 return std::nullopt; // No need to reorder. 5103 return std::move(ResOrder); 5104 } 5105 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) { 5106 // TODO: add analysis of other gather nodes with extractelement 5107 // instructions and other values/instructions, not only undefs. 5108 if ((TE.getOpcode() == Instruction::ExtractElement || 5109 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) && 5110 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) && 5111 all_of(TE.Scalars, [](Value *V) { 5112 auto *EE = dyn_cast<ExtractElementInst>(V); 5113 return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); 5114 })) { 5115 // Check that gather of extractelements can be represented as 5116 // just a shuffle of a single vector. 5117 OrdersType CurrentOrder; 5118 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder, 5119 /*ResizeAllowed=*/true); 5120 if (Reuse || !CurrentOrder.empty()) 5121 return std::move(CurrentOrder); 5122 } 5123 // If the gather node is <undef, v, .., poison> and 5124 // insertelement poison, v, 0 [+ permute] 5125 // is cheaper than 5126 // insertelement poison, v, n - try to reorder. 5127 // If rotating the whole graph, exclude the permute cost, the whole graph 5128 // might be transformed. 5129 int Sz = TE.Scalars.size(); 5130 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) && 5131 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) { 5132 const auto *It = 5133 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); }); 5134 if (It == TE.Scalars.begin()) 5135 return OrdersType(); 5136 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz); 5137 if (It != TE.Scalars.end()) { 5138 OrdersType Order(Sz, Sz); 5139 unsigned Idx = std::distance(TE.Scalars.begin(), It); 5140 Order[Idx] = 0; 5141 fixupOrderingIndices(Order); 5142 SmallVector<int> Mask; 5143 inversePermutation(Order, Mask); 5144 InstructionCost PermuteCost = 5145 TopToBottom 5146 ? 0 5147 : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask); 5148 InstructionCost InsertFirstCost = TTI->getVectorInstrCost( 5149 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0, 5150 PoisonValue::get(Ty), *It); 5151 InstructionCost InsertIdxCost = TTI->getVectorInstrCost( 5152 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, 5153 PoisonValue::get(Ty), *It); 5154 if (InsertFirstCost + PermuteCost < InsertIdxCost) { 5155 OrdersType Order(Sz, Sz); 5156 Order[Idx] = 0; 5157 return std::move(Order); 5158 } 5159 } 5160 } 5161 if (isSplat(TE.Scalars)) 5162 return std::nullopt; 5163 if (TE.Scalars.size() >= 4) 5164 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) 5165 return Order; 5166 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) 5167 return CurrentOrder; 5168 } 5169 return std::nullopt; 5170 } 5171 5172 /// Checks if the given mask is a "clustered" mask with the same clusters of 5173 /// size \p Sz, which are not identity submasks. 5174 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask, 5175 unsigned Sz) { 5176 ArrayRef<int> FirstCluster = Mask.slice(0, Sz); 5177 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz)) 5178 return false; 5179 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { 5180 ArrayRef<int> Cluster = Mask.slice(I, Sz); 5181 if (Cluster != FirstCluster) 5182 return false; 5183 } 5184 return true; 5185 } 5186 5187 void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const { 5188 // Reorder reuses mask. 5189 reorderReuses(TE.ReuseShuffleIndices, Mask); 5190 const unsigned Sz = TE.Scalars.size(); 5191 // For vectorized and non-clustered reused no need to do anything else. 5192 if (!TE.isGather() || 5193 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, 5194 Sz) || 5195 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) 5196 return; 5197 SmallVector<int> NewMask; 5198 inversePermutation(TE.ReorderIndices, NewMask); 5199 addMask(NewMask, TE.ReuseShuffleIndices); 5200 // Clear reorder since it is going to be applied to the new mask. 5201 TE.ReorderIndices.clear(); 5202 // Try to improve gathered nodes with clustered reuses, if possible. 5203 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz); 5204 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end()); 5205 inversePermutation(NewOrder, NewMask); 5206 reorderScalars(TE.Scalars, NewMask); 5207 // Fill the reuses mask with the identity submasks. 5208 for (auto *It = TE.ReuseShuffleIndices.begin(), 5209 *End = TE.ReuseShuffleIndices.end(); 5210 It != End; std::advance(It, Sz)) 5211 std::iota(It, std::next(It, Sz), 0); 5212 } 5213 5214 static void combineOrders(MutableArrayRef<unsigned> Order, 5215 ArrayRef<unsigned> SecondaryOrder) { 5216 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) && 5217 "Expected same size of orders"); 5218 unsigned Sz = Order.size(); 5219 SmallBitVector UsedIndices(Sz); 5220 for (unsigned Idx : seq<unsigned>(0, Sz)) { 5221 if (Order[Idx] != Sz) 5222 UsedIndices.set(Order[Idx]); 5223 } 5224 if (SecondaryOrder.empty()) { 5225 for (unsigned Idx : seq<unsigned>(0, Sz)) 5226 if (Order[Idx] == Sz && !UsedIndices.test(Idx)) 5227 Order[Idx] = Idx; 5228 } else { 5229 for (unsigned Idx : seq<unsigned>(0, Sz)) 5230 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz && 5231 !UsedIndices.test(SecondaryOrder[Idx])) 5232 Order[Idx] = SecondaryOrder[Idx]; 5233 } 5234 } 5235 5236 void BoUpSLP::reorderTopToBottom() { 5237 // Maps VF to the graph nodes. 5238 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; 5239 // ExtractElement gather nodes which can be vectorized and need to handle 5240 // their ordering. 5241 DenseMap<const TreeEntry *, OrdersType> GathersToOrders; 5242 5243 // Phi nodes can have preferred ordering based on their result users 5244 DenseMap<const TreeEntry *, OrdersType> PhisToOrders; 5245 5246 // AltShuffles can also have a preferred ordering that leads to fewer 5247 // instructions, e.g., the addsub instruction in x86. 5248 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders; 5249 5250 // Maps a TreeEntry to the reorder indices of external users. 5251 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>> 5252 ExternalUserReorderMap; 5253 // Find all reorderable nodes with the given VF. 5254 // Currently the are vectorized stores,loads,extracts + some gathering of 5255 // extracts. 5256 for_each(VectorizableTree, [&, &TTIRef = *TTI]( 5257 const std::unique_ptr<TreeEntry> &TE) { 5258 // Look for external users that will probably be vectorized. 5259 SmallVector<OrdersType, 1> ExternalUserReorderIndices = 5260 findExternalStoreUsersReorderIndices(TE.get()); 5261 if (!ExternalUserReorderIndices.empty()) { 5262 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 5263 ExternalUserReorderMap.try_emplace(TE.get(), 5264 std::move(ExternalUserReorderIndices)); 5265 } 5266 5267 // Patterns like [fadd,fsub] can be combined into a single instruction in 5268 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need 5269 // to take into account their order when looking for the most used order. 5270 if (TE->isAltShuffle()) { 5271 VectorType *VecTy = 5272 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); 5273 unsigned Opcode0 = TE->getOpcode(); 5274 unsigned Opcode1 = TE->getAltOpcode(); 5275 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1)); 5276 // If this pattern is supported by the target then we consider the order. 5277 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { 5278 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 5279 AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); 5280 } 5281 // TODO: Check the reverse order too. 5282 } 5283 5284 if (std::optional<OrdersType> CurrentOrder = 5285 getReorderingData(*TE, /*TopToBottom=*/true)) { 5286 // Do not include ordering for nodes used in the alt opcode vectorization, 5287 // better to reorder them during bottom-to-top stage. If follow the order 5288 // here, it causes reordering of the whole graph though actually it is 5289 // profitable just to reorder the subgraph that starts from the alternate 5290 // opcode vectorization node. Such nodes already end-up with the shuffle 5291 // instruction and it is just enough to change this shuffle rather than 5292 // rotate the scalars for the whole graph. 5293 unsigned Cnt = 0; 5294 const TreeEntry *UserTE = TE.get(); 5295 while (UserTE && Cnt < RecursionMaxDepth) { 5296 if (UserTE->UserTreeIndices.size() != 1) 5297 break; 5298 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { 5299 return EI.UserTE->State == TreeEntry::Vectorize && 5300 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; 5301 })) 5302 return; 5303 UserTE = UserTE->UserTreeIndices.back().UserTE; 5304 ++Cnt; 5305 } 5306 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); 5307 if (!(TE->State == TreeEntry::Vectorize || 5308 TE->State == TreeEntry::StridedVectorize) || 5309 !TE->ReuseShuffleIndices.empty()) 5310 GathersToOrders.try_emplace(TE.get(), *CurrentOrder); 5311 if (TE->State == TreeEntry::Vectorize && 5312 TE->getOpcode() == Instruction::PHI) 5313 PhisToOrders.try_emplace(TE.get(), *CurrentOrder); 5314 } 5315 }); 5316 5317 // Reorder the graph nodes according to their vectorization factor. 5318 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; 5319 VF /= 2) { 5320 auto It = VFToOrderedEntries.find(VF); 5321 if (It == VFToOrderedEntries.end()) 5322 continue; 5323 // Try to find the most profitable order. We just are looking for the most 5324 // used order and reorder scalar elements in the nodes according to this 5325 // mostly used order. 5326 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); 5327 // All operands are reordered and used only in this node - propagate the 5328 // most used order to the user node. 5329 MapVector<OrdersType, unsigned, 5330 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 5331 OrdersUses; 5332 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 5333 for (const TreeEntry *OpTE : OrderedEntries) { 5334 // No need to reorder this nodes, still need to extend and to use shuffle, 5335 // just need to merge reordering shuffle and the reuse shuffle. 5336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) 5337 continue; 5338 // Count number of orders uses. 5339 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, 5340 &PhisToOrders]() -> const OrdersType & { 5341 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) { 5342 auto It = GathersToOrders.find(OpTE); 5343 if (It != GathersToOrders.end()) 5344 return It->second; 5345 } 5346 if (OpTE->isAltShuffle()) { 5347 auto It = AltShufflesToOrders.find(OpTE); 5348 if (It != AltShufflesToOrders.end()) 5349 return It->second; 5350 } 5351 if (OpTE->State == TreeEntry::Vectorize && 5352 OpTE->getOpcode() == Instruction::PHI) { 5353 auto It = PhisToOrders.find(OpTE); 5354 if (It != PhisToOrders.end()) 5355 return It->second; 5356 } 5357 return OpTE->ReorderIndices; 5358 }(); 5359 // First consider the order of the external scalar users. 5360 auto It = ExternalUserReorderMap.find(OpTE); 5361 if (It != ExternalUserReorderMap.end()) { 5362 const auto &ExternalUserReorderIndices = It->second; 5363 // If the OpTE vector factor != number of scalars - use natural order, 5364 // it is an attempt to reorder node with reused scalars but with 5365 // external uses. 5366 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { 5367 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += 5368 ExternalUserReorderIndices.size(); 5369 } else { 5370 for (const OrdersType &ExtOrder : ExternalUserReorderIndices) 5371 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; 5372 } 5373 // No other useful reorder data in this entry. 5374 if (Order.empty()) 5375 continue; 5376 } 5377 // Stores actually store the mask, not the order, need to invert. 5378 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && 5379 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 5380 SmallVector<int> Mask; 5381 inversePermutation(Order, Mask); 5382 unsigned E = Order.size(); 5383 OrdersType CurrentOrder(E, E); 5384 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 5385 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); 5386 }); 5387 fixupOrderingIndices(CurrentOrder); 5388 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; 5389 } else { 5390 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; 5391 } 5392 } 5393 if (OrdersUses.empty()) 5394 continue; 5395 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) { 5396 const unsigned Sz = Order.size(); 5397 for (unsigned Idx : seq<unsigned>(0, Sz)) 5398 if (Idx != Order[Idx] && Order[Idx] != Sz) 5399 return false; 5400 return true; 5401 }; 5402 // Choose the most used order. 5403 unsigned IdentityCnt = 0; 5404 unsigned FilledIdentityCnt = 0; 5405 OrdersType IdentityOrder(VF, VF); 5406 for (auto &Pair : OrdersUses) { 5407 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { 5408 if (!Pair.first.empty()) 5409 FilledIdentityCnt += Pair.second; 5410 IdentityCnt += Pair.second; 5411 combineOrders(IdentityOrder, Pair.first); 5412 } 5413 } 5414 MutableArrayRef<unsigned> BestOrder = IdentityOrder; 5415 unsigned Cnt = IdentityCnt; 5416 for (auto &Pair : OrdersUses) { 5417 // Prefer identity order. But, if filled identity found (non-empty order) 5418 // with same number of uses, as the new candidate order, we can choose 5419 // this candidate order. 5420 if (Cnt < Pair.second || 5421 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt && 5422 Cnt == Pair.second && !BestOrder.empty() && 5423 IsIdentityOrder(BestOrder))) { 5424 combineOrders(Pair.first, BestOrder); 5425 BestOrder = Pair.first; 5426 Cnt = Pair.second; 5427 } else { 5428 combineOrders(BestOrder, Pair.first); 5429 } 5430 } 5431 // Set order of the user node. 5432 if (IsIdentityOrder(BestOrder)) 5433 continue; 5434 fixupOrderingIndices(BestOrder); 5435 SmallVector<int> Mask; 5436 inversePermutation(BestOrder, Mask); 5437 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); 5438 unsigned E = BestOrder.size(); 5439 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 5440 return I < E ? static_cast<int>(I) : PoisonMaskElem; 5441 }); 5442 // Do an actual reordering, if profitable. 5443 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 5444 // Just do the reordering for the nodes with the given VF. 5445 if (TE->Scalars.size() != VF) { 5446 if (TE->ReuseShuffleIndices.size() == VF) { 5447 // Need to reorder the reuses masks of the operands with smaller VF to 5448 // be able to find the match between the graph nodes and scalar 5449 // operands of the given node during vectorization/cost estimation. 5450 assert(all_of(TE->UserTreeIndices, 5451 [VF, &TE](const EdgeInfo &EI) { 5452 return EI.UserTE->Scalars.size() == VF || 5453 EI.UserTE->Scalars.size() == 5454 TE->Scalars.size(); 5455 }) && 5456 "All users must be of VF size."); 5457 // Update ordering of the operands with the smaller VF than the given 5458 // one. 5459 reorderNodeWithReuses(*TE, Mask); 5460 } 5461 continue; 5462 } 5463 if ((TE->State == TreeEntry::Vectorize || 5464 TE->State == TreeEntry::StridedVectorize) && 5465 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, 5466 InsertElementInst>(TE->getMainOp()) && 5467 !TE->isAltShuffle()) { 5468 // Build correct orders for extract{element,value}, loads and 5469 // stores. 5470 reorderOrder(TE->ReorderIndices, Mask); 5471 if (isa<InsertElementInst, StoreInst>(TE->getMainOp())) 5472 TE->reorderOperands(Mask); 5473 } else { 5474 // Reorder the node and its operands. 5475 TE->reorderOperands(Mask); 5476 assert(TE->ReorderIndices.empty() && 5477 "Expected empty reorder sequence."); 5478 reorderScalars(TE->Scalars, Mask); 5479 } 5480 if (!TE->ReuseShuffleIndices.empty()) { 5481 // Apply reversed order to keep the original ordering of the reused 5482 // elements to avoid extra reorder indices shuffling. 5483 OrdersType CurrentOrder; 5484 reorderOrder(CurrentOrder, MaskOrder); 5485 SmallVector<int> NewReuses; 5486 inversePermutation(CurrentOrder, NewReuses); 5487 addMask(NewReuses, TE->ReuseShuffleIndices); 5488 TE->ReuseShuffleIndices.swap(NewReuses); 5489 } 5490 } 5491 } 5492 } 5493 5494 bool BoUpSLP::canReorderOperands( 5495 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, 5496 ArrayRef<TreeEntry *> ReorderableGathers, 5497 SmallVectorImpl<TreeEntry *> &GatherOps) { 5498 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. 5499 if (UserTE->isNonPowOf2Vec()) 5500 return false; 5501 5502 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { 5503 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) { 5504 return OpData.first == I && 5505 (OpData.second->State == TreeEntry::Vectorize || 5506 OpData.second->State == TreeEntry::StridedVectorize); 5507 })) 5508 continue; 5509 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { 5510 // Do not reorder if operand node is used by many user nodes. 5511 if (any_of(TE->UserTreeIndices, 5512 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) 5513 return false; 5514 // Add the node to the list of the ordered nodes with the identity 5515 // order. 5516 Edges.emplace_back(I, TE); 5517 // Add ScatterVectorize nodes to the list of operands, where just 5518 // reordering of the scalars is required. Similar to the gathers, so 5519 // simply add to the list of gathered ops. 5520 // If there are reused scalars, process this node as a regular vectorize 5521 // node, just reorder reuses mask. 5522 if (TE->State != TreeEntry::Vectorize && 5523 TE->State != TreeEntry::StridedVectorize && 5524 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) 5525 GatherOps.push_back(TE); 5526 continue; 5527 } 5528 TreeEntry *Gather = nullptr; 5529 if (count_if(ReorderableGathers, 5530 [&Gather, UserTE, I](TreeEntry *TE) { 5531 assert(TE->State != TreeEntry::Vectorize && 5532 TE->State != TreeEntry::StridedVectorize && 5533 "Only non-vectorized nodes are expected."); 5534 if (any_of(TE->UserTreeIndices, 5535 [UserTE, I](const EdgeInfo &EI) { 5536 return EI.UserTE == UserTE && EI.EdgeIdx == I; 5537 })) { 5538 assert(TE->isSame(UserTE->getOperand(I)) && 5539 "Operand entry does not match operands."); 5540 Gather = TE; 5541 return true; 5542 } 5543 return false; 5544 }) > 1 && 5545 !allConstant(UserTE->getOperand(I))) 5546 return false; 5547 if (Gather) 5548 GatherOps.push_back(Gather); 5549 } 5550 return true; 5551 } 5552 5553 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { 5554 SetVector<TreeEntry *> OrderedEntries; 5555 DenseSet<const TreeEntry *> GathersToOrders; 5556 // Find all reorderable leaf nodes with the given VF. 5557 // Currently the are vectorized loads,extracts without alternate operands + 5558 // some gathering of extracts. 5559 SmallVector<TreeEntry *> NonVectorized; 5560 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 5561 if (TE->State != TreeEntry::Vectorize && 5562 TE->State != TreeEntry::StridedVectorize) 5563 NonVectorized.push_back(TE.get()); 5564 if (std::optional<OrdersType> CurrentOrder = 5565 getReorderingData(*TE, /*TopToBottom=*/false)) { 5566 OrderedEntries.insert(TE.get()); 5567 if (!(TE->State == TreeEntry::Vectorize || 5568 TE->State == TreeEntry::StridedVectorize) || 5569 !TE->ReuseShuffleIndices.empty()) 5570 GathersToOrders.insert(TE.get()); 5571 } 5572 } 5573 5574 // 1. Propagate order to the graph nodes, which use only reordered nodes. 5575 // I.e., if the node has operands, that are reordered, try to make at least 5576 // one operand order in the natural order and reorder others + reorder the 5577 // user node itself. 5578 SmallPtrSet<const TreeEntry *, 4> Visited; 5579 while (!OrderedEntries.empty()) { 5580 // 1. Filter out only reordered nodes. 5581 // 2. If the entry has multiple uses - skip it and jump to the next node. 5582 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; 5583 SmallVector<TreeEntry *> Filtered; 5584 for (TreeEntry *TE : OrderedEntries) { 5585 if (!(TE->State == TreeEntry::Vectorize || 5586 TE->State == TreeEntry::StridedVectorize || 5587 (TE->isGather() && GathersToOrders.contains(TE))) || 5588 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || 5589 !all_of(drop_begin(TE->UserTreeIndices), 5590 [TE](const EdgeInfo &EI) { 5591 return EI.UserTE == TE->UserTreeIndices.front().UserTE; 5592 }) || 5593 !Visited.insert(TE).second) { 5594 Filtered.push_back(TE); 5595 continue; 5596 } 5597 // Build a map between user nodes and their operands order to speedup 5598 // search. The graph currently does not provide this dependency directly. 5599 for (EdgeInfo &EI : TE->UserTreeIndices) { 5600 TreeEntry *UserTE = EI.UserTE; 5601 auto It = Users.find(UserTE); 5602 if (It == Users.end()) 5603 It = Users.insert({UserTE, {}}).first; 5604 It->second.emplace_back(EI.EdgeIdx, TE); 5605 } 5606 } 5607 // Erase filtered entries. 5608 for (TreeEntry *TE : Filtered) 5609 OrderedEntries.remove(TE); 5610 SmallVector< 5611 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>> 5612 UsersVec(Users.begin(), Users.end()); 5613 sort(UsersVec, [](const auto &Data1, const auto &Data2) { 5614 return Data1.first->Idx > Data2.first->Idx; 5615 }); 5616 for (auto &Data : UsersVec) { 5617 // Check that operands are used only in the User node. 5618 SmallVector<TreeEntry *> GatherOps; 5619 if (!canReorderOperands(Data.first, Data.second, NonVectorized, 5620 GatherOps)) { 5621 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) 5622 OrderedEntries.remove(Op.second); 5623 continue; 5624 } 5625 // All operands are reordered and used only in this node - propagate the 5626 // most used order to the user node. 5627 MapVector<OrdersType, unsigned, 5628 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> 5629 OrdersUses; 5630 // Do the analysis for each tree entry only once, otherwise the order of 5631 // the same node my be considered several times, though might be not 5632 // profitable. 5633 SmallPtrSet<const TreeEntry *, 4> VisitedOps; 5634 SmallPtrSet<const TreeEntry *, 4> VisitedUsers; 5635 for (const auto &Op : Data.second) { 5636 TreeEntry *OpTE = Op.second; 5637 if (!VisitedOps.insert(OpTE).second) 5638 continue; 5639 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) 5640 continue; 5641 const auto Order = [&]() -> const OrdersType { 5642 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) 5643 return getReorderingData(*OpTE, /*TopToBottom=*/false) 5644 .value_or(OrdersType(1)); 5645 return OpTE->ReorderIndices; 5646 }(); 5647 // The order is partially ordered, skip it in favor of fully non-ordered 5648 // orders. 5649 if (Order.size() == 1) 5650 continue; 5651 unsigned NumOps = count_if( 5652 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) { 5653 return P.second == OpTE; 5654 }); 5655 // Stores actually store the mask, not the order, need to invert. 5656 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && 5657 OpTE->getOpcode() == Instruction::Store && !Order.empty()) { 5658 SmallVector<int> Mask; 5659 inversePermutation(Order, Mask); 5660 unsigned E = Order.size(); 5661 OrdersType CurrentOrder(E, E); 5662 transform(Mask, CurrentOrder.begin(), [E](int Idx) { 5663 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); 5664 }); 5665 fixupOrderingIndices(CurrentOrder); 5666 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += 5667 NumOps; 5668 } else { 5669 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; 5670 } 5671 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); 5672 const auto AllowsReordering = [&](const TreeEntry *TE) { 5673 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. 5674 if (TE->isNonPowOf2Vec()) 5675 return false; 5676 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || 5677 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || 5678 (IgnoreReorder && TE->Idx == 0)) 5679 return true; 5680 if (TE->isGather()) { 5681 if (GathersToOrders.contains(TE)) 5682 return !getReorderingData(*TE, /*TopToBottom=*/false) 5683 .value_or(OrdersType(1)) 5684 .empty(); 5685 return true; 5686 } 5687 return false; 5688 }; 5689 for (const EdgeInfo &EI : OpTE->UserTreeIndices) { 5690 TreeEntry *UserTE = EI.UserTE; 5691 if (!VisitedUsers.insert(UserTE).second) 5692 continue; 5693 // May reorder user node if it requires reordering, has reused 5694 // scalars, is an alternate op vectorize node or its op nodes require 5695 // reordering. 5696 if (AllowsReordering(UserTE)) 5697 continue; 5698 // Check if users allow reordering. 5699 // Currently look up just 1 level of operands to avoid increase of 5700 // the compile time. 5701 // Profitable to reorder if definitely more operands allow 5702 // reordering rather than those with natural order. 5703 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE]; 5704 if (static_cast<unsigned>(count_if( 5705 Ops, [UserTE, &AllowsReordering]( 5706 const std::pair<unsigned, TreeEntry *> &Op) { 5707 return AllowsReordering(Op.second) && 5708 all_of(Op.second->UserTreeIndices, 5709 [UserTE](const EdgeInfo &EI) { 5710 return EI.UserTE == UserTE; 5711 }); 5712 })) <= Ops.size() / 2) 5713 ++Res.first->second; 5714 } 5715 } 5716 if (OrdersUses.empty()) { 5717 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) 5718 OrderedEntries.remove(Op.second); 5719 continue; 5720 } 5721 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) { 5722 const unsigned Sz = Order.size(); 5723 for (unsigned Idx : seq<unsigned>(0, Sz)) 5724 if (Idx != Order[Idx] && Order[Idx] != Sz) 5725 return false; 5726 return true; 5727 }; 5728 // Choose the most used order. 5729 unsigned IdentityCnt = 0; 5730 unsigned VF = Data.second.front().second->getVectorFactor(); 5731 OrdersType IdentityOrder(VF, VF); 5732 for (auto &Pair : OrdersUses) { 5733 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { 5734 IdentityCnt += Pair.second; 5735 combineOrders(IdentityOrder, Pair.first); 5736 } 5737 } 5738 MutableArrayRef<unsigned> BestOrder = IdentityOrder; 5739 unsigned Cnt = IdentityCnt; 5740 for (auto &Pair : OrdersUses) { 5741 // Prefer identity order. But, if filled identity found (non-empty 5742 // order) with same number of uses, as the new candidate order, we can 5743 // choose this candidate order. 5744 if (Cnt < Pair.second) { 5745 combineOrders(Pair.first, BestOrder); 5746 BestOrder = Pair.first; 5747 Cnt = Pair.second; 5748 } else { 5749 combineOrders(BestOrder, Pair.first); 5750 } 5751 } 5752 // Set order of the user node. 5753 if (IsIdentityOrder(BestOrder)) { 5754 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) 5755 OrderedEntries.remove(Op.second); 5756 continue; 5757 } 5758 fixupOrderingIndices(BestOrder); 5759 // Erase operands from OrderedEntries list and adjust their orders. 5760 VisitedOps.clear(); 5761 SmallVector<int> Mask; 5762 inversePermutation(BestOrder, Mask); 5763 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); 5764 unsigned E = BestOrder.size(); 5765 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { 5766 return I < E ? static_cast<int>(I) : PoisonMaskElem; 5767 }); 5768 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { 5769 TreeEntry *TE = Op.second; 5770 OrderedEntries.remove(TE); 5771 if (!VisitedOps.insert(TE).second) 5772 continue; 5773 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { 5774 reorderNodeWithReuses(*TE, Mask); 5775 continue; 5776 } 5777 // Gathers are processed separately. 5778 if (TE->State != TreeEntry::Vectorize && 5779 TE->State != TreeEntry::StridedVectorize && 5780 (TE->State != TreeEntry::ScatterVectorize || 5781 TE->ReorderIndices.empty())) 5782 continue; 5783 assert((BestOrder.size() == TE->ReorderIndices.size() || 5784 TE->ReorderIndices.empty()) && 5785 "Non-matching sizes of user/operand entries."); 5786 reorderOrder(TE->ReorderIndices, Mask); 5787 if (IgnoreReorder && TE == VectorizableTree.front().get()) 5788 IgnoreReorder = false; 5789 } 5790 // For gathers just need to reorder its scalars. 5791 for (TreeEntry *Gather : GatherOps) { 5792 assert(Gather->ReorderIndices.empty() && 5793 "Unexpected reordering of gathers."); 5794 if (!Gather->ReuseShuffleIndices.empty()) { 5795 // Just reorder reuses indices. 5796 reorderReuses(Gather->ReuseShuffleIndices, Mask); 5797 continue; 5798 } 5799 reorderScalars(Gather->Scalars, Mask); 5800 OrderedEntries.remove(Gather); 5801 } 5802 // Reorder operands of the user node and set the ordering for the user 5803 // node itself. 5804 if (Data.first->State != TreeEntry::Vectorize || 5805 !isa<ExtractElementInst, ExtractValueInst, LoadInst>( 5806 Data.first->getMainOp()) || 5807 Data.first->isAltShuffle()) 5808 Data.first->reorderOperands(Mask); 5809 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) || 5810 Data.first->isAltShuffle() || 5811 Data.first->State == TreeEntry::StridedVectorize) { 5812 reorderScalars(Data.first->Scalars, Mask); 5813 reorderOrder(Data.first->ReorderIndices, MaskOrder, 5814 /*BottomOrder=*/true); 5815 if (Data.first->ReuseShuffleIndices.empty() && 5816 !Data.first->ReorderIndices.empty() && 5817 !Data.first->isAltShuffle()) { 5818 // Insert user node to the list to try to sink reordering deeper in 5819 // the graph. 5820 OrderedEntries.insert(Data.first); 5821 } 5822 } else { 5823 reorderOrder(Data.first->ReorderIndices, Mask); 5824 } 5825 } 5826 } 5827 // If the reordering is unnecessary, just remove the reorder. 5828 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && 5829 VectorizableTree.front()->ReuseShuffleIndices.empty()) 5830 VectorizableTree.front()->ReorderIndices.clear(); 5831 } 5832 5833 void BoUpSLP::buildExternalUses( 5834 const ExtraValueToDebugLocsMap &ExternallyUsedValues) { 5835 DenseMap<Value *, unsigned> ScalarToExtUses; 5836 // Collect the values that we need to extract from the tree. 5837 for (auto &TEPtr : VectorizableTree) { 5838 TreeEntry *Entry = TEPtr.get(); 5839 5840 // No need to handle users of gathered values. 5841 if (Entry->isGather()) 5842 continue; 5843 5844 // For each lane: 5845 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 5846 Value *Scalar = Entry->Scalars[Lane]; 5847 if (!isa<Instruction>(Scalar)) 5848 continue; 5849 // All uses must be replaced already? No need to do it again. 5850 auto It = ScalarToExtUses.find(Scalar); 5851 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User) 5852 continue; 5853 5854 // Check if the scalar is externally used as an extra arg. 5855 const auto *ExtI = ExternallyUsedValues.find(Scalar); 5856 if (ExtI != ExternallyUsedValues.end()) { 5857 int FoundLane = Entry->findLaneForValue(Scalar); 5858 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " 5859 << FoundLane << " from " << *Scalar << ".\n"); 5860 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()); 5861 ExternalUses.emplace_back(Scalar, nullptr, FoundLane); 5862 continue; 5863 } 5864 for (User *U : Scalar->users()) { 5865 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 5866 5867 Instruction *UserInst = dyn_cast<Instruction>(U); 5868 if (!UserInst || isDeleted(UserInst)) 5869 continue; 5870 5871 // Ignore users in the user ignore list. 5872 if (UserIgnoreList && UserIgnoreList->contains(UserInst)) 5873 continue; 5874 5875 // Skip in-tree scalars that become vectors 5876 if (TreeEntry *UseEntry = getTreeEntry(U)) { 5877 // Some in-tree scalars will remain as scalar in vectorized 5878 // instructions. If that is the case, the one in FoundLane will 5879 // be used. 5880 if (UseEntry->State == TreeEntry::ScatterVectorize || 5881 !doesInTreeUserNeedToExtract( 5882 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) { 5883 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 5884 << ".\n"); 5885 assert(!UseEntry->isGather() && "Bad state"); 5886 continue; 5887 } 5888 U = nullptr; 5889 if (It != ScalarToExtUses.end()) { 5890 ExternalUses[It->second].User = nullptr; 5891 break; 5892 } 5893 } 5894 5895 if (U && Scalar->hasNUsesOrMore(UsesLimit)) 5896 U = nullptr; 5897 int FoundLane = Entry->findLaneForValue(Scalar); 5898 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst 5899 << " from lane " << FoundLane << " from " << *Scalar 5900 << ".\n"); 5901 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first; 5902 ExternalUses.emplace_back(Scalar, U, FoundLane); 5903 if (!U) 5904 break; 5905 } 5906 } 5907 } 5908 } 5909 5910 DenseMap<Value *, SmallVector<StoreInst *>> 5911 BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { 5912 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap; 5913 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) { 5914 Value *V = TE->Scalars[Lane]; 5915 // To save compilation time we don't visit if we have too many users. 5916 if (V->hasNUsesOrMore(UsesLimit)) 5917 break; 5918 5919 // Collect stores per pointer object. 5920 for (User *U : V->users()) { 5921 auto *SI = dyn_cast<StoreInst>(U); 5922 if (SI == nullptr || !SI->isSimple() || 5923 !isValidElementType(SI->getValueOperand()->getType())) 5924 continue; 5925 // Skip entry if already 5926 if (getTreeEntry(U)) 5927 continue; 5928 5929 Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 5930 auto &StoresVec = PtrToStoresMap[Ptr]; 5931 // For now just keep one store per pointer object per lane. 5932 // TODO: Extend this to support multiple stores per pointer per lane 5933 if (StoresVec.size() > Lane) 5934 continue; 5935 // Skip if in different BBs. 5936 if (!StoresVec.empty() && 5937 SI->getParent() != StoresVec.back()->getParent()) 5938 continue; 5939 // Make sure that the stores are of the same type. 5940 if (!StoresVec.empty() && 5941 SI->getValueOperand()->getType() != 5942 StoresVec.back()->getValueOperand()->getType()) 5943 continue; 5944 StoresVec.push_back(SI); 5945 } 5946 } 5947 return PtrToStoresMap; 5948 } 5949 5950 bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec, 5951 OrdersType &ReorderIndices) const { 5952 // We check whether the stores in StoreVec can form a vector by sorting them 5953 // and checking whether they are consecutive. 5954 5955 // To avoid calling getPointersDiff() while sorting we create a vector of 5956 // pairs {store, offset from first} and sort this instead. 5957 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size()); 5958 StoreInst *S0 = StoresVec[0]; 5959 StoreOffsetVec[0] = {S0, 0}; 5960 Type *S0Ty = S0->getValueOperand()->getType(); 5961 Value *S0Ptr = S0->getPointerOperand(); 5962 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) { 5963 StoreInst *SI = StoresVec[Idx]; 5964 std::optional<int> Diff = 5965 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), 5966 SI->getPointerOperand(), *DL, *SE, 5967 /*StrictCheck=*/true); 5968 // We failed to compare the pointers so just abandon this StoresVec. 5969 if (!Diff) 5970 return false; 5971 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff}; 5972 } 5973 5974 // Sort the vector based on the pointers. We create a copy because we may 5975 // need the original later for calculating the reorder (shuffle) indices. 5976 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1, 5977 const std::pair<StoreInst *, int> &Pair2) { 5978 int Offset1 = Pair1.second; 5979 int Offset2 = Pair2.second; 5980 return Offset1 < Offset2; 5981 }); 5982 5983 // Check if the stores are consecutive by checking if their difference is 1. 5984 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size())) 5985 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1) 5986 return false; 5987 5988 // Calculate the shuffle indices according to their offset against the sorted 5989 // StoreOffsetVec. 5990 ReorderIndices.reserve(StoresVec.size()); 5991 for (StoreInst *SI : StoresVec) { 5992 unsigned Idx = find_if(StoreOffsetVec, 5993 [SI](const std::pair<StoreInst *, int> &Pair) { 5994 return Pair.first == SI; 5995 }) - 5996 StoreOffsetVec.begin(); 5997 ReorderIndices.push_back(Idx); 5998 } 5999 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in 6000 // reorderTopToBottom() and reorderBottomToTop(), so we are following the 6001 // same convention here. 6002 auto IsIdentityOrder = [](const OrdersType &Order) { 6003 for (unsigned Idx : seq<unsigned>(0, Order.size())) 6004 if (Idx != Order[Idx]) 6005 return false; 6006 return true; 6007 }; 6008 if (IsIdentityOrder(ReorderIndices)) 6009 ReorderIndices.clear(); 6010 6011 return true; 6012 } 6013 6014 #ifndef NDEBUG 6015 LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) { 6016 for (unsigned Idx : Order) 6017 dbgs() << Idx << ", "; 6018 dbgs() << "\n"; 6019 } 6020 #endif 6021 6022 SmallVector<BoUpSLP::OrdersType, 1> 6023 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { 6024 unsigned NumLanes = TE->Scalars.size(); 6025 6026 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap = 6027 collectUserStores(TE); 6028 6029 // Holds the reorder indices for each candidate store vector that is a user of 6030 // the current TreeEntry. 6031 SmallVector<OrdersType, 1> ExternalReorderIndices; 6032 6033 // Now inspect the stores collected per pointer and look for vectorization 6034 // candidates. For each candidate calculate the reorder index vector and push 6035 // it into `ExternalReorderIndices` 6036 for (const auto &Pair : PtrToStoresMap) { 6037 auto &StoresVec = Pair.second; 6038 // If we have fewer than NumLanes stores, then we can't form a vector. 6039 if (StoresVec.size() != NumLanes) 6040 continue; 6041 6042 // If the stores are not consecutive then abandon this StoresVec. 6043 OrdersType ReorderIndices; 6044 if (!canFormVector(StoresVec, ReorderIndices)) 6045 continue; 6046 6047 // We now know that the scalars in StoresVec can form a vector instruction, 6048 // so set the reorder indices. 6049 ExternalReorderIndices.push_back(ReorderIndices); 6050 } 6051 return ExternalReorderIndices; 6052 } 6053 6054 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 6055 const SmallDenseSet<Value *> &UserIgnoreLst) { 6056 deleteTree(); 6057 UserIgnoreList = &UserIgnoreLst; 6058 if (!allSameType(Roots)) 6059 return; 6060 buildTree_rec(Roots, 0, EdgeInfo()); 6061 } 6062 6063 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { 6064 deleteTree(); 6065 if (!allSameType(Roots)) 6066 return; 6067 buildTree_rec(Roots, 0, EdgeInfo()); 6068 } 6069 6070 /// \return true if the specified list of values has only one instruction that 6071 /// requires scheduling, false otherwise. 6072 #ifndef NDEBUG 6073 static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) { 6074 Value *NeedsScheduling = nullptr; 6075 for (Value *V : VL) { 6076 if (doesNotNeedToBeScheduled(V)) 6077 continue; 6078 if (!NeedsScheduling) { 6079 NeedsScheduling = V; 6080 continue; 6081 } 6082 return false; 6083 } 6084 return NeedsScheduling; 6085 } 6086 #endif 6087 6088 /// Generates key/subkey pair for the given value to provide effective sorting 6089 /// of the values and better detection of the vectorizable values sequences. The 6090 /// keys/subkeys can be used for better sorting of the values themselves (keys) 6091 /// and in values subgroups (subkeys). 6092 static std::pair<size_t, size_t> generateKeySubkey( 6093 Value *V, const TargetLibraryInfo *TLI, 6094 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, 6095 bool AllowAlternate) { 6096 hash_code Key = hash_value(V->getValueID() + 2); 6097 hash_code SubKey = hash_value(0); 6098 // Sort the loads by the distance between the pointers. 6099 if (auto *LI = dyn_cast<LoadInst>(V)) { 6100 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key); 6101 if (LI->isSimple()) 6102 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); 6103 else 6104 Key = SubKey = hash_value(LI); 6105 } else if (isVectorLikeInstWithConstOps(V)) { 6106 // Sort extracts by the vector operands. 6107 if (isa<ExtractElementInst, UndefValue>(V)) 6108 Key = hash_value(Value::UndefValueVal + 1); 6109 if (auto *EI = dyn_cast<ExtractElementInst>(V)) { 6110 if (!isUndefVector(EI->getVectorOperand()).all() && 6111 !isa<UndefValue>(EI->getIndexOperand())) 6112 SubKey = hash_value(EI->getVectorOperand()); 6113 } 6114 } else if (auto *I = dyn_cast<Instruction>(V)) { 6115 // Sort other instructions just by the opcodes except for CMPInst. 6116 // For CMP also sort by the predicate kind. 6117 if ((isa<BinaryOperator, CastInst>(I)) && 6118 isValidForAlternation(I->getOpcode())) { 6119 if (AllowAlternate) 6120 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0); 6121 else 6122 Key = hash_combine(hash_value(I->getOpcode()), Key); 6123 SubKey = hash_combine( 6124 hash_value(I->getOpcode()), hash_value(I->getType()), 6125 hash_value(isa<BinaryOperator>(I) 6126 ? I->getType() 6127 : cast<CastInst>(I)->getOperand(0)->getType())); 6128 // For casts, look through the only operand to improve compile time. 6129 if (isa<CastInst>(I)) { 6130 std::pair<size_t, size_t> OpVals = 6131 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator, 6132 /*AllowAlternate=*/true); 6133 Key = hash_combine(OpVals.first, Key); 6134 SubKey = hash_combine(OpVals.first, SubKey); 6135 } 6136 } else if (auto *CI = dyn_cast<CmpInst>(I)) { 6137 CmpInst::Predicate Pred = CI->getPredicate(); 6138 if (CI->isCommutative()) 6139 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred)); 6140 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred); 6141 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred), 6142 hash_value(SwapPred), 6143 hash_value(CI->getOperand(0)->getType())); 6144 } else if (auto *Call = dyn_cast<CallInst>(I)) { 6145 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI); 6146 if (isTriviallyVectorizable(ID)) { 6147 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID)); 6148 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) { 6149 SubKey = hash_combine(hash_value(I->getOpcode()), 6150 hash_value(Call->getCalledFunction())); 6151 } else { 6152 Key = hash_combine(hash_value(Call), Key); 6153 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call)); 6154 } 6155 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) 6156 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End), 6157 hash_value(Op.Tag), SubKey); 6158 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { 6159 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1))) 6160 SubKey = hash_value(Gep->getPointerOperand()); 6161 else 6162 SubKey = hash_value(Gep); 6163 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) && 6164 !isa<ConstantInt>(I->getOperand(1))) { 6165 // Do not try to vectorize instructions with potentially high cost. 6166 SubKey = hash_value(I); 6167 } else { 6168 SubKey = hash_value(I->getOpcode()); 6169 } 6170 Key = hash_combine(hash_value(I->getParent()), Key); 6171 } 6172 return std::make_pair(Key, SubKey); 6173 } 6174 6175 /// Checks if the specified instruction \p I is an alternate operation for 6176 /// the given \p MainOp and \p AltOp instructions. 6177 static bool isAlternateInstruction(const Instruction *I, 6178 const Instruction *MainOp, 6179 const Instruction *AltOp, 6180 const TargetLibraryInfo &TLI); 6181 6182 bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, 6183 ArrayRef<Value *> VL) const { 6184 unsigned Opcode0 = S.getOpcode(); 6185 unsigned Opcode1 = S.getAltOpcode(); 6186 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); 6187 // If this pattern is supported by the target then consider it profitable. 6188 if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()), 6189 Opcode0, Opcode1, OpcodeMask)) 6190 return true; 6191 SmallVector<ValueList> Operands; 6192 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) { 6193 Operands.emplace_back(); 6194 // Prepare the operand vector. 6195 for (Value *V : VL) 6196 Operands.back().push_back(cast<Instruction>(V)->getOperand(I)); 6197 } 6198 if (Operands.size() == 2) { 6199 // Try find best operands candidates. 6200 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) { 6201 SmallVector<std::pair<Value *, Value *>> Candidates(3); 6202 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]); 6203 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]); 6204 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]); 6205 std::optional<int> Res = findBestRootPair(Candidates); 6206 switch (Res.value_or(0)) { 6207 case 0: 6208 break; 6209 case 1: 6210 std::swap(Operands[0][I + 1], Operands[1][I + 1]); 6211 break; 6212 case 2: 6213 std::swap(Operands[0][I], Operands[1][I]); 6214 break; 6215 default: 6216 llvm_unreachable("Unexpected index."); 6217 } 6218 } 6219 } 6220 DenseSet<unsigned> UniqueOpcodes; 6221 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle. 6222 unsigned NonInstCnt = 0; 6223 // Estimate number of instructions, required for the vectorized node and for 6224 // the buildvector node. 6225 unsigned UndefCnt = 0; 6226 // Count the number of extra shuffles, required for vector nodes. 6227 unsigned ExtraShuffleInsts = 0; 6228 // Check that operands do not contain same values and create either perfect 6229 // diamond match or shuffled match. 6230 if (Operands.size() == 2) { 6231 // Do not count same operands twice. 6232 if (Operands.front() == Operands.back()) { 6233 Operands.erase(Operands.begin()); 6234 } else if (!allConstant(Operands.front()) && 6235 all_of(Operands.front(), [&](Value *V) { 6236 return is_contained(Operands.back(), V); 6237 })) { 6238 Operands.erase(Operands.begin()); 6239 ++ExtraShuffleInsts; 6240 } 6241 } 6242 const Loop *L = LI->getLoopFor(S.MainOp->getParent()); 6243 // Vectorize node, if: 6244 // 1. at least single operand is constant or splat. 6245 // 2. Operands have many loop invariants (the instructions are not loop 6246 // invariants). 6247 // 3. At least single unique operands is supposed to vectorized. 6248 return none_of(Operands, 6249 [&](ArrayRef<Value *> Op) { 6250 if (allConstant(Op) || 6251 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) && 6252 getSameOpcode(Op, *TLI).MainOp)) 6253 return false; 6254 DenseMap<Value *, unsigned> Uniques; 6255 for (Value *V : Op) { 6256 if (isa<Constant, ExtractElementInst>(V) || 6257 getTreeEntry(V) || (L && L->isLoopInvariant(V))) { 6258 if (isa<UndefValue>(V)) 6259 ++UndefCnt; 6260 continue; 6261 } 6262 auto Res = Uniques.try_emplace(V, 0); 6263 // Found first duplicate - need to add shuffle. 6264 if (!Res.second && Res.first->second == 1) 6265 ++ExtraShuffleInsts; 6266 ++Res.first->getSecond(); 6267 if (auto *I = dyn_cast<Instruction>(V)) 6268 UniqueOpcodes.insert(I->getOpcode()); 6269 else if (Res.second) 6270 ++NonInstCnt; 6271 } 6272 return none_of(Uniques, [&](const auto &P) { 6273 return P.first->hasNUsesOrMore(P.second + 1) && 6274 none_of(P.first->users(), [&](User *U) { 6275 return getTreeEntry(U) || Uniques.contains(U); 6276 }); 6277 }); 6278 }) || 6279 // Do not vectorize node, if estimated number of vector instructions is 6280 // more than estimated number of buildvector instructions. Number of 6281 // vector operands is number of vector instructions + number of vector 6282 // instructions for operands (buildvectors). Number of buildvector 6283 // instructions is just number_of_operands * number_of_scalars. 6284 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() && 6285 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts + 6286 NumAltInsts) < S.MainOp->getNumOperands() * VL.size()); 6287 } 6288 6289 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( 6290 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE, 6291 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const { 6292 assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); 6293 6294 unsigned ShuffleOrOp = 6295 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); 6296 auto *VL0 = cast<Instruction>(S.OpValue); 6297 switch (ShuffleOrOp) { 6298 case Instruction::PHI: { 6299 // Too many operands - gather, most probably won't be vectorized. 6300 if (VL0->getNumOperands() > MaxPHINumOperands) 6301 return TreeEntry::NeedToGather; 6302 // Check for terminator values (e.g. invoke). 6303 for (Value *V : VL) 6304 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) { 6305 Instruction *Term = dyn_cast<Instruction>(Incoming); 6306 if (Term && Term->isTerminator()) { 6307 LLVM_DEBUG(dbgs() 6308 << "SLP: Need to swizzle PHINodes (terminator use).\n"); 6309 return TreeEntry::NeedToGather; 6310 } 6311 } 6312 6313 return TreeEntry::Vectorize; 6314 } 6315 case Instruction::ExtractValue: 6316 case Instruction::ExtractElement: { 6317 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); 6318 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. 6319 if (!isPowerOf2_32(VL.size())) 6320 return TreeEntry::NeedToGather; 6321 if (Reuse || !CurrentOrder.empty()) 6322 return TreeEntry::Vectorize; 6323 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); 6324 return TreeEntry::NeedToGather; 6325 } 6326 case Instruction::InsertElement: { 6327 // Check that we have a buildvector and not a shuffle of 2 or more 6328 // different vectors. 6329 ValueSet SourceVectors; 6330 for (Value *V : VL) { 6331 SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); 6332 assert(getElementIndex(V) != std::nullopt && 6333 "Non-constant or undef index?"); 6334 } 6335 6336 if (count_if(VL, [&SourceVectors](Value *V) { 6337 return !SourceVectors.contains(V); 6338 }) >= 2) { 6339 // Found 2nd source vector - cancel. 6340 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " 6341 "different source vectors.\n"); 6342 return TreeEntry::NeedToGather; 6343 } 6344 6345 return TreeEntry::Vectorize; 6346 } 6347 case Instruction::Load: { 6348 // Check that a vectorized load would load the same memory as a scalar 6349 // load. For example, we don't want to vectorize loads that are smaller 6350 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 6351 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 6352 // from such a struct, we read/write packed bits disagreeing with the 6353 // unvectorized version. 6354 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { 6355 case LoadsState::Vectorize: 6356 return TreeEntry::Vectorize; 6357 case LoadsState::ScatterVectorize: 6358 return TreeEntry::ScatterVectorize; 6359 case LoadsState::StridedVectorize: 6360 return TreeEntry::StridedVectorize; 6361 case LoadsState::Gather: 6362 #ifndef NDEBUG 6363 Type *ScalarTy = VL0->getType(); 6364 if (DL->getTypeSizeInBits(ScalarTy) != 6365 DL->getTypeAllocSizeInBits(ScalarTy)) 6366 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 6367 else if (any_of(VL, 6368 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); })) 6369 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 6370 else 6371 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 6372 #endif // NDEBUG 6373 return TreeEntry::NeedToGather; 6374 } 6375 llvm_unreachable("Unexpected state of loads"); 6376 } 6377 case Instruction::ZExt: 6378 case Instruction::SExt: 6379 case Instruction::FPToUI: 6380 case Instruction::FPToSI: 6381 case Instruction::FPExt: 6382 case Instruction::PtrToInt: 6383 case Instruction::IntToPtr: 6384 case Instruction::SIToFP: 6385 case Instruction::UIToFP: 6386 case Instruction::Trunc: 6387 case Instruction::FPTrunc: 6388 case Instruction::BitCast: { 6389 Type *SrcTy = VL0->getOperand(0)->getType(); 6390 for (Value *V : VL) { 6391 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); 6392 if (Ty != SrcTy || !isValidElementType(Ty)) { 6393 LLVM_DEBUG( 6394 dbgs() << "SLP: Gathering casts with different src types.\n"); 6395 return TreeEntry::NeedToGather; 6396 } 6397 } 6398 return TreeEntry::Vectorize; 6399 } 6400 case Instruction::ICmp: 6401 case Instruction::FCmp: { 6402 // Check that all of the compares have the same predicate. 6403 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 6404 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); 6405 Type *ComparedTy = VL0->getOperand(0)->getType(); 6406 for (Value *V : VL) { 6407 CmpInst *Cmp = cast<CmpInst>(V); 6408 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || 6409 Cmp->getOperand(0)->getType() != ComparedTy) { 6410 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 6411 return TreeEntry::NeedToGather; 6412 } 6413 } 6414 return TreeEntry::Vectorize; 6415 } 6416 case Instruction::Select: 6417 case Instruction::FNeg: 6418 case Instruction::Add: 6419 case Instruction::FAdd: 6420 case Instruction::Sub: 6421 case Instruction::FSub: 6422 case Instruction::Mul: 6423 case Instruction::FMul: 6424 case Instruction::UDiv: 6425 case Instruction::SDiv: 6426 case Instruction::FDiv: 6427 case Instruction::URem: 6428 case Instruction::SRem: 6429 case Instruction::FRem: 6430 case Instruction::Shl: 6431 case Instruction::LShr: 6432 case Instruction::AShr: 6433 case Instruction::And: 6434 case Instruction::Or: 6435 case Instruction::Xor: 6436 return TreeEntry::Vectorize; 6437 case Instruction::GetElementPtr: { 6438 // We don't combine GEPs with complicated (nested) indexing. 6439 for (Value *V : VL) { 6440 auto *I = dyn_cast<GetElementPtrInst>(V); 6441 if (!I) 6442 continue; 6443 if (I->getNumOperands() != 2) { 6444 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 6445 return TreeEntry::NeedToGather; 6446 } 6447 } 6448 6449 // We can't combine several GEPs into one vector if they operate on 6450 // different types. 6451 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType(); 6452 for (Value *V : VL) { 6453 auto *GEP = dyn_cast<GEPOperator>(V); 6454 if (!GEP) 6455 continue; 6456 Type *CurTy = GEP->getSourceElementType(); 6457 if (Ty0 != CurTy) { 6458 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 6459 return TreeEntry::NeedToGather; 6460 } 6461 } 6462 6463 // We don't combine GEPs with non-constant indexes. 6464 Type *Ty1 = VL0->getOperand(1)->getType(); 6465 for (Value *V : VL) { 6466 auto *I = dyn_cast<GetElementPtrInst>(V); 6467 if (!I) 6468 continue; 6469 auto *Op = I->getOperand(1); 6470 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || 6471 (Op->getType() != Ty1 && 6472 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) || 6473 Op->getType()->getScalarSizeInBits() > 6474 DL->getIndexSizeInBits( 6475 V->getType()->getPointerAddressSpace())))) { 6476 LLVM_DEBUG( 6477 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 6478 return TreeEntry::NeedToGather; 6479 } 6480 } 6481 6482 return TreeEntry::Vectorize; 6483 } 6484 case Instruction::Store: { 6485 // Check if the stores are consecutive or if we need to swizzle them. 6486 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); 6487 // Avoid types that are padded when being allocated as scalars, while 6488 // being packed together in a vector (such as i1). 6489 if (DL->getTypeSizeInBits(ScalarTy) != 6490 DL->getTypeAllocSizeInBits(ScalarTy)) { 6491 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); 6492 return TreeEntry::NeedToGather; 6493 } 6494 // Make sure all stores in the bundle are simple - we can't vectorize 6495 // atomic or volatile stores. 6496 for (Value *V : VL) { 6497 auto *SI = cast<StoreInst>(V); 6498 if (!SI->isSimple()) { 6499 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); 6500 return TreeEntry::NeedToGather; 6501 } 6502 PointerOps.push_back(SI->getPointerOperand()); 6503 } 6504 6505 // Check the order of pointer operands. 6506 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { 6507 Value *Ptr0; 6508 Value *PtrN; 6509 if (CurrentOrder.empty()) { 6510 Ptr0 = PointerOps.front(); 6511 PtrN = PointerOps.back(); 6512 } else { 6513 Ptr0 = PointerOps[CurrentOrder.front()]; 6514 PtrN = PointerOps[CurrentOrder.back()]; 6515 } 6516 std::optional<int> Dist = 6517 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); 6518 // Check that the sorted pointer operands are consecutive. 6519 if (static_cast<unsigned>(*Dist) == VL.size() - 1) 6520 return TreeEntry::Vectorize; 6521 } 6522 6523 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 6524 return TreeEntry::NeedToGather; 6525 } 6526 case Instruction::Call: { 6527 // Check if the calls are all to the same vectorizable intrinsic or 6528 // library function. 6529 CallInst *CI = cast<CallInst>(VL0); 6530 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6531 6532 VFShape Shape = VFShape::get( 6533 CI->getFunctionType(), 6534 ElementCount::getFixed(static_cast<unsigned int>(VL.size())), 6535 false /*HasGlobalPred*/); 6536 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 6537 6538 if (!VecFunc && !isTriviallyVectorizable(ID)) { 6539 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 6540 return TreeEntry::NeedToGather; 6541 } 6542 Function *F = CI->getCalledFunction(); 6543 unsigned NumArgs = CI->arg_size(); 6544 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr); 6545 for (unsigned J = 0; J != NumArgs; ++J) 6546 if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) 6547 ScalarArgs[J] = CI->getArgOperand(J); 6548 for (Value *V : VL) { 6549 CallInst *CI2 = dyn_cast<CallInst>(V); 6550 if (!CI2 || CI2->getCalledFunction() != F || 6551 getVectorIntrinsicIDForCall(CI2, TLI) != ID || 6552 (VecFunc && 6553 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || 6554 !CI->hasIdenticalOperandBundleSchema(*CI2)) { 6555 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V 6556 << "\n"); 6557 return TreeEntry::NeedToGather; 6558 } 6559 // Some intrinsics have scalar arguments and should be same in order for 6560 // them to be vectorized. 6561 for (unsigned J = 0; J != NumArgs; ++J) { 6562 if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { 6563 Value *A1J = CI2->getArgOperand(J); 6564 if (ScalarArgs[J] != A1J) { 6565 LLVM_DEBUG(dbgs() 6566 << "SLP: mismatched arguments in call:" << *CI 6567 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n"); 6568 return TreeEntry::NeedToGather; 6569 } 6570 } 6571 } 6572 // Verify that the bundle operands are identical between the two calls. 6573 if (CI->hasOperandBundles() && 6574 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), 6575 CI->op_begin() + CI->getBundleOperandsEndIndex(), 6576 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { 6577 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI 6578 << "!=" << *V << '\n'); 6579 return TreeEntry::NeedToGather; 6580 } 6581 } 6582 6583 return TreeEntry::Vectorize; 6584 } 6585 case Instruction::ShuffleVector: { 6586 // If this is not an alternate sequence of opcode like add-sub 6587 // then do not vectorize this instruction. 6588 if (!S.isAltShuffle()) { 6589 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 6590 return TreeEntry::NeedToGather; 6591 } 6592 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) { 6593 LLVM_DEBUG( 6594 dbgs() 6595 << "SLP: ShuffleVector not vectorized, operands are buildvector and " 6596 "the whole alt sequence is not profitable.\n"); 6597 return TreeEntry::NeedToGather; 6598 } 6599 6600 return TreeEntry::Vectorize; 6601 } 6602 default: 6603 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 6604 return TreeEntry::NeedToGather; 6605 } 6606 } 6607 6608 namespace { 6609 /// Allows to correctly handle operands of the phi nodes based on the \p Main 6610 /// PHINode order of incoming basic blocks/values. 6611 class PHIHandler { 6612 DominatorTree &DT; 6613 PHINode *Main = nullptr; 6614 SmallVector<Value *> Phis; 6615 SmallVector<SmallVector<Value *>> Operands; 6616 6617 public: 6618 PHIHandler() = delete; 6619 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis) 6620 : DT(DT), Main(Main), Phis(Phis), 6621 Operands(Main->getNumIncomingValues(), 6622 SmallVector<Value *>(Phis.size(), nullptr)) {} 6623 void buildOperands() { 6624 constexpr unsigned FastLimit = 4; 6625 if (Main->getNumIncomingValues() <= FastLimit) { 6626 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) { 6627 BasicBlock *InBB = Main->getIncomingBlock(I); 6628 if (!DT.isReachableFromEntry(InBB)) { 6629 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType())); 6630 continue; 6631 } 6632 // Prepare the operand vector. 6633 for (auto [Idx, V] : enumerate(Phis)) { 6634 auto *P = cast<PHINode>(V); 6635 if (P->getIncomingBlock(I) == InBB) 6636 Operands[I][Idx] = P->getIncomingValue(I); 6637 else 6638 Operands[I][Idx] = P->getIncomingValueForBlock(InBB); 6639 } 6640 } 6641 return; 6642 } 6643 SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks; 6644 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) { 6645 BasicBlock *InBB = Main->getIncomingBlock(I); 6646 if (!DT.isReachableFromEntry(InBB)) { 6647 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType())); 6648 continue; 6649 } 6650 Blocks.try_emplace(InBB).first->second.push_back(I); 6651 } 6652 for (auto [Idx, V] : enumerate(Phis)) { 6653 auto *P = cast<PHINode>(V); 6654 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) { 6655 BasicBlock *InBB = P->getIncomingBlock(I); 6656 if (InBB == Main->getIncomingBlock(I)) { 6657 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx])) 6658 continue; 6659 Operands[I][Idx] = P->getIncomingValue(I); 6660 continue; 6661 } 6662 auto It = Blocks.find(InBB); 6663 if (It == Blocks.end()) 6664 continue; 6665 Operands[It->second.front()][Idx] = P->getIncomingValue(I); 6666 } 6667 } 6668 for (const auto &P : Blocks) { 6669 if (P.getSecond().size() <= 1) 6670 continue; 6671 unsigned BasicI = P.getSecond().front(); 6672 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) { 6673 assert(all_of(enumerate(Operands[I]), 6674 [&](const auto &Data) { 6675 return !Data.value() || 6676 Data.value() == Operands[BasicI][Data.index()]; 6677 }) && 6678 "Expected empty operands list."); 6679 Operands[I] = Operands[BasicI]; 6680 } 6681 } 6682 } 6683 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; } 6684 }; 6685 } // namespace 6686 6687 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, 6688 const EdgeInfo &UserTreeIdx) { 6689 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); 6690 6691 SmallVector<int> ReuseShuffleIndices; 6692 SmallVector<Value *> UniqueValues; 6693 SmallVector<Value *> NonUniqueValueVL; 6694 auto TryToFindDuplicates = [&](const InstructionsState &S, 6695 bool DoNotFail = false) { 6696 // Check that every instruction appears once in this bundle. 6697 DenseMap<Value *, unsigned> UniquePositions(VL.size()); 6698 for (Value *V : VL) { 6699 if (isConstant(V)) { 6700 ReuseShuffleIndices.emplace_back( 6701 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size()); 6702 UniqueValues.emplace_back(V); 6703 continue; 6704 } 6705 auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 6706 ReuseShuffleIndices.emplace_back(Res.first->second); 6707 if (Res.second) 6708 UniqueValues.emplace_back(V); 6709 } 6710 size_t NumUniqueScalarValues = UniqueValues.size(); 6711 if (NumUniqueScalarValues == VL.size()) { 6712 ReuseShuffleIndices.clear(); 6713 } else { 6714 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. 6715 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { 6716 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " 6717 "for nodes with padding.\n"); 6718 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 6719 return false; 6720 } 6721 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); 6722 if (NumUniqueScalarValues <= 1 || 6723 (UniquePositions.size() == 1 && all_of(UniqueValues, 6724 [](Value *V) { 6725 return isa<UndefValue>(V) || 6726 !isConstant(V); 6727 })) || 6728 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) { 6729 if (DoNotFail && UniquePositions.size() > 1 && 6730 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && 6731 all_of(UniqueValues, [=](Value *V) { 6732 return isa<ExtractElementInst>(V) || 6733 areAllUsersVectorized(cast<Instruction>(V), 6734 UserIgnoreList); 6735 })) { 6736 unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); 6737 if (PWSz == VL.size()) { 6738 ReuseShuffleIndices.clear(); 6739 } else { 6740 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); 6741 NonUniqueValueVL.append(PWSz - UniqueValues.size(), 6742 UniqueValues.back()); 6743 VL = NonUniqueValueVL; 6744 } 6745 return true; 6746 } 6747 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 6748 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 6749 return false; 6750 } 6751 VL = UniqueValues; 6752 } 6753 return true; 6754 }; 6755 6756 InstructionsState S = getSameOpcode(VL, *TLI); 6757 6758 // Don't vectorize ephemeral values. 6759 if (!EphValues.empty()) { 6760 for (Value *V : VL) { 6761 if (EphValues.count(V)) { 6762 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 6763 << ") is ephemeral.\n"); 6764 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 6765 return; 6766 } 6767 } 6768 } 6769 6770 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of 6771 // a load), in which case peek through to include it in the tree, without 6772 // ballooning over-budget. 6773 if (Depth >= RecursionMaxDepth && 6774 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp && 6775 VL.size() >= 4 && 6776 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { 6777 return match(I, 6778 m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && 6779 cast<Instruction>(I)->getOpcode() == 6780 cast<Instruction>(S.MainOp)->getOpcode(); 6781 })))) { 6782 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 6783 if (TryToFindDuplicates(S)) 6784 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 6785 ReuseShuffleIndices); 6786 return; 6787 } 6788 6789 // Don't handle scalable vectors 6790 if (S.getOpcode() == Instruction::ExtractElement && 6791 isa<ScalableVectorType>( 6792 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { 6793 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); 6794 if (TryToFindDuplicates(S)) 6795 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 6796 ReuseShuffleIndices); 6797 return; 6798 } 6799 6800 // Don't handle vectors. 6801 if (!SLPReVec && S.OpValue->getType()->isVectorTy() && 6802 !isa<InsertElementInst>(S.OpValue)) { 6803 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 6804 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 6805 return; 6806 } 6807 6808 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) 6809 if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) { 6810 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 6811 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 6812 return; 6813 } 6814 6815 // If all of the operands are identical or constant we have a simple solution. 6816 // If we deal with insert/extract instructions, they all must have constant 6817 // indices, otherwise we should gather them, not try to vectorize. 6818 // If alternate op node with 2 elements with gathered operands - do not 6819 // vectorize. 6820 auto &&NotProfitableForVectorization = [&S, this, 6821 Depth](ArrayRef<Value *> VL) { 6822 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) 6823 return false; 6824 if (VectorizableTree.size() < MinTreeSize) 6825 return false; 6826 if (Depth >= RecursionMaxDepth - 1) 6827 return true; 6828 // Check if all operands are extracts, part of vector node or can build a 6829 // regular vectorize node. 6830 SmallVector<unsigned, 2> InstsCount(VL.size(), 0); 6831 for (Value *V : VL) { 6832 auto *I = cast<Instruction>(V); 6833 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) { 6834 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op); 6835 })); 6836 } 6837 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp); 6838 if ((IsCommutative && 6839 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) || 6840 (!IsCommutative && 6841 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; }))) 6842 return true; 6843 assert(VL.size() == 2 && "Expected only 2 alternate op instructions."); 6844 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; 6845 auto *I1 = cast<Instruction>(VL.front()); 6846 auto *I2 = cast<Instruction>(VL.back()); 6847 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) 6848 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 6849 I2->getOperand(Op)); 6850 if (static_cast<unsigned>(count_if( 6851 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 6852 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); 6853 })) >= S.MainOp->getNumOperands() / 2) 6854 return false; 6855 if (S.MainOp->getNumOperands() > 2) 6856 return true; 6857 if (IsCommutative) { 6858 // Check permuted operands. 6859 Candidates.clear(); 6860 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) 6861 Candidates.emplace_back().emplace_back(I1->getOperand(Op), 6862 I2->getOperand((Op + 1) % E)); 6863 if (any_of( 6864 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) { 6865 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); 6866 })) 6867 return false; 6868 } 6869 return true; 6870 }; 6871 SmallVector<unsigned> SortedIndices; 6872 BasicBlock *BB = nullptr; 6873 bool IsScatterVectorizeUserTE = 6874 UserTreeIdx.UserTE && 6875 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; 6876 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL); 6877 bool AreScatterAllGEPSameBlock = 6878 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() && 6879 VL.size() > 2 && 6880 all_of(VL, 6881 [&BB](Value *V) { 6882 auto *I = dyn_cast<GetElementPtrInst>(V); 6883 if (!I) 6884 return doesNotNeedToBeScheduled(V); 6885 if (!BB) 6886 BB = I->getParent(); 6887 return BB == I->getParent() && I->getNumOperands() == 2; 6888 }) && 6889 BB && 6890 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, 6891 SortedIndices)); 6892 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; 6893 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) || 6894 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( 6895 S.OpValue) && 6896 !all_of(VL, isVectorLikeInstWithConstOps)) || 6897 NotProfitableForVectorization(VL)) { 6898 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); 6899 if (TryToFindDuplicates(S)) 6900 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 6901 ReuseShuffleIndices); 6902 return; 6903 } 6904 6905 // We now know that this is a vector of instructions of the same type from 6906 // the same block. 6907 6908 // Check if this is a duplicate of another entry. 6909 if (TreeEntry *E = getTreeEntry(S.OpValue)) { 6910 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); 6911 if (!E->isSame(VL)) { 6912 auto It = MultiNodeScalars.find(S.OpValue); 6913 if (It != MultiNodeScalars.end()) { 6914 auto *TEIt = find_if(It->getSecond(), 6915 [&](TreeEntry *ME) { return ME->isSame(VL); }); 6916 if (TEIt != It->getSecond().end()) 6917 E = *TEIt; 6918 else 6919 E = nullptr; 6920 } else { 6921 E = nullptr; 6922 } 6923 } 6924 if (!E) { 6925 if (!doesNotNeedToBeScheduled(S.OpValue)) { 6926 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 6927 if (TryToFindDuplicates(S)) 6928 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 6929 ReuseShuffleIndices); 6930 return; 6931 } 6932 } else { 6933 // Record the reuse of the tree node. FIXME, currently this is only used 6934 // to properly draw the graph rather than for the actual vectorization. 6935 E->UserTreeIndices.push_back(UserTreeIdx); 6936 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue 6937 << ".\n"); 6938 return; 6939 } 6940 } 6941 6942 // Check that none of the instructions in the bundle are already in the tree. 6943 for (Value *V : VL) { 6944 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) || 6945 doesNotNeedToBeScheduled(V)) 6946 continue; 6947 if (getTreeEntry(V)) { 6948 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 6949 << ") is already in tree.\n"); 6950 if (TryToFindDuplicates(S)) 6951 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 6952 ReuseShuffleIndices); 6953 return; 6954 } 6955 } 6956 6957 // The reduction nodes (stored in UserIgnoreList) also should stay scalar. 6958 if (UserIgnoreList && !UserIgnoreList->empty()) { 6959 for (Value *V : VL) { 6960 if (UserIgnoreList && UserIgnoreList->contains(V)) { 6961 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 6962 if (TryToFindDuplicates(S)) 6963 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 6964 ReuseShuffleIndices); 6965 return; 6966 } 6967 } 6968 } 6969 6970 // Special processing for sorted pointers for ScatterVectorize node with 6971 // constant indeces only. 6972 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) { 6973 assert(S.OpValue->getType()->isPointerTy() && 6974 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 && 6975 "Expected pointers only."); 6976 // Reset S to make it GetElementPtr kind of node. 6977 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>); 6978 assert(It != VL.end() && "Expected at least one GEP."); 6979 S = getSameOpcode(*It, *TLI); 6980 } 6981 6982 // Check that all of the users of the scalars that we want to vectorize are 6983 // schedulable. 6984 auto *VL0 = cast<Instruction>(S.OpValue); 6985 BB = VL0->getParent(); 6986 6987 if (!DT->isReachableFromEntry(BB)) { 6988 // Don't go into unreachable blocks. They may contain instructions with 6989 // dependency cycles which confuse the final scheduling. 6990 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 6991 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 6992 return; 6993 } 6994 6995 // Don't go into catchswitch blocks, which can happen with PHIs. 6996 // Such blocks can only have PHIs and the catchswitch. There is no 6997 // place to insert a shuffle if we need to, so just avoid that issue. 6998 if (isa<CatchSwitchInst>(BB->getTerminator())) { 6999 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); 7000 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); 7001 return; 7002 } 7003 7004 // Check that every instruction appears once in this bundle. 7005 if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) 7006 return; 7007 7008 // Perform specific checks for each particular instruction kind. 7009 OrdersType CurrentOrder; 7010 SmallVector<Value *> PointerOps; 7011 TreeEntry::EntryState State = getScalarsVectorizationState( 7012 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); 7013 if (State == TreeEntry::NeedToGather) { 7014 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 7015 ReuseShuffleIndices); 7016 return; 7017 } 7018 7019 auto &BSRef = BlocksSchedules[BB]; 7020 if (!BSRef) 7021 BSRef = std::make_unique<BlockScheduling>(BB); 7022 7023 BlockScheduling &BS = *BSRef; 7024 7025 std::optional<ScheduleData *> Bundle = 7026 BS.tryScheduleBundle(UniqueValues, this, S); 7027 #ifdef EXPENSIVE_CHECKS 7028 // Make sure we didn't break any internal invariants 7029 BS.verify(); 7030 #endif 7031 if (!Bundle) { 7032 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 7033 assert((!BS.getScheduleData(VL0) || 7034 !BS.getScheduleData(VL0)->isPartOfBundle()) && 7035 "tryScheduleBundle should cancelScheduling on failure"); 7036 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, 7037 ReuseShuffleIndices); 7038 NonScheduledFirst.insert(VL.front()); 7039 return; 7040 } 7041 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 7042 7043 unsigned ShuffleOrOp = S.isAltShuffle() ? 7044 (unsigned) Instruction::ShuffleVector : S.getOpcode(); 7045 switch (ShuffleOrOp) { 7046 case Instruction::PHI: { 7047 auto *PH = cast<PHINode>(VL0); 7048 7049 TreeEntry *TE = 7050 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices); 7051 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 7052 7053 // Keeps the reordered operands to avoid code duplication. 7054 PHIHandler Handler(*DT, PH, VL); 7055 Handler.buildOperands(); 7056 for (unsigned I : seq<unsigned>(0, PH->getNumOperands())) 7057 TE->setOperand(I, Handler.getOperands(I)); 7058 for (unsigned I : seq<unsigned>(0, PH->getNumOperands())) 7059 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I}); 7060 return; 7061 } 7062 case Instruction::ExtractValue: 7063 case Instruction::ExtractElement: { 7064 if (CurrentOrder.empty()) { 7065 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); 7066 } else { 7067 LLVM_DEBUG({ 7068 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " 7069 "with order"; 7070 for (unsigned Idx : CurrentOrder) 7071 dbgs() << " " << Idx; 7072 dbgs() << "\n"; 7073 }); 7074 fixupOrderingIndices(CurrentOrder); 7075 } 7076 // Insert new order with initial value 0, if it does not exist, 7077 // otherwise return the iterator to the existing one. 7078 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7079 ReuseShuffleIndices, CurrentOrder); 7080 // This is a special case, as it does not gather, but at the same time 7081 // we are not extending buildTree_rec() towards the operands. 7082 ValueList Op0; 7083 Op0.assign(VL.size(), VL0->getOperand(0)); 7084 VectorizableTree.back()->setOperand(0, Op0); 7085 return; 7086 } 7087 case Instruction::InsertElement: { 7088 assert(ReuseShuffleIndices.empty() && "All inserts should be unique"); 7089 7090 auto OrdCompare = [](const std::pair<int, int> &P1, 7091 const std::pair<int, int> &P2) { 7092 return P1.first > P2.first; 7093 }; 7094 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>, 7095 decltype(OrdCompare)> 7096 Indices(OrdCompare); 7097 for (int I = 0, E = VL.size(); I < E; ++I) { 7098 unsigned Idx = *getElementIndex(VL[I]); 7099 Indices.emplace(Idx, I); 7100 } 7101 OrdersType CurrentOrder(VL.size(), VL.size()); 7102 bool IsIdentity = true; 7103 for (int I = 0, E = VL.size(); I < E; ++I) { 7104 CurrentOrder[Indices.top().second] = I; 7105 IsIdentity &= Indices.top().second == I; 7106 Indices.pop(); 7107 } 7108 if (IsIdentity) 7109 CurrentOrder.clear(); 7110 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7111 std::nullopt, CurrentOrder); 7112 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); 7113 7114 TE->setOperandsInOrder(); 7115 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1}); 7116 return; 7117 } 7118 case Instruction::Load: { 7119 // Check that a vectorized load would load the same memory as a scalar 7120 // load. For example, we don't want to vectorize loads that are smaller 7121 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 7122 // treats loading/storing it as an i8 struct. If we vectorize loads/stores 7123 // from such a struct, we read/write packed bits disagreeing with the 7124 // unvectorized version. 7125 TreeEntry *TE = nullptr; 7126 fixupOrderingIndices(CurrentOrder); 7127 switch (State) { 7128 case TreeEntry::Vectorize: 7129 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7130 ReuseShuffleIndices, CurrentOrder); 7131 if (CurrentOrder.empty()) 7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 7133 else 7134 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); 7135 TE->setOperandsInOrder(); 7136 break; 7137 case TreeEntry::StridedVectorize: 7138 // Vectorizing non-consecutive loads with `llvm.masked.gather`. 7139 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, 7140 UserTreeIdx, ReuseShuffleIndices, CurrentOrder); 7141 TE->setOperandsInOrder(); 7142 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n"); 7143 break; 7144 case TreeEntry::ScatterVectorize: 7145 // Vectorizing non-consecutive loads with `llvm.masked.gather`. 7146 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, 7147 UserTreeIdx, ReuseShuffleIndices); 7148 TE->setOperandsInOrder(); 7149 buildTree_rec(PointerOps, Depth + 1, {TE, 0}); 7150 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); 7151 break; 7152 case TreeEntry::NeedToGather: 7153 llvm_unreachable("Unexpected loads state."); 7154 } 7155 return; 7156 } 7157 case Instruction::ZExt: 7158 case Instruction::SExt: 7159 case Instruction::FPToUI: 7160 case Instruction::FPToSI: 7161 case Instruction::FPExt: 7162 case Instruction::PtrToInt: 7163 case Instruction::IntToPtr: 7164 case Instruction::SIToFP: 7165 case Instruction::UIToFP: 7166 case Instruction::Trunc: 7167 case Instruction::FPTrunc: 7168 case Instruction::BitCast: { 7169 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or( 7170 std::make_pair(std::numeric_limits<unsigned>::min(), 7171 std::numeric_limits<unsigned>::max())); 7172 if (ShuffleOrOp == Instruction::ZExt || 7173 ShuffleOrOp == Instruction::SExt) { 7174 CastMaxMinBWSizes = std::make_pair( 7175 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()), 7176 PrevMaxBW), 7177 std::min<unsigned>( 7178 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), 7179 PrevMinBW)); 7180 } else if (ShuffleOrOp == Instruction::Trunc) { 7181 CastMaxMinBWSizes = std::make_pair( 7182 std::max<unsigned>( 7183 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), 7184 PrevMaxBW), 7185 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()), 7186 PrevMinBW)); 7187 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1); 7188 } else if (ShuffleOrOp == Instruction::SIToFP || 7189 ShuffleOrOp == Instruction::UIToFP) { 7190 unsigned NumSignBits = 7191 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT); 7192 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) { 7193 APInt Mask = DB->getDemandedBits(OpI); 7194 NumSignBits = std::max(NumSignBits, Mask.countl_zero()); 7195 } 7196 if (NumSignBits * 2 >= 7197 DL->getTypeSizeInBits(VL0->getOperand(0)->getType())) 7198 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1); 7199 } 7200 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7201 ReuseShuffleIndices); 7202 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 7203 7204 TE->setOperandsInOrder(); 7205 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) 7206 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 7207 return; 7208 } 7209 case Instruction::ICmp: 7210 case Instruction::FCmp: { 7211 // Check that all of the compares have the same predicate. 7212 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 7213 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7214 ReuseShuffleIndices); 7215 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 7216 7217 ValueList Left, Right; 7218 if (cast<CmpInst>(VL0)->isCommutative()) { 7219 // Commutative predicate - collect + sort operands of the instructions 7220 // so that each side is more likely to have the same opcode. 7221 assert(P0 == CmpInst::getSwappedPredicate(P0) && 7222 "Commutative Predicate mismatch"); 7223 reorderInputsAccordingToOpcode(VL, Left, Right, *this); 7224 } else { 7225 // Collect operands - commute if it uses the swapped predicate. 7226 for (Value *V : VL) { 7227 auto *Cmp = cast<CmpInst>(V); 7228 Value *LHS = Cmp->getOperand(0); 7229 Value *RHS = Cmp->getOperand(1); 7230 if (Cmp->getPredicate() != P0) 7231 std::swap(LHS, RHS); 7232 Left.push_back(LHS); 7233 Right.push_back(RHS); 7234 } 7235 } 7236 TE->setOperand(0, Left); 7237 TE->setOperand(1, Right); 7238 buildTree_rec(Left, Depth + 1, {TE, 0}); 7239 buildTree_rec(Right, Depth + 1, {TE, 1}); 7240 if (ShuffleOrOp == Instruction::ICmp) { 7241 unsigned NumSignBits0 = 7242 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT); 7243 if (NumSignBits0 * 2 >= 7244 DL->getTypeSizeInBits(VL0->getOperand(0)->getType())) 7245 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx); 7246 unsigned NumSignBits1 = 7247 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT); 7248 if (NumSignBits1 * 2 >= 7249 DL->getTypeSizeInBits(VL0->getOperand(1)->getType())) 7250 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx); 7251 } 7252 return; 7253 } 7254 case Instruction::Select: 7255 case Instruction::FNeg: 7256 case Instruction::Add: 7257 case Instruction::FAdd: 7258 case Instruction::Sub: 7259 case Instruction::FSub: 7260 case Instruction::Mul: 7261 case Instruction::FMul: 7262 case Instruction::UDiv: 7263 case Instruction::SDiv: 7264 case Instruction::FDiv: 7265 case Instruction::URem: 7266 case Instruction::SRem: 7267 case Instruction::FRem: 7268 case Instruction::Shl: 7269 case Instruction::LShr: 7270 case Instruction::AShr: 7271 case Instruction::And: 7272 case Instruction::Or: 7273 case Instruction::Xor: { 7274 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7275 ReuseShuffleIndices); 7276 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); 7277 7278 // Sort operands of the instructions so that each side is more likely to 7279 // have the same opcode. 7280 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) { 7281 ValueList Left, Right; 7282 reorderInputsAccordingToOpcode(VL, Left, Right, *this); 7283 TE->setOperand(0, Left); 7284 TE->setOperand(1, Right); 7285 buildTree_rec(Left, Depth + 1, {TE, 0}); 7286 buildTree_rec(Right, Depth + 1, {TE, 1}); 7287 return; 7288 } 7289 7290 TE->setOperandsInOrder(); 7291 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) 7292 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 7293 return; 7294 } 7295 case Instruction::GetElementPtr: { 7296 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7297 ReuseShuffleIndices); 7298 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 7299 SmallVector<ValueList, 2> Operands(2); 7300 // Prepare the operand vector for pointer operands. 7301 for (Value *V : VL) { 7302 auto *GEP = dyn_cast<GetElementPtrInst>(V); 7303 if (!GEP) { 7304 Operands.front().push_back(V); 7305 continue; 7306 } 7307 Operands.front().push_back(GEP->getPointerOperand()); 7308 } 7309 TE->setOperand(0, Operands.front()); 7310 // Need to cast all indices to the same type before vectorization to 7311 // avoid crash. 7312 // Required to be able to find correct matches between different gather 7313 // nodes and reuse the vectorized values rather than trying to gather them 7314 // again. 7315 int IndexIdx = 1; 7316 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType(); 7317 Type *Ty = all_of(VL, 7318 [VL0Ty, IndexIdx](Value *V) { 7319 auto *GEP = dyn_cast<GetElementPtrInst>(V); 7320 if (!GEP) 7321 return true; 7322 return VL0Ty == GEP->getOperand(IndexIdx)->getType(); 7323 }) 7324 ? VL0Ty 7325 : DL->getIndexType(cast<GetElementPtrInst>(VL0) 7326 ->getPointerOperandType() 7327 ->getScalarType()); 7328 // Prepare the operand vector. 7329 for (Value *V : VL) { 7330 auto *I = dyn_cast<GetElementPtrInst>(V); 7331 if (!I) { 7332 Operands.back().push_back( 7333 ConstantInt::get(Ty, 0, /*isSigned=*/false)); 7334 continue; 7335 } 7336 auto *Op = I->getOperand(IndexIdx); 7337 auto *CI = dyn_cast<ConstantInt>(Op); 7338 if (!CI) 7339 Operands.back().push_back(Op); 7340 else 7341 Operands.back().push_back(ConstantFoldIntegerCast( 7342 CI, Ty, CI->getValue().isSignBitSet(), *DL)); 7343 } 7344 TE->setOperand(IndexIdx, Operands.back()); 7345 7346 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I) 7347 buildTree_rec(Operands[I], Depth + 1, {TE, I}); 7348 return; 7349 } 7350 case Instruction::Store: { 7351 bool Consecutive = CurrentOrder.empty(); 7352 if (!Consecutive) 7353 fixupOrderingIndices(CurrentOrder); 7354 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7355 ReuseShuffleIndices, CurrentOrder); 7356 TE->setOperandsInOrder(); 7357 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0}); 7358 if (Consecutive) 7359 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 7360 else 7361 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); 7362 return; 7363 } 7364 case Instruction::Call: { 7365 // Check if the calls are all to the same vectorizable intrinsic or 7366 // library function. 7367 CallInst *CI = cast<CallInst>(VL0); 7368 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7369 7370 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7371 ReuseShuffleIndices); 7372 // Sort operands of the instructions so that each side is more likely to 7373 // have the same opcode. 7374 if (isCommutative(VL0)) { 7375 ValueList Left, Right; 7376 reorderInputsAccordingToOpcode(VL, Left, Right, *this); 7377 TE->setOperand(0, Left); 7378 TE->setOperand(1, Right); 7379 SmallVector<ValueList> Operands; 7380 for (unsigned I : seq<unsigned>(2, CI->arg_size())) { 7381 Operands.emplace_back(); 7382 if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) 7383 continue; 7384 for (Value *V : VL) { 7385 auto *CI2 = cast<CallInst>(V); 7386 Operands.back().push_back(CI2->getArgOperand(I)); 7387 } 7388 TE->setOperand(I, Operands.back()); 7389 } 7390 buildTree_rec(Left, Depth + 1, {TE, 0}); 7391 buildTree_rec(Right, Depth + 1, {TE, 1}); 7392 for (unsigned I : seq<unsigned>(2, CI->arg_size())) { 7393 if (Operands[I - 2].empty()) 7394 continue; 7395 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I}); 7396 } 7397 return; 7398 } 7399 TE->setOperandsInOrder(); 7400 for (unsigned I : seq<unsigned>(0, CI->arg_size())) { 7401 // For scalar operands no need to create an entry since no need to 7402 // vectorize it. 7403 if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) 7404 continue; 7405 ValueList Operands; 7406 // Prepare the operand vector. 7407 for (Value *V : VL) { 7408 auto *CI2 = cast<CallInst>(V); 7409 Operands.push_back(CI2->getArgOperand(I)); 7410 } 7411 buildTree_rec(Operands, Depth + 1, {TE, I}); 7412 } 7413 return; 7414 } 7415 case Instruction::ShuffleVector: { 7416 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 7417 ReuseShuffleIndices); 7418 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 7419 7420 // Reorder operands if reordering would enable vectorization. 7421 auto *CI = dyn_cast<CmpInst>(VL0); 7422 if (isa<BinaryOperator>(VL0) || CI) { 7423 ValueList Left, Right; 7424 if (!CI || all_of(VL, [](Value *V) { 7425 return cast<CmpInst>(V)->isCommutative(); 7426 })) { 7427 reorderInputsAccordingToOpcode(VL, Left, Right, *this); 7428 } else { 7429 auto *MainCI = cast<CmpInst>(S.MainOp); 7430 auto *AltCI = cast<CmpInst>(S.AltOp); 7431 CmpInst::Predicate MainP = MainCI->getPredicate(); 7432 CmpInst::Predicate AltP = AltCI->getPredicate(); 7433 assert(MainP != AltP && 7434 "Expected different main/alternate predicates."); 7435 // Collect operands - commute if it uses the swapped predicate or 7436 // alternate operation. 7437 for (Value *V : VL) { 7438 auto *Cmp = cast<CmpInst>(V); 7439 Value *LHS = Cmp->getOperand(0); 7440 Value *RHS = Cmp->getOperand(1); 7441 7442 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { 7443 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) 7444 std::swap(LHS, RHS); 7445 } else { 7446 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) 7447 std::swap(LHS, RHS); 7448 } 7449 Left.push_back(LHS); 7450 Right.push_back(RHS); 7451 } 7452 } 7453 TE->setOperand(0, Left); 7454 TE->setOperand(1, Right); 7455 buildTree_rec(Left, Depth + 1, {TE, 0}); 7456 buildTree_rec(Right, Depth + 1, {TE, 1}); 7457 return; 7458 } 7459 7460 TE->setOperandsInOrder(); 7461 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) 7462 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I}); 7463 return; 7464 } 7465 default: 7466 break; 7467 } 7468 llvm_unreachable("Unexpected vectorization of the instructions."); 7469 } 7470 7471 unsigned BoUpSLP::canMapToVector(Type *T) const { 7472 unsigned N = 1; 7473 Type *EltTy = T; 7474 7475 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) { 7476 if (auto *ST = dyn_cast<StructType>(EltTy)) { 7477 // Check that struct is homogeneous. 7478 for (const auto *Ty : ST->elements()) 7479 if (Ty != *ST->element_begin()) 7480 return 0; 7481 N *= ST->getNumElements(); 7482 EltTy = *ST->element_begin(); 7483 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { 7484 N *= AT->getNumElements(); 7485 EltTy = AT->getElementType(); 7486 } else { 7487 auto *VT = cast<FixedVectorType>(EltTy); 7488 N *= VT->getNumElements(); 7489 EltTy = VT->getElementType(); 7490 } 7491 } 7492 7493 if (!isValidElementType(EltTy)) 7494 return 0; 7495 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N)); 7496 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || 7497 VTSize != DL->getTypeStoreSizeInBits(T)) 7498 return 0; 7499 return N; 7500 } 7501 7502 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 7503 SmallVectorImpl<unsigned> &CurrentOrder, 7504 bool ResizeAllowed) const { 7505 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>); 7506 assert(It != VL.end() && "Expected at least one extract instruction."); 7507 auto *E0 = cast<Instruction>(*It); 7508 assert( 7509 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) && 7510 "Invalid opcode"); 7511 // Check if all of the extracts come from the same vector and from the 7512 // correct offset. 7513 Value *Vec = E0->getOperand(0); 7514 7515 CurrentOrder.clear(); 7516 7517 // We have to extract from a vector/aggregate with the same number of elements. 7518 unsigned NElts; 7519 if (E0->getOpcode() == Instruction::ExtractValue) { 7520 NElts = canMapToVector(Vec->getType()); 7521 if (!NElts) 7522 return false; 7523 // Check if load can be rewritten as load of vector. 7524 LoadInst *LI = dyn_cast<LoadInst>(Vec); 7525 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) 7526 return false; 7527 } else { 7528 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); 7529 } 7530 7531 unsigned E = VL.size(); 7532 if (!ResizeAllowed && NElts != E) 7533 return false; 7534 SmallVector<int> Indices(E, PoisonMaskElem); 7535 unsigned MinIdx = NElts, MaxIdx = 0; 7536 for (auto [I, V] : enumerate(VL)) { 7537 auto *Inst = dyn_cast<Instruction>(V); 7538 if (!Inst) 7539 continue; 7540 if (Inst->getOperand(0) != Vec) 7541 return false; 7542 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) 7543 if (isa<UndefValue>(EE->getIndexOperand())) 7544 continue; 7545 std::optional<unsigned> Idx = getExtractIndex(Inst); 7546 if (!Idx) 7547 return false; 7548 const unsigned ExtIdx = *Idx; 7549 if (ExtIdx >= NElts) 7550 continue; 7551 Indices[I] = ExtIdx; 7552 if (MinIdx > ExtIdx) 7553 MinIdx = ExtIdx; 7554 if (MaxIdx < ExtIdx) 7555 MaxIdx = ExtIdx; 7556 } 7557 if (MaxIdx - MinIdx + 1 > E) 7558 return false; 7559 if (MaxIdx + 1 <= E) 7560 MinIdx = 0; 7561 7562 // Check that all of the indices extract from the correct offset. 7563 bool ShouldKeepOrder = true; 7564 // Assign to all items the initial value E + 1 so we can check if the extract 7565 // instruction index was used already. 7566 // Also, later we can check that all the indices are used and we have a 7567 // consecutive access in the extract instructions, by checking that no 7568 // element of CurrentOrder still has value E + 1. 7569 CurrentOrder.assign(E, E); 7570 for (unsigned I = 0; I < E; ++I) { 7571 if (Indices[I] == PoisonMaskElem) 7572 continue; 7573 const unsigned ExtIdx = Indices[I] - MinIdx; 7574 if (CurrentOrder[ExtIdx] != E) { 7575 CurrentOrder.clear(); 7576 return false; 7577 } 7578 ShouldKeepOrder &= ExtIdx == I; 7579 CurrentOrder[ExtIdx] = I; 7580 } 7581 if (ShouldKeepOrder) 7582 CurrentOrder.clear(); 7583 7584 return ShouldKeepOrder; 7585 } 7586 7587 bool BoUpSLP::areAllUsersVectorized( 7588 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const { 7589 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) || 7590 all_of(I->users(), [this](User *U) { 7591 return ScalarToTreeEntry.contains(U) || 7592 isVectorLikeInstWithConstOps(U) || 7593 (isa<ExtractElementInst>(U) && MustGather.contains(U)); 7594 }); 7595 } 7596 7597 static std::pair<InstructionCost, InstructionCost> 7598 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, 7599 TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7600 ArrayRef<Type *> ArgTys) { 7601 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7602 7603 // Calculate the cost of the scalar and vector calls. 7604 FastMathFlags FMF; 7605 if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) 7606 FMF = FPCI->getFastMathFlags(); 7607 SmallVector<const Value *> Arguments(CI->args()); 7608 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF, 7609 dyn_cast<IntrinsicInst>(CI)); 7610 auto IntrinsicCost = 7611 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); 7612 7613 auto Shape = VFShape::get(CI->getFunctionType(), 7614 ElementCount::getFixed(VecTy->getNumElements()), 7615 false /*HasGlobalPred*/); 7616 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 7617 auto LibCost = IntrinsicCost; 7618 if (!CI->isNoBuiltin() && VecFunc) { 7619 // Calculate the cost of the vector library call. 7620 // If the corresponding vector call is cheaper, return its cost. 7621 LibCost = 7622 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput); 7623 } 7624 return {IntrinsicCost, LibCost}; 7625 } 7626 7627 void BoUpSLP::TreeEntry::buildAltOpShuffleMask( 7628 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask, 7629 SmallVectorImpl<Value *> *OpScalars, 7630 SmallVectorImpl<Value *> *AltScalars) const { 7631 unsigned Sz = Scalars.size(); 7632 Mask.assign(Sz, PoisonMaskElem); 7633 SmallVector<int> OrderMask; 7634 if (!ReorderIndices.empty()) 7635 inversePermutation(ReorderIndices, OrderMask); 7636 for (unsigned I = 0; I < Sz; ++I) { 7637 unsigned Idx = I; 7638 if (!ReorderIndices.empty()) 7639 Idx = OrderMask[I]; 7640 auto *OpInst = cast<Instruction>(Scalars[Idx]); 7641 if (IsAltOp(OpInst)) { 7642 Mask[I] = Sz + Idx; 7643 if (AltScalars) 7644 AltScalars->push_back(OpInst); 7645 } else { 7646 Mask[I] = Idx; 7647 if (OpScalars) 7648 OpScalars->push_back(OpInst); 7649 } 7650 } 7651 if (!ReuseShuffleIndices.empty()) { 7652 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem); 7653 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) { 7654 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem; 7655 }); 7656 Mask.swap(NewMask); 7657 } 7658 } 7659 7660 static bool isAlternateInstruction(const Instruction *I, 7661 const Instruction *MainOp, 7662 const Instruction *AltOp, 7663 const TargetLibraryInfo &TLI) { 7664 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) { 7665 auto *AltCI = cast<CmpInst>(AltOp); 7666 CmpInst::Predicate MainP = MainCI->getPredicate(); 7667 CmpInst::Predicate AltP = AltCI->getPredicate(); 7668 assert(MainP != AltP && "Expected different main/alternate predicates."); 7669 auto *CI = cast<CmpInst>(I); 7670 if (isCmpSameOrSwapped(MainCI, CI, TLI)) 7671 return false; 7672 if (isCmpSameOrSwapped(AltCI, CI, TLI)) 7673 return true; 7674 CmpInst::Predicate P = CI->getPredicate(); 7675 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P); 7676 7677 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && 7678 "CmpInst expected to match either main or alternate predicate or " 7679 "their swap."); 7680 (void)AltP; 7681 return MainP != P && MainP != SwappedP; 7682 } 7683 return I->getOpcode() == AltOp->getOpcode(); 7684 } 7685 7686 TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) { 7687 assert(!Ops.empty()); 7688 const auto *Op0 = Ops.front(); 7689 7690 const bool IsConstant = all_of(Ops, [](Value *V) { 7691 // TODO: We should allow undef elements here 7692 return isConstant(V) && !isa<UndefValue>(V); 7693 }); 7694 const bool IsUniform = all_of(Ops, [=](Value *V) { 7695 // TODO: We should allow undef elements here 7696 return V == Op0; 7697 }); 7698 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) { 7699 // TODO: We should allow undef elements here 7700 if (auto *CI = dyn_cast<ConstantInt>(V)) 7701 return CI->getValue().isPowerOf2(); 7702 return false; 7703 }); 7704 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) { 7705 // TODO: We should allow undef elements here 7706 if (auto *CI = dyn_cast<ConstantInt>(V)) 7707 return CI->getValue().isNegatedPowerOf2(); 7708 return false; 7709 }); 7710 7711 TTI::OperandValueKind VK = TTI::OK_AnyValue; 7712 if (IsConstant && IsUniform) 7713 VK = TTI::OK_UniformConstantValue; 7714 else if (IsConstant) 7715 VK = TTI::OK_NonUniformConstantValue; 7716 else if (IsUniform) 7717 VK = TTI::OK_UniformValue; 7718 7719 TTI::OperandValueProperties VP = TTI::OP_None; 7720 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP; 7721 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP; 7722 7723 return {VK, VP}; 7724 } 7725 7726 namespace { 7727 /// The base class for shuffle instruction emission and shuffle cost estimation. 7728 class BaseShuffleAnalysis { 7729 protected: 7730 /// Checks if the mask is an identity mask. 7731 /// \param IsStrict if is true the function returns false if mask size does 7732 /// not match vector size. 7733 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy, 7734 bool IsStrict) { 7735 int Limit = Mask.size(); 7736 int VF = VecTy->getNumElements(); 7737 int Index = -1; 7738 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit)) 7739 return true; 7740 if (!IsStrict) { 7741 // Consider extract subvector starting from index 0. 7742 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && 7743 Index == 0) 7744 return true; 7745 // All VF-size submasks are identity (e.g. 7746 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4). 7747 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) { 7748 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF); 7749 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) || 7750 ShuffleVectorInst::isIdentityMask(Slice, VF); 7751 })) 7752 return true; 7753 } 7754 return false; 7755 } 7756 7757 /// Tries to combine 2 different masks into single one. 7758 /// \param LocalVF Vector length of the permuted input vector. \p Mask may 7759 /// change the size of the vector, \p LocalVF is the original size of the 7760 /// shuffled vector. 7761 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask, 7762 ArrayRef<int> ExtMask) { 7763 unsigned VF = Mask.size(); 7764 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 7765 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 7766 if (ExtMask[I] == PoisonMaskElem) 7767 continue; 7768 int MaskedIdx = Mask[ExtMask[I] % VF]; 7769 NewMask[I] = 7770 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF; 7771 } 7772 Mask.swap(NewMask); 7773 } 7774 7775 /// Looks through shuffles trying to reduce final number of shuffles in the 7776 /// code. The function looks through the previously emitted shuffle 7777 /// instructions and properly mark indices in mask as undef. 7778 /// For example, given the code 7779 /// \code 7780 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 7781 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 7782 /// \endcode 7783 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 7784 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 7785 /// <0, 1, 2, 3> for the shuffle. 7786 /// If 2 operands are of different size, the smallest one will be resized and 7787 /// the mask recalculated properly. 7788 /// For example, given the code 7789 /// \code 7790 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 7791 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 7792 /// \endcode 7793 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 7794 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 7795 /// <0, 1, 2, 3> for the shuffle. 7796 /// So, it tries to transform permutations to simple vector merge, if 7797 /// possible. 7798 /// \param V The input vector which must be shuffled using the given \p Mask. 7799 /// If the better candidate is found, \p V is set to this best candidate 7800 /// vector. 7801 /// \param Mask The input mask for the shuffle. If the best candidate is found 7802 /// during looking-through-shuffles attempt, it is updated accordingly. 7803 /// \param SinglePermute true if the shuffle operation is originally a 7804 /// single-value-permutation. In this case the look-through-shuffles procedure 7805 /// may look for resizing shuffles as the best candidates. 7806 /// \return true if the shuffle results in the non-resizing identity shuffle 7807 /// (and thus can be ignored), false - otherwise. 7808 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask, 7809 bool SinglePermute) { 7810 Value *Op = V; 7811 ShuffleVectorInst *IdentityOp = nullptr; 7812 SmallVector<int> IdentityMask; 7813 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) { 7814 // Exit if not a fixed vector type or changing size shuffle. 7815 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType()); 7816 if (!SVTy) 7817 break; 7818 // Remember the identity or broadcast mask, if it is not a resizing 7819 // shuffle. If no better candidates are found, this Op and Mask will be 7820 // used in the final shuffle. 7821 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { 7822 if (!IdentityOp || !SinglePermute || 7823 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && 7824 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask, 7825 IdentityMask.size()))) { 7826 IdentityOp = SV; 7827 // Store current mask in the IdentityMask so later we did not lost 7828 // this info if IdentityOp is selected as the best candidate for the 7829 // permutation. 7830 IdentityMask.assign(Mask); 7831 } 7832 } 7833 // Remember the broadcast mask. If no better candidates are found, this Op 7834 // and Mask will be used in the final shuffle. 7835 // Zero splat can be used as identity too, since it might be used with 7836 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. 7837 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is 7838 // expensive, the analysis founds out, that the source vector is just a 7839 // broadcast, this original mask can be transformed to identity mask <0, 7840 // 1, 2, 3>. 7841 // \code 7842 // %0 = shuffle %v, poison, zeroinitalizer 7843 // %res = shuffle %0, poison, <3, 1, 2, 0> 7844 // \endcode 7845 // may be transformed to 7846 // \code 7847 // %0 = shuffle %v, poison, zeroinitalizer 7848 // %res = shuffle %0, poison, <0, 1, 2, 3> 7849 // \endcode 7850 if (SV->isZeroEltSplat()) { 7851 IdentityOp = SV; 7852 IdentityMask.assign(Mask); 7853 } 7854 int LocalVF = Mask.size(); 7855 if (auto *SVOpTy = 7856 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType())) 7857 LocalVF = SVOpTy->getNumElements(); 7858 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem); 7859 for (auto [Idx, I] : enumerate(Mask)) { 7860 if (I == PoisonMaskElem || 7861 static_cast<unsigned>(I) >= SV->getShuffleMask().size()) 7862 continue; 7863 ExtMask[Idx] = SV->getMaskValue(I); 7864 } 7865 bool IsOp1Undef = 7866 isUndefVector(SV->getOperand(0), 7867 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg)) 7868 .all(); 7869 bool IsOp2Undef = 7870 isUndefVector(SV->getOperand(1), 7871 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg)) 7872 .all(); 7873 if (!IsOp1Undef && !IsOp2Undef) { 7874 // Update mask and mark undef elems. 7875 for (int &I : Mask) { 7876 if (I == PoisonMaskElem) 7877 continue; 7878 if (SV->getMaskValue(I % SV->getShuffleMask().size()) == 7879 PoisonMaskElem) 7880 I = PoisonMaskElem; 7881 } 7882 break; 7883 } 7884 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(), 7885 SV->getShuffleMask().end()); 7886 combineMasks(LocalVF, ShuffleMask, Mask); 7887 Mask.swap(ShuffleMask); 7888 if (IsOp2Undef) 7889 Op = SV->getOperand(0); 7890 else 7891 Op = SV->getOperand(1); 7892 } 7893 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType()); 7894 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) || 7895 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) { 7896 if (IdentityOp) { 7897 V = IdentityOp; 7898 assert(Mask.size() == IdentityMask.size() && 7899 "Expected masks of same sizes."); 7900 // Clear known poison elements. 7901 for (auto [I, Idx] : enumerate(Mask)) 7902 if (Idx == PoisonMaskElem) 7903 IdentityMask[I] = PoisonMaskElem; 7904 Mask.swap(IdentityMask); 7905 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V); 7906 return SinglePermute && 7907 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()), 7908 /*IsStrict=*/true) || 7909 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && 7910 Shuffle->isZeroEltSplat() && 7911 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size()))); 7912 } 7913 V = Op; 7914 return false; 7915 } 7916 V = Op; 7917 return true; 7918 } 7919 7920 /// Smart shuffle instruction emission, walks through shuffles trees and 7921 /// tries to find the best matching vector for the actual shuffle 7922 /// instruction. 7923 template <typename T, typename ShuffleBuilderTy> 7924 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, 7925 ShuffleBuilderTy &Builder) { 7926 assert(V1 && "Expected at least one vector value."); 7927 if (V2) 7928 Builder.resizeToMatch(V1, V2); 7929 int VF = Mask.size(); 7930 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) 7931 VF = FTy->getNumElements(); 7932 if (V2 && 7933 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { 7934 // Peek through shuffles. 7935 Value *Op1 = V1; 7936 Value *Op2 = V2; 7937 int VF = 7938 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 7939 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); 7940 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); 7941 for (int I = 0, E = Mask.size(); I < E; ++I) { 7942 if (Mask[I] < VF) 7943 CombinedMask1[I] = Mask[I]; 7944 else 7945 CombinedMask2[I] = Mask[I] - VF; 7946 } 7947 Value *PrevOp1; 7948 Value *PrevOp2; 7949 do { 7950 PrevOp1 = Op1; 7951 PrevOp2 = Op2; 7952 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false); 7953 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false); 7954 // Check if we have 2 resizing shuffles - need to peek through operands 7955 // again. 7956 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1)) 7957 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) { 7958 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem); 7959 for (auto [Idx, I] : enumerate(CombinedMask1)) { 7960 if (I == PoisonMaskElem) 7961 continue; 7962 ExtMask1[Idx] = SV1->getMaskValue(I); 7963 } 7964 SmallBitVector UseMask1 = buildUseMask( 7965 cast<FixedVectorType>(SV1->getOperand(1)->getType()) 7966 ->getNumElements(), 7967 ExtMask1, UseMask::SecondArg); 7968 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem); 7969 for (auto [Idx, I] : enumerate(CombinedMask2)) { 7970 if (I == PoisonMaskElem) 7971 continue; 7972 ExtMask2[Idx] = SV2->getMaskValue(I); 7973 } 7974 SmallBitVector UseMask2 = buildUseMask( 7975 cast<FixedVectorType>(SV2->getOperand(1)->getType()) 7976 ->getNumElements(), 7977 ExtMask2, UseMask::SecondArg); 7978 if (SV1->getOperand(0)->getType() == 7979 SV2->getOperand(0)->getType() && 7980 SV1->getOperand(0)->getType() != SV1->getType() && 7981 isUndefVector(SV1->getOperand(1), UseMask1).all() && 7982 isUndefVector(SV2->getOperand(1), UseMask2).all()) { 7983 Op1 = SV1->getOperand(0); 7984 Op2 = SV2->getOperand(0); 7985 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(), 7986 SV1->getShuffleMask().end()); 7987 int LocalVF = ShuffleMask1.size(); 7988 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType())) 7989 LocalVF = FTy->getNumElements(); 7990 combineMasks(LocalVF, ShuffleMask1, CombinedMask1); 7991 CombinedMask1.swap(ShuffleMask1); 7992 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(), 7993 SV2->getShuffleMask().end()); 7994 LocalVF = ShuffleMask2.size(); 7995 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType())) 7996 LocalVF = FTy->getNumElements(); 7997 combineMasks(LocalVF, ShuffleMask2, CombinedMask2); 7998 CombinedMask2.swap(ShuffleMask2); 7999 } 8000 } 8001 } while (PrevOp1 != Op1 || PrevOp2 != Op2); 8002 Builder.resizeToMatch(Op1, Op2); 8003 VF = std::max(cast<VectorType>(Op1->getType()) 8004 ->getElementCount() 8005 .getKnownMinValue(), 8006 cast<VectorType>(Op2->getType()) 8007 ->getElementCount() 8008 .getKnownMinValue()); 8009 for (int I = 0, E = Mask.size(); I < E; ++I) { 8010 if (CombinedMask2[I] != PoisonMaskElem) { 8011 assert(CombinedMask1[I] == PoisonMaskElem && 8012 "Expected undefined mask element"); 8013 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); 8014 } 8015 } 8016 if (Op1 == Op2 && 8017 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) || 8018 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) && 8019 isa<ShuffleVectorInst>(Op1) && 8020 cast<ShuffleVectorInst>(Op1)->getShuffleMask() == 8021 ArrayRef(CombinedMask1)))) 8022 return Builder.createIdentity(Op1); 8023 return Builder.createShuffleVector( 8024 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, 8025 CombinedMask1); 8026 } 8027 if (isa<PoisonValue>(V1)) 8028 return Builder.createPoison( 8029 cast<VectorType>(V1->getType())->getElementType(), Mask.size()); 8030 SmallVector<int> NewMask(Mask.begin(), Mask.end()); 8031 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); 8032 assert(V1 && "Expected non-null value after looking through shuffles."); 8033 8034 if (!IsIdentity) 8035 return Builder.createShuffleVector(V1, NewMask); 8036 return Builder.createIdentity(V1); 8037 } 8038 }; 8039 } // namespace 8040 8041 /// Returns the cost of the shuffle instructions with the given \p Kind, vector 8042 /// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert 8043 /// subvector pattern. 8044 static InstructionCost 8045 getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, 8046 VectorType *Tp, ArrayRef<int> Mask = std::nullopt, 8047 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 8048 int Index = 0, VectorType *SubTp = nullptr, 8049 ArrayRef<const Value *> Args = std::nullopt) { 8050 if (Kind != TTI::SK_PermuteTwoSrc) 8051 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); 8052 int NumSrcElts = Tp->getElementCount().getKnownMinValue(); 8053 int NumSubElts; 8054 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 8055 Mask, NumSrcElts, NumSubElts, Index)) { 8056 if (Index + NumSubElts > NumSrcElts && 8057 Index + NumSrcElts <= static_cast<int>(Mask.size())) 8058 return TTI.getShuffleCost( 8059 TTI::SK_InsertSubvector, 8060 getWidenedType(Tp->getElementType(), Mask.size()), Mask, 8061 TTI::TCK_RecipThroughput, Index, Tp); 8062 } 8063 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); 8064 } 8065 8066 /// Calculate the scalar and the vector costs from vectorizing set of GEPs. 8067 static std::pair<InstructionCost, InstructionCost> 8068 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, 8069 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, 8070 Type *ScalarTy, VectorType *VecTy) { 8071 InstructionCost ScalarCost = 0; 8072 InstructionCost VecCost = 0; 8073 // Here we differentiate two cases: (1) when Ptrs represent a regular 8074 // vectorization tree node (as they are pointer arguments of scattered 8075 // loads) or (2) when Ptrs are the arguments of loads or stores being 8076 // vectorized as plane wide unit-stride load/store since all the 8077 // loads/stores are known to be from/to adjacent locations. 8078 if (Opcode == Instruction::Load || Opcode == Instruction::Store) { 8079 // Case 2: estimate costs for pointer related costs when vectorizing to 8080 // a wide load/store. 8081 // Scalar cost is estimated as a set of pointers with known relationship 8082 // between them. 8083 // For vector code we will use BasePtr as argument for the wide load/store 8084 // but we also need to account all the instructions which are going to 8085 // stay in vectorized code due to uses outside of these scalar 8086 // loads/stores. 8087 ScalarCost = TTI.getPointersChainCost( 8088 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy, 8089 CostKind); 8090 8091 SmallVector<const Value *> PtrsRetainedInVecCode; 8092 for (Value *V : Ptrs) { 8093 if (V == BasePtr) { 8094 PtrsRetainedInVecCode.push_back(V); 8095 continue; 8096 } 8097 auto *Ptr = dyn_cast<GetElementPtrInst>(V); 8098 // For simplicity assume Ptr to stay in vectorized code if it's not a 8099 // GEP instruction. We don't care since it's cost considered free. 8100 // TODO: We should check for any uses outside of vectorizable tree 8101 // rather than just single use. 8102 if (!Ptr || !Ptr->hasOneUse()) 8103 PtrsRetainedInVecCode.push_back(V); 8104 } 8105 8106 if (PtrsRetainedInVecCode.size() == Ptrs.size()) { 8107 // If all pointers stay in vectorized code then we don't have 8108 // any savings on that. 8109 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free); 8110 } 8111 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr, 8112 TTI::PointersChainInfo::getKnownStride(), 8113 VecTy, CostKind); 8114 } else { 8115 // Case 1: Ptrs are the arguments of loads that we are going to transform 8116 // into masked gather load intrinsic. 8117 // All the scalar GEPs will be removed as a result of vectorization. 8118 // For any external uses of some lanes extract element instructions will 8119 // be generated (which cost is estimated separately). 8120 TTI::PointersChainInfo PtrsInfo = 8121 all_of(Ptrs, 8122 [](const Value *V) { 8123 auto *Ptr = dyn_cast<GetElementPtrInst>(V); 8124 return Ptr && !Ptr->hasAllConstantIndices(); 8125 }) 8126 ? TTI::PointersChainInfo::getUnknownStride() 8127 : TTI::PointersChainInfo::getKnownStride(); 8128 8129 ScalarCost = 8130 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind); 8131 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr); 8132 if (!BaseGEP) { 8133 auto *It = find_if(Ptrs, IsaPred<GEPOperator>); 8134 if (It != Ptrs.end()) 8135 BaseGEP = cast<GEPOperator>(*It); 8136 } 8137 if (BaseGEP) { 8138 SmallVector<const Value *> Indices(BaseGEP->indices()); 8139 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(), 8140 BaseGEP->getPointerOperand(), Indices, VecTy, 8141 CostKind); 8142 } 8143 } 8144 8145 return std::make_pair(ScalarCost, VecCost); 8146 } 8147 8148 void BoUpSLP::transformNodes() { 8149 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 8150 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 8151 TreeEntry &E = *TE; 8152 switch (E.getOpcode()) { 8153 case Instruction::Load: { 8154 // No need to reorder masked gather loads, just reorder the scalar 8155 // operands. 8156 if (E.State != TreeEntry::Vectorize) 8157 break; 8158 Type *ScalarTy = E.getMainOp()->getType(); 8159 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); 8160 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars); 8161 // Check if profitable to represent consecutive load + reverse as strided 8162 // load with stride -1. 8163 if (isReverseOrder(E.ReorderIndices) && 8164 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { 8165 SmallVector<int> Mask; 8166 inversePermutation(E.ReorderIndices, Mask); 8167 auto *BaseLI = cast<LoadInst>(E.Scalars.back()); 8168 InstructionCost OriginalVecCost = 8169 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(), 8170 BaseLI->getPointerAddressSpace(), CostKind, 8171 TTI::OperandValueInfo()) + 8172 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind); 8173 InstructionCost StridedCost = TTI->getStridedMemoryOpCost( 8174 Instruction::Load, VecTy, BaseLI->getPointerOperand(), 8175 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI); 8176 if (StridedCost < OriginalVecCost) 8177 // Strided load is more profitable than consecutive load + reverse - 8178 // transform the node to strided load. 8179 E.State = TreeEntry::StridedVectorize; 8180 } 8181 break; 8182 } 8183 case Instruction::Store: { 8184 Type *ScalarTy = 8185 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType(); 8186 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size()); 8187 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars); 8188 // Check if profitable to represent consecutive load + reverse as strided 8189 // load with stride -1. 8190 if (isReverseOrder(E.ReorderIndices) && 8191 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { 8192 SmallVector<int> Mask; 8193 inversePermutation(E.ReorderIndices, Mask); 8194 auto *BaseSI = cast<StoreInst>(E.Scalars.back()); 8195 InstructionCost OriginalVecCost = 8196 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(), 8197 BaseSI->getPointerAddressSpace(), CostKind, 8198 TTI::OperandValueInfo()) + 8199 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind); 8200 InstructionCost StridedCost = TTI->getStridedMemoryOpCost( 8201 Instruction::Store, VecTy, BaseSI->getPointerOperand(), 8202 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI); 8203 if (StridedCost < OriginalVecCost) 8204 // Strided load is more profitable than consecutive load + reverse - 8205 // transform the node to strided load. 8206 E.State = TreeEntry::StridedVectorize; 8207 } 8208 break; 8209 } 8210 default: 8211 break; 8212 } 8213 } 8214 } 8215 8216 /// Merges shuffle masks and emits final shuffle instruction, if required. It 8217 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 8218 /// when the actual shuffle instruction is generated only if this is actually 8219 /// required. Otherwise, the shuffle instruction emission is delayed till the 8220 /// end of the process, to reduce the number of emitted instructions and further 8221 /// analysis/transformations. 8222 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { 8223 bool IsFinalized = false; 8224 SmallVector<int> CommonMask; 8225 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors; 8226 Type *ScalarTy = nullptr; 8227 const TargetTransformInfo &TTI; 8228 InstructionCost Cost = 0; 8229 SmallDenseSet<Value *> VectorizedVals; 8230 BoUpSLP &R; 8231 SmallPtrSetImpl<Value *> &CheckedExtracts; 8232 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 8233 /// While set, still trying to estimate the cost for the same nodes and we 8234 /// can delay actual cost estimation (virtual shuffle instruction emission). 8235 /// May help better estimate the cost if same nodes must be permuted + allows 8236 /// to move most of the long shuffles cost estimation to TTI. 8237 bool SameNodesEstimated = true; 8238 8239 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) { 8240 if (Ty->getScalarType()->isPointerTy()) { 8241 Constant *Res = ConstantExpr::getIntToPtr( 8242 ConstantInt::getAllOnesValue( 8243 IntegerType::get(Ty->getContext(), 8244 DL.getTypeStoreSizeInBits(Ty->getScalarType()))), 8245 Ty->getScalarType()); 8246 if (auto *VTy = dyn_cast<VectorType>(Ty)) 8247 Res = ConstantVector::getSplat(VTy->getElementCount(), Res); 8248 return Res; 8249 } 8250 return Constant::getAllOnesValue(Ty); 8251 } 8252 8253 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) { 8254 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>)) 8255 return TTI::TCC_Free; 8256 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 8257 InstructionCost GatherCost = 0; 8258 SmallVector<Value *> Gathers(VL.begin(), VL.end()); 8259 // Improve gather cost for gather of loads, if we can group some of the 8260 // loads into vector loads. 8261 InstructionsState S = getSameOpcode(VL, *R.TLI); 8262 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); 8263 unsigned MinVF = R.getMinVF(2 * Sz); 8264 if (VL.size() > 2 && 8265 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || 8266 (InVectors.empty() && 8267 any_of(seq<unsigned>(0, VL.size() / MinVF), 8268 [&](unsigned Idx) { 8269 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF); 8270 InstructionsState S = getSameOpcode(SubVL, *R.TLI); 8271 return S.getOpcode() == Instruction::Load && 8272 !S.isAltShuffle(); 8273 }))) && 8274 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && 8275 !isSplat(Gathers)) { 8276 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); 8277 SetVector<Value *> VectorizedLoads; 8278 SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts; 8279 SmallVector<unsigned> ScatterVectorized; 8280 unsigned StartIdx = 0; 8281 unsigned VF = VL.size() / 2; 8282 for (; VF >= MinVF; VF /= 2) { 8283 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; 8284 Cnt += VF) { 8285 ArrayRef<Value *> Slice = VL.slice(Cnt, VF); 8286 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { 8287 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); 8288 if (SliceS.getOpcode() != Instruction::Load || 8289 SliceS.isAltShuffle()) 8290 continue; 8291 } 8292 if (!VectorizedLoads.count(Slice.front()) && 8293 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { 8294 SmallVector<Value *> PointerOps; 8295 OrdersType CurrentOrder; 8296 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(), 8297 CurrentOrder, PointerOps); 8298 switch (LS) { 8299 case LoadsState::Vectorize: 8300 case LoadsState::ScatterVectorize: 8301 case LoadsState::StridedVectorize: 8302 // Mark the vectorized loads so that we don't vectorize them 8303 // again. 8304 // TODO: better handling of loads with reorders. 8305 if (((LS == LoadsState::Vectorize || 8306 LS == LoadsState::StridedVectorize) && 8307 CurrentOrder.empty()) || 8308 (LS == LoadsState::StridedVectorize && 8309 isReverseOrder(CurrentOrder))) 8310 VectorizedStarts.emplace_back(Cnt, LS); 8311 else 8312 ScatterVectorized.push_back(Cnt); 8313 VectorizedLoads.insert(Slice.begin(), Slice.end()); 8314 // If we vectorized initial block, no need to try to vectorize 8315 // it again. 8316 if (Cnt == StartIdx) 8317 StartIdx += VF; 8318 break; 8319 case LoadsState::Gather: 8320 break; 8321 } 8322 } 8323 } 8324 // Check if the whole array was vectorized already - exit. 8325 if (StartIdx >= VL.size()) 8326 break; 8327 // Found vectorizable parts - exit. 8328 if (!VectorizedLoads.empty()) 8329 break; 8330 } 8331 if (!VectorizedLoads.empty()) { 8332 unsigned NumParts = TTI.getNumberOfParts(VecTy); 8333 bool NeedInsertSubvectorAnalysis = 8334 !NumParts || (VL.size() / VF) > NumParts; 8335 // Get the cost for gathered loads. 8336 for (unsigned I = 0, End = VL.size(); I < End; I += VF) { 8337 if (VectorizedLoads.contains(VL[I])) 8338 continue; 8339 GatherCost += 8340 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); 8341 } 8342 // Exclude potentially vectorized loads from list of gathered 8343 // scalars. 8344 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); 8345 // The cost for vectorized loads. 8346 InstructionCost ScalarsCost = 0; 8347 for (Value *V : VectorizedLoads) { 8348 auto *LI = cast<LoadInst>(V); 8349 ScalarsCost += 8350 TTI.getMemoryOpCost(Instruction::Load, LI->getType(), 8351 LI->getAlign(), LI->getPointerAddressSpace(), 8352 CostKind, TTI::OperandValueInfo(), LI); 8353 } 8354 auto *LoadTy = getWidenedType(VL.front()->getType(), VF); 8355 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) { 8356 auto *LI = cast<LoadInst>(VL[P.first]); 8357 Align Alignment = LI->getAlign(); 8358 GatherCost += 8359 P.second == LoadsState::Vectorize 8360 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, 8361 LI->getPointerAddressSpace(), CostKind, 8362 TTI::OperandValueInfo(), LI) 8363 : TTI.getStridedMemoryOpCost( 8364 Instruction::Load, LoadTy, LI->getPointerOperand(), 8365 /*VariableMask=*/false, Alignment, CostKind, LI); 8366 // Estimate GEP cost. 8367 SmallVector<Value *> PointerOps(VF); 8368 for (auto [I, V] : enumerate(VL.slice(P.first, VF))) 8369 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand(); 8370 auto [ScalarGEPCost, VectorGEPCost] = 8371 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), 8372 Instruction::Load, CostKind, LI->getType(), LoadTy); 8373 GatherCost += VectorGEPCost - ScalarGEPCost; 8374 } 8375 for (unsigned P : ScatterVectorized) { 8376 auto *LI0 = cast<LoadInst>(VL[P]); 8377 ArrayRef<Value *> Slice = VL.slice(P, VF); 8378 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice); 8379 GatherCost += TTI.getGatherScatterOpCost( 8380 Instruction::Load, LoadTy, LI0->getPointerOperand(), 8381 /*VariableMask=*/false, CommonAlignment, CostKind, LI0); 8382 // Estimate GEP cost. 8383 SmallVector<Value *> PointerOps(VF); 8384 for (auto [I, V] : enumerate(Slice)) 8385 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand(); 8386 OrdersType Order; 8387 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, 8388 Order)) { 8389 // TODO: improve checks if GEPs can be vectorized. 8390 Value *Ptr0 = PointerOps.front(); 8391 Type *ScalarTy = Ptr0->getType(); 8392 auto *VecTy = getWidenedType(ScalarTy, VF); 8393 auto [ScalarGEPCost, VectorGEPCost] = 8394 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, 8395 CostKind, ScalarTy, VecTy); 8396 GatherCost += VectorGEPCost - ScalarGEPCost; 8397 if (!Order.empty()) { 8398 SmallVector<int> Mask; 8399 inversePermutation(Order, Mask); 8400 GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, 8401 VecTy, Mask, CostKind); 8402 } 8403 } else { 8404 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, 8405 PointerOps.front()->getType()); 8406 } 8407 } 8408 if (NeedInsertSubvectorAnalysis) { 8409 // Add the cost for the subvectors insert. 8410 SmallVector<int> ShuffleMask(VL.size()); 8411 for (unsigned I = VF, E = VL.size(); I < E; I += VF) { 8412 for (unsigned Idx : seq<unsigned>(0, E)) 8413 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx; 8414 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, 8415 ShuffleMask, CostKind, I, LoadTy); 8416 } 8417 } 8418 GatherCost -= ScalarsCost; 8419 } 8420 GatherCost = std::min(BaseCost, GatherCost); 8421 } else if (!Root && isSplat(VL)) { 8422 // Found the broadcasting of the single scalar, calculate the cost as 8423 // the broadcast. 8424 const auto *It = find_if_not(VL, IsaPred<UndefValue>); 8425 assert(It != VL.end() && "Expected at least one non-undef value."); 8426 // Add broadcast for non-identity shuffle only. 8427 bool NeedShuffle = 8428 count(VL, *It) > 1 && 8429 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>)); 8430 if (!NeedShuffle) 8431 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, 8432 CostKind, std::distance(VL.begin(), It), 8433 PoisonValue::get(VecTy), *It); 8434 8435 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem); 8436 transform(VL, ShuffleMask.begin(), [](Value *V) { 8437 return isa<PoisonValue>(V) ? PoisonMaskElem : 0; 8438 }); 8439 InstructionCost InsertCost = 8440 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0, 8441 PoisonValue::get(VecTy), *It); 8442 return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, 8443 VecTy, ShuffleMask, CostKind, 8444 /*Index=*/0, /*SubTp=*/nullptr, 8445 /*Args=*/*It); 8446 } 8447 return GatherCost + 8448 (all_of(Gathers, IsaPred<UndefValue>) 8449 ? TTI::TCC_Free 8450 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers), 8451 ScalarTy)); 8452 }; 8453 8454 /// Compute the cost of creating a vector containing the extracted values from 8455 /// \p VL. 8456 InstructionCost 8457 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask, 8458 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, 8459 unsigned NumParts) { 8460 assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); 8461 unsigned NumElts = 8462 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) { 8463 auto *EE = dyn_cast<ExtractElementInst>(V); 8464 if (!EE) 8465 return Sz; 8466 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType()); 8467 if (!VecTy) 8468 return Sz; 8469 return std::max(Sz, VecTy->getNumElements()); 8470 }); 8471 // FIXME: this must be moved to TTI for better estimation. 8472 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts); 8473 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask, 8474 SmallVectorImpl<unsigned> &Indices) 8475 -> std::optional<TTI::ShuffleKind> { 8476 if (NumElts <= EltsPerVector) 8477 return std::nullopt; 8478 int OffsetReg0 = 8479 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX, 8480 [](int S, int I) { 8481 if (I == PoisonMaskElem) 8482 return S; 8483 return std::min(S, I); 8484 }), 8485 EltsPerVector); 8486 int OffsetReg1 = OffsetReg0; 8487 DenseSet<int> RegIndices; 8488 // Check that if trying to permute same single/2 input vectors. 8489 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; 8490 int FirstRegId = -1; 8491 Indices.assign(1, OffsetReg0); 8492 for (auto [Pos, I] : enumerate(Mask)) { 8493 if (I == PoisonMaskElem) 8494 continue; 8495 int Idx = I - OffsetReg0; 8496 int RegId = 8497 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector; 8498 if (FirstRegId < 0) 8499 FirstRegId = RegId; 8500 RegIndices.insert(RegId); 8501 if (RegIndices.size() > 2) 8502 return std::nullopt; 8503 if (RegIndices.size() == 2) { 8504 ShuffleKind = TTI::SK_PermuteTwoSrc; 8505 if (Indices.size() == 1) { 8506 OffsetReg1 = alignDown( 8507 std::accumulate( 8508 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX, 8509 [&](int S, int I) { 8510 if (I == PoisonMaskElem) 8511 return S; 8512 int RegId = ((I - OffsetReg0) / NumElts) * NumParts + 8513 ((I - OffsetReg0) % NumElts) / EltsPerVector; 8514 if (RegId == FirstRegId) 8515 return S; 8516 return std::min(S, I); 8517 }), 8518 EltsPerVector); 8519 Indices.push_back(OffsetReg1 % NumElts); 8520 } 8521 Idx = I - OffsetReg1; 8522 } 8523 I = (Idx % NumElts) % EltsPerVector + 8524 (RegId == FirstRegId ? 0 : EltsPerVector); 8525 } 8526 return ShuffleKind; 8527 }; 8528 InstructionCost Cost = 0; 8529 8530 // Process extracts in blocks of EltsPerVector to check if the source vector 8531 // operand can be re-used directly. If not, add the cost of creating a 8532 // shuffle to extract the values into a vector register. 8533 for (unsigned Part : seq<unsigned>(NumParts)) { 8534 if (!ShuffleKinds[Part]) 8535 continue; 8536 ArrayRef<int> MaskSlice = Mask.slice( 8537 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part)); 8538 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem); 8539 copy(MaskSlice, SubMask.begin()); 8540 SmallVector<unsigned, 2> Indices; 8541 std::optional<TTI::ShuffleKind> RegShuffleKind = 8542 CheckPerRegistersShuffle(SubMask, Indices); 8543 if (!RegShuffleKind) { 8544 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc || 8545 !ShuffleVectorInst::isIdentityMask( 8546 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size()))) 8547 Cost += 8548 ::getShuffleCost(TTI, *ShuffleKinds[Part], 8549 getWidenedType(ScalarTy, NumElts), MaskSlice); 8550 continue; 8551 } 8552 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || 8553 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { 8554 Cost += 8555 ::getShuffleCost(TTI, *RegShuffleKind, 8556 getWidenedType(ScalarTy, EltsPerVector), SubMask); 8557 } 8558 for (unsigned Idx : Indices) { 8559 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) && 8560 "SK_ExtractSubvector index out of range"); 8561 Cost += ::getShuffleCost( 8562 TTI, TTI::SK_ExtractSubvector, 8563 getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)), 8564 std::nullopt, CostKind, Idx, 8565 getWidenedType(ScalarTy, EltsPerVector)); 8566 } 8567 // Second attempt to check, if just a permute is better estimated than 8568 // subvector extract. 8569 SubMask.assign(NumElts, PoisonMaskElem); 8570 copy(MaskSlice, SubMask.begin()); 8571 InstructionCost OriginalCost = ::getShuffleCost( 8572 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask); 8573 if (OriginalCost < Cost) 8574 Cost = OriginalCost; 8575 } 8576 return Cost; 8577 } 8578 /// Transforms mask \p CommonMask per given \p Mask to make proper set after 8579 /// shuffle emission. 8580 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, 8581 ArrayRef<int> Mask) { 8582 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 8583 if (Mask[Idx] != PoisonMaskElem) 8584 CommonMask[Idx] = Idx; 8585 } 8586 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given 8587 /// mask \p Mask, register number \p Part, that includes \p SliceSize 8588 /// elements. 8589 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2, 8590 ArrayRef<int> Mask, unsigned Part, 8591 unsigned SliceSize) { 8592 if (SameNodesEstimated) { 8593 // Delay the cost estimation if the same nodes are reshuffling. 8594 // If we already requested the cost of reshuffling of E1 and E2 before, no 8595 // need to estimate another cost with the sub-Mask, instead include this 8596 // sub-Mask into the CommonMask to estimate it later and avoid double cost 8597 // estimation. 8598 if ((InVectors.size() == 2 && 8599 InVectors.front().get<const TreeEntry *>() == &E1 && 8600 InVectors.back().get<const TreeEntry *>() == E2) || 8601 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) { 8602 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part); 8603 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit), 8604 [](int Idx) { return Idx == PoisonMaskElem; }) && 8605 "Expected all poisoned elements."); 8606 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit); 8607 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part)); 8608 return; 8609 } 8610 // Found non-matching nodes - need to estimate the cost for the matched 8611 // and transform mask. 8612 Cost += createShuffle(InVectors.front(), 8613 InVectors.size() == 1 ? nullptr : InVectors.back(), 8614 CommonMask); 8615 transformMaskAfterShuffle(CommonMask, CommonMask); 8616 } 8617 SameNodesEstimated = false; 8618 if (!E2 && InVectors.size() == 1) { 8619 unsigned VF = E1.getVectorFactor(); 8620 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) { 8621 VF = std::max(VF, 8622 cast<FixedVectorType>(V1->getType())->getNumElements()); 8623 } else { 8624 const auto *E = InVectors.front().get<const TreeEntry *>(); 8625 VF = std::max(VF, E->getVectorFactor()); 8626 } 8627 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 8628 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 8629 CommonMask[Idx] = Mask[Idx] + VF; 8630 Cost += createShuffle(InVectors.front(), &E1, CommonMask); 8631 transformMaskAfterShuffle(CommonMask, CommonMask); 8632 } else { 8633 Cost += createShuffle(&E1, E2, Mask); 8634 transformMaskAfterShuffle(CommonMask, Mask); 8635 } 8636 } 8637 8638 class ShuffleCostBuilder { 8639 const TargetTransformInfo &TTI; 8640 8641 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) { 8642 int Index = -1; 8643 return Mask.empty() || 8644 (VF == Mask.size() && 8645 ShuffleVectorInst::isIdentityMask(Mask, VF)) || 8646 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && 8647 Index == 0); 8648 } 8649 8650 public: 8651 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} 8652 ~ShuffleCostBuilder() = default; 8653 InstructionCost createShuffleVector(Value *V1, Value *, 8654 ArrayRef<int> Mask) const { 8655 // Empty mask or identity mask are free. 8656 unsigned VF = 8657 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 8658 if (isEmptyOrIdentity(Mask, VF)) 8659 return TTI::TCC_Free; 8660 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, 8661 cast<VectorType>(V1->getType()), Mask); 8662 } 8663 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { 8664 // Empty mask or identity mask are free. 8665 unsigned VF = 8666 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue(); 8667 if (isEmptyOrIdentity(Mask, VF)) 8668 return TTI::TCC_Free; 8669 return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, 8670 cast<VectorType>(V1->getType()), Mask); 8671 } 8672 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } 8673 InstructionCost createPoison(Type *Ty, unsigned VF) const { 8674 return TTI::TCC_Free; 8675 } 8676 void resizeToMatch(Value *&, Value *&) const {} 8677 }; 8678 8679 /// Smart shuffle instruction emission, walks through shuffles trees and 8680 /// tries to find the best matching vector for the actual shuffle 8681 /// instruction. 8682 InstructionCost 8683 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1, 8684 const PointerUnion<Value *, const TreeEntry *> &P2, 8685 ArrayRef<int> Mask) { 8686 ShuffleCostBuilder Builder(TTI); 8687 SmallVector<int> CommonMask(Mask.begin(), Mask.end()); 8688 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>(); 8689 unsigned CommonVF = Mask.size(); 8690 InstructionCost ExtraCost = 0; 8691 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E, 8692 unsigned VF) -> InstructionCost { 8693 if (E.isGather() && allConstant(E.Scalars)) 8694 return TTI::TCC_Free; 8695 Type *EScalarTy = E.Scalars.front()->getType(); 8696 bool IsSigned = true; 8697 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) { 8698 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first); 8699 IsSigned = It->second.second; 8700 } 8701 if (EScalarTy != ScalarTy) { 8702 unsigned CastOpcode = Instruction::Trunc; 8703 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); 8704 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); 8705 if (DstSz > SrcSz) 8706 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; 8707 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF), 8708 getWidenedType(EScalarTy, VF), 8709 TTI::CastContextHint::None, CostKind); 8710 } 8711 return TTI::TCC_Free; 8712 }; 8713 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost { 8714 if (isa<Constant>(V)) 8715 return TTI::TCC_Free; 8716 auto *VecTy = cast<VectorType>(V->getType()); 8717 Type *EScalarTy = VecTy->getElementType(); 8718 if (EScalarTy != ScalarTy) { 8719 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL)); 8720 unsigned CastOpcode = Instruction::Trunc; 8721 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); 8722 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); 8723 if (DstSz > SrcSz) 8724 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; 8725 return TTI.getCastInstrCost( 8726 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()), 8727 VecTy, TTI::CastContextHint::None, CostKind); 8728 } 8729 return TTI::TCC_Free; 8730 }; 8731 if (!V1 && !V2 && !P2.isNull()) { 8732 // Shuffle 2 entry nodes. 8733 const TreeEntry *E = P1.get<const TreeEntry *>(); 8734 unsigned VF = E->getVectorFactor(); 8735 const TreeEntry *E2 = P2.get<const TreeEntry *>(); 8736 CommonVF = std::max(VF, E2->getVectorFactor()); 8737 assert(all_of(Mask, 8738 [=](int Idx) { 8739 return Idx < 2 * static_cast<int>(CommonVF); 8740 }) && 8741 "All elements in mask must be less than 2 * CommonVF."); 8742 if (E->Scalars.size() == E2->Scalars.size()) { 8743 SmallVector<int> EMask = E->getCommonMask(); 8744 SmallVector<int> E2Mask = E2->getCommonMask(); 8745 if (!EMask.empty() || !E2Mask.empty()) { 8746 for (int &Idx : CommonMask) { 8747 if (Idx == PoisonMaskElem) 8748 continue; 8749 if (Idx < static_cast<int>(CommonVF) && !EMask.empty()) 8750 Idx = EMask[Idx]; 8751 else if (Idx >= static_cast<int>(CommonVF)) 8752 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) + 8753 E->Scalars.size(); 8754 } 8755 } 8756 CommonVF = E->Scalars.size(); 8757 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) + 8758 GetNodeMinBWAffectedCost(*E2, CommonVF); 8759 } else { 8760 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) + 8761 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor()); 8762 } 8763 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 8764 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 8765 } else if (!V1 && P2.isNull()) { 8766 // Shuffle single entry node. 8767 const TreeEntry *E = P1.get<const TreeEntry *>(); 8768 unsigned VF = E->getVectorFactor(); 8769 CommonVF = VF; 8770 assert( 8771 all_of(Mask, 8772 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && 8773 "All elements in mask must be less than CommonVF."); 8774 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { 8775 SmallVector<int> EMask = E->getCommonMask(); 8776 assert(!EMask.empty() && "Expected non-empty common mask."); 8777 for (int &Idx : CommonMask) { 8778 if (Idx != PoisonMaskElem) 8779 Idx = EMask[Idx]; 8780 } 8781 CommonVF = E->Scalars.size(); 8782 } 8783 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF); 8784 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 8785 // Not identity/broadcast? Try to see if the original vector is better. 8786 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() && 8787 CommonVF == CommonMask.size() && 8788 any_of(enumerate(CommonMask), 8789 [](const auto &&P) { 8790 return P.value() != PoisonMaskElem && 8791 static_cast<unsigned>(P.value()) != P.index(); 8792 }) && 8793 any_of(CommonMask, 8794 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) { 8795 SmallVector<int> ReorderMask; 8796 inversePermutation(E->ReorderIndices, ReorderMask); 8797 ::addMask(CommonMask, ReorderMask); 8798 } 8799 } else if (V1 && P2.isNull()) { 8800 // Shuffle single vector. 8801 ExtraCost += GetValueMinBWAffectedCost(V1); 8802 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements(); 8803 assert( 8804 all_of(Mask, 8805 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && 8806 "All elements in mask must be less than CommonVF."); 8807 } else if (V1 && !V2) { 8808 // Shuffle vector and tree node. 8809 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 8810 const TreeEntry *E2 = P2.get<const TreeEntry *>(); 8811 CommonVF = std::max(VF, E2->getVectorFactor()); 8812 assert(all_of(Mask, 8813 [=](int Idx) { 8814 return Idx < 2 * static_cast<int>(CommonVF); 8815 }) && 8816 "All elements in mask must be less than 2 * CommonVF."); 8817 if (E2->Scalars.size() == VF && VF != CommonVF) { 8818 SmallVector<int> E2Mask = E2->getCommonMask(); 8819 assert(!E2Mask.empty() && "Expected non-empty common mask."); 8820 for (int &Idx : CommonMask) { 8821 if (Idx == PoisonMaskElem) 8822 continue; 8823 if (Idx >= static_cast<int>(CommonVF)) 8824 Idx = E2Mask[Idx - CommonVF] + VF; 8825 } 8826 CommonVF = VF; 8827 } 8828 ExtraCost += GetValueMinBWAffectedCost(V1); 8829 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 8830 ExtraCost += GetNodeMinBWAffectedCost( 8831 *E2, std::min(CommonVF, E2->getVectorFactor())); 8832 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 8833 } else if (!V1 && V2) { 8834 // Shuffle vector and tree node. 8835 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements(); 8836 const TreeEntry *E1 = P1.get<const TreeEntry *>(); 8837 CommonVF = std::max(VF, E1->getVectorFactor()); 8838 assert(all_of(Mask, 8839 [=](int Idx) { 8840 return Idx < 2 * static_cast<int>(CommonVF); 8841 }) && 8842 "All elements in mask must be less than 2 * CommonVF."); 8843 if (E1->Scalars.size() == VF && VF != CommonVF) { 8844 SmallVector<int> E1Mask = E1->getCommonMask(); 8845 assert(!E1Mask.empty() && "Expected non-empty common mask."); 8846 for (int &Idx : CommonMask) { 8847 if (Idx == PoisonMaskElem) 8848 continue; 8849 if (Idx >= static_cast<int>(CommonVF)) 8850 Idx = E1Mask[Idx - CommonVF] + VF; 8851 else 8852 Idx = E1Mask[Idx]; 8853 } 8854 CommonVF = VF; 8855 } 8856 ExtraCost += GetNodeMinBWAffectedCost( 8857 *E1, std::min(CommonVF, E1->getVectorFactor())); 8858 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 8859 ExtraCost += GetValueMinBWAffectedCost(V2); 8860 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 8861 } else { 8862 assert(V1 && V2 && "Expected both vectors."); 8863 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 8864 CommonVF = 8865 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements()); 8866 assert(all_of(Mask, 8867 [=](int Idx) { 8868 return Idx < 2 * static_cast<int>(CommonVF); 8869 }) && 8870 "All elements in mask must be less than 2 * CommonVF."); 8871 ExtraCost += 8872 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2); 8873 if (V1->getType() != V2->getType()) { 8874 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 8875 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 8876 } else { 8877 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy) 8878 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); 8879 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy) 8880 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF)); 8881 } 8882 } 8883 InVectors.front() = 8884 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size())); 8885 if (InVectors.size() == 2) 8886 InVectors.pop_back(); 8887 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>( 8888 V1, V2, CommonMask, Builder); 8889 } 8890 8891 public: 8892 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, 8893 ArrayRef<Value *> VectorizedVals, BoUpSLP &R, 8894 SmallPtrSetImpl<Value *> &CheckedExtracts) 8895 : ScalarTy(ScalarTy), TTI(TTI), 8896 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R), 8897 CheckedExtracts(CheckedExtracts) {} 8898 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask, 8899 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, 8900 unsigned NumParts, bool &UseVecBaseAsInput) { 8901 UseVecBaseAsInput = false; 8902 if (Mask.empty()) 8903 return nullptr; 8904 Value *VecBase = nullptr; 8905 ArrayRef<Value *> VL = E->Scalars; 8906 // If the resulting type is scalarized, do not adjust the cost. 8907 if (NumParts == VL.size()) 8908 return nullptr; 8909 // Check if it can be considered reused if same extractelements were 8910 // vectorized already. 8911 bool PrevNodeFound = any_of( 8912 ArrayRef(R.VectorizableTree).take_front(E->Idx), 8913 [&](const std::unique_ptr<TreeEntry> &TE) { 8914 return ((!TE->isAltShuffle() && 8915 TE->getOpcode() == Instruction::ExtractElement) || 8916 TE->isGather()) && 8917 all_of(enumerate(TE->Scalars), [&](auto &&Data) { 8918 return VL.size() > Data.index() && 8919 (Mask[Data.index()] == PoisonMaskElem || 8920 isa<UndefValue>(VL[Data.index()]) || 8921 Data.value() == VL[Data.index()]); 8922 }); 8923 }); 8924 SmallPtrSet<Value *, 4> UniqueBases; 8925 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 8926 for (unsigned Part : seq<unsigned>(NumParts)) { 8927 unsigned Limit = getNumElems(VL.size(), SliceSize, Part); 8928 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit); 8929 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) { 8930 // Ignore non-extractelement scalars. 8931 if (isa<UndefValue>(V) || 8932 (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) 8933 continue; 8934 // If all users of instruction are going to be vectorized and this 8935 // instruction itself is not going to be vectorized, consider this 8936 // instruction as dead and remove its cost from the final cost of the 8937 // vectorized tree. 8938 // Also, avoid adjusting the cost for extractelements with multiple uses 8939 // in different graph entries. 8940 auto *EE = cast<ExtractElementInst>(V); 8941 VecBase = EE->getVectorOperand(); 8942 UniqueBases.insert(VecBase); 8943 const TreeEntry *VE = R.getTreeEntry(V); 8944 if (!CheckedExtracts.insert(V).second || 8945 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) || 8946 any_of(EE->users(), 8947 [&](User *U) { 8948 return isa<GetElementPtrInst>(U) && 8949 !R.areAllUsersVectorized(cast<Instruction>(U), 8950 &VectorizedVals); 8951 }) || 8952 (VE && VE != E)) 8953 continue; 8954 std::optional<unsigned> EEIdx = getExtractIndex(EE); 8955 if (!EEIdx) 8956 continue; 8957 unsigned Idx = *EEIdx; 8958 // Take credit for instruction that will become dead. 8959 if (EE->hasOneUse() || !PrevNodeFound) { 8960 Instruction *Ext = EE->user_back(); 8961 if (isa<SExtInst, ZExtInst>(Ext) && 8962 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) { 8963 // Use getExtractWithExtendCost() to calculate the cost of 8964 // extractelement/ext pair. 8965 Cost -= 8966 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), 8967 EE->getVectorOperandType(), Idx); 8968 // Add back the cost of s|zext which is subtracted separately. 8969 Cost += TTI.getCastInstrCost( 8970 Ext->getOpcode(), Ext->getType(), EE->getType(), 8971 TTI::getCastContextHint(Ext), CostKind, Ext); 8972 continue; 8973 } 8974 } 8975 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), 8976 CostKind, Idx); 8977 } 8978 } 8979 // Check that gather of extractelements can be represented as just a 8980 // shuffle of a single/two vectors the scalars are extracted from. 8981 // Found the bunch of extractelement instructions that must be gathered 8982 // into a vector and can be represented as a permutation elements in a 8983 // single input vector or of 2 input vectors. 8984 // Done for reused if same extractelements were vectorized already. 8985 if (!PrevNodeFound) 8986 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); 8987 InVectors.assign(1, E); 8988 CommonMask.assign(Mask.begin(), Mask.end()); 8989 transformMaskAfterShuffle(CommonMask, CommonMask); 8990 SameNodesEstimated = false; 8991 if (NumParts != 1 && UniqueBases.size() != 1) { 8992 UseVecBaseAsInput = true; 8993 VecBase = 8994 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size())); 8995 } 8996 return VecBase; 8997 } 8998 /// Checks if the specified entry \p E needs to be delayed because of its 8999 /// dependency nodes. 9000 std::optional<InstructionCost> 9001 needToDelay(const TreeEntry *, 9002 ArrayRef<SmallVector<const TreeEntry *>>) const { 9003 // No need to delay the cost estimation during analysis. 9004 return std::nullopt; 9005 } 9006 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { 9007 if (&E1 == &E2) { 9008 assert(all_of(Mask, 9009 [&](int Idx) { 9010 return Idx < static_cast<int>(E1.getVectorFactor()); 9011 }) && 9012 "Expected single vector shuffle mask."); 9013 add(E1, Mask); 9014 return; 9015 } 9016 if (InVectors.empty()) { 9017 CommonMask.assign(Mask.begin(), Mask.end()); 9018 InVectors.assign({&E1, &E2}); 9019 return; 9020 } 9021 assert(!CommonMask.empty() && "Expected non-empty common mask."); 9022 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); 9023 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); 9024 if (NumParts == 0 || NumParts >= Mask.size()) 9025 NumParts = 1; 9026 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); 9027 const auto *It = 9028 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); 9029 unsigned Part = std::distance(Mask.begin(), It) / SliceSize; 9030 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize); 9031 } 9032 void add(const TreeEntry &E1, ArrayRef<int> Mask) { 9033 if (InVectors.empty()) { 9034 CommonMask.assign(Mask.begin(), Mask.end()); 9035 InVectors.assign(1, &E1); 9036 return; 9037 } 9038 assert(!CommonMask.empty() && "Expected non-empty common mask."); 9039 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); 9040 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); 9041 if (NumParts == 0 || NumParts >= Mask.size()) 9042 NumParts = 1; 9043 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); 9044 const auto *It = 9045 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); 9046 unsigned Part = std::distance(Mask.begin(), It) / SliceSize; 9047 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize); 9048 if (!SameNodesEstimated && InVectors.size() == 1) 9049 InVectors.emplace_back(&E1); 9050 } 9051 /// Adds 2 input vectors and the mask for their shuffling. 9052 void add(Value *V1, Value *V2, ArrayRef<int> Mask) { 9053 // May come only for shuffling of 2 vectors with extractelements, already 9054 // handled in adjustExtracts. 9055 assert(InVectors.size() == 1 && 9056 all_of(enumerate(CommonMask), 9057 [&](auto P) { 9058 if (P.value() == PoisonMaskElem) 9059 return Mask[P.index()] == PoisonMaskElem; 9060 auto *EI = 9061 cast<ExtractElementInst>(InVectors.front() 9062 .get<const TreeEntry *>() 9063 ->Scalars[P.index()]); 9064 return EI->getVectorOperand() == V1 || 9065 EI->getVectorOperand() == V2; 9066 }) && 9067 "Expected extractelement vectors."); 9068 } 9069 /// Adds another one input vector and the mask for the shuffling. 9070 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) { 9071 if (InVectors.empty()) { 9072 assert(CommonMask.empty() && !ForExtracts && 9073 "Expected empty input mask/vectors."); 9074 CommonMask.assign(Mask.begin(), Mask.end()); 9075 InVectors.assign(1, V1); 9076 return; 9077 } 9078 if (ForExtracts) { 9079 // No need to add vectors here, already handled them in adjustExtracts. 9080 assert(InVectors.size() == 1 && 9081 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() && 9082 all_of(enumerate(CommonMask), 9083 [&](auto P) { 9084 Value *Scalar = InVectors.front() 9085 .get<const TreeEntry *>() 9086 ->Scalars[P.index()]; 9087 if (P.value() == PoisonMaskElem) 9088 return P.value() == Mask[P.index()] || 9089 isa<UndefValue>(Scalar); 9090 if (isa<Constant>(V1)) 9091 return true; 9092 auto *EI = cast<ExtractElementInst>(Scalar); 9093 return EI->getVectorOperand() == V1; 9094 }) && 9095 "Expected only tree entry for extractelement vectors."); 9096 return; 9097 } 9098 assert(!InVectors.empty() && !CommonMask.empty() && 9099 "Expected only tree entries from extracts/reused buildvectors."); 9100 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 9101 if (InVectors.size() == 2) { 9102 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask); 9103 transformMaskAfterShuffle(CommonMask, CommonMask); 9104 VF = std::max<unsigned>(VF, CommonMask.size()); 9105 } else if (const auto *InTE = 9106 InVectors.front().dyn_cast<const TreeEntry *>()) { 9107 VF = std::max(VF, InTE->getVectorFactor()); 9108 } else { 9109 VF = std::max( 9110 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType()) 9111 ->getNumElements()); 9112 } 9113 InVectors.push_back(V1); 9114 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9115 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 9116 CommonMask[Idx] = Mask[Idx] + VF; 9117 } 9118 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, 9119 Value *Root = nullptr) { 9120 Cost += getBuildVectorCost(VL, Root); 9121 if (!Root) { 9122 // FIXME: Need to find a way to avoid use of getNullValue here. 9123 SmallVector<Constant *> Vals; 9124 unsigned VF = VL.size(); 9125 if (MaskVF != 0) 9126 VF = std::min(VF, MaskVF); 9127 for (Value *V : VL.take_front(VF)) { 9128 if (isa<UndefValue>(V)) { 9129 Vals.push_back(cast<Constant>(V)); 9130 continue; 9131 } 9132 Vals.push_back(Constant::getNullValue(V->getType())); 9133 } 9134 return ConstantVector::get(Vals); 9135 } 9136 return ConstantVector::getSplat( 9137 ElementCount::getFixed( 9138 cast<FixedVectorType>(Root->getType())->getNumElements()), 9139 getAllOnesValue(*R.DL, ScalarTy)); 9140 } 9141 InstructionCost createFreeze(InstructionCost Cost) { return Cost; } 9142 /// Finalize emission of the shuffles. 9143 InstructionCost 9144 finalize(ArrayRef<int> ExtMask, unsigned VF = 0, 9145 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { 9146 IsFinalized = true; 9147 if (Action) { 9148 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); 9149 if (InVectors.size() == 2) 9150 Cost += createShuffle(Vec, InVectors.back(), CommonMask); 9151 else 9152 Cost += createShuffle(Vec, nullptr, CommonMask); 9153 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 9154 if (CommonMask[Idx] != PoisonMaskElem) 9155 CommonMask[Idx] = Idx; 9156 assert(VF > 0 && 9157 "Expected vector length for the final value before action."); 9158 Value *V = Vec.get<Value *>(); 9159 Action(V, CommonMask); 9160 InVectors.front() = V; 9161 } 9162 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); 9163 if (CommonMask.empty()) { 9164 assert(InVectors.size() == 1 && "Expected only one vector with no mask"); 9165 return Cost; 9166 } 9167 return Cost + 9168 createShuffle(InVectors.front(), 9169 InVectors.size() == 2 ? InVectors.back() : nullptr, 9170 CommonMask); 9171 } 9172 9173 ~ShuffleCostEstimator() { 9174 assert((IsFinalized || CommonMask.empty()) && 9175 "Shuffle construction must be finalized."); 9176 } 9177 }; 9178 9179 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, 9180 unsigned Idx) const { 9181 Value *Op = E->getOperand(Idx).front(); 9182 if (const TreeEntry *TE = getTreeEntry(Op)) { 9183 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { 9184 return EI.EdgeIdx == Idx && EI.UserTE == E; 9185 }) != TE->UserTreeIndices.end()) 9186 return TE; 9187 auto MIt = MultiNodeScalars.find(Op); 9188 if (MIt != MultiNodeScalars.end()) { 9189 for (const TreeEntry *TE : MIt->second) { 9190 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { 9191 return EI.EdgeIdx == Idx && EI.UserTE == E; 9192 }) != TE->UserTreeIndices.end()) 9193 return TE; 9194 } 9195 } 9196 } 9197 const auto *It = 9198 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 9199 return TE->isGather() && 9200 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { 9201 return EI.EdgeIdx == Idx && EI.UserTE == E; 9202 }) != TE->UserTreeIndices.end(); 9203 }); 9204 assert(It != VectorizableTree.end() && "Expected vectorizable entry."); 9205 return It->get(); 9206 } 9207 9208 TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { 9209 if (TE.State == TreeEntry::ScatterVectorize || 9210 TE.State == TreeEntry::StridedVectorize) 9211 return TTI::CastContextHint::GatherScatter; 9212 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && 9213 !TE.isAltShuffle()) { 9214 if (TE.ReorderIndices.empty()) 9215 return TTI::CastContextHint::Normal; 9216 SmallVector<int> Mask; 9217 inversePermutation(TE.ReorderIndices, Mask); 9218 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size())) 9219 return TTI::CastContextHint::Reversed; 9220 } 9221 return TTI::CastContextHint::None; 9222 } 9223 9224 /// Builds the arguments types vector for the given call instruction with the 9225 /// given \p ID for the specified vector factor. 9226 static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI, 9227 const Intrinsic::ID ID, 9228 const unsigned VF, 9229 unsigned MinBW) { 9230 SmallVector<Type *> ArgTys; 9231 for (auto [Idx, Arg] : enumerate(CI->args())) { 9232 if (ID != Intrinsic::not_intrinsic) { 9233 if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) { 9234 ArgTys.push_back(Arg->getType()); 9235 continue; 9236 } 9237 if (MinBW > 0) { 9238 ArgTys.push_back( 9239 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF)); 9240 continue; 9241 } 9242 } 9243 ArgTys.push_back(getWidenedType(Arg->getType(), VF)); 9244 } 9245 return ArgTys; 9246 } 9247 9248 InstructionCost 9249 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, 9250 SmallPtrSetImpl<Value *> &CheckedExtracts) { 9251 ArrayRef<Value *> VL = E->Scalars; 9252 9253 Type *ScalarTy = VL[0]->getType(); 9254 if (!E->isGather()) { 9255 if (auto *SI = dyn_cast<StoreInst>(VL[0])) 9256 ScalarTy = SI->getValueOperand()->getType(); 9257 else if (auto *CI = dyn_cast<CmpInst>(VL[0])) 9258 ScalarTy = CI->getOperand(0)->getType(); 9259 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) 9260 ScalarTy = IE->getOperand(1)->getType(); 9261 } 9262 if (!isValidElementType(ScalarTy)) 9263 return InstructionCost::getInvalid(); 9264 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 9265 9266 // If we have computed a smaller type for the expression, update VecTy so 9267 // that the costs will be accurate. 9268 auto It = MinBWs.find(E); 9269 Type *OrigScalarTy = ScalarTy; 9270 if (It != MinBWs.end()) 9271 ScalarTy = IntegerType::get(F->getContext(), It->second.first); 9272 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 9273 unsigned EntryVF = E->getVectorFactor(); 9274 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF); 9275 9276 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); 9277 if (E->isGather()) { 9278 if (allConstant(VL)) 9279 return 0; 9280 if (isa<InsertElementInst>(VL[0])) 9281 return InstructionCost::getInvalid(); 9282 return processBuildVector<ShuffleCostEstimator, InstructionCost>( 9283 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts); 9284 } 9285 InstructionCost CommonCost = 0; 9286 SmallVector<int> Mask; 9287 bool IsReverseOrder = isReverseOrder(E->ReorderIndices); 9288 if (!E->ReorderIndices.empty() && 9289 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) { 9290 SmallVector<int> NewMask; 9291 if (E->getOpcode() == Instruction::Store) { 9292 // For stores the order is actually a mask. 9293 NewMask.resize(E->ReorderIndices.size()); 9294 copy(E->ReorderIndices, NewMask.begin()); 9295 } else { 9296 inversePermutation(E->ReorderIndices, NewMask); 9297 } 9298 ::addMask(Mask, NewMask); 9299 } 9300 if (NeedToShuffleReuses) 9301 ::addMask(Mask, E->ReuseShuffleIndices); 9302 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) 9303 CommonCost = 9304 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); 9305 assert((E->State == TreeEntry::Vectorize || 9306 E->State == TreeEntry::ScatterVectorize || 9307 E->State == TreeEntry::StridedVectorize) && 9308 "Unhandled state"); 9309 assert(E->getOpcode() && 9310 ((allSameType(VL) && allSameBlock(VL)) || 9311 (E->getOpcode() == Instruction::GetElementPtr && 9312 E->getMainOp()->getType()->isPointerTy())) && 9313 "Invalid VL"); 9314 Instruction *VL0 = E->getMainOp(); 9315 unsigned ShuffleOrOp = 9316 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 9317 SetVector<Value *> UniqueValues(VL.begin(), VL.end()); 9318 const unsigned Sz = UniqueValues.size(); 9319 SmallBitVector UsedScalars(Sz, false); 9320 for (unsigned I = 0; I < Sz; ++I) { 9321 if (getTreeEntry(UniqueValues[I]) == E) 9322 continue; 9323 UsedScalars.set(I); 9324 } 9325 auto GetCastContextHint = [&](Value *V) { 9326 if (const TreeEntry *OpTE = getTreeEntry(V)) 9327 return getCastContextHint(*OpTE); 9328 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); 9329 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) 9330 return TTI::CastContextHint::GatherScatter; 9331 return TTI::CastContextHint::None; 9332 }; 9333 auto GetCostDiff = 9334 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost, 9335 function_ref<InstructionCost(InstructionCost)> VectorCost) { 9336 // Calculate the cost of this instruction. 9337 InstructionCost ScalarCost = 0; 9338 if (isa<CastInst, CallInst>(VL0)) { 9339 // For some of the instructions no need to calculate cost for each 9340 // particular instruction, we can use the cost of the single 9341 // instruction x total number of scalar instructions. 9342 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0); 9343 } else { 9344 for (unsigned I = 0; I < Sz; ++I) { 9345 if (UsedScalars.test(I)) 9346 continue; 9347 ScalarCost += ScalarEltCost(I); 9348 } 9349 } 9350 9351 InstructionCost VecCost = VectorCost(CommonCost); 9352 // Check if the current node must be resized, if the parent node is not 9353 // resized. 9354 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) { 9355 const EdgeInfo &EI = E->UserTreeIndices.front(); 9356 if ((EI.UserTE->getOpcode() != Instruction::Select || 9357 EI.EdgeIdx != 0) && 9358 It != MinBWs.end()) { 9359 auto UserBWIt = MinBWs.find(EI.UserTE); 9360 Type *UserScalarTy = 9361 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType(); 9362 if (UserBWIt != MinBWs.end()) 9363 UserScalarTy = IntegerType::get(ScalarTy->getContext(), 9364 UserBWIt->second.first); 9365 if (ScalarTy != UserScalarTy) { 9366 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 9367 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy); 9368 unsigned VecOpcode; 9369 auto *UserVecTy = 9370 getWidenedType(UserScalarTy, E->getVectorFactor()); 9371 if (BWSz > SrcBWSz) 9372 VecOpcode = Instruction::Trunc; 9373 else 9374 VecOpcode = 9375 It->second.second ? Instruction::SExt : Instruction::ZExt; 9376 TTI::CastContextHint CCH = GetCastContextHint(VL0); 9377 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH, 9378 CostKind); 9379 } 9380 } 9381 } 9382 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost, 9383 ScalarCost, "Calculated costs for Tree")); 9384 return VecCost - ScalarCost; 9385 }; 9386 // Calculate cost difference from vectorizing set of GEPs. 9387 // Negative value means vectorizing is profitable. 9388 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) { 9389 assert((E->State == TreeEntry::Vectorize || 9390 E->State == TreeEntry::StridedVectorize) && 9391 "Entry state expected to be Vectorize or StridedVectorize here."); 9392 InstructionCost ScalarCost = 0; 9393 InstructionCost VecCost = 0; 9394 std::tie(ScalarCost, VecCost) = getGEPCosts( 9395 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy); 9396 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, 9397 "Calculated GEPs cost for Tree")); 9398 9399 return VecCost - ScalarCost; 9400 }; 9401 9402 switch (ShuffleOrOp) { 9403 case Instruction::PHI: { 9404 // Count reused scalars. 9405 InstructionCost ScalarCost = 0; 9406 SmallPtrSet<const TreeEntry *, 4> CountedOps; 9407 for (Value *V : UniqueValues) { 9408 auto *PHI = dyn_cast<PHINode>(V); 9409 if (!PHI) 9410 continue; 9411 9412 ValueList Operands(PHI->getNumIncomingValues(), nullptr); 9413 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { 9414 Value *Op = PHI->getIncomingValue(I); 9415 Operands[I] = Op; 9416 } 9417 if (const TreeEntry *OpTE = getTreeEntry(Operands.front())) 9418 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second) 9419 if (!OpTE->ReuseShuffleIndices.empty()) 9420 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - 9421 OpTE->Scalars.size()); 9422 } 9423 9424 return CommonCost - ScalarCost; 9425 } 9426 case Instruction::ExtractValue: 9427 case Instruction::ExtractElement: { 9428 auto GetScalarCost = [&](unsigned Idx) { 9429 auto *I = cast<Instruction>(UniqueValues[Idx]); 9430 VectorType *SrcVecTy; 9431 if (ShuffleOrOp == Instruction::ExtractElement) { 9432 auto *EE = cast<ExtractElementInst>(I); 9433 SrcVecTy = EE->getVectorOperandType(); 9434 } else { 9435 auto *EV = cast<ExtractValueInst>(I); 9436 Type *AggregateTy = EV->getAggregateOperand()->getType(); 9437 unsigned NumElts; 9438 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy)) 9439 NumElts = ATy->getNumElements(); 9440 else 9441 NumElts = AggregateTy->getStructNumElements(); 9442 SrcVecTy = getWidenedType(OrigScalarTy, NumElts); 9443 } 9444 if (I->hasOneUse()) { 9445 Instruction *Ext = I->user_back(); 9446 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 9447 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) { 9448 // Use getExtractWithExtendCost() to calculate the cost of 9449 // extractelement/ext pair. 9450 InstructionCost Cost = TTI->getExtractWithExtendCost( 9451 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I)); 9452 // Subtract the cost of s|zext which is subtracted separately. 9453 Cost -= TTI->getCastInstrCost( 9454 Ext->getOpcode(), Ext->getType(), I->getType(), 9455 TTI::getCastContextHint(Ext), CostKind, Ext); 9456 return Cost; 9457 } 9458 } 9459 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, 9460 CostKind, *getExtractIndex(I)); 9461 }; 9462 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; 9463 return GetCostDiff(GetScalarCost, GetVectorCost); 9464 } 9465 case Instruction::InsertElement: { 9466 assert(E->ReuseShuffleIndices.empty() && 9467 "Unique insertelements only are expected."); 9468 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); 9469 unsigned const NumElts = SrcVecTy->getNumElements(); 9470 unsigned const NumScalars = VL.size(); 9471 9472 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); 9473 9474 SmallVector<int> InsertMask(NumElts, PoisonMaskElem); 9475 unsigned OffsetBeg = *getElementIndex(VL.front()); 9476 unsigned OffsetEnd = OffsetBeg; 9477 InsertMask[OffsetBeg] = 0; 9478 for (auto [I, V] : enumerate(VL.drop_front())) { 9479 unsigned Idx = *getElementIndex(V); 9480 if (OffsetBeg > Idx) 9481 OffsetBeg = Idx; 9482 else if (OffsetEnd < Idx) 9483 OffsetEnd = Idx; 9484 InsertMask[Idx] = I + 1; 9485 } 9486 unsigned VecScalarsSz = PowerOf2Ceil(NumElts); 9487 if (NumOfParts > 0) 9488 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); 9489 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * 9490 VecScalarsSz; 9491 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); 9492 unsigned InsertVecSz = std::min<unsigned>( 9493 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), 9494 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); 9495 bool IsWholeSubvector = 9496 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); 9497 // Check if we can safely insert a subvector. If it is not possible, just 9498 // generate a whole-sized vector and shuffle the source vector and the new 9499 // subvector. 9500 if (OffsetBeg + InsertVecSz > VecSz) { 9501 // Align OffsetBeg to generate correct mask. 9502 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); 9503 InsertVecSz = VecSz; 9504 } 9505 9506 APInt DemandedElts = APInt::getZero(NumElts); 9507 // TODO: Add support for Instruction::InsertValue. 9508 SmallVector<int> Mask; 9509 if (!E->ReorderIndices.empty()) { 9510 inversePermutation(E->ReorderIndices, Mask); 9511 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem); 9512 } else { 9513 Mask.assign(VecSz, PoisonMaskElem); 9514 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); 9515 } 9516 bool IsIdentity = true; 9517 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem); 9518 Mask.swap(PrevMask); 9519 for (unsigned I = 0; I < NumScalars; ++I) { 9520 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]); 9521 DemandedElts.setBit(InsertIdx); 9522 IsIdentity &= InsertIdx - OffsetBeg == I; 9523 Mask[InsertIdx - OffsetBeg] = I; 9524 } 9525 assert(Offset < NumElts && "Failed to find vector index offset"); 9526 9527 InstructionCost Cost = 0; 9528 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, 9529 /*Insert*/ true, /*Extract*/ false, 9530 CostKind); 9531 9532 // First cost - resize to actual vector size if not identity shuffle or 9533 // need to shift the vector. 9534 // Do not calculate the cost if the actual size is the register size and 9535 // we can merge this shuffle with the following SK_Select. 9536 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz); 9537 if (!IsIdentity) 9538 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 9539 InsertVecTy, Mask); 9540 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 9541 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 9542 })); 9543 // Second cost - permutation with subvector, if some elements are from the 9544 // initial vector or inserting a subvector. 9545 // TODO: Implement the analysis of the FirstInsert->getOperand(0) 9546 // subvector of ActualVecTy. 9547 SmallBitVector InMask = 9548 isUndefVector(FirstInsert->getOperand(0), 9549 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask)); 9550 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { 9551 if (InsertVecSz != VecSz) { 9552 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz); 9553 Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, 9554 std::nullopt, CostKind, OffsetBeg - Offset, 9555 InsertVecTy); 9556 } else { 9557 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) 9558 Mask[I] = InMask.test(I) ? PoisonMaskElem : I; 9559 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; 9560 I <= End; ++I) 9561 if (Mask[I] != PoisonMaskElem) 9562 Mask[I] = I + VecSz; 9563 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) 9564 Mask[I] = 9565 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I; 9566 Cost += 9567 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); 9568 } 9569 } 9570 return Cost; 9571 } 9572 case Instruction::ZExt: 9573 case Instruction::SExt: 9574 case Instruction::FPToUI: 9575 case Instruction::FPToSI: 9576 case Instruction::FPExt: 9577 case Instruction::PtrToInt: 9578 case Instruction::IntToPtr: 9579 case Instruction::SIToFP: 9580 case Instruction::UIToFP: 9581 case Instruction::Trunc: 9582 case Instruction::FPTrunc: 9583 case Instruction::BitCast: { 9584 auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); 9585 Type *SrcScalarTy = VL0->getOperand(0)->getType(); 9586 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size()); 9587 unsigned Opcode = ShuffleOrOp; 9588 unsigned VecOpcode = Opcode; 9589 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() && 9590 (SrcIt != MinBWs.end() || It != MinBWs.end())) { 9591 // Check if the values are candidates to demote. 9592 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy); 9593 if (SrcIt != MinBWs.end()) { 9594 SrcBWSz = SrcIt->second.first; 9595 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz); 9596 SrcVecTy = getWidenedType(SrcScalarTy, VL.size()); 9597 } 9598 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 9599 if (BWSz == SrcBWSz) { 9600 VecOpcode = Instruction::BitCast; 9601 } else if (BWSz < SrcBWSz) { 9602 VecOpcode = Instruction::Trunc; 9603 } else if (It != MinBWs.end()) { 9604 assert(BWSz > SrcBWSz && "Invalid cast!"); 9605 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; 9606 } else if (SrcIt != MinBWs.end()) { 9607 assert(BWSz > SrcBWSz && "Invalid cast!"); 9608 VecOpcode = 9609 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt; 9610 } 9611 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() && 9612 !SrcIt->second.second) { 9613 VecOpcode = Instruction::UIToFP; 9614 } 9615 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost { 9616 auto *VI = cast<Instruction>(UniqueValues[Idx]); 9617 return TTI->getCastInstrCost(Opcode, VL0->getType(), 9618 VL0->getOperand(0)->getType(), 9619 TTI::getCastContextHint(VI), CostKind, VI); 9620 }; 9621 auto GetVectorCost = [=](InstructionCost CommonCost) { 9622 // Do not count cost here if minimum bitwidth is in effect and it is just 9623 // a bitcast (here it is just a noop). 9624 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast) 9625 return CommonCost; 9626 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr; 9627 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0)); 9628 return CommonCost + 9629 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind, 9630 VecOpcode == Opcode ? VI : nullptr); 9631 }; 9632 return GetCostDiff(GetScalarCost, GetVectorCost); 9633 } 9634 case Instruction::FCmp: 9635 case Instruction::ICmp: 9636 case Instruction::Select: { 9637 CmpInst::Predicate VecPred, SwappedVecPred; 9638 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value()); 9639 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) || 9640 match(VL0, MatchCmp)) 9641 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred); 9642 else 9643 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy() 9644 ? CmpInst::BAD_FCMP_PREDICATE 9645 : CmpInst::BAD_ICMP_PREDICATE; 9646 auto GetScalarCost = [&](unsigned Idx) { 9647 auto *VI = cast<Instruction>(UniqueValues[Idx]); 9648 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() 9649 ? CmpInst::BAD_FCMP_PREDICATE 9650 : CmpInst::BAD_ICMP_PREDICATE; 9651 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); 9652 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) && 9653 !match(VI, MatchCmp)) || 9654 (CurrentPred != VecPred && CurrentPred != SwappedVecPred)) 9655 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy() 9656 ? CmpInst::BAD_FCMP_PREDICATE 9657 : CmpInst::BAD_ICMP_PREDICATE; 9658 9659 InstructionCost ScalarCost = TTI->getCmpSelInstrCost( 9660 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred, 9661 CostKind, VI); 9662 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI); 9663 if (MinMaxID != Intrinsic::not_intrinsic) { 9664 Type *CanonicalType = OrigScalarTy; 9665 if (CanonicalType->isPtrOrPtrVectorTy()) 9666 CanonicalType = CanonicalType->getWithNewType(IntegerType::get( 9667 CanonicalType->getContext(), 9668 DL->getTypeSizeInBits(CanonicalType->getScalarType()))); 9669 9670 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType, 9671 {CanonicalType, CanonicalType}); 9672 InstructionCost IntrinsicCost = 9673 TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 9674 // If the selects are the only uses of the compares, they will be 9675 // dead and we can adjust the cost by removing their cost. 9676 if (SelectOnly) { 9677 auto *CI = cast<CmpInst>(VI->getOperand(0)); 9678 IntrinsicCost -= TTI->getCmpSelInstrCost( 9679 CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), 9680 CI->getPredicate(), CostKind, CI); 9681 } 9682 ScalarCost = std::min(ScalarCost, IntrinsicCost); 9683 } 9684 9685 return ScalarCost; 9686 }; 9687 auto GetVectorCost = [&](InstructionCost CommonCost) { 9688 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); 9689 9690 InstructionCost VecCost = TTI->getCmpSelInstrCost( 9691 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); 9692 // Check if it is possible and profitable to use min/max for selects 9693 // in VL. 9694 // 9695 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL); 9696 if (MinMaxID != Intrinsic::not_intrinsic) { 9697 Type *CanonicalType = VecTy; 9698 if (CanonicalType->isPtrOrPtrVectorTy()) 9699 CanonicalType = CanonicalType->getWithNewType(IntegerType::get( 9700 CanonicalType->getContext(), 9701 DL->getTypeSizeInBits(CanonicalType->getScalarType()))); 9702 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType, 9703 {CanonicalType, CanonicalType}); 9704 InstructionCost IntrinsicCost = 9705 TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 9706 // If the selects are the only uses of the compares, they will be 9707 // dead and we can adjust the cost by removing their cost. 9708 if (SelectOnly) { 9709 auto *CI = 9710 cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0)); 9711 IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy, 9712 MaskTy, VecPred, CostKind); 9713 } 9714 VecCost = std::min(VecCost, IntrinsicCost); 9715 } 9716 return VecCost + CommonCost; 9717 }; 9718 return GetCostDiff(GetScalarCost, GetVectorCost); 9719 } 9720 case Instruction::FNeg: 9721 case Instruction::Add: 9722 case Instruction::FAdd: 9723 case Instruction::Sub: 9724 case Instruction::FSub: 9725 case Instruction::Mul: 9726 case Instruction::FMul: 9727 case Instruction::UDiv: 9728 case Instruction::SDiv: 9729 case Instruction::FDiv: 9730 case Instruction::URem: 9731 case Instruction::SRem: 9732 case Instruction::FRem: 9733 case Instruction::Shl: 9734 case Instruction::LShr: 9735 case Instruction::AShr: 9736 case Instruction::And: 9737 case Instruction::Or: 9738 case Instruction::Xor: { 9739 auto GetScalarCost = [&](unsigned Idx) { 9740 auto *VI = cast<Instruction>(UniqueValues[Idx]); 9741 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1; 9742 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); 9743 TTI::OperandValueInfo Op2Info = 9744 TTI::getOperandInfo(VI->getOperand(OpIdx)); 9745 SmallVector<const Value *> Operands(VI->operand_values()); 9746 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, 9747 Op1Info, Op2Info, Operands, VI); 9748 }; 9749 auto GetVectorCost = [=](InstructionCost CommonCost) { 9750 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { 9751 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) { 9752 ArrayRef<Value *> Ops = E->getOperand(I); 9753 if (all_of(Ops, [&](Value *Op) { 9754 auto *CI = dyn_cast<ConstantInt>(Op); 9755 return CI && CI->getValue().countr_one() >= It->second.first; 9756 })) 9757 return CommonCost; 9758 } 9759 } 9760 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; 9761 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); 9762 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); 9763 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, 9764 Op2Info, std::nullopt, nullptr, TLI) + 9765 CommonCost; 9766 }; 9767 return GetCostDiff(GetScalarCost, GetVectorCost); 9768 } 9769 case Instruction::GetElementPtr: { 9770 return CommonCost + GetGEPCostDiff(VL, VL0); 9771 } 9772 case Instruction::Load: { 9773 auto GetScalarCost = [&](unsigned Idx) { 9774 auto *VI = cast<LoadInst>(UniqueValues[Idx]); 9775 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy, 9776 VI->getAlign(), VI->getPointerAddressSpace(), 9777 CostKind, TTI::OperandValueInfo(), VI); 9778 }; 9779 auto *LI0 = cast<LoadInst>(VL0); 9780 auto GetVectorCost = [&](InstructionCost CommonCost) { 9781 InstructionCost VecLdCost; 9782 if (E->State == TreeEntry::Vectorize) { 9783 VecLdCost = TTI->getMemoryOpCost( 9784 Instruction::Load, VecTy, LI0->getAlign(), 9785 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); 9786 } else if (E->State == TreeEntry::StridedVectorize) { 9787 Align CommonAlignment = 9788 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef()); 9789 VecLdCost = TTI->getStridedMemoryOpCost( 9790 Instruction::Load, VecTy, LI0->getPointerOperand(), 9791 /*VariableMask=*/false, CommonAlignment, CostKind); 9792 } else { 9793 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); 9794 Align CommonAlignment = 9795 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef()); 9796 VecLdCost = TTI->getGatherScatterOpCost( 9797 Instruction::Load, VecTy, LI0->getPointerOperand(), 9798 /*VariableMask=*/false, CommonAlignment, CostKind); 9799 } 9800 return VecLdCost + CommonCost; 9801 }; 9802 9803 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); 9804 // If this node generates masked gather load then it is not a terminal node. 9805 // Hence address operand cost is estimated separately. 9806 if (E->State == TreeEntry::ScatterVectorize) 9807 return Cost; 9808 9809 // Estimate cost of GEPs since this tree node is a terminator. 9810 SmallVector<Value *> PointerOps(VL.size()); 9811 for (auto [I, V] : enumerate(VL)) 9812 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand(); 9813 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand()); 9814 } 9815 case Instruction::Store: { 9816 bool IsReorder = !E->ReorderIndices.empty(); 9817 auto GetScalarCost = [=](unsigned Idx) { 9818 auto *VI = cast<StoreInst>(VL[Idx]); 9819 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand()); 9820 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy, 9821 VI->getAlign(), VI->getPointerAddressSpace(), 9822 CostKind, OpInfo, VI); 9823 }; 9824 auto *BaseSI = 9825 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); 9826 auto GetVectorCost = [=](InstructionCost CommonCost) { 9827 // We know that we can merge the stores. Calculate the cost. 9828 InstructionCost VecStCost; 9829 if (E->State == TreeEntry::StridedVectorize) { 9830 Align CommonAlignment = 9831 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef()); 9832 VecStCost = TTI->getStridedMemoryOpCost( 9833 Instruction::Store, VecTy, BaseSI->getPointerOperand(), 9834 /*VariableMask=*/false, CommonAlignment, CostKind); 9835 } else { 9836 assert(E->State == TreeEntry::Vectorize && 9837 "Expected either strided or consecutive stores."); 9838 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0)); 9839 VecStCost = TTI->getMemoryOpCost( 9840 Instruction::Store, VecTy, BaseSI->getAlign(), 9841 BaseSI->getPointerAddressSpace(), CostKind, OpInfo); 9842 } 9843 return VecStCost + CommonCost; 9844 }; 9845 SmallVector<Value *> PointerOps(VL.size()); 9846 for (auto [I, V] : enumerate(VL)) { 9847 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I; 9848 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand(); 9849 } 9850 9851 return GetCostDiff(GetScalarCost, GetVectorCost) + 9852 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); 9853 } 9854 case Instruction::Call: { 9855 auto GetScalarCost = [&](unsigned Idx) { 9856 auto *CI = cast<CallInst>(UniqueValues[Idx]); 9857 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 9858 if (ID != Intrinsic::not_intrinsic) { 9859 IntrinsicCostAttributes CostAttrs(ID, *CI, 1); 9860 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind); 9861 } 9862 return TTI->getCallInstrCost(CI->getCalledFunction(), 9863 CI->getFunctionType()->getReturnType(), 9864 CI->getFunctionType()->params(), CostKind); 9865 }; 9866 auto GetVectorCost = [=](InstructionCost CommonCost) { 9867 auto *CI = cast<CallInst>(VL0); 9868 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 9869 SmallVector<Type *> ArgTys = 9870 buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(), 9871 It != MinBWs.end() ? It->second.first : 0); 9872 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); 9873 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost; 9874 }; 9875 return GetCostDiff(GetScalarCost, GetVectorCost); 9876 } 9877 case Instruction::ShuffleVector: { 9878 assert(E->isAltShuffle() && 9879 ((Instruction::isBinaryOp(E->getOpcode()) && 9880 Instruction::isBinaryOp(E->getAltOpcode())) || 9881 (Instruction::isCast(E->getOpcode()) && 9882 Instruction::isCast(E->getAltOpcode())) || 9883 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 9884 "Invalid Shuffle Vector Operand"); 9885 // Try to find the previous shuffle node with the same operands and same 9886 // main/alternate ops. 9887 auto TryFindNodeWithEqualOperands = [=]() { 9888 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { 9889 if (TE.get() == E) 9890 break; 9891 if (TE->isAltShuffle() && 9892 ((TE->getOpcode() == E->getOpcode() && 9893 TE->getAltOpcode() == E->getAltOpcode()) || 9894 (TE->getOpcode() == E->getAltOpcode() && 9895 TE->getAltOpcode() == E->getOpcode())) && 9896 TE->hasEqualOperands(*E)) 9897 return true; 9898 } 9899 return false; 9900 }; 9901 auto GetScalarCost = [&](unsigned Idx) { 9902 auto *VI = cast<Instruction>(UniqueValues[Idx]); 9903 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); 9904 (void)E; 9905 return TTI->getInstructionCost(VI, CostKind); 9906 }; 9907 // Need to clear CommonCost since the final shuffle cost is included into 9908 // vector cost. 9909 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) { 9910 // VecCost is equal to sum of the cost of creating 2 vectors 9911 // and the cost of creating shuffle. 9912 InstructionCost VecCost = 0; 9913 if (TryFindNodeWithEqualOperands()) { 9914 LLVM_DEBUG({ 9915 dbgs() << "SLP: diamond match for alternate node found.\n"; 9916 E->dump(); 9917 }); 9918 // No need to add new vector costs here since we're going to reuse 9919 // same main/alternate vector ops, just do different shuffling. 9920 } else if (Instruction::isBinaryOp(E->getOpcode())) { 9921 VecCost = 9922 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); 9923 VecCost += 9924 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); 9925 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 9926 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); 9927 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, 9928 CI0->getPredicate(), CostKind, VL0); 9929 VecCost += TTIRef.getCmpSelInstrCost( 9930 E->getOpcode(), VecTy, MaskTy, 9931 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, 9932 E->getAltOp()); 9933 } else { 9934 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType(); 9935 auto *SrcTy = getWidenedType(SrcSclTy, VL.size()); 9936 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { 9937 auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); 9938 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 9939 unsigned SrcBWSz = 9940 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType()); 9941 if (SrcIt != MinBWs.end()) { 9942 SrcBWSz = SrcIt->second.first; 9943 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz); 9944 SrcTy = getWidenedType(SrcSclTy, VL.size()); 9945 } 9946 if (BWSz <= SrcBWSz) { 9947 if (BWSz < SrcBWSz) 9948 VecCost = 9949 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy, 9950 TTI::CastContextHint::None, CostKind); 9951 LLVM_DEBUG({ 9952 dbgs() 9953 << "SLP: alternate extension, which should be truncated.\n"; 9954 E->dump(); 9955 }); 9956 return VecCost; 9957 } 9958 } 9959 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy, 9960 TTI::CastContextHint::None, CostKind); 9961 VecCost += 9962 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy, 9963 TTI::CastContextHint::None, CostKind); 9964 } 9965 SmallVector<int> Mask; 9966 E->buildAltOpShuffleMask( 9967 [E](Instruction *I) { 9968 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 9969 return I->getOpcode() == E->getAltOpcode(); 9970 }, 9971 Mask); 9972 VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc, 9973 FinalVecTy, Mask); 9974 // Patterns like [fadd,fsub] can be combined into a single instruction 9975 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we 9976 // need to take into account their order when looking for the most used 9977 // order. 9978 unsigned Opcode0 = E->getOpcode(); 9979 unsigned Opcode1 = E->getAltOpcode(); 9980 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1)); 9981 // If this pattern is supported by the target then we consider the 9982 // order. 9983 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { 9984 InstructionCost AltVecCost = TTIRef.getAltInstrCost( 9985 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); 9986 return AltVecCost < VecCost ? AltVecCost : VecCost; 9987 } 9988 // TODO: Check the reverse order too. 9989 return VecCost; 9990 }; 9991 return GetCostDiff(GetScalarCost, GetVectorCost); 9992 } 9993 default: 9994 llvm_unreachable("Unknown instruction"); 9995 } 9996 } 9997 9998 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { 9999 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " 10000 << VectorizableTree.size() << " is fully vectorizable .\n"); 10001 10002 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) { 10003 SmallVector<int> Mask; 10004 return TE->isGather() && 10005 !any_of(TE->Scalars, 10006 [this](Value *V) { return EphValues.contains(V); }) && 10007 (allConstant(TE->Scalars) || isSplat(TE->Scalars) || 10008 TE->Scalars.size() < Limit || 10009 ((TE->getOpcode() == Instruction::ExtractElement || 10010 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) && 10011 isFixedVectorShuffle(TE->Scalars, Mask)) || 10012 (TE->isGather() && TE->getOpcode() == Instruction::Load && 10013 !TE->isAltShuffle())); 10014 }; 10015 10016 // We only handle trees of heights 1 and 2. 10017 if (VectorizableTree.size() == 1 && 10018 (VectorizableTree[0]->State == TreeEntry::Vectorize || 10019 (ForReduction && 10020 AreVectorizableGathers(VectorizableTree[0].get(), 10021 VectorizableTree[0]->Scalars.size()) && 10022 VectorizableTree[0]->getVectorFactor() > 2))) 10023 return true; 10024 10025 if (VectorizableTree.size() != 2) 10026 return false; 10027 10028 // Handle splat and all-constants stores. Also try to vectorize tiny trees 10029 // with the second gather nodes if they have less scalar operands rather than 10030 // the initial tree element (may be profitable to shuffle the second gather) 10031 // or they are extractelements, which form shuffle. 10032 SmallVector<int> Mask; 10033 if (VectorizableTree[0]->State == TreeEntry::Vectorize && 10034 AreVectorizableGathers(VectorizableTree[1].get(), 10035 VectorizableTree[0]->Scalars.size())) 10036 return true; 10037 10038 // Gathering cost would be too much for tiny trees. 10039 if (VectorizableTree[0]->isGather() || 10040 (VectorizableTree[1]->isGather() && 10041 VectorizableTree[0]->State != TreeEntry::ScatterVectorize && 10042 VectorizableTree[0]->State != TreeEntry::StridedVectorize)) 10043 return false; 10044 10045 return true; 10046 } 10047 10048 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, 10049 TargetTransformInfo *TTI, 10050 bool MustMatchOrInst) { 10051 // Look past the root to find a source value. Arbitrarily follow the 10052 // path through operand 0 of any 'or'. Also, peek through optional 10053 // shift-left-by-multiple-of-8-bits. 10054 Value *ZextLoad = Root; 10055 const APInt *ShAmtC; 10056 bool FoundOr = false; 10057 while (!isa<ConstantExpr>(ZextLoad) && 10058 (match(ZextLoad, m_Or(m_Value(), m_Value())) || 10059 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && 10060 ShAmtC->urem(8) == 0))) { 10061 auto *BinOp = cast<BinaryOperator>(ZextLoad); 10062 ZextLoad = BinOp->getOperand(0); 10063 if (BinOp->getOpcode() == Instruction::Or) 10064 FoundOr = true; 10065 } 10066 // Check if the input is an extended load of the required or/shift expression. 10067 Value *Load; 10068 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || 10069 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) 10070 return false; 10071 10072 // Require that the total load bit width is a legal integer type. 10073 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 10074 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 10075 Type *SrcTy = Load->getType(); 10076 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; 10077 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) 10078 return false; 10079 10080 // Everything matched - assume that we can fold the whole sequence using 10081 // load combining. 10082 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " 10083 << *(cast<Instruction>(Root)) << "\n"); 10084 10085 return true; 10086 } 10087 10088 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { 10089 if (RdxKind != RecurKind::Or) 10090 return false; 10091 10092 unsigned NumElts = VectorizableTree[0]->Scalars.size(); 10093 Value *FirstReduced = VectorizableTree[0]->Scalars[0]; 10094 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, 10095 /* MatchOr */ false); 10096 } 10097 10098 bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const { 10099 // Peek through a final sequence of stores and check if all operations are 10100 // likely to be load-combined. 10101 unsigned NumElts = Stores.size(); 10102 for (Value *Scalar : Stores) { 10103 Value *X; 10104 if (!match(Scalar, m_Store(m_Value(X), m_Value())) || 10105 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) 10106 return false; 10107 } 10108 return true; 10109 } 10110 10111 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { 10112 // No need to vectorize inserts of gathered values. 10113 if (VectorizableTree.size() == 2 && 10114 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && 10115 VectorizableTree[1]->isGather() && 10116 (VectorizableTree[1]->getVectorFactor() <= 2 || 10117 !(isSplat(VectorizableTree[1]->Scalars) || 10118 allConstant(VectorizableTree[1]->Scalars)))) 10119 return true; 10120 10121 // If the graph includes only PHI nodes and gathers, it is defnitely not 10122 // profitable for the vectorization, we can skip it, if the cost threshold is 10123 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of 10124 // gathers/buildvectors. 10125 constexpr int Limit = 4; 10126 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && 10127 !VectorizableTree.empty() && 10128 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 10129 return (TE->isGather() && 10130 TE->getOpcode() != Instruction::ExtractElement && 10131 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) || 10132 TE->getOpcode() == Instruction::PHI; 10133 })) 10134 return true; 10135 10136 // We can vectorize the tree if its size is greater than or equal to the 10137 // minimum size specified by the MinTreeSize command line option. 10138 if (VectorizableTree.size() >= MinTreeSize) 10139 return false; 10140 10141 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we 10142 // can vectorize it if we can prove it fully vectorizable. 10143 if (isFullyVectorizableTinyTree(ForReduction)) 10144 return false; 10145 10146 // Check if any of the gather node forms an insertelement buildvector 10147 // somewhere. 10148 bool IsAllowedSingleBVNode = 10149 VectorizableTree.size() > 1 || 10150 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() && 10151 !VectorizableTree.front()->isAltShuffle() && 10152 VectorizableTree.front()->getOpcode() != Instruction::PHI && 10153 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr && 10154 allSameBlock(VectorizableTree.front()->Scalars)); 10155 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 10156 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) { 10157 return isa<ExtractElementInst, UndefValue>(V) || 10158 (IsAllowedSingleBVNode && 10159 !V->hasNUsesOrMore(UsesLimit) && 10160 any_of(V->users(), IsaPred<InsertElementInst>)); 10161 }); 10162 })) 10163 return false; 10164 10165 assert(VectorizableTree.empty() 10166 ? ExternalUses.empty() 10167 : true && "We shouldn't have any external users"); 10168 10169 // Otherwise, we can't vectorize the tree. It is both tiny and not fully 10170 // vectorizable. 10171 return true; 10172 } 10173 10174 InstructionCost BoUpSLP::getSpillCost() const { 10175 // Walk from the bottom of the tree to the top, tracking which values are 10176 // live. When we see a call instruction that is not part of our tree, 10177 // query TTI to see if there is a cost to keeping values live over it 10178 // (for example, if spills and fills are required). 10179 unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); 10180 InstructionCost Cost = 0; 10181 10182 SmallPtrSet<Instruction *, 4> LiveValues; 10183 Instruction *PrevInst = nullptr; 10184 10185 // The entries in VectorizableTree are not necessarily ordered by their 10186 // position in basic blocks. Collect them and order them by dominance so later 10187 // instructions are guaranteed to be visited first. For instructions in 10188 // different basic blocks, we only scan to the beginning of the block, so 10189 // their order does not matter, as long as all instructions in a basic block 10190 // are grouped together. Using dominance ensures a deterministic order. 10191 SmallVector<Instruction *, 16> OrderedScalars; 10192 for (const auto &TEPtr : VectorizableTree) { 10193 if (TEPtr->State != TreeEntry::Vectorize) 10194 continue; 10195 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); 10196 if (!Inst) 10197 continue; 10198 OrderedScalars.push_back(Inst); 10199 } 10200 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { 10201 auto *NodeA = DT->getNode(A->getParent()); 10202 auto *NodeB = DT->getNode(B->getParent()); 10203 assert(NodeA && "Should only process reachable instructions"); 10204 assert(NodeB && "Should only process reachable instructions"); 10205 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 10206 "Different nodes should have different DFS numbers"); 10207 if (NodeA != NodeB) 10208 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn(); 10209 return B->comesBefore(A); 10210 }); 10211 10212 for (Instruction *Inst : OrderedScalars) { 10213 if (!PrevInst) { 10214 PrevInst = Inst; 10215 continue; 10216 } 10217 10218 // Update LiveValues. 10219 LiveValues.erase(PrevInst); 10220 for (auto &J : PrevInst->operands()) { 10221 if (isa<Instruction>(&*J) && getTreeEntry(&*J)) 10222 LiveValues.insert(cast<Instruction>(&*J)); 10223 } 10224 10225 LLVM_DEBUG({ 10226 dbgs() << "SLP: #LV: " << LiveValues.size(); 10227 for (auto *X : LiveValues) 10228 dbgs() << " " << X->getName(); 10229 dbgs() << ", Looking at "; 10230 Inst->dump(); 10231 }); 10232 10233 // Now find the sequence of instructions between PrevInst and Inst. 10234 unsigned NumCalls = 0; 10235 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), 10236 PrevInstIt = 10237 PrevInst->getIterator().getReverse(); 10238 while (InstIt != PrevInstIt) { 10239 if (PrevInstIt == PrevInst->getParent()->rend()) { 10240 PrevInstIt = Inst->getParent()->rbegin(); 10241 continue; 10242 } 10243 10244 auto NoCallIntrinsic = [this](Instruction *I) { 10245 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 10246 if (II->isAssumeLikeIntrinsic()) 10247 return true; 10248 FastMathFlags FMF; 10249 SmallVector<Type *, 4> Tys; 10250 for (auto &ArgOp : II->args()) 10251 Tys.push_back(ArgOp->getType()); 10252 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 10253 FMF = FPMO->getFastMathFlags(); 10254 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, 10255 FMF); 10256 InstructionCost IntrCost = 10257 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); 10258 InstructionCost CallCost = TTI->getCallInstrCost( 10259 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput); 10260 if (IntrCost < CallCost) 10261 return true; 10262 } 10263 return false; 10264 }; 10265 10266 // Debug information does not impact spill cost. 10267 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) && 10268 &*PrevInstIt != PrevInst) 10269 NumCalls++; 10270 10271 ++PrevInstIt; 10272 } 10273 10274 if (NumCalls) { 10275 SmallVector<Type *, 4> V; 10276 for (auto *II : LiveValues) { 10277 auto *ScalarTy = II->getType(); 10278 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) 10279 ScalarTy = VectorTy->getElementType(); 10280 V.push_back(getWidenedType(ScalarTy, BundleWidth)); 10281 } 10282 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); 10283 } 10284 10285 PrevInst = Inst; 10286 } 10287 10288 return Cost; 10289 } 10290 10291 /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the 10292 /// buildvector sequence. 10293 static bool isFirstInsertElement(const InsertElementInst *IE1, 10294 const InsertElementInst *IE2) { 10295 if (IE1 == IE2) 10296 return false; 10297 const auto *I1 = IE1; 10298 const auto *I2 = IE2; 10299 const InsertElementInst *PrevI1; 10300 const InsertElementInst *PrevI2; 10301 unsigned Idx1 = *getElementIndex(IE1); 10302 unsigned Idx2 = *getElementIndex(IE2); 10303 do { 10304 if (I2 == IE1) 10305 return true; 10306 if (I1 == IE2) 10307 return false; 10308 PrevI1 = I1; 10309 PrevI2 = I2; 10310 if (I1 && (I1 == IE1 || I1->hasOneUse()) && 10311 getElementIndex(I1).value_or(Idx2) != Idx2) 10312 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0)); 10313 if (I2 && ((I2 == IE2 || I2->hasOneUse())) && 10314 getElementIndex(I2).value_or(Idx1) != Idx1) 10315 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0)); 10316 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2)); 10317 llvm_unreachable("Two different buildvectors not expected."); 10318 } 10319 10320 namespace { 10321 /// Returns incoming Value *, if the requested type is Value * too, or a default 10322 /// value, otherwise. 10323 struct ValueSelect { 10324 template <typename U> 10325 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) { 10326 return V; 10327 } 10328 template <typename U> 10329 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) { 10330 return U(); 10331 } 10332 }; 10333 } // namespace 10334 10335 /// Does the analysis of the provided shuffle masks and performs the requested 10336 /// actions on the vectors with the given shuffle masks. It tries to do it in 10337 /// several steps. 10338 /// 1. If the Base vector is not undef vector, resizing the very first mask to 10339 /// have common VF and perform action for 2 input vectors (including non-undef 10340 /// Base). Other shuffle masks are combined with the resulting after the 1 stage 10341 /// and processed as a shuffle of 2 elements. 10342 /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the 10343 /// action only for 1 vector with the given mask, if it is not the identity 10344 /// mask. 10345 /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 10346 /// vectors, combing the masks properly between the steps. 10347 template <typename T> 10348 static T *performExtractsShuffleAction( 10349 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base, 10350 function_ref<unsigned(T *)> GetVF, 10351 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction, 10352 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) { 10353 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); 10354 SmallVector<int> Mask(ShuffleMask.begin()->second); 10355 auto VMIt = std::next(ShuffleMask.begin()); 10356 T *Prev = nullptr; 10357 SmallBitVector UseMask = 10358 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask); 10359 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask); 10360 if (!IsBaseUndef.all()) { 10361 // Base is not undef, need to combine it with the next subvectors. 10362 std::pair<T *, bool> Res = 10363 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); 10364 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask); 10365 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { 10366 if (Mask[Idx] == PoisonMaskElem) 10367 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx; 10368 else 10369 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; 10370 } 10371 auto *V = ValueSelect::get<T *>(Base); 10372 (void)V; 10373 assert((!V || GetVF(V) == Mask.size()) && 10374 "Expected base vector of VF number of elements."); 10375 Prev = Action(Mask, {nullptr, Res.first}); 10376 } else if (ShuffleMask.size() == 1) { 10377 // Base is undef and only 1 vector is shuffled - perform the action only for 10378 // single vector, if the mask is not the identity mask. 10379 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask, 10380 /*ForSingleMask=*/true); 10381 if (Res.second) 10382 // Identity mask is found. 10383 Prev = Res.first; 10384 else 10385 Prev = Action(Mask, {ShuffleMask.begin()->first}); 10386 } else { 10387 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors 10388 // shuffles step by step, combining shuffle between the steps. 10389 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first); 10390 unsigned Vec2VF = GetVF(VMIt->first); 10391 if (Vec1VF == Vec2VF) { 10392 // No need to resize the input vectors since they are of the same size, we 10393 // can shuffle them directly. 10394 ArrayRef<int> SecMask = VMIt->second; 10395 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 10396 if (SecMask[I] != PoisonMaskElem) { 10397 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); 10398 Mask[I] = SecMask[I] + Vec1VF; 10399 } 10400 } 10401 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); 10402 } else { 10403 // Vectors of different sizes - resize and reshuffle. 10404 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask, 10405 /*ForSingleMask=*/false); 10406 std::pair<T *, bool> Res2 = 10407 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); 10408 ArrayRef<int> SecMask = VMIt->second; 10409 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 10410 if (Mask[I] != PoisonMaskElem) { 10411 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars."); 10412 if (Res1.second) 10413 Mask[I] = I; 10414 } else if (SecMask[I] != PoisonMaskElem) { 10415 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars."); 10416 Mask[I] = (Res2.second ? I : SecMask[I]) + VF; 10417 } 10418 } 10419 Prev = Action(Mask, {Res1.first, Res2.first}); 10420 } 10421 VMIt = std::next(VMIt); 10422 } 10423 bool IsBaseNotUndef = !IsBaseUndef.all(); 10424 (void)IsBaseNotUndef; 10425 // Perform requested actions for the remaining masks/vectors. 10426 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { 10427 // Shuffle other input vectors, if any. 10428 std::pair<T *, bool> Res = 10429 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); 10430 ArrayRef<int> SecMask = VMIt->second; 10431 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { 10432 if (SecMask[I] != PoisonMaskElem) { 10433 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) && 10434 "Multiple uses of scalars."); 10435 Mask[I] = (Res.second ? I : SecMask[I]) + VF; 10436 } else if (Mask[I] != PoisonMaskElem) { 10437 Mask[I] = I; 10438 } 10439 } 10440 Prev = Action(Mask, {Prev, Res.first}); 10441 } 10442 return Prev; 10443 } 10444 10445 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { 10446 InstructionCost Cost = 0; 10447 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " 10448 << VectorizableTree.size() << ".\n"); 10449 10450 unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); 10451 10452 SmallPtrSet<Value *, 4> CheckedExtracts; 10453 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { 10454 TreeEntry &TE = *VectorizableTree[I]; 10455 if (TE.isGather()) { 10456 if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); 10457 E && E->getVectorFactor() == TE.getVectorFactor() && 10458 E->isSame(TE.Scalars)) { 10459 // Some gather nodes might be absolutely the same as some vectorizable 10460 // nodes after reordering, need to handle it. 10461 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle " 10462 << shortBundleName(TE.Scalars) << ".\n" 10463 << "SLP: Current total cost = " << Cost << "\n"); 10464 continue; 10465 } 10466 } 10467 10468 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); 10469 Cost += C; 10470 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " 10471 << shortBundleName(TE.Scalars) << ".\n" 10472 << "SLP: Current total cost = " << Cost << "\n"); 10473 } 10474 10475 SmallPtrSet<Value *, 16> ExtractCostCalculated; 10476 InstructionCost ExtractCost = 0; 10477 SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks; 10478 SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers; 10479 SmallVector<APInt> DemandedElts; 10480 SmallDenseSet<Value *, 4> UsedInserts; 10481 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts; 10482 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses; 10483 for (ExternalUser &EU : ExternalUses) { 10484 // We only add extract cost once for the same scalar. 10485 if (!isa_and_nonnull<InsertElementInst>(EU.User) && 10486 !ExtractCostCalculated.insert(EU.Scalar).second) 10487 continue; 10488 10489 // Uses by ephemeral values are free (because the ephemeral value will be 10490 // removed prior to code generation, and so the extraction will be 10491 // removed as well). 10492 if (EphValues.count(EU.User)) 10493 continue; 10494 10495 // No extract cost for vector "scalar" 10496 if (isa<FixedVectorType>(EU.Scalar->getType())) 10497 continue; 10498 10499 // If found user is an insertelement, do not calculate extract cost but try 10500 // to detect it as a final shuffled/identity match. 10501 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User); 10502 VU && VU->getOperand(1) == EU.Scalar) { 10503 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { 10504 if (!UsedInserts.insert(VU).second) 10505 continue; 10506 std::optional<unsigned> InsertIdx = getElementIndex(VU); 10507 if (InsertIdx) { 10508 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); 10509 auto *It = find_if( 10510 FirstUsers, 10511 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) { 10512 return areTwoInsertFromSameBuildVector( 10513 VU, cast<InsertElementInst>(Pair.first), 10514 [this](InsertElementInst *II) -> Value * { 10515 Value *Op0 = II->getOperand(0); 10516 if (getTreeEntry(II) && !getTreeEntry(Op0)) 10517 return nullptr; 10518 return Op0; 10519 }); 10520 }); 10521 int VecId = -1; 10522 if (It == FirstUsers.end()) { 10523 (void)ShuffleMasks.emplace_back(); 10524 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE]; 10525 if (Mask.empty()) 10526 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 10527 // Find the insertvector, vectorized in tree, if any. 10528 Value *Base = VU; 10529 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { 10530 if (IEBase != EU.User && 10531 (!IEBase->hasOneUse() || 10532 getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx)) 10533 break; 10534 // Build the mask for the vectorized insertelement instructions. 10535 if (const TreeEntry *E = getTreeEntry(IEBase)) { 10536 VU = IEBase; 10537 do { 10538 IEBase = cast<InsertElementInst>(Base); 10539 int Idx = *getElementIndex(IEBase); 10540 assert(Mask[Idx] == PoisonMaskElem && 10541 "InsertElementInstruction used already."); 10542 Mask[Idx] = Idx; 10543 Base = IEBase->getOperand(0); 10544 } while (E == getTreeEntry(Base)); 10545 break; 10546 } 10547 Base = cast<InsertElementInst>(Base)->getOperand(0); 10548 } 10549 FirstUsers.emplace_back(VU, ScalarTE); 10550 DemandedElts.push_back(APInt::getZero(FTy->getNumElements())); 10551 VecId = FirstUsers.size() - 1; 10552 auto It = MinBWs.find(ScalarTE); 10553 if (It != MinBWs.end() && 10554 VectorCasts 10555 .insert(std::make_pair(ScalarTE, FTy->getElementType())) 10556 .second) { 10557 unsigned BWSz = It->second.first; 10558 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType()); 10559 unsigned VecOpcode; 10560 if (DstBWSz < BWSz) 10561 VecOpcode = Instruction::Trunc; 10562 else 10563 VecOpcode = 10564 It->second.second ? Instruction::SExt : Instruction::ZExt; 10565 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 10566 InstructionCost C = TTI->getCastInstrCost( 10567 VecOpcode, FTy, 10568 getWidenedType(IntegerType::get(FTy->getContext(), BWSz), 10569 FTy->getNumElements()), 10570 TTI::CastContextHint::None, CostKind); 10571 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 10572 << " for extending externally used vector with " 10573 "non-equal minimum bitwidth.\n"); 10574 Cost += C; 10575 } 10576 } else { 10577 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first))) 10578 It->first = VU; 10579 VecId = std::distance(FirstUsers.begin(), It); 10580 } 10581 int InIdx = *InsertIdx; 10582 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE]; 10583 if (Mask.empty()) 10584 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 10585 Mask[InIdx] = EU.Lane; 10586 DemandedElts[VecId].setBit(InIdx); 10587 continue; 10588 } 10589 } 10590 } 10591 // Leave the GEPs as is, they are free in most cases and better to keep them 10592 // as GEPs. 10593 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 10594 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) { 10595 if (!ValueToExtUses) { 10596 ValueToExtUses.emplace(); 10597 for_each(enumerate(ExternalUses), [&](const auto &P) { 10598 ValueToExtUses->try_emplace(P.value().Scalar, P.index()); 10599 }); 10600 } 10601 // Can use original GEP, if no operands vectorized or they are marked as 10602 // externally used already. 10603 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) { 10604 if (!getTreeEntry(V)) 10605 return true; 10606 auto It = ValueToExtUses->find(V); 10607 if (It != ValueToExtUses->end()) { 10608 // Replace all uses to avoid compiler crash. 10609 ExternalUses[It->second].User = nullptr; 10610 return true; 10611 } 10612 return false; 10613 }); 10614 if (CanBeUsedAsGEP) { 10615 ExtractCost += TTI->getInstructionCost(GEP, CostKind); 10616 ExternalUsesAsGEPs.insert(EU.Scalar); 10617 continue; 10618 } 10619 } 10620 10621 // If we plan to rewrite the tree in a smaller type, we will need to sign 10622 // extend the extracted value back to the original type. Here, we account 10623 // for the extract and the added cost of the sign extend if needed. 10624 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth); 10625 auto It = MinBWs.find(getTreeEntry(EU.Scalar)); 10626 if (It != MinBWs.end()) { 10627 auto *MinTy = IntegerType::get(F->getContext(), It->second.first); 10628 unsigned Extend = 10629 It->second.second ? Instruction::SExt : Instruction::ZExt; 10630 VecTy = getWidenedType(MinTy, BundleWidth); 10631 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), 10632 VecTy, EU.Lane); 10633 } else { 10634 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 10635 CostKind, EU.Lane); 10636 } 10637 } 10638 // Add reduced value cost, if resized. 10639 if (!VectorizedVals.empty()) { 10640 const TreeEntry &Root = *VectorizableTree.front(); 10641 auto BWIt = MinBWs.find(&Root); 10642 if (BWIt != MinBWs.end()) { 10643 Type *DstTy = Root.Scalars.front()->getType(); 10644 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy); 10645 unsigned SrcSz = 10646 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth; 10647 if (OriginalSz != SrcSz) { 10648 unsigned Opcode = Instruction::Trunc; 10649 if (OriginalSz > SrcSz) 10650 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt; 10651 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz); 10652 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy, 10653 TTI::CastContextHint::None, 10654 TTI::TCK_RecipThroughput); 10655 } 10656 } 10657 } 10658 10659 InstructionCost SpillCost = getSpillCost(); 10660 Cost += SpillCost + ExtractCost; 10661 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask, 10662 bool) { 10663 InstructionCost C = 0; 10664 unsigned VF = Mask.size(); 10665 unsigned VecVF = TE->getVectorFactor(); 10666 if (VF != VecVF && 10667 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) || 10668 !ShuffleVectorInst::isIdentityMask(Mask, VF))) { 10669 SmallVector<int> OrigMask(VecVF, PoisonMaskElem); 10670 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), 10671 OrigMask.begin()); 10672 C = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, 10673 getWidenedType(TE->getMainOp()->getType(), VecVF), 10674 OrigMask); 10675 LLVM_DEBUG( 10676 dbgs() << "SLP: Adding cost " << C 10677 << " for final shuffle of insertelement external users.\n"; 10678 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); 10679 Cost += C; 10680 return std::make_pair(TE, true); 10681 } 10682 return std::make_pair(TE, false); 10683 }; 10684 // Calculate the cost of the reshuffled vectors, if any. 10685 for (int I = 0, E = FirstUsers.size(); I < E; ++I) { 10686 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0); 10687 auto Vector = ShuffleMasks[I].takeVector(); 10688 unsigned VF = 0; 10689 auto EstimateShufflesCost = [&](ArrayRef<int> Mask, 10690 ArrayRef<const TreeEntry *> TEs) { 10691 assert((TEs.size() == 1 || TEs.size() == 2) && 10692 "Expected exactly 1 or 2 tree entries."); 10693 if (TEs.size() == 1) { 10694 if (VF == 0) 10695 VF = TEs.front()->getVectorFactor(); 10696 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); 10697 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) && 10698 !all_of(enumerate(Mask), [=](const auto &Data) { 10699 return Data.value() == PoisonMaskElem || 10700 (Data.index() < VF && 10701 static_cast<int>(Data.index()) == Data.value()); 10702 })) { 10703 InstructionCost C = 10704 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask); 10705 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 10706 << " for final shuffle of insertelement " 10707 "external users.\n"; 10708 TEs.front()->dump(); 10709 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 10710 Cost += C; 10711 } 10712 } else { 10713 if (VF == 0) { 10714 if (TEs.front() && 10715 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor()) 10716 VF = TEs.front()->getVectorFactor(); 10717 else 10718 VF = Mask.size(); 10719 } 10720 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); 10721 InstructionCost C = 10722 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); 10723 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 10724 << " for final shuffle of vector node and external " 10725 "insertelement users.\n"; 10726 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump(); 10727 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 10728 Cost += C; 10729 } 10730 VF = Mask.size(); 10731 return TEs.back(); 10732 }; 10733 (void)performExtractsShuffleAction<const TreeEntry>( 10734 MutableArrayRef(Vector.data(), Vector.size()), Base, 10735 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF, 10736 EstimateShufflesCost); 10737 InstructionCost InsertCost = TTI->getScalarizationOverhead( 10738 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I], 10739 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); 10740 Cost -= InsertCost; 10741 } 10742 10743 // Add the cost for reduced value resize (if required). 10744 if (ReductionBitWidth != 0) { 10745 assert(UserIgnoreList && "Expected reduction tree."); 10746 const TreeEntry &E = *VectorizableTree.front(); 10747 auto It = MinBWs.find(&E); 10748 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) { 10749 unsigned SrcSize = It->second.first; 10750 unsigned DstSize = ReductionBitWidth; 10751 unsigned Opcode = Instruction::Trunc; 10752 if (SrcSize < DstSize) 10753 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt; 10754 auto *SrcVecTy = 10755 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor()); 10756 auto *DstVecTy = 10757 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor()); 10758 TTI::CastContextHint CCH = getCastContextHint(E); 10759 InstructionCost CastCost; 10760 switch (E.getOpcode()) { 10761 case Instruction::SExt: 10762 case Instruction::ZExt: 10763 case Instruction::Trunc: { 10764 const TreeEntry *OpTE = getOperandEntry(&E, 0); 10765 CCH = getCastContextHint(*OpTE); 10766 break; 10767 } 10768 default: 10769 break; 10770 } 10771 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH, 10772 TTI::TCK_RecipThroughput); 10773 Cost += CastCost; 10774 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost 10775 << " for final resize for reduction from " << SrcVecTy 10776 << " to " << DstVecTy << "\n"; 10777 dbgs() << "SLP: Current total cost = " << Cost << "\n"); 10778 } 10779 } 10780 10781 #ifndef NDEBUG 10782 SmallString<256> Str; 10783 { 10784 raw_svector_ostream OS(Str); 10785 OS << "SLP: Spill Cost = " << SpillCost << ".\n" 10786 << "SLP: Extract Cost = " << ExtractCost << ".\n" 10787 << "SLP: Total Cost = " << Cost << ".\n"; 10788 } 10789 LLVM_DEBUG(dbgs() << Str); 10790 if (ViewSLPTree) 10791 ViewGraph(this, "SLP" + F->getName(), false, Str); 10792 #endif 10793 10794 return Cost; 10795 } 10796 10797 /// Tries to find extractelement instructions with constant indices from fixed 10798 /// vector type and gather such instructions into a bunch, which highly likely 10799 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 10800 /// successful, the matched scalars are replaced by poison values in \p VL for 10801 /// future analysis. 10802 std::optional<TTI::ShuffleKind> 10803 BoUpSLP::tryToGatherSingleRegisterExtractElements( 10804 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const { 10805 // Scan list of gathered scalars for extractelements that can be represented 10806 // as shuffles. 10807 MapVector<Value *, SmallVector<int>> VectorOpToIdx; 10808 SmallVector<int> UndefVectorExtracts; 10809 for (int I = 0, E = VL.size(); I < E; ++I) { 10810 auto *EI = dyn_cast<ExtractElementInst>(VL[I]); 10811 if (!EI) { 10812 if (isa<UndefValue>(VL[I])) 10813 UndefVectorExtracts.push_back(I); 10814 continue; 10815 } 10816 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType()); 10817 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand())) 10818 continue; 10819 std::optional<unsigned> Idx = getExtractIndex(EI); 10820 // Undefined index. 10821 if (!Idx) { 10822 UndefVectorExtracts.push_back(I); 10823 continue; 10824 } 10825 SmallBitVector ExtractMask(VecTy->getNumElements(), true); 10826 ExtractMask.reset(*Idx); 10827 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { 10828 UndefVectorExtracts.push_back(I); 10829 continue; 10830 } 10831 VectorOpToIdx[EI->getVectorOperand()].push_back(I); 10832 } 10833 // Sort the vector operands by the maximum number of uses in extractelements. 10834 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors = 10835 VectorOpToIdx.takeVector(); 10836 stable_sort(Vectors, [](const auto &P1, const auto &P2) { 10837 return P1.second.size() > P2.second.size(); 10838 }); 10839 // Find the best pair of the vectors or a single vector. 10840 const int UndefSz = UndefVectorExtracts.size(); 10841 unsigned SingleMax = 0; 10842 unsigned PairMax = 0; 10843 if (!Vectors.empty()) { 10844 SingleMax = Vectors.front().second.size() + UndefSz; 10845 if (Vectors.size() > 1) { 10846 auto *ItNext = std::next(Vectors.begin()); 10847 PairMax = SingleMax + ItNext->second.size(); 10848 } 10849 } 10850 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) 10851 return std::nullopt; 10852 // Check if better to perform a shuffle of 2 vectors or just of a single 10853 // vector. 10854 SmallVector<Value *> SavedVL(VL.begin(), VL.end()); 10855 SmallVector<Value *> GatheredExtracts( 10856 VL.size(), PoisonValue::get(VL.front()->getType())); 10857 if (SingleMax >= PairMax && SingleMax) { 10858 for (int Idx : Vectors.front().second) 10859 std::swap(GatheredExtracts[Idx], VL[Idx]); 10860 } else if (!Vectors.empty()) { 10861 for (unsigned Idx : {0, 1}) 10862 for (int Idx : Vectors[Idx].second) 10863 std::swap(GatheredExtracts[Idx], VL[Idx]); 10864 } 10865 // Add extracts from undefs too. 10866 for (int Idx : UndefVectorExtracts) 10867 std::swap(GatheredExtracts[Idx], VL[Idx]); 10868 // Check that gather of extractelements can be represented as just a 10869 // shuffle of a single/two vectors the scalars are extracted from. 10870 std::optional<TTI::ShuffleKind> Res = 10871 isFixedVectorShuffle(GatheredExtracts, Mask); 10872 if (!Res) { 10873 // TODO: try to check other subsets if possible. 10874 // Restore the original VL if attempt was not successful. 10875 copy(SavedVL, VL.begin()); 10876 return std::nullopt; 10877 } 10878 // Restore unused scalars from mask, if some of the extractelements were not 10879 // selected for shuffle. 10880 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { 10881 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) && 10882 isa<UndefValue>(GatheredExtracts[I])) { 10883 std::swap(VL[I], GatheredExtracts[I]); 10884 continue; 10885 } 10886 auto *EI = dyn_cast<ExtractElementInst>(VL[I]); 10887 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) || 10888 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) || 10889 is_contained(UndefVectorExtracts, I)) 10890 continue; 10891 } 10892 return Res; 10893 } 10894 10895 /// Tries to find extractelement instructions with constant indices from fixed 10896 /// vector type and gather such instructions into a bunch, which highly likely 10897 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 10898 /// successful, the matched scalars are replaced by poison values in \p VL for 10899 /// future analysis. 10900 SmallVector<std::optional<TTI::ShuffleKind>> 10901 BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, 10902 SmallVectorImpl<int> &Mask, 10903 unsigned NumParts) const { 10904 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); 10905 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts); 10906 Mask.assign(VL.size(), PoisonMaskElem); 10907 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 10908 for (unsigned Part : seq<unsigned>(NumParts)) { 10909 // Scan list of gathered scalars for extractelements that can be represented 10910 // as shuffles. 10911 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice( 10912 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); 10913 SmallVector<int> SubMask; 10914 std::optional<TTI::ShuffleKind> Res = 10915 tryToGatherSingleRegisterExtractElements(SubVL, SubMask); 10916 ShufflesRes[Part] = Res; 10917 copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); 10918 } 10919 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) { 10920 return Res.has_value(); 10921 })) 10922 ShufflesRes.clear(); 10923 return ShufflesRes; 10924 } 10925 10926 std::optional<TargetTransformInfo::ShuffleKind> 10927 BoUpSLP::isGatherShuffledSingleRegisterEntry( 10928 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, 10929 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) { 10930 Entries.clear(); 10931 // TODO: currently checking only for Scalars in the tree entry, need to count 10932 // reused elements too for better cost estimation. 10933 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front(); 10934 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE); 10935 const BasicBlock *TEInsertBlock = nullptr; 10936 // Main node of PHI entries keeps the correct order of operands/incoming 10937 // blocks. 10938 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) { 10939 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx); 10940 TEInsertPt = TEInsertBlock->getTerminator(); 10941 } else { 10942 TEInsertBlock = TEInsertPt->getParent(); 10943 } 10944 if (!DT->isReachableFromEntry(TEInsertBlock)) 10945 return std::nullopt; 10946 auto *NodeUI = DT->getNode(TEInsertBlock); 10947 assert(NodeUI && "Should only process reachable instructions"); 10948 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end()); 10949 auto CheckOrdering = [&](const Instruction *InsertPt) { 10950 // Argument InsertPt is an instruction where vector code for some other 10951 // tree entry (one that shares one or more scalars with TE) is going to be 10952 // generated. This lambda returns true if insertion point of vector code 10953 // for the TE dominates that point (otherwise dependency is the other way 10954 // around). The other node is not limited to be of a gather kind. Gather 10955 // nodes are not scheduled and their vector code is inserted before their 10956 // first user. If user is PHI, that is supposed to be at the end of a 10957 // predecessor block. Otherwise it is the last instruction among scalars of 10958 // the user node. So, instead of checking dependency between instructions 10959 // themselves, we check dependency between their insertion points for vector 10960 // code (since each scalar instruction ends up as a lane of a vector 10961 // instruction). 10962 const BasicBlock *InsertBlock = InsertPt->getParent(); 10963 auto *NodeEUI = DT->getNode(InsertBlock); 10964 if (!NodeEUI) 10965 return false; 10966 assert((NodeUI == NodeEUI) == 10967 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && 10968 "Different nodes should have different DFS numbers"); 10969 // Check the order of the gather nodes users. 10970 if (TEInsertPt->getParent() != InsertBlock && 10971 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) 10972 return false; 10973 if (TEInsertPt->getParent() == InsertBlock && 10974 TEInsertPt->comesBefore(InsertPt)) 10975 return false; 10976 return true; 10977 }; 10978 // Find all tree entries used by the gathered values. If no common entries 10979 // found - not a shuffle. 10980 // Here we build a set of tree nodes for each gathered value and trying to 10981 // find the intersection between these sets. If we have at least one common 10982 // tree node for each gathered value - we have just a permutation of the 10983 // single vector. If we have 2 different sets, we're in situation where we 10984 // have a permutation of 2 input vectors. 10985 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; 10986 DenseMap<Value *, int> UsedValuesEntry; 10987 for (Value *V : VL) { 10988 if (isConstant(V)) 10989 continue; 10990 // Build a list of tree entries where V is used. 10991 SmallPtrSet<const TreeEntry *, 4> VToTEs; 10992 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { 10993 if (TEPtr == TE) 10994 continue; 10995 assert(any_of(TEPtr->Scalars, 10996 [&](Value *V) { return GatheredScalars.contains(V); }) && 10997 "Must contain at least single gathered value."); 10998 assert(TEPtr->UserTreeIndices.size() == 1 && 10999 "Expected only single user of a gather node."); 11000 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front(); 11001 11002 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp()); 11003 const Instruction *InsertPt = 11004 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator() 11005 : &getLastInstructionInBundle(UseEI.UserTE); 11006 if (TEInsertPt == InsertPt) { 11007 // If 2 gathers are operands of the same entry (regardless of whether 11008 // user is PHI or else), compare operands indices, use the earlier one 11009 // as the base. 11010 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx) 11011 continue; 11012 // If the user instruction is used for some reason in different 11013 // vectorized nodes - make it depend on index. 11014 if (TEUseEI.UserTE != UseEI.UserTE && 11015 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx) 11016 continue; 11017 } 11018 11019 // Check if the user node of the TE comes after user node of TEPtr, 11020 // otherwise TEPtr depends on TE. 11021 if ((TEInsertBlock != InsertPt->getParent() || 11022 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && 11023 !CheckOrdering(InsertPt)) 11024 continue; 11025 VToTEs.insert(TEPtr); 11026 } 11027 if (const TreeEntry *VTE = getTreeEntry(V)) { 11028 if (ForOrder) { 11029 if (VTE->State != TreeEntry::Vectorize) { 11030 auto It = MultiNodeScalars.find(V); 11031 if (It == MultiNodeScalars.end()) 11032 continue; 11033 VTE = *It->getSecond().begin(); 11034 // Iterate through all vectorized nodes. 11035 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) { 11036 return MTE->State == TreeEntry::Vectorize; 11037 }); 11038 if (MIt == It->getSecond().end()) 11039 continue; 11040 VTE = *MIt; 11041 } 11042 } 11043 Instruction &LastBundleInst = getLastInstructionInBundle(VTE); 11044 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) 11045 continue; 11046 VToTEs.insert(VTE); 11047 } 11048 if (VToTEs.empty()) 11049 continue; 11050 if (UsedTEs.empty()) { 11051 // The first iteration, just insert the list of nodes to vector. 11052 UsedTEs.push_back(VToTEs); 11053 UsedValuesEntry.try_emplace(V, 0); 11054 } else { 11055 // Need to check if there are any previously used tree nodes which use V. 11056 // If there are no such nodes, consider that we have another one input 11057 // vector. 11058 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); 11059 unsigned Idx = 0; 11060 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { 11061 // Do we have a non-empty intersection of previously listed tree entries 11062 // and tree entries using current V? 11063 set_intersect(VToTEs, Set); 11064 if (!VToTEs.empty()) { 11065 // Yes, write the new subset and continue analysis for the next 11066 // scalar. 11067 Set.swap(VToTEs); 11068 break; 11069 } 11070 VToTEs = SavedVToTEs; 11071 ++Idx; 11072 } 11073 // No non-empty intersection found - need to add a second set of possible 11074 // source vectors. 11075 if (Idx == UsedTEs.size()) { 11076 // If the number of input vectors is greater than 2 - not a permutation, 11077 // fallback to the regular gather. 11078 // TODO: support multiple reshuffled nodes. 11079 if (UsedTEs.size() == 2) 11080 continue; 11081 UsedTEs.push_back(SavedVToTEs); 11082 Idx = UsedTEs.size() - 1; 11083 } 11084 UsedValuesEntry.try_emplace(V, Idx); 11085 } 11086 } 11087 11088 if (UsedTEs.empty()) { 11089 Entries.clear(); 11090 return std::nullopt; 11091 } 11092 11093 unsigned VF = 0; 11094 if (UsedTEs.size() == 1) { 11095 // Keep the order to avoid non-determinism. 11096 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(), 11097 UsedTEs.front().end()); 11098 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { 11099 return TE1->Idx < TE2->Idx; 11100 }); 11101 // Try to find the perfect match in another gather node at first. 11102 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { 11103 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); 11104 }); 11105 if (It != FirstEntries.end() && 11106 ((*It)->getVectorFactor() == VL.size() || 11107 ((*It)->getVectorFactor() == TE->Scalars.size() && 11108 TE->ReuseShuffleIndices.size() == VL.size() && 11109 (*It)->isSame(TE->Scalars)))) { 11110 Entries.push_back(*It); 11111 if ((*It)->getVectorFactor() == VL.size()) { 11112 std::iota(std::next(Mask.begin(), Part * VL.size()), 11113 std::next(Mask.begin(), (Part + 1) * VL.size()), 0); 11114 } else { 11115 SmallVector<int> CommonMask = TE->getCommonMask(); 11116 copy(CommonMask, Mask.begin()); 11117 } 11118 // Clear undef scalars. 11119 for (int I = 0, Sz = VL.size(); I < Sz; ++I) 11120 if (isa<PoisonValue>(VL[I])) 11121 Mask[I] = PoisonMaskElem; 11122 return TargetTransformInfo::SK_PermuteSingleSrc; 11123 } 11124 // No perfect match, just shuffle, so choose the first tree node from the 11125 // tree. 11126 Entries.push_back(FirstEntries.front()); 11127 } else { 11128 // Try to find nodes with the same vector factor. 11129 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); 11130 // Keep the order of tree nodes to avoid non-determinism. 11131 DenseMap<int, const TreeEntry *> VFToTE; 11132 for (const TreeEntry *TE : UsedTEs.front()) { 11133 unsigned VF = TE->getVectorFactor(); 11134 auto It = VFToTE.find(VF); 11135 if (It != VFToTE.end()) { 11136 if (It->second->Idx > TE->Idx) 11137 It->getSecond() = TE; 11138 continue; 11139 } 11140 VFToTE.try_emplace(VF, TE); 11141 } 11142 // Same, keep the order to avoid non-determinism. 11143 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(), 11144 UsedTEs.back().end()); 11145 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { 11146 return TE1->Idx < TE2->Idx; 11147 }); 11148 for (const TreeEntry *TE : SecondEntries) { 11149 auto It = VFToTE.find(TE->getVectorFactor()); 11150 if (It != VFToTE.end()) { 11151 VF = It->first; 11152 Entries.push_back(It->second); 11153 Entries.push_back(TE); 11154 break; 11155 } 11156 } 11157 // No 2 source vectors with the same vector factor - just choose 2 with max 11158 // index. 11159 if (Entries.empty()) { 11160 Entries.push_back(*llvm::max_element( 11161 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) { 11162 return TE1->Idx < TE2->Idx; 11163 })); 11164 Entries.push_back(SecondEntries.front()); 11165 VF = std::max(Entries.front()->getVectorFactor(), 11166 Entries.back()->getVectorFactor()); 11167 } 11168 } 11169 11170 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>); 11171 // Checks if the 2 PHIs are compatible in terms of high possibility to be 11172 // vectorized. 11173 auto AreCompatiblePHIs = [&](Value *V, Value *V1) { 11174 auto *PHI = cast<PHINode>(V); 11175 auto *PHI1 = cast<PHINode>(V1); 11176 // Check that all incoming values are compatible/from same parent (if they 11177 // are instructions). 11178 // The incoming values are compatible if they all are constants, or 11179 // instruction with the same/alternate opcodes from the same basic block. 11180 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { 11181 Value *In = PHI->getIncomingValue(I); 11182 Value *In1 = PHI1->getIncomingValue(I); 11183 if (isConstant(In) && isConstant(In1)) 11184 continue; 11185 if (!getSameOpcode({In, In1}, *TLI).getOpcode()) 11186 return false; 11187 if (cast<Instruction>(In)->getParent() != 11188 cast<Instruction>(In1)->getParent()) 11189 return false; 11190 } 11191 return true; 11192 }; 11193 // Check if the value can be ignored during analysis for shuffled gathers. 11194 // We suppose it is better to ignore instruction, which do not form splats, 11195 // are not vectorized/not extractelements (these instructions will be handled 11196 // by extractelements processing) or may form vector node in future. 11197 auto MightBeIgnored = [=](Value *V) { 11198 auto *I = dyn_cast<Instruction>(V); 11199 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && 11200 !isVectorLikeInstWithConstOps(I) && 11201 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I); 11202 }; 11203 // Check that the neighbor instruction may form a full vector node with the 11204 // current instruction V. It is possible, if they have same/alternate opcode 11205 // and same parent basic block. 11206 auto NeighborMightBeIgnored = [&](Value *V, int Idx) { 11207 Value *V1 = VL[Idx]; 11208 bool UsedInSameVTE = false; 11209 auto It = UsedValuesEntry.find(V1); 11210 if (It != UsedValuesEntry.end()) 11211 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; 11212 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && 11213 getSameOpcode({V, V1}, *TLI).getOpcode() && 11214 cast<Instruction>(V)->getParent() == 11215 cast<Instruction>(V1)->getParent() && 11216 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1)); 11217 }; 11218 // Build a shuffle mask for better cost estimation and vector emission. 11219 SmallBitVector UsedIdxs(Entries.size()); 11220 SmallVector<std::pair<unsigned, int>> EntryLanes; 11221 for (int I = 0, E = VL.size(); I < E; ++I) { 11222 Value *V = VL[I]; 11223 auto It = UsedValuesEntry.find(V); 11224 if (It == UsedValuesEntry.end()) 11225 continue; 11226 // Do not try to shuffle scalars, if they are constants, or instructions 11227 // that can be vectorized as a result of the following vector build 11228 // vectorization. 11229 if (isConstant(V) || (MightBeIgnored(V) && 11230 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || 11231 (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) 11232 continue; 11233 unsigned Idx = It->second; 11234 EntryLanes.emplace_back(Idx, I); 11235 UsedIdxs.set(Idx); 11236 } 11237 // Iterate through all shuffled scalars and select entries, which can be used 11238 // for final shuffle. 11239 SmallVector<const TreeEntry *> TempEntries; 11240 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { 11241 if (!UsedIdxs.test(I)) 11242 continue; 11243 // Fix the entry number for the given scalar. If it is the first entry, set 11244 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). 11245 // These indices are used when calculating final shuffle mask as the vector 11246 // offset. 11247 for (std::pair<unsigned, int> &Pair : EntryLanes) 11248 if (Pair.first == I) 11249 Pair.first = TempEntries.size(); 11250 TempEntries.push_back(Entries[I]); 11251 } 11252 Entries.swap(TempEntries); 11253 if (EntryLanes.size() == Entries.size() && 11254 !VL.equals(ArrayRef(TE->Scalars) 11255 .slice(Part * VL.size(), 11256 std::min<int>(VL.size(), TE->Scalars.size())))) { 11257 // We may have here 1 or 2 entries only. If the number of scalars is equal 11258 // to the number of entries, no need to do the analysis, it is not very 11259 // profitable. Since VL is not the same as TE->Scalars, it means we already 11260 // have some shuffles before. Cut off not profitable case. 11261 Entries.clear(); 11262 return std::nullopt; 11263 } 11264 // Build the final mask, check for the identity shuffle, if possible. 11265 bool IsIdentity = Entries.size() == 1; 11266 // Pair.first is the offset to the vector, while Pair.second is the index of 11267 // scalar in the list. 11268 for (const std::pair<unsigned, int> &Pair : EntryLanes) { 11269 unsigned Idx = Part * VL.size() + Pair.second; 11270 Mask[Idx] = 11271 Pair.first * VF + 11272 (ForOrder ? std::distance( 11273 Entries[Pair.first]->Scalars.begin(), 11274 find(Entries[Pair.first]->Scalars, VL[Pair.second])) 11275 : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); 11276 IsIdentity &= Mask[Idx] == Pair.second; 11277 } 11278 switch (Entries.size()) { 11279 case 1: 11280 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) 11281 return TargetTransformInfo::SK_PermuteSingleSrc; 11282 break; 11283 case 2: 11284 if (EntryLanes.size() > 2 || VL.size() <= 2) 11285 return TargetTransformInfo::SK_PermuteTwoSrc; 11286 break; 11287 default: 11288 break; 11289 } 11290 Entries.clear(); 11291 // Clear the corresponding mask elements. 11292 std::fill(std::next(Mask.begin(), Part * VL.size()), 11293 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem); 11294 return std::nullopt; 11295 } 11296 11297 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> 11298 BoUpSLP::isGatherShuffledEntry( 11299 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, 11300 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts, 11301 bool ForOrder) { 11302 assert(NumParts > 0 && NumParts < VL.size() && 11303 "Expected positive number of registers."); 11304 Entries.clear(); 11305 // No need to check for the topmost gather node. 11306 if (TE == VectorizableTree.front().get()) 11307 return {}; 11308 // FIXME: Gathering for non-power-of-2 nodes not implemented yet. 11309 if (TE->isNonPowOf2Vec()) 11310 return {}; 11311 Mask.assign(VL.size(), PoisonMaskElem); 11312 assert(TE->UserTreeIndices.size() == 1 && 11313 "Expected only single user of the gather node."); 11314 assert(VL.size() % NumParts == 0 && 11315 "Number of scalars must be divisible by NumParts."); 11316 unsigned SliceSize = getPartNumElems(VL.size(), NumParts); 11317 SmallVector<std::optional<TTI::ShuffleKind>> Res; 11318 for (unsigned Part : seq<unsigned>(NumParts)) { 11319 ArrayRef<Value *> SubVL = 11320 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); 11321 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back(); 11322 std::optional<TTI::ShuffleKind> SubRes = 11323 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part, 11324 ForOrder); 11325 if (!SubRes) 11326 SubEntries.clear(); 11327 Res.push_back(SubRes); 11328 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc && 11329 SubEntries.front()->getVectorFactor() == VL.size() && 11330 (SubEntries.front()->isSame(TE->Scalars) || 11331 SubEntries.front()->isSame(VL))) { 11332 SmallVector<const TreeEntry *> LocalSubEntries; 11333 LocalSubEntries.swap(SubEntries); 11334 Entries.clear(); 11335 Res.clear(); 11336 std::iota(Mask.begin(), Mask.end(), 0); 11337 // Clear undef scalars. 11338 for (int I = 0, Sz = VL.size(); I < Sz; ++I) 11339 if (isa<PoisonValue>(VL[I])) 11340 Mask[I] = PoisonMaskElem; 11341 Entries.emplace_back(1, LocalSubEntries.front()); 11342 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc); 11343 return Res; 11344 } 11345 } 11346 if (all_of(Res, 11347 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) { 11348 Entries.clear(); 11349 return {}; 11350 } 11351 return Res; 11352 } 11353 11354 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, 11355 Type *ScalarTy) const { 11356 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 11357 bool DuplicateNonConst = false; 11358 // Find the cost of inserting/extracting values from the vector. 11359 // Check if the same elements are inserted several times and count them as 11360 // shuffle candidates. 11361 APInt ShuffledElements = APInt::getZero(VL.size()); 11362 DenseMap<Value *, unsigned> UniqueElements; 11363 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 11364 InstructionCost Cost; 11365 auto EstimateInsertCost = [&](unsigned I, Value *V) { 11366 if (V->getType() != ScalarTy) { 11367 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(), 11368 TTI::CastContextHint::None, CostKind); 11369 V = nullptr; 11370 } 11371 if (!ForPoisonSrc) 11372 Cost += 11373 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 11374 I, Constant::getNullValue(VecTy), V); 11375 }; 11376 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem); 11377 for (unsigned I = 0, E = VL.size(); I < E; ++I) { 11378 Value *V = VL[I]; 11379 // No need to shuffle duplicates for constants. 11380 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) { 11381 ShuffledElements.setBit(I); 11382 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I; 11383 continue; 11384 } 11385 11386 auto Res = UniqueElements.try_emplace(V, I); 11387 if (Res.second) { 11388 EstimateInsertCost(I, V); 11389 ShuffleMask[I] = I; 11390 continue; 11391 } 11392 11393 DuplicateNonConst = true; 11394 ShuffledElements.setBit(I); 11395 ShuffleMask[I] = Res.first->second; 11396 } 11397 if (ForPoisonSrc) 11398 Cost = 11399 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true, 11400 /*Extract*/ false, CostKind); 11401 if (DuplicateNonConst) 11402 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, 11403 VecTy, ShuffleMask); 11404 return Cost; 11405 } 11406 11407 // Perform operand reordering on the instructions in VL and return the reordered 11408 // operands in Left and Right. 11409 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 11410 SmallVectorImpl<Value *> &Left, 11411 SmallVectorImpl<Value *> &Right, 11412 const BoUpSLP &R) { 11413 if (VL.empty()) 11414 return; 11415 VLOperands Ops(VL, R); 11416 // Reorder the operands in place. 11417 Ops.reorder(); 11418 Left = Ops.getVL(0); 11419 Right = Ops.getVL(1); 11420 } 11421 11422 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { 11423 auto &Res = EntryToLastInstruction.FindAndConstruct(E); 11424 if (Res.second) 11425 return *Res.second; 11426 // Get the basic block this bundle is in. All instructions in the bundle 11427 // should be in this block (except for extractelement-like instructions with 11428 // constant indeces). 11429 auto *Front = E->getMainOp(); 11430 auto *BB = Front->getParent(); 11431 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { 11432 if (E->getOpcode() == Instruction::GetElementPtr && 11433 !isa<GetElementPtrInst>(V)) 11434 return true; 11435 auto *I = cast<Instruction>(V); 11436 return !E->isOpcodeOrAlt(I) || I->getParent() == BB || 11437 isVectorLikeInstWithConstOps(I); 11438 })); 11439 11440 auto FindLastInst = [&]() { 11441 Instruction *LastInst = Front; 11442 for (Value *V : E->Scalars) { 11443 auto *I = dyn_cast<Instruction>(V); 11444 if (!I) 11445 continue; 11446 if (LastInst->getParent() == I->getParent()) { 11447 if (LastInst->comesBefore(I)) 11448 LastInst = I; 11449 continue; 11450 } 11451 assert(((E->getOpcode() == Instruction::GetElementPtr && 11452 !isa<GetElementPtrInst>(I)) || 11453 (isVectorLikeInstWithConstOps(LastInst) && 11454 isVectorLikeInstWithConstOps(I))) && 11455 "Expected vector-like or non-GEP in GEP node insts only."); 11456 if (!DT->isReachableFromEntry(LastInst->getParent())) { 11457 LastInst = I; 11458 continue; 11459 } 11460 if (!DT->isReachableFromEntry(I->getParent())) 11461 continue; 11462 auto *NodeA = DT->getNode(LastInst->getParent()); 11463 auto *NodeB = DT->getNode(I->getParent()); 11464 assert(NodeA && "Should only process reachable instructions"); 11465 assert(NodeB && "Should only process reachable instructions"); 11466 assert((NodeA == NodeB) == 11467 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 11468 "Different nodes should have different DFS numbers"); 11469 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) 11470 LastInst = I; 11471 } 11472 BB = LastInst->getParent(); 11473 return LastInst; 11474 }; 11475 11476 auto FindFirstInst = [&]() { 11477 Instruction *FirstInst = Front; 11478 for (Value *V : E->Scalars) { 11479 auto *I = dyn_cast<Instruction>(V); 11480 if (!I) 11481 continue; 11482 if (FirstInst->getParent() == I->getParent()) { 11483 if (I->comesBefore(FirstInst)) 11484 FirstInst = I; 11485 continue; 11486 } 11487 assert(((E->getOpcode() == Instruction::GetElementPtr && 11488 !isa<GetElementPtrInst>(I)) || 11489 (isVectorLikeInstWithConstOps(FirstInst) && 11490 isVectorLikeInstWithConstOps(I))) && 11491 "Expected vector-like or non-GEP in GEP node insts only."); 11492 if (!DT->isReachableFromEntry(FirstInst->getParent())) { 11493 FirstInst = I; 11494 continue; 11495 } 11496 if (!DT->isReachableFromEntry(I->getParent())) 11497 continue; 11498 auto *NodeA = DT->getNode(FirstInst->getParent()); 11499 auto *NodeB = DT->getNode(I->getParent()); 11500 assert(NodeA && "Should only process reachable instructions"); 11501 assert(NodeB && "Should only process reachable instructions"); 11502 assert((NodeA == NodeB) == 11503 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && 11504 "Different nodes should have different DFS numbers"); 11505 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn()) 11506 FirstInst = I; 11507 } 11508 return FirstInst; 11509 }; 11510 11511 // Set the insert point to the beginning of the basic block if the entry 11512 // should not be scheduled. 11513 if (doesNotNeedToSchedule(E->Scalars) || 11514 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) { 11515 if ((E->getOpcode() == Instruction::GetElementPtr && 11516 any_of(E->Scalars, 11517 [](Value *V) { 11518 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); 11519 })) || 11520 all_of(E->Scalars, 11521 [](Value *V) { 11522 return !isVectorLikeInstWithConstOps(V) && 11523 isUsedOutsideBlock(V); 11524 }) || 11525 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) { 11526 return isa<ExtractElementInst, UndefValue>(V) || 11527 areAllOperandsNonInsts(V); 11528 }))) 11529 Res.second = FindLastInst(); 11530 else 11531 Res.second = FindFirstInst(); 11532 return *Res.second; 11533 } 11534 11535 // Find the last instruction. The common case should be that BB has been 11536 // scheduled, and the last instruction is VL.back(). So we start with 11537 // VL.back() and iterate over schedule data until we reach the end of the 11538 // bundle. The end of the bundle is marked by null ScheduleData. 11539 if (BlocksSchedules.count(BB)) { 11540 Value *V = E->isOneOf(E->Scalars.back()); 11541 if (doesNotNeedToBeScheduled(V)) 11542 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); 11543 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); 11544 if (Bundle && Bundle->isPartOfBundle()) 11545 for (; Bundle; Bundle = Bundle->NextInBundle) 11546 if (Bundle->OpValue == Bundle->Inst) 11547 Res.second = Bundle->Inst; 11548 } 11549 11550 // LastInst can still be null at this point if there's either not an entry 11551 // for BB in BlocksSchedules or there's no ScheduleData available for 11552 // VL.back(). This can be the case if buildTree_rec aborts for various 11553 // reasons (e.g., the maximum recursion depth is reached, the maximum region 11554 // size is reached, etc.). ScheduleData is initialized in the scheduling 11555 // "dry-run". 11556 // 11557 // If this happens, we can still find the last instruction by brute force. We 11558 // iterate forwards from Front (inclusive) until we either see all 11559 // instructions in the bundle or reach the end of the block. If Front is the 11560 // last instruction in program order, LastInst will be set to Front, and we 11561 // will visit all the remaining instructions in the block. 11562 // 11563 // One of the reasons we exit early from buildTree_rec is to place an upper 11564 // bound on compile-time. Thus, taking an additional compile-time hit here is 11565 // not ideal. However, this should be exceedingly rare since it requires that 11566 // we both exit early from buildTree_rec and that the bundle be out-of-order 11567 // (causing us to iterate all the way to the end of the block). 11568 if (!Res.second) 11569 Res.second = FindLastInst(); 11570 assert(Res.second && "Failed to find last instruction in bundle"); 11571 return *Res.second; 11572 } 11573 11574 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { 11575 auto *Front = E->getMainOp(); 11576 Instruction *LastInst = &getLastInstructionInBundle(E); 11577 assert(LastInst && "Failed to find last instruction in bundle"); 11578 BasicBlock::iterator LastInstIt = LastInst->getIterator(); 11579 // If the instruction is PHI, set the insert point after all the PHIs. 11580 bool IsPHI = isa<PHINode>(LastInst); 11581 if (IsPHI) 11582 LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); 11583 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) { 11584 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt); 11585 } else { 11586 // Set the insertion point after the last instruction in the bundle. Set the 11587 // debug location to Front. 11588 Builder.SetInsertPoint( 11589 LastInst->getParent(), 11590 LastInst->getNextNonDebugInstruction()->getIterator()); 11591 } 11592 Builder.SetCurrentDebugLocation(Front->getDebugLoc()); 11593 } 11594 11595 Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) { 11596 // List of instructions/lanes from current block and/or the blocks which are 11597 // part of the current loop. These instructions will be inserted at the end to 11598 // make it possible to optimize loops and hoist invariant instructions out of 11599 // the loops body with better chances for success. 11600 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; 11601 SmallSet<int, 4> PostponedIndices; 11602 Loop *L = LI->getLoopFor(Builder.GetInsertBlock()); 11603 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { 11604 SmallPtrSet<BasicBlock *, 4> Visited; 11605 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second) 11606 InsertBB = InsertBB->getSinglePredecessor(); 11607 return InsertBB && InsertBB == InstBB; 11608 }; 11609 for (int I = 0, E = VL.size(); I < E; ++I) { 11610 if (auto *Inst = dyn_cast<Instruction>(VL[I])) 11611 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || 11612 getTreeEntry(Inst) || 11613 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) && 11614 PostponedIndices.insert(I).second) 11615 PostponedInsts.emplace_back(Inst, I); 11616 } 11617 11618 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos, 11619 Type *Ty) { 11620 Value *Scalar = V; 11621 if (Scalar->getType() != Ty) { 11622 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() && 11623 "Expected integer types only."); 11624 Value *V = Scalar; 11625 if (auto *CI = dyn_cast<CastInst>(Scalar); 11626 isa_and_nonnull<SExtInst, ZExtInst>(CI)) { 11627 Value *Op = CI->getOperand(0); 11628 if (auto *IOp = dyn_cast<Instruction>(Op); 11629 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp))) 11630 V = Op; 11631 } 11632 Scalar = Builder.CreateIntCast( 11633 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL))); 11634 } 11635 11636 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos)); 11637 auto *InsElt = dyn_cast<InsertElementInst>(Vec); 11638 if (!InsElt) 11639 return Vec; 11640 GatherShuffleExtractSeq.insert(InsElt); 11641 CSEBlocks.insert(InsElt->getParent()); 11642 // Add to our 'need-to-extract' list. 11643 if (isa<Instruction>(V)) { 11644 if (TreeEntry *Entry = getTreeEntry(V)) { 11645 // Find which lane we need to extract. 11646 User *UserOp = nullptr; 11647 if (Scalar != V) { 11648 if (auto *SI = dyn_cast<Instruction>(Scalar)) 11649 UserOp = SI; 11650 } else { 11651 UserOp = InsElt; 11652 } 11653 if (UserOp) { 11654 unsigned FoundLane = Entry->findLaneForValue(V); 11655 ExternalUses.emplace_back(V, UserOp, FoundLane); 11656 } 11657 } 11658 } 11659 return Vec; 11660 }; 11661 auto *VecTy = getWidenedType(ScalarTy, VL.size()); 11662 Value *Vec = Root ? Root : PoisonValue::get(VecTy); 11663 SmallVector<int> NonConsts; 11664 // Insert constant values at first. 11665 for (int I = 0, E = VL.size(); I < E; ++I) { 11666 if (PostponedIndices.contains(I)) 11667 continue; 11668 if (!isConstant(VL[I])) { 11669 NonConsts.push_back(I); 11670 continue; 11671 } 11672 if (Root) { 11673 if (!isa<UndefValue>(VL[I])) { 11674 NonConsts.push_back(I); 11675 continue; 11676 } 11677 if (isa<PoisonValue>(VL[I])) 11678 continue; 11679 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) { 11680 if (SV->getMaskValue(I) == PoisonMaskElem) 11681 continue; 11682 } 11683 } 11684 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy); 11685 } 11686 // Insert non-constant values. 11687 for (int I : NonConsts) 11688 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy); 11689 // Append instructions, which are/may be part of the loop, in the end to make 11690 // it possible to hoist non-loop-based instructions. 11691 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) 11692 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy); 11693 11694 return Vec; 11695 } 11696 11697 /// Merges shuffle masks and emits final shuffle instruction, if required. It 11698 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 11699 /// when the actual shuffle instruction is generated only if this is actually 11700 /// required. Otherwise, the shuffle instruction emission is delayed till the 11701 /// end of the process, to reduce the number of emitted instructions and further 11702 /// analysis/transformations. 11703 /// The class also will look through the previously emitted shuffle instructions 11704 /// and properly mark indices in mask as undef. 11705 /// For example, given the code 11706 /// \code 11707 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 11708 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 11709 /// \endcode 11710 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 11711 /// look through %s1 and %s2 and emit 11712 /// \code 11713 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 11714 /// \endcode 11715 /// instead. 11716 /// If 2 operands are of different size, the smallest one will be resized and 11717 /// the mask recalculated properly. 11718 /// For example, given the code 11719 /// \code 11720 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 11721 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 11722 /// \endcode 11723 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 11724 /// look through %s1 and %s2 and emit 11725 /// \code 11726 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 11727 /// \endcode 11728 /// instead. 11729 class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { 11730 bool IsFinalized = false; 11731 /// Combined mask for all applied operands and masks. It is built during 11732 /// analysis and actual emission of shuffle vector instructions. 11733 SmallVector<int> CommonMask; 11734 /// List of operands for the shuffle vector instruction. It hold at max 2 11735 /// operands, if the 3rd is going to be added, the first 2 are combined into 11736 /// shuffle with \p CommonMask mask, the first operand sets to be the 11737 /// resulting shuffle and the second operand sets to be the newly added 11738 /// operand. The \p CommonMask is transformed in the proper way after that. 11739 SmallVector<Value *, 2> InVectors; 11740 Type *ScalarTy = nullptr; 11741 IRBuilderBase &Builder; 11742 BoUpSLP &R; 11743 11744 class ShuffleIRBuilder { 11745 IRBuilderBase &Builder; 11746 /// Holds all of the instructions that we gathered. 11747 SetVector<Instruction *> &GatherShuffleExtractSeq; 11748 /// A list of blocks that we are going to CSE. 11749 DenseSet<BasicBlock *> &CSEBlocks; 11750 /// Data layout. 11751 const DataLayout &DL; 11752 11753 public: 11754 ShuffleIRBuilder(IRBuilderBase &Builder, 11755 SetVector<Instruction *> &GatherShuffleExtractSeq, 11756 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL) 11757 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq), 11758 CSEBlocks(CSEBlocks), DL(DL) {} 11759 ~ShuffleIRBuilder() = default; 11760 /// Creates shufflevector for the 2 operands with the given mask. 11761 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) { 11762 if (V1->getType() != V2->getType()) { 11763 assert(V1->getType()->isIntOrIntVectorTy() && 11764 V1->getType()->isIntOrIntVectorTy() && 11765 "Expected integer vector types only."); 11766 if (V1->getType() != V2->getType()) { 11767 if (cast<VectorType>(V2->getType()) 11768 ->getElementType() 11769 ->getIntegerBitWidth() < cast<VectorType>(V1->getType()) 11770 ->getElementType() 11771 ->getIntegerBitWidth()) 11772 V2 = Builder.CreateIntCast( 11773 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL))); 11774 else 11775 V1 = Builder.CreateIntCast( 11776 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL))); 11777 } 11778 } 11779 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); 11780 if (auto *I = dyn_cast<Instruction>(Vec)) { 11781 GatherShuffleExtractSeq.insert(I); 11782 CSEBlocks.insert(I->getParent()); 11783 } 11784 return Vec; 11785 } 11786 /// Creates permutation of the single vector operand with the given mask, if 11787 /// it is not identity mask. 11788 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) { 11789 if (Mask.empty()) 11790 return V1; 11791 unsigned VF = Mask.size(); 11792 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements(); 11793 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF)) 11794 return V1; 11795 Value *Vec = Builder.CreateShuffleVector(V1, Mask); 11796 if (auto *I = dyn_cast<Instruction>(Vec)) { 11797 GatherShuffleExtractSeq.insert(I); 11798 CSEBlocks.insert(I->getParent()); 11799 } 11800 return Vec; 11801 } 11802 Value *createIdentity(Value *V) { return V; } 11803 Value *createPoison(Type *Ty, unsigned VF) { 11804 return PoisonValue::get(getWidenedType(Ty, VF)); 11805 } 11806 /// Resizes 2 input vector to match the sizes, if the they are not equal 11807 /// yet. The smallest vector is resized to the size of the larger vector. 11808 void resizeToMatch(Value *&V1, Value *&V2) { 11809 if (V1->getType() == V2->getType()) 11810 return; 11811 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 11812 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements(); 11813 int VF = std::max(V1VF, V2VF); 11814 int MinVF = std::min(V1VF, V2VF); 11815 SmallVector<int> IdentityMask(VF, PoisonMaskElem); 11816 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), 11817 0); 11818 Value *&Op = MinVF == V1VF ? V1 : V2; 11819 Op = Builder.CreateShuffleVector(Op, IdentityMask); 11820 if (auto *I = dyn_cast<Instruction>(Op)) { 11821 GatherShuffleExtractSeq.insert(I); 11822 CSEBlocks.insert(I->getParent()); 11823 } 11824 if (MinVF == V1VF) 11825 V1 = Op; 11826 else 11827 V2 = Op; 11828 } 11829 }; 11830 11831 /// Smart shuffle instruction emission, walks through shuffles trees and 11832 /// tries to find the best matching vector for the actual shuffle 11833 /// instruction. 11834 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) { 11835 assert(V1 && "Expected at least one vector value."); 11836 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, 11837 R.CSEBlocks, *R.DL); 11838 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask, 11839 ShuffleBuilder); 11840 } 11841 11842 /// Transforms mask \p CommonMask per given \p Mask to make proper set after 11843 /// shuffle emission. 11844 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, 11845 ArrayRef<int> Mask) { 11846 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 11847 if (Mask[Idx] != PoisonMaskElem) 11848 CommonMask[Idx] = Idx; 11849 } 11850 11851 /// Cast value \p V to the vector type with the same number of elements, but 11852 /// the base type \p ScalarTy. 11853 Value *castToScalarTyElem(Value *V, 11854 std::optional<bool> IsSigned = std::nullopt) { 11855 auto *VecTy = cast<VectorType>(V->getType()); 11856 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0); 11857 if (VecTy->getElementType() == ScalarTy->getScalarType()) 11858 return V; 11859 return Builder.CreateIntCast( 11860 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()), 11861 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL)))); 11862 } 11863 11864 public: 11865 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) 11866 : ScalarTy(ScalarTy), Builder(Builder), R(R) {} 11867 11868 /// Adjusts extractelements after reusing them. 11869 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask, 11870 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, 11871 unsigned NumParts, bool &UseVecBaseAsInput) { 11872 UseVecBaseAsInput = false; 11873 SmallPtrSet<Value *, 4> UniqueBases; 11874 Value *VecBase = nullptr; 11875 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 11876 int Idx = Mask[I]; 11877 if (Idx == PoisonMaskElem) 11878 continue; 11879 auto *EI = cast<ExtractElementInst>(E->Scalars[I]); 11880 VecBase = EI->getVectorOperand(); 11881 if (const TreeEntry *TE = R.getTreeEntry(VecBase)) 11882 VecBase = TE->VectorizedValue; 11883 assert(VecBase && "Expected vectorized value."); 11884 UniqueBases.insert(VecBase); 11885 // If the only one use is vectorized - can delete the extractelement 11886 // itself. 11887 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) || 11888 any_of(EI->users(), [&](User *U) { 11889 const TreeEntry *UTE = R.getTreeEntry(U); 11890 return !UTE || R.MultiNodeScalars.contains(U) || 11891 (isa<GetElementPtrInst>(U) && 11892 !R.areAllUsersVectorized(cast<Instruction>(U))) || 11893 count_if(R.VectorizableTree, 11894 [&](const std::unique_ptr<TreeEntry> &TE) { 11895 return any_of(TE->UserTreeIndices, 11896 [&](const EdgeInfo &Edge) { 11897 return Edge.UserTE == UTE; 11898 }) && 11899 is_contained(TE->Scalars, EI); 11900 }) != 1; 11901 })) 11902 continue; 11903 R.eraseInstruction(EI); 11904 } 11905 if (NumParts == 1 || UniqueBases.size() == 1) { 11906 assert(VecBase && "Expected vectorized value."); 11907 return castToScalarTyElem(VecBase); 11908 } 11909 UseVecBaseAsInput = true; 11910 auto TransformToIdentity = [](MutableArrayRef<int> Mask) { 11911 for (auto [I, Idx] : enumerate(Mask)) 11912 if (Idx != PoisonMaskElem) 11913 Idx = I; 11914 }; 11915 // Perform multi-register vector shuffle, joining them into a single virtual 11916 // long vector. 11917 // Need to shuffle each part independently and then insert all this parts 11918 // into a long virtual vector register, forming the original vector. 11919 Value *Vec = nullptr; 11920 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); 11921 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts); 11922 for (unsigned Part : seq<unsigned>(NumParts)) { 11923 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part); 11924 ArrayRef<Value *> VL = 11925 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit); 11926 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit); 11927 constexpr int MaxBases = 2; 11928 SmallVector<Value *, MaxBases> Bases(MaxBases); 11929 auto VLMask = zip(VL, SubMask); 11930 const unsigned VF = std::accumulate( 11931 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) { 11932 if (std::get<1>(D) == PoisonMaskElem) 11933 return S; 11934 Value *VecOp = 11935 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand(); 11936 if (const TreeEntry *TE = R.getTreeEntry(VecOp)) 11937 VecOp = TE->VectorizedValue; 11938 assert(VecOp && "Expected vectorized value."); 11939 const unsigned Size = 11940 cast<FixedVectorType>(VecOp->getType())->getNumElements(); 11941 return std::max(S, Size); 11942 }); 11943 for (const auto [V, I] : VLMask) { 11944 if (I == PoisonMaskElem) 11945 continue; 11946 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand(); 11947 if (const TreeEntry *TE = R.getTreeEntry(VecOp)) 11948 VecOp = TE->VectorizedValue; 11949 assert(VecOp && "Expected vectorized value."); 11950 VecOp = castToScalarTyElem(VecOp); 11951 Bases[I / VF] = VecOp; 11952 } 11953 if (!Bases.front()) 11954 continue; 11955 Value *SubVec; 11956 if (Bases.back()) { 11957 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask); 11958 TransformToIdentity(SubMask); 11959 } else { 11960 SubVec = Bases.front(); 11961 } 11962 if (!Vec) { 11963 Vec = SubVec; 11964 assert((Part == 0 || all_of(seq<unsigned>(0, Part), 11965 [&](unsigned P) { 11966 ArrayRef<int> SubMask = 11967 Mask.slice(P * SliceSize, 11968 getNumElems(Mask.size(), 11969 SliceSize, P)); 11970 return all_of(SubMask, [](int Idx) { 11971 return Idx == PoisonMaskElem; 11972 }); 11973 })) && 11974 "Expected first part or all previous parts masked."); 11975 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); 11976 } else { 11977 unsigned NewVF = 11978 cast<FixedVectorType>(Vec->getType())->getNumElements(); 11979 if (Vec->getType() != SubVec->getType()) { 11980 unsigned SubVecVF = 11981 cast<FixedVectorType>(SubVec->getType())->getNumElements(); 11982 NewVF = std::max(NewVF, SubVecVF); 11983 } 11984 // Adjust SubMask. 11985 for (int &Idx : SubMask) 11986 if (Idx != PoisonMaskElem) 11987 Idx += NewVF; 11988 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); 11989 Vec = createShuffle(Vec, SubVec, VecMask); 11990 TransformToIdentity(VecMask); 11991 } 11992 } 11993 copy(VecMask, Mask.begin()); 11994 return Vec; 11995 } 11996 /// Checks if the specified entry \p E needs to be delayed because of its 11997 /// dependency nodes. 11998 std::optional<Value *> 11999 needToDelay(const TreeEntry *E, 12000 ArrayRef<SmallVector<const TreeEntry *>> Deps) const { 12001 // No need to delay emission if all deps are ready. 12002 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) { 12003 return all_of( 12004 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; }); 12005 })) 12006 return std::nullopt; 12007 // Postpone gather emission, will be emitted after the end of the 12008 // process to keep correct order. 12009 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor()); 12010 return Builder.CreateAlignedLoad( 12011 ResVecTy, 12012 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())), 12013 MaybeAlign()); 12014 } 12015 /// Adds 2 input vectors (in form of tree entries) and the mask for their 12016 /// shuffling. 12017 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { 12018 Value *V1 = E1.VectorizedValue; 12019 if (V1->getType()->isIntOrIntVectorTy()) 12020 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { 12021 return !isKnownNonNegative( 12022 V, SimplifyQuery(*R.DL)); 12023 })); 12024 Value *V2 = E2.VectorizedValue; 12025 if (V2->getType()->isIntOrIntVectorTy()) 12026 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) { 12027 return !isKnownNonNegative( 12028 V, SimplifyQuery(*R.DL)); 12029 })); 12030 add(V1, V2, Mask); 12031 } 12032 /// Adds single input vector (in form of tree entry) and the mask for its 12033 /// shuffling. 12034 void add(const TreeEntry &E1, ArrayRef<int> Mask) { 12035 Value *V1 = E1.VectorizedValue; 12036 if (V1->getType()->isIntOrIntVectorTy()) 12037 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { 12038 return !isKnownNonNegative( 12039 V, SimplifyQuery(*R.DL)); 12040 })); 12041 add(V1, Mask); 12042 } 12043 /// Adds 2 input vectors and the mask for their shuffling. 12044 void add(Value *V1, Value *V2, ArrayRef<int> Mask) { 12045 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); 12046 V1 = castToScalarTyElem(V1); 12047 V2 = castToScalarTyElem(V2); 12048 if (InVectors.empty()) { 12049 InVectors.push_back(V1); 12050 InVectors.push_back(V2); 12051 CommonMask.assign(Mask.begin(), Mask.end()); 12052 return; 12053 } 12054 Value *Vec = InVectors.front(); 12055 if (InVectors.size() == 2) { 12056 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 12057 transformMaskAfterShuffle(CommonMask, CommonMask); 12058 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() != 12059 Mask.size()) { 12060 Vec = createShuffle(Vec, nullptr, CommonMask); 12061 transformMaskAfterShuffle(CommonMask, CommonMask); 12062 } 12063 V1 = createShuffle(V1, V2, Mask); 12064 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 12065 if (Mask[Idx] != PoisonMaskElem) 12066 CommonMask[Idx] = Idx + Sz; 12067 InVectors.front() = Vec; 12068 if (InVectors.size() == 2) 12069 InVectors.back() = V1; 12070 else 12071 InVectors.push_back(V1); 12072 } 12073 /// Adds another one input vector and the mask for the shuffling. 12074 void add(Value *V1, ArrayRef<int> Mask, bool = false) { 12075 V1 = castToScalarTyElem(V1); 12076 if (InVectors.empty()) { 12077 if (!isa<FixedVectorType>(V1->getType())) { 12078 V1 = createShuffle(V1, nullptr, CommonMask); 12079 CommonMask.assign(Mask.size(), PoisonMaskElem); 12080 transformMaskAfterShuffle(CommonMask, Mask); 12081 } 12082 InVectors.push_back(V1); 12083 CommonMask.assign(Mask.begin(), Mask.end()); 12084 return; 12085 } 12086 const auto *It = find(InVectors, V1); 12087 if (It == InVectors.end()) { 12088 if (InVectors.size() == 2 || 12089 InVectors.front()->getType() != V1->getType() || 12090 !isa<FixedVectorType>(V1->getType())) { 12091 Value *V = InVectors.front(); 12092 if (InVectors.size() == 2) { 12093 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask); 12094 transformMaskAfterShuffle(CommonMask, CommonMask); 12095 } else if (cast<FixedVectorType>(V->getType())->getNumElements() != 12096 CommonMask.size()) { 12097 V = createShuffle(InVectors.front(), nullptr, CommonMask); 12098 transformMaskAfterShuffle(CommonMask, CommonMask); 12099 } 12100 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 12101 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem) 12102 CommonMask[Idx] = 12103 V->getType() != V1->getType() 12104 ? Idx + Sz 12105 : Mask[Idx] + cast<FixedVectorType>(V1->getType()) 12106 ->getNumElements(); 12107 if (V->getType() != V1->getType()) 12108 V1 = createShuffle(V1, nullptr, Mask); 12109 InVectors.front() = V; 12110 if (InVectors.size() == 2) 12111 InVectors.back() = V1; 12112 else 12113 InVectors.push_back(V1); 12114 return; 12115 } 12116 // Check if second vector is required if the used elements are already 12117 // used from the first one. 12118 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 12119 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) { 12120 InVectors.push_back(V1); 12121 break; 12122 } 12123 } 12124 int VF = CommonMask.size(); 12125 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType())) 12126 VF = FTy->getNumElements(); 12127 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 12128 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) 12129 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); 12130 } 12131 /// Adds another one input vector and the mask for the shuffling. 12132 void addOrdered(Value *V1, ArrayRef<unsigned> Order) { 12133 SmallVector<int> NewMask; 12134 inversePermutation(Order, NewMask); 12135 add(V1, NewMask); 12136 } 12137 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, 12138 Value *Root = nullptr) { 12139 return R.gather(VL, Root, ScalarTy); 12140 } 12141 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } 12142 /// Finalize emission of the shuffles. 12143 /// \param Action the action (if any) to be performed before final applying of 12144 /// the \p ExtMask mask. 12145 Value * 12146 finalize(ArrayRef<int> ExtMask, unsigned VF = 0, 12147 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) { 12148 IsFinalized = true; 12149 if (Action) { 12150 Value *Vec = InVectors.front(); 12151 if (InVectors.size() == 2) { 12152 Vec = createShuffle(Vec, InVectors.back(), CommonMask); 12153 InVectors.pop_back(); 12154 } else { 12155 Vec = createShuffle(Vec, nullptr, CommonMask); 12156 } 12157 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) 12158 if (CommonMask[Idx] != PoisonMaskElem) 12159 CommonMask[Idx] = Idx; 12160 assert(VF > 0 && 12161 "Expected vector length for the final value before action."); 12162 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); 12163 if (VecVF < VF) { 12164 SmallVector<int> ResizeMask(VF, PoisonMaskElem); 12165 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0); 12166 Vec = createShuffle(Vec, nullptr, ResizeMask); 12167 } 12168 Action(Vec, CommonMask); 12169 InVectors.front() = Vec; 12170 } 12171 if (!ExtMask.empty()) { 12172 if (CommonMask.empty()) { 12173 CommonMask.assign(ExtMask.begin(), ExtMask.end()); 12174 } else { 12175 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); 12176 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { 12177 if (ExtMask[I] == PoisonMaskElem) 12178 continue; 12179 NewMask[I] = CommonMask[ExtMask[I]]; 12180 } 12181 CommonMask.swap(NewMask); 12182 } 12183 } 12184 if (CommonMask.empty()) { 12185 assert(InVectors.size() == 1 && "Expected only one vector with no mask"); 12186 return InVectors.front(); 12187 } 12188 if (InVectors.size() == 2) 12189 return createShuffle(InVectors.front(), InVectors.back(), CommonMask); 12190 return createShuffle(InVectors.front(), nullptr, CommonMask); 12191 } 12192 12193 ~ShuffleInstructionBuilder() { 12194 assert((IsFinalized || CommonMask.empty()) && 12195 "Shuffle construction must be finalized."); 12196 } 12197 }; 12198 12199 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, 12200 bool PostponedPHIs) { 12201 ValueList &VL = E->getOperand(NodeIdx); 12202 const unsigned VF = VL.size(); 12203 InstructionsState S = getSameOpcode(VL, *TLI); 12204 // Special processing for GEPs bundle, which may include non-gep values. 12205 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { 12206 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>); 12207 if (It != VL.end()) 12208 S = getSameOpcode(*It, *TLI); 12209 } 12210 if (S.getOpcode()) { 12211 auto CheckSameVE = [&](const TreeEntry *VE) { 12212 return VE->isSame(VL) && 12213 (any_of(VE->UserTreeIndices, 12214 [E, NodeIdx](const EdgeInfo &EI) { 12215 return EI.UserTE == E && EI.EdgeIdx == NodeIdx; 12216 }) || 12217 any_of(VectorizableTree, 12218 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) { 12219 return TE->isOperandGatherNode({E, NodeIdx}) && 12220 VE->isSame(TE->Scalars); 12221 })); 12222 }; 12223 TreeEntry *VE = getTreeEntry(S.OpValue); 12224 bool IsSameVE = VE && CheckSameVE(VE); 12225 if (!IsSameVE) { 12226 auto It = MultiNodeScalars.find(S.OpValue); 12227 if (It != MultiNodeScalars.end()) { 12228 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) { 12229 return TE != VE && CheckSameVE(TE); 12230 }); 12231 if (I != It->getSecond().end()) { 12232 VE = *I; 12233 IsSameVE = true; 12234 } 12235 } 12236 } 12237 if (IsSameVE) { 12238 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) { 12239 ShuffleInstructionBuilder ShuffleBuilder( 12240 cast<VectorType>(V->getType())->getElementType(), Builder, *this); 12241 ShuffleBuilder.add(V, Mask); 12242 return ShuffleBuilder.finalize(std::nullopt); 12243 }; 12244 Value *V = vectorizeTree(VE, PostponedPHIs); 12245 if (VF * getNumElements(VL[0]->getType()) != 12246 cast<FixedVectorType>(V->getType())->getNumElements()) { 12247 if (!VE->ReuseShuffleIndices.empty()) { 12248 // Reshuffle to get only unique values. 12249 // If some of the scalars are duplicated in the vectorization 12250 // tree entry, we do not vectorize them but instead generate a 12251 // mask for the reuses. But if there are several users of the 12252 // same entry, they may have different vectorization factors. 12253 // This is especially important for PHI nodes. In this case, we 12254 // need to adapt the resulting instruction for the user 12255 // vectorization factor and have to reshuffle it again to take 12256 // only unique elements of the vector. Without this code the 12257 // function incorrectly returns reduced vector instruction with 12258 // the same elements, not with the unique ones. 12259 12260 // block: 12261 // %phi = phi <2 x > { .., %entry} {%shuffle, %block} 12262 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> 12263 // ... (use %2) 12264 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} 12265 // br %block 12266 SmallVector<int> Mask(VF, PoisonMaskElem); 12267 for (auto [I, V] : enumerate(VL)) { 12268 if (isa<PoisonValue>(V)) 12269 continue; 12270 Mask[I] = VE->findLaneForValue(V); 12271 } 12272 V = FinalShuffle(V, Mask); 12273 } else { 12274 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && 12275 "Expected vectorization factor less " 12276 "than original vector size."); 12277 SmallVector<int> UniformMask(VF, 0); 12278 std::iota(UniformMask.begin(), UniformMask.end(), 0); 12279 V = FinalShuffle(V, UniformMask); 12280 } 12281 } 12282 // Need to update the operand gather node, if actually the operand is not a 12283 // vectorized node, but the buildvector/gather node, which matches one of 12284 // the vectorized nodes. 12285 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { 12286 return EI.UserTE == E && EI.EdgeIdx == NodeIdx; 12287 }) == VE->UserTreeIndices.end()) { 12288 auto *It = find_if( 12289 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { 12290 return TE->isGather() && 12291 TE->UserTreeIndices.front().UserTE == E && 12292 TE->UserTreeIndices.front().EdgeIdx == NodeIdx; 12293 }); 12294 assert(It != VectorizableTree.end() && "Expected gather node operand."); 12295 (*It)->VectorizedValue = V; 12296 } 12297 return V; 12298 } 12299 } 12300 12301 // Find the corresponding gather entry and vectorize it. 12302 // Allows to be more accurate with tree/graph transformations, checks for the 12303 // correctness of the transformations in many cases. 12304 auto *I = find_if(VectorizableTree, 12305 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) { 12306 return TE->isOperandGatherNode({E, NodeIdx}); 12307 }); 12308 assert(I != VectorizableTree.end() && "Gather node is not in the graph."); 12309 assert(I->get()->UserTreeIndices.size() == 1 && 12310 "Expected only single user for the gather node."); 12311 assert(I->get()->isSame(VL) && "Expected same list of scalars."); 12312 return vectorizeTree(I->get(), PostponedPHIs); 12313 } 12314 12315 template <typename BVTy, typename ResTy, typename... Args> 12316 ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, 12317 Args &...Params) { 12318 assert(E->isGather() && "Expected gather node."); 12319 unsigned VF = E->getVectorFactor(); 12320 12321 bool NeedFreeze = false; 12322 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), 12323 E->ReuseShuffleIndices.end()); 12324 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); 12325 // Build a mask out of the reorder indices and reorder scalars per this 12326 // mask. 12327 SmallVector<int> ReorderMask; 12328 inversePermutation(E->ReorderIndices, ReorderMask); 12329 if (!ReorderMask.empty()) 12330 reorderScalars(GatheredScalars, ReorderMask); 12331 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF, 12332 unsigned I, unsigned SliceSize) { 12333 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { 12334 return isa<UndefValue>(V) && !isa<PoisonValue>(V); 12335 })) 12336 return false; 12337 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; 12338 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; 12339 if (UserTE->getNumOperands() != 2) 12340 return false; 12341 auto *It = 12342 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) { 12343 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { 12344 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; 12345 }) != TE->UserTreeIndices.end(); 12346 }); 12347 if (It == VectorizableTree.end()) 12348 return false; 12349 int Idx; 12350 if ((Mask.size() < InputVF && 12351 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) && 12352 Idx == 0) || 12353 (Mask.size() == InputVF && 12354 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) { 12355 std::iota( 12356 std::next(Mask.begin(), I * SliceSize), 12357 std::next(Mask.begin(), 12358 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)), 12359 0); 12360 } else { 12361 unsigned IVal = 12362 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); 12363 std::fill( 12364 std::next(Mask.begin(), I * SliceSize), 12365 std::next(Mask.begin(), 12366 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)), 12367 IVal); 12368 } 12369 return true; 12370 }; 12371 BVTy ShuffleBuilder(ScalarTy, Params...); 12372 ResTy Res = ResTy(); 12373 SmallVector<int> Mask; 12374 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem); 12375 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles; 12376 Value *ExtractVecBase = nullptr; 12377 bool UseVecBaseAsInput = false; 12378 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles; 12379 SmallVector<SmallVector<const TreeEntry *>> Entries; 12380 Type *OrigScalarTy = GatheredScalars.front()->getType(); 12381 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size()); 12382 unsigned NumParts = TTI->getNumberOfParts(VecTy); 12383 if (NumParts == 0 || NumParts >= GatheredScalars.size()) 12384 NumParts = 1; 12385 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) { 12386 // Check for gathered extracts. 12387 bool Resized = false; 12388 ExtractShuffles = 12389 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); 12390 if (!ExtractShuffles.empty()) { 12391 SmallVector<const TreeEntry *> ExtractEntries; 12392 for (auto [Idx, I] : enumerate(ExtractMask)) { 12393 if (I == PoisonMaskElem) 12394 continue; 12395 if (const auto *TE = getTreeEntry( 12396 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand())) 12397 ExtractEntries.push_back(TE); 12398 } 12399 if (std::optional<ResTy> Delayed = 12400 ShuffleBuilder.needToDelay(E, ExtractEntries)) { 12401 // Delay emission of gathers which are not ready yet. 12402 PostponedGathers.insert(E); 12403 // Postpone gather emission, will be emitted after the end of the 12404 // process to keep correct order. 12405 return *Delayed; 12406 } 12407 if (Value *VecBase = ShuffleBuilder.adjustExtracts( 12408 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) { 12409 ExtractVecBase = VecBase; 12410 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType())) 12411 if (VF == VecBaseTy->getNumElements() && 12412 GatheredScalars.size() != VF) { 12413 Resized = true; 12414 GatheredScalars.append(VF - GatheredScalars.size(), 12415 PoisonValue::get(OrigScalarTy)); 12416 } 12417 } 12418 } 12419 // Gather extracts after we check for full matched gathers only. 12420 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || 12421 E->isAltShuffle() || 12422 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || 12423 isSplat(E->Scalars) || 12424 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { 12425 GatherShuffles = 12426 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); 12427 } 12428 if (!GatherShuffles.empty()) { 12429 if (std::optional<ResTy> Delayed = 12430 ShuffleBuilder.needToDelay(E, Entries)) { 12431 // Delay emission of gathers which are not ready yet. 12432 PostponedGathers.insert(E); 12433 // Postpone gather emission, will be emitted after the end of the 12434 // process to keep correct order. 12435 return *Delayed; 12436 } 12437 if (GatherShuffles.size() == 1 && 12438 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && 12439 Entries.front().front()->isSame(E->Scalars)) { 12440 // Perfect match in the graph, will reuse the previously vectorized 12441 // node. Cost is 0. 12442 LLVM_DEBUG( 12443 dbgs() 12444 << "SLP: perfect diamond match for gather bundle " 12445 << shortBundleName(E->Scalars) << ".\n"); 12446 // Restore the mask for previous partially matched values. 12447 Mask.resize(E->Scalars.size()); 12448 const TreeEntry *FrontTE = Entries.front().front(); 12449 if (FrontTE->ReorderIndices.empty() && 12450 ((FrontTE->ReuseShuffleIndices.empty() && 12451 E->Scalars.size() == FrontTE->Scalars.size()) || 12452 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) { 12453 std::iota(Mask.begin(), Mask.end(), 0); 12454 } else { 12455 for (auto [I, V] : enumerate(E->Scalars)) { 12456 if (isa<PoisonValue>(V)) { 12457 Mask[I] = PoisonMaskElem; 12458 continue; 12459 } 12460 Mask[I] = FrontTE->findLaneForValue(V); 12461 } 12462 } 12463 ShuffleBuilder.add(*FrontTE, Mask); 12464 Res = ShuffleBuilder.finalize(E->getCommonMask()); 12465 return Res; 12466 } 12467 if (!Resized) { 12468 if (GatheredScalars.size() != VF && 12469 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) { 12470 return any_of(TEs, [&](const TreeEntry *TE) { 12471 return TE->getVectorFactor() == VF; 12472 }); 12473 })) 12474 GatheredScalars.append(VF - GatheredScalars.size(), 12475 PoisonValue::get(OrigScalarTy)); 12476 } 12477 // Remove shuffled elements from list of gathers. 12478 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { 12479 if (Mask[I] != PoisonMaskElem) 12480 GatheredScalars[I] = PoisonValue::get(OrigScalarTy); 12481 } 12482 } 12483 } 12484 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars, 12485 SmallVectorImpl<int> &ReuseMask, 12486 bool IsRootPoison) { 12487 // For splats with can emit broadcasts instead of gathers, so try to find 12488 // such sequences. 12489 bool IsSplat = IsRootPoison && isSplat(Scalars) && 12490 (Scalars.size() > 2 || Scalars.front() == Scalars.back()); 12491 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy)); 12492 SmallVector<int> UndefPos; 12493 DenseMap<Value *, unsigned> UniquePositions; 12494 // Gather unique non-const values and all constant values. 12495 // For repeated values, just shuffle them. 12496 int NumNonConsts = 0; 12497 int SinglePos = 0; 12498 for (auto [I, V] : enumerate(Scalars)) { 12499 if (isa<UndefValue>(V)) { 12500 if (!isa<PoisonValue>(V)) { 12501 ReuseMask[I] = I; 12502 UndefPos.push_back(I); 12503 } 12504 continue; 12505 } 12506 if (isConstant(V)) { 12507 ReuseMask[I] = I; 12508 continue; 12509 } 12510 ++NumNonConsts; 12511 SinglePos = I; 12512 Value *OrigV = V; 12513 Scalars[I] = PoisonValue::get(OrigScalarTy); 12514 if (IsSplat) { 12515 Scalars.front() = OrigV; 12516 ReuseMask[I] = 0; 12517 } else { 12518 const auto Res = UniquePositions.try_emplace(OrigV, I); 12519 Scalars[Res.first->second] = OrigV; 12520 ReuseMask[I] = Res.first->second; 12521 } 12522 } 12523 if (NumNonConsts == 1) { 12524 // Restore single insert element. 12525 if (IsSplat) { 12526 ReuseMask.assign(VF, PoisonMaskElem); 12527 std::swap(Scalars.front(), Scalars[SinglePos]); 12528 if (!UndefPos.empty() && UndefPos.front() == 0) 12529 Scalars.front() = UndefValue::get(OrigScalarTy); 12530 } 12531 ReuseMask[SinglePos] = SinglePos; 12532 } else if (!UndefPos.empty() && IsSplat) { 12533 // For undef values, try to replace them with the simple broadcast. 12534 // We can do it if the broadcasted value is guaranteed to be 12535 // non-poisonous, or by freezing the incoming scalar value first. 12536 auto *It = find_if(Scalars, [this, E](Value *V) { 12537 return !isa<UndefValue>(V) && 12538 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || 12539 (E->UserTreeIndices.size() == 1 && 12540 any_of(V->uses(), [E](const Use &U) { 12541 // Check if the value already used in the same operation in 12542 // one of the nodes already. 12543 return E->UserTreeIndices.front().EdgeIdx != 12544 U.getOperandNo() && 12545 is_contained( 12546 E->UserTreeIndices.front().UserTE->Scalars, 12547 U.getUser()); 12548 }))); 12549 }); 12550 if (It != Scalars.end()) { 12551 // Replace undefs by the non-poisoned scalars and emit broadcast. 12552 int Pos = std::distance(Scalars.begin(), It); 12553 for (int I : UndefPos) { 12554 // Set the undef position to the non-poisoned scalar. 12555 ReuseMask[I] = Pos; 12556 // Replace the undef by the poison, in the mask it is replaced by 12557 // non-poisoned scalar already. 12558 if (I != Pos) 12559 Scalars[I] = PoisonValue::get(OrigScalarTy); 12560 } 12561 } else { 12562 // Replace undefs by the poisons, emit broadcast and then emit 12563 // freeze. 12564 for (int I : UndefPos) { 12565 ReuseMask[I] = PoisonMaskElem; 12566 if (isa<UndefValue>(Scalars[I])) 12567 Scalars[I] = PoisonValue::get(OrigScalarTy); 12568 } 12569 NeedFreeze = true; 12570 } 12571 } 12572 }; 12573 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) { 12574 bool IsNonPoisoned = true; 12575 bool IsUsedInExpr = true; 12576 Value *Vec1 = nullptr; 12577 if (!ExtractShuffles.empty()) { 12578 // Gather of extractelements can be represented as just a shuffle of 12579 // a single/two vectors the scalars are extracted from. 12580 // Find input vectors. 12581 Value *Vec2 = nullptr; 12582 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { 12583 if (!Mask.empty() && Mask[I] != PoisonMaskElem) 12584 ExtractMask[I] = PoisonMaskElem; 12585 } 12586 if (UseVecBaseAsInput) { 12587 Vec1 = ExtractVecBase; 12588 } else { 12589 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { 12590 if (ExtractMask[I] == PoisonMaskElem) 12591 continue; 12592 if (isa<UndefValue>(E->Scalars[I])) 12593 continue; 12594 auto *EI = cast<ExtractElementInst>(E->Scalars[I]); 12595 Value *VecOp = EI->getVectorOperand(); 12596 if (const auto *TE = getTreeEntry(VecOp)) 12597 if (TE->VectorizedValue) 12598 VecOp = TE->VectorizedValue; 12599 if (!Vec1) { 12600 Vec1 = VecOp; 12601 } else if (Vec1 != VecOp) { 12602 assert((!Vec2 || Vec2 == VecOp) && 12603 "Expected only 1 or 2 vectors shuffle."); 12604 Vec2 = VecOp; 12605 } 12606 } 12607 } 12608 if (Vec2) { 12609 IsUsedInExpr = false; 12610 IsNonPoisoned &= 12611 isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); 12612 ShuffleBuilder.add(Vec1, Vec2, ExtractMask); 12613 } else if (Vec1) { 12614 IsUsedInExpr &= FindReusedSplat( 12615 ExtractMask, 12616 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0, 12617 ExtractMask.size()); 12618 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true); 12619 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); 12620 } else { 12621 IsUsedInExpr = false; 12622 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask, 12623 /*ForExtracts=*/true); 12624 } 12625 } 12626 if (!GatherShuffles.empty()) { 12627 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts); 12628 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); 12629 for (const auto [I, TEs] : enumerate(Entries)) { 12630 if (TEs.empty()) { 12631 assert(!GatherShuffles[I] && 12632 "No shuffles with empty entries list expected."); 12633 continue; 12634 } 12635 assert((TEs.size() == 1 || TEs.size() == 2) && 12636 "Expected shuffle of 1 or 2 entries."); 12637 unsigned Limit = getNumElems(Mask.size(), SliceSize, I); 12638 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit); 12639 VecMask.assign(VecMask.size(), PoisonMaskElem); 12640 copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); 12641 if (TEs.size() == 1) { 12642 IsUsedInExpr &= FindReusedSplat( 12643 VecMask, TEs.front()->getVectorFactor(), I, SliceSize); 12644 ShuffleBuilder.add(*TEs.front(), VecMask); 12645 if (TEs.front()->VectorizedValue) 12646 IsNonPoisoned &= 12647 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue); 12648 } else { 12649 IsUsedInExpr = false; 12650 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); 12651 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) 12652 IsNonPoisoned &= 12653 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && 12654 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); 12655 } 12656 } 12657 } 12658 // Try to figure out best way to combine values: build a shuffle and insert 12659 // elements or just build several shuffles. 12660 // Insert non-constant scalars. 12661 SmallVector<Value *> NonConstants(GatheredScalars); 12662 int EMSz = ExtractMask.size(); 12663 int MSz = Mask.size(); 12664 // Try to build constant vector and shuffle with it only if currently we 12665 // have a single permutation and more than 1 scalar constants. 12666 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty(); 12667 bool IsIdentityShuffle = 12668 ((UseVecBaseAsInput || 12669 all_of(ExtractShuffles, 12670 [](const std::optional<TTI::ShuffleKind> &SK) { 12671 return SK.value_or(TTI::SK_PermuteTwoSrc) == 12672 TTI::SK_PermuteSingleSrc; 12673 })) && 12674 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && 12675 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) || 12676 (!GatherShuffles.empty() && 12677 all_of(GatherShuffles, 12678 [](const std::optional<TTI::ShuffleKind> &SK) { 12679 return SK.value_or(TTI::SK_PermuteTwoSrc) == 12680 TTI::SK_PermuteSingleSrc; 12681 }) && 12682 none_of(Mask, [&](int I) { return I >= MSz; }) && 12683 ShuffleVectorInst::isIdentityMask(Mask, MSz)); 12684 bool EnoughConstsForShuffle = 12685 IsSingleShuffle && 12686 (none_of(GatheredScalars, 12687 [](Value *V) { 12688 return isa<UndefValue>(V) && !isa<PoisonValue>(V); 12689 }) || 12690 any_of(GatheredScalars, 12691 [](Value *V) { 12692 return isa<Constant>(V) && !isa<UndefValue>(V); 12693 })) && 12694 (!IsIdentityShuffle || 12695 (GatheredScalars.size() == 2 && 12696 any_of(GatheredScalars, 12697 [](Value *V) { return !isa<UndefValue>(V); })) || 12698 count_if(GatheredScalars, [](Value *V) { 12699 return isa<Constant>(V) && !isa<PoisonValue>(V); 12700 }) > 1); 12701 // NonConstants array contains just non-constant values, GatheredScalars 12702 // contains only constant to build final vector and then shuffle. 12703 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { 12704 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I])) 12705 NonConstants[I] = PoisonValue::get(OrigScalarTy); 12706 else 12707 GatheredScalars[I] = PoisonValue::get(OrigScalarTy); 12708 } 12709 // Generate constants for final shuffle and build a mask for them. 12710 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) { 12711 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem); 12712 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); 12713 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size()); 12714 ShuffleBuilder.add(BV, BVMask); 12715 } 12716 if (all_of(NonConstants, [=](Value *V) { 12717 return isa<PoisonValue>(V) || 12718 (IsSingleShuffle && ((IsIdentityShuffle && 12719 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V)); 12720 })) 12721 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); 12722 else 12723 Res = ShuffleBuilder.finalize( 12724 E->ReuseShuffleIndices, E->Scalars.size(), 12725 [&](Value *&Vec, SmallVectorImpl<int> &Mask) { 12726 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); 12727 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); 12728 }); 12729 } else if (!allConstant(GatheredScalars)) { 12730 // Gather unique scalars and all constants. 12731 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem); 12732 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); 12733 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); 12734 ShuffleBuilder.add(BV, ReuseMask); 12735 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); 12736 } else { 12737 // Gather all constants. 12738 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem); 12739 for (auto [I, V] : enumerate(E->Scalars)) { 12740 if (!isa<PoisonValue>(V)) 12741 Mask[I] = I; 12742 } 12743 Value *BV = ShuffleBuilder.gather(E->Scalars); 12744 ShuffleBuilder.add(BV, Mask); 12745 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); 12746 } 12747 12748 if (NeedFreeze) 12749 Res = ShuffleBuilder.createFreeze(Res); 12750 return Res; 12751 } 12752 12753 Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { 12754 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy, 12755 Builder, *this); 12756 } 12757 12758 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { 12759 IRBuilderBase::InsertPointGuard Guard(Builder); 12760 12761 if (E->VectorizedValue && 12762 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI || 12763 E->isAltShuffle())) { 12764 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 12765 return E->VectorizedValue; 12766 } 12767 12768 Value *V = E->Scalars.front(); 12769 Type *ScalarTy = V->getType(); 12770 if (auto *Store = dyn_cast<StoreInst>(V)) 12771 ScalarTy = Store->getValueOperand()->getType(); 12772 else if (auto *IE = dyn_cast<InsertElementInst>(V)) 12773 ScalarTy = IE->getOperand(1)->getType(); 12774 auto It = MinBWs.find(E); 12775 if (It != MinBWs.end()) 12776 ScalarTy = IntegerType::get(F->getContext(), It->second.first); 12777 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size()); 12778 if (E->isGather()) { 12779 // Set insert point for non-reduction initial nodes. 12780 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList) 12781 setInsertPointAfterBundle(E); 12782 Value *Vec = createBuildVector(E, ScalarTy); 12783 E->VectorizedValue = Vec; 12784 return Vec; 12785 } 12786 12787 bool IsReverseOrder = isReverseOrder(E->ReorderIndices); 12788 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) { 12789 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); 12790 if (E->getOpcode() == Instruction::Store && 12791 E->State == TreeEntry::Vectorize) { 12792 ArrayRef<int> Mask = 12793 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), 12794 E->ReorderIndices.size()); 12795 ShuffleBuilder.add(V, Mask); 12796 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) { 12797 ShuffleBuilder.addOrdered(V, std::nullopt); 12798 } else { 12799 ShuffleBuilder.addOrdered(V, E->ReorderIndices); 12800 } 12801 return ShuffleBuilder.finalize(E->ReuseShuffleIndices); 12802 }; 12803 12804 assert((E->State == TreeEntry::Vectorize || 12805 E->State == TreeEntry::ScatterVectorize || 12806 E->State == TreeEntry::StridedVectorize) && 12807 "Unhandled state"); 12808 unsigned ShuffleOrOp = 12809 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 12810 Instruction *VL0 = E->getMainOp(); 12811 auto GetOperandSignedness = [&](unsigned Idx) { 12812 const TreeEntry *OpE = getOperandEntry(E, Idx); 12813 bool IsSigned = false; 12814 auto It = MinBWs.find(OpE); 12815 if (It != MinBWs.end()) 12816 IsSigned = It->second.second; 12817 else 12818 IsSigned = any_of(OpE->Scalars, [&](Value *R) { 12819 return !isKnownNonNegative(R, SimplifyQuery(*DL)); 12820 }); 12821 return IsSigned; 12822 }; 12823 switch (ShuffleOrOp) { 12824 case Instruction::PHI: { 12825 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() || 12826 E != VectorizableTree.front().get() || 12827 !E->UserTreeIndices.empty()) && 12828 "PHI reordering is free."); 12829 if (PostponedPHIs && E->VectorizedValue) 12830 return E->VectorizedValue; 12831 auto *PH = cast<PHINode>(VL0); 12832 Builder.SetInsertPoint(PH->getParent(), 12833 PH->getParent()->getFirstNonPHIIt()); 12834 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 12835 if (PostponedPHIs || !E->VectorizedValue) { 12836 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 12837 E->PHI = NewPhi; 12838 Value *V = NewPhi; 12839 12840 // Adjust insertion point once all PHI's have been generated. 12841 Builder.SetInsertPoint(PH->getParent(), 12842 PH->getParent()->getFirstInsertionPt()); 12843 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 12844 12845 V = FinalShuffle(V, E, VecTy); 12846 12847 E->VectorizedValue = V; 12848 if (PostponedPHIs) 12849 return V; 12850 } 12851 PHINode *NewPhi = cast<PHINode>(E->PHI); 12852 // If phi node is fully emitted - exit. 12853 if (NewPhi->getNumIncomingValues() != 0) 12854 return NewPhi; 12855 12856 // PHINodes may have multiple entries from the same block. We want to 12857 // visit every block once. 12858 SmallPtrSet<BasicBlock *, 4> VisitedBBs; 12859 12860 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) { 12861 ValueList Operands; 12862 BasicBlock *IBB = PH->getIncomingBlock(I); 12863 12864 // Stop emission if all incoming values are generated. 12865 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) { 12866 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 12867 return NewPhi; 12868 } 12869 12870 if (!VisitedBBs.insert(IBB).second) { 12871 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 12872 continue; 12873 } 12874 12875 Builder.SetInsertPoint(IBB->getTerminator()); 12876 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 12877 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true); 12878 if (VecTy != Vec->getType()) { 12879 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || 12880 MinBWs.contains(getOperandEntry(E, I))) && 12881 "Expected item in MinBWs."); 12882 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I)); 12883 } 12884 NewPhi->addIncoming(Vec, IBB); 12885 } 12886 12887 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 12888 "Invalid number of incoming values"); 12889 return NewPhi; 12890 } 12891 12892 case Instruction::ExtractElement: { 12893 Value *V = E->getSingleOperand(0); 12894 if (const TreeEntry *TE = getTreeEntry(V)) 12895 V = TE->VectorizedValue; 12896 setInsertPointAfterBundle(E); 12897 V = FinalShuffle(V, E, VecTy); 12898 E->VectorizedValue = V; 12899 return V; 12900 } 12901 case Instruction::ExtractValue: { 12902 auto *LI = cast<LoadInst>(E->getSingleOperand(0)); 12903 Builder.SetInsertPoint(LI); 12904 Value *Ptr = LI->getPointerOperand(); 12905 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); 12906 Value *NewV = propagateMetadata(V, E->Scalars); 12907 NewV = FinalShuffle(NewV, E, VecTy); 12908 E->VectorizedValue = NewV; 12909 return NewV; 12910 } 12911 case Instruction::InsertElement: { 12912 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); 12913 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back())); 12914 Value *V = vectorizeOperand(E, 1, PostponedPHIs); 12915 ArrayRef<Value *> Op = E->getOperand(1); 12916 Type *ScalarTy = Op.front()->getType(); 12917 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) { 12918 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs."); 12919 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1)); 12920 assert(Res.first > 0 && "Expected item in MinBWs."); 12921 V = Builder.CreateIntCast( 12922 V, 12923 getWidenedType( 12924 ScalarTy, 12925 cast<FixedVectorType>(V->getType())->getNumElements()), 12926 Res.second); 12927 } 12928 12929 // Create InsertVector shuffle if necessary 12930 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) { 12931 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); 12932 })); 12933 const unsigned NumElts = 12934 cast<FixedVectorType>(FirstInsert->getType())->getNumElements(); 12935 const unsigned NumScalars = E->Scalars.size(); 12936 12937 unsigned Offset = *getElementIndex(VL0); 12938 assert(Offset < NumElts && "Failed to find vector index offset"); 12939 12940 // Create shuffle to resize vector 12941 SmallVector<int> Mask; 12942 if (!E->ReorderIndices.empty()) { 12943 inversePermutation(E->ReorderIndices, Mask); 12944 Mask.append(NumElts - NumScalars, PoisonMaskElem); 12945 } else { 12946 Mask.assign(NumElts, PoisonMaskElem); 12947 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); 12948 } 12949 // Create InsertVector shuffle if necessary 12950 bool IsIdentity = true; 12951 SmallVector<int> PrevMask(NumElts, PoisonMaskElem); 12952 Mask.swap(PrevMask); 12953 for (unsigned I = 0; I < NumScalars; ++I) { 12954 Value *Scalar = E->Scalars[PrevMask[I]]; 12955 unsigned InsertIdx = *getElementIndex(Scalar); 12956 IsIdentity &= InsertIdx - Offset == I; 12957 Mask[InsertIdx - Offset] = I; 12958 } 12959 if (!IsIdentity || NumElts != NumScalars) { 12960 Value *V2 = nullptr; 12961 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V); 12962 SmallVector<int> InsertMask(Mask); 12963 if (NumElts != NumScalars && Offset == 0) { 12964 // Follow all insert element instructions from the current buildvector 12965 // sequence. 12966 InsertElementInst *Ins = cast<InsertElementInst>(VL0); 12967 do { 12968 std::optional<unsigned> InsertIdx = getElementIndex(Ins); 12969 if (!InsertIdx) 12970 break; 12971 if (InsertMask[*InsertIdx] == PoisonMaskElem) 12972 InsertMask[*InsertIdx] = *InsertIdx; 12973 if (!Ins->hasOneUse()) 12974 break; 12975 Ins = dyn_cast_or_null<InsertElementInst>( 12976 Ins->getUniqueUndroppableUser()); 12977 } while (Ins); 12978 SmallBitVector UseMask = 12979 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); 12980 SmallBitVector IsFirstPoison = 12981 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 12982 SmallBitVector IsFirstUndef = 12983 isUndefVector(FirstInsert->getOperand(0), UseMask); 12984 if (!IsFirstPoison.all()) { 12985 unsigned Idx = 0; 12986 for (unsigned I = 0; I < NumElts; I++) { 12987 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) && 12988 IsFirstUndef.test(I)) { 12989 if (IsVNonPoisonous) { 12990 InsertMask[I] = I < NumScalars ? I : 0; 12991 continue; 12992 } 12993 if (!V2) 12994 V2 = UndefValue::get(V->getType()); 12995 if (Idx >= NumScalars) 12996 Idx = NumScalars - 1; 12997 InsertMask[I] = NumScalars + Idx; 12998 ++Idx; 12999 } else if (InsertMask[I] != PoisonMaskElem && 13000 Mask[I] == PoisonMaskElem) { 13001 InsertMask[I] = PoisonMaskElem; 13002 } 13003 } 13004 } else { 13005 InsertMask = Mask; 13006 } 13007 } 13008 if (!V2) 13009 V2 = PoisonValue::get(V->getType()); 13010 V = Builder.CreateShuffleVector(V, V2, InsertMask); 13011 if (auto *I = dyn_cast<Instruction>(V)) { 13012 GatherShuffleExtractSeq.insert(I); 13013 CSEBlocks.insert(I->getParent()); 13014 } 13015 } 13016 13017 SmallVector<int> InsertMask(NumElts, PoisonMaskElem); 13018 for (unsigned I = 0; I < NumElts; I++) { 13019 if (Mask[I] != PoisonMaskElem) 13020 InsertMask[Offset + I] = I; 13021 } 13022 SmallBitVector UseMask = 13023 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask); 13024 SmallBitVector IsFirstUndef = 13025 isUndefVector(FirstInsert->getOperand(0), UseMask); 13026 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) && 13027 NumElts != NumScalars) { 13028 if (IsFirstUndef.all()) { 13029 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) { 13030 SmallBitVector IsFirstPoison = 13031 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 13032 if (!IsFirstPoison.all()) { 13033 for (unsigned I = 0; I < NumElts; I++) { 13034 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I)) 13035 InsertMask[I] = I + NumElts; 13036 } 13037 } 13038 V = Builder.CreateShuffleVector( 13039 V, 13040 IsFirstPoison.all() ? PoisonValue::get(V->getType()) 13041 : FirstInsert->getOperand(0), 13042 InsertMask, cast<Instruction>(E->Scalars.back())->getName()); 13043 if (auto *I = dyn_cast<Instruction>(V)) { 13044 GatherShuffleExtractSeq.insert(I); 13045 CSEBlocks.insert(I->getParent()); 13046 } 13047 } 13048 } else { 13049 SmallBitVector IsFirstPoison = 13050 isUndefVector<true>(FirstInsert->getOperand(0), UseMask); 13051 for (unsigned I = 0; I < NumElts; I++) { 13052 if (InsertMask[I] == PoisonMaskElem) 13053 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I; 13054 else 13055 InsertMask[I] += NumElts; 13056 } 13057 V = Builder.CreateShuffleVector( 13058 FirstInsert->getOperand(0), V, InsertMask, 13059 cast<Instruction>(E->Scalars.back())->getName()); 13060 if (auto *I = dyn_cast<Instruction>(V)) { 13061 GatherShuffleExtractSeq.insert(I); 13062 CSEBlocks.insert(I->getParent()); 13063 } 13064 } 13065 } 13066 13067 ++NumVectorInstructions; 13068 E->VectorizedValue = V; 13069 return V; 13070 } 13071 case Instruction::ZExt: 13072 case Instruction::SExt: 13073 case Instruction::FPToUI: 13074 case Instruction::FPToSI: 13075 case Instruction::FPExt: 13076 case Instruction::PtrToInt: 13077 case Instruction::IntToPtr: 13078 case Instruction::SIToFP: 13079 case Instruction::UIToFP: 13080 case Instruction::Trunc: 13081 case Instruction::FPTrunc: 13082 case Instruction::BitCast: { 13083 setInsertPointAfterBundle(E); 13084 13085 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs); 13086 if (E->VectorizedValue) { 13087 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13088 return E->VectorizedValue; 13089 } 13090 13091 auto *CI = cast<CastInst>(VL0); 13092 Instruction::CastOps VecOpcode = CI->getOpcode(); 13093 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType(); 13094 auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); 13095 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() && 13096 (SrcIt != MinBWs.end() || It != MinBWs.end() || 13097 SrcScalarTy != CI->getOperand(0)->getType())) { 13098 // Check if the values are candidates to demote. 13099 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy); 13100 if (SrcIt != MinBWs.end()) 13101 SrcBWSz = SrcIt->second.first; 13102 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 13103 if (BWSz == SrcBWSz) { 13104 VecOpcode = Instruction::BitCast; 13105 } else if (BWSz < SrcBWSz) { 13106 VecOpcode = Instruction::Trunc; 13107 } else if (It != MinBWs.end()) { 13108 assert(BWSz > SrcBWSz && "Invalid cast!"); 13109 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; 13110 } else if (SrcIt != MinBWs.end()) { 13111 assert(BWSz > SrcBWSz && "Invalid cast!"); 13112 VecOpcode = 13113 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt; 13114 } 13115 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() && 13116 !SrcIt->second.second) { 13117 VecOpcode = Instruction::UIToFP; 13118 } 13119 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast) 13120 ? InVec 13121 : Builder.CreateCast(VecOpcode, InVec, VecTy); 13122 V = FinalShuffle(V, E, VecTy); 13123 13124 E->VectorizedValue = V; 13125 ++NumVectorInstructions; 13126 return V; 13127 } 13128 case Instruction::FCmp: 13129 case Instruction::ICmp: { 13130 setInsertPointAfterBundle(E); 13131 13132 Value *L = vectorizeOperand(E, 0, PostponedPHIs); 13133 if (E->VectorizedValue) { 13134 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13135 return E->VectorizedValue; 13136 } 13137 Value *R = vectorizeOperand(E, 1, PostponedPHIs); 13138 if (E->VectorizedValue) { 13139 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13140 return E->VectorizedValue; 13141 } 13142 if (L->getType() != R->getType()) { 13143 assert((getOperandEntry(E, 0)->isGather() || 13144 getOperandEntry(E, 1)->isGather() || 13145 MinBWs.contains(getOperandEntry(E, 0)) || 13146 MinBWs.contains(getOperandEntry(E, 1))) && 13147 "Expected item in MinBWs."); 13148 if (cast<VectorType>(L->getType()) 13149 ->getElementType() 13150 ->getIntegerBitWidth() < cast<VectorType>(R->getType()) 13151 ->getElementType() 13152 ->getIntegerBitWidth()) { 13153 Type *CastTy = R->getType(); 13154 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0)); 13155 } else { 13156 Type *CastTy = L->getType(); 13157 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1)); 13158 } 13159 } 13160 13161 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 13162 Value *V = Builder.CreateCmp(P0, L, R); 13163 propagateIRFlags(V, E->Scalars, VL0); 13164 // Do not cast for cmps. 13165 VecTy = cast<FixedVectorType>(V->getType()); 13166 V = FinalShuffle(V, E, VecTy); 13167 13168 E->VectorizedValue = V; 13169 ++NumVectorInstructions; 13170 return V; 13171 } 13172 case Instruction::Select: { 13173 setInsertPointAfterBundle(E); 13174 13175 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs); 13176 if (E->VectorizedValue) { 13177 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13178 return E->VectorizedValue; 13179 } 13180 Value *True = vectorizeOperand(E, 1, PostponedPHIs); 13181 if (E->VectorizedValue) { 13182 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13183 return E->VectorizedValue; 13184 } 13185 Value *False = vectorizeOperand(E, 2, PostponedPHIs); 13186 if (E->VectorizedValue) { 13187 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13188 return E->VectorizedValue; 13189 } 13190 if (True->getType() != VecTy || False->getType() != VecTy) { 13191 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() || 13192 getOperandEntry(E, 2)->isGather() || 13193 MinBWs.contains(getOperandEntry(E, 1)) || 13194 MinBWs.contains(getOperandEntry(E, 2))) && 13195 "Expected item in MinBWs."); 13196 if (True->getType() != VecTy) 13197 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1)); 13198 if (False->getType() != VecTy) 13199 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2)); 13200 } 13201 13202 Value *V = Builder.CreateSelect(Cond, True, False); 13203 V = FinalShuffle(V, E, VecTy); 13204 13205 E->VectorizedValue = V; 13206 ++NumVectorInstructions; 13207 return V; 13208 } 13209 case Instruction::FNeg: { 13210 setInsertPointAfterBundle(E); 13211 13212 Value *Op = vectorizeOperand(E, 0, PostponedPHIs); 13213 13214 if (E->VectorizedValue) { 13215 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13216 return E->VectorizedValue; 13217 } 13218 13219 Value *V = Builder.CreateUnOp( 13220 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); 13221 propagateIRFlags(V, E->Scalars, VL0); 13222 if (auto *I = dyn_cast<Instruction>(V)) 13223 V = propagateMetadata(I, E->Scalars); 13224 13225 V = FinalShuffle(V, E, VecTy); 13226 13227 E->VectorizedValue = V; 13228 ++NumVectorInstructions; 13229 13230 return V; 13231 } 13232 case Instruction::Add: 13233 case Instruction::FAdd: 13234 case Instruction::Sub: 13235 case Instruction::FSub: 13236 case Instruction::Mul: 13237 case Instruction::FMul: 13238 case Instruction::UDiv: 13239 case Instruction::SDiv: 13240 case Instruction::FDiv: 13241 case Instruction::URem: 13242 case Instruction::SRem: 13243 case Instruction::FRem: 13244 case Instruction::Shl: 13245 case Instruction::LShr: 13246 case Instruction::AShr: 13247 case Instruction::And: 13248 case Instruction::Or: 13249 case Instruction::Xor: { 13250 setInsertPointAfterBundle(E); 13251 13252 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs); 13253 if (E->VectorizedValue) { 13254 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13255 return E->VectorizedValue; 13256 } 13257 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs); 13258 if (E->VectorizedValue) { 13259 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13260 return E->VectorizedValue; 13261 } 13262 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { 13263 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) { 13264 ArrayRef<Value *> Ops = E->getOperand(I); 13265 if (all_of(Ops, [&](Value *Op) { 13266 auto *CI = dyn_cast<ConstantInt>(Op); 13267 return CI && CI->getValue().countr_one() >= It->second.first; 13268 })) { 13269 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy); 13270 E->VectorizedValue = V; 13271 ++NumVectorInstructions; 13272 return V; 13273 } 13274 } 13275 } 13276 if (LHS->getType() != VecTy || RHS->getType() != VecTy) { 13277 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || 13278 getOperandEntry(E, 1)->isGather() || 13279 MinBWs.contains(getOperandEntry(E, 0)) || 13280 MinBWs.contains(getOperandEntry(E, 1))) && 13281 "Expected item in MinBWs."); 13282 if (LHS->getType() != VecTy) 13283 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0)); 13284 if (RHS->getType() != VecTy) 13285 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1)); 13286 } 13287 13288 Value *V = Builder.CreateBinOp( 13289 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, 13290 RHS); 13291 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); 13292 if (auto *I = dyn_cast<Instruction>(V)) { 13293 V = propagateMetadata(I, E->Scalars); 13294 // Drop nuw flags for abs(sub(commutative), true). 13295 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub && 13296 any_of(E->Scalars, [](Value *V) { 13297 return isCommutative(cast<Instruction>(V)); 13298 })) 13299 I->setHasNoUnsignedWrap(/*b=*/false); 13300 } 13301 13302 V = FinalShuffle(V, E, VecTy); 13303 13304 E->VectorizedValue = V; 13305 ++NumVectorInstructions; 13306 13307 return V; 13308 } 13309 case Instruction::Load: { 13310 // Loads are inserted at the head of the tree because we don't want to 13311 // sink them all the way down past store instructions. 13312 setInsertPointAfterBundle(E); 13313 13314 LoadInst *LI = cast<LoadInst>(VL0); 13315 Instruction *NewLI; 13316 Value *PO = LI->getPointerOperand(); 13317 if (E->State == TreeEntry::Vectorize) { 13318 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); 13319 } else if (E->State == TreeEntry::StridedVectorize) { 13320 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand(); 13321 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand(); 13322 PO = IsReverseOrder ? PtrN : Ptr0; 13323 std::optional<int> Diff = getPointersDiff( 13324 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE); 13325 Type *StrideTy = DL->getIndexType(PO->getType()); 13326 Value *StrideVal; 13327 if (Diff) { 13328 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1); 13329 StrideVal = 13330 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride * 13331 DL->getTypeAllocSize(ScalarTy)); 13332 } else { 13333 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr); 13334 transform(E->Scalars, PointerOps.begin(), [](Value *V) { 13335 return cast<LoadInst>(V)->getPointerOperand(); 13336 }); 13337 OrdersType Order; 13338 std::optional<Value *> Stride = 13339 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order, 13340 &*Builder.GetInsertPoint()); 13341 Value *NewStride = 13342 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true); 13343 StrideVal = Builder.CreateMul( 13344 NewStride, 13345 ConstantInt::get( 13346 StrideTy, 13347 (IsReverseOrder ? -1 : 1) * 13348 static_cast<int>(DL->getTypeAllocSize(ScalarTy)))); 13349 } 13350 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars); 13351 auto *Inst = Builder.CreateIntrinsic( 13352 Intrinsic::experimental_vp_strided_load, 13353 {VecTy, PO->getType(), StrideTy}, 13354 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()), 13355 Builder.getInt32(E->Scalars.size())}); 13356 Inst->addParamAttr( 13357 /*ArgNo=*/0, 13358 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); 13359 NewLI = Inst; 13360 } else { 13361 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); 13362 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs); 13363 if (E->VectorizedValue) { 13364 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13365 return E->VectorizedValue; 13366 } 13367 // Use the minimum alignment of the gathered loads. 13368 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars); 13369 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); 13370 } 13371 Value *V = propagateMetadata(NewLI, E->Scalars); 13372 13373 V = FinalShuffle(V, E, VecTy); 13374 E->VectorizedValue = V; 13375 ++NumVectorInstructions; 13376 return V; 13377 } 13378 case Instruction::Store: { 13379 auto *SI = cast<StoreInst>(VL0); 13380 13381 setInsertPointAfterBundle(E); 13382 13383 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs); 13384 if (VecValue->getType() != VecTy) 13385 VecValue = 13386 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0)); 13387 VecValue = FinalShuffle(VecValue, E, VecTy); 13388 13389 Value *Ptr = SI->getPointerOperand(); 13390 Instruction *ST; 13391 if (E->State == TreeEntry::Vectorize) { 13392 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign()); 13393 } else { 13394 assert(E->State == TreeEntry::StridedVectorize && 13395 "Expected either strided or conseutive stores."); 13396 if (!E->ReorderIndices.empty()) { 13397 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]); 13398 Ptr = SI->getPointerOperand(); 13399 } 13400 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars); 13401 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType()); 13402 auto *Inst = Builder.CreateIntrinsic( 13403 Intrinsic::experimental_vp_strided_store, 13404 {VecTy, Ptr->getType(), StrideTy}, 13405 {VecValue, Ptr, 13406 ConstantInt::get( 13407 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))), 13408 Builder.getAllOnesMask(VecTy->getElementCount()), 13409 Builder.getInt32(E->Scalars.size())}); 13410 Inst->addParamAttr( 13411 /*ArgNo=*/1, 13412 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); 13413 ST = Inst; 13414 } 13415 13416 Value *V = propagateMetadata(ST, E->Scalars); 13417 13418 E->VectorizedValue = V; 13419 ++NumVectorInstructions; 13420 return V; 13421 } 13422 case Instruction::GetElementPtr: { 13423 auto *GEP0 = cast<GetElementPtrInst>(VL0); 13424 setInsertPointAfterBundle(E); 13425 13426 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs); 13427 if (E->VectorizedValue) { 13428 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13429 return E->VectorizedValue; 13430 } 13431 13432 SmallVector<Value *> OpVecs; 13433 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { 13434 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs); 13435 if (E->VectorizedValue) { 13436 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13437 return E->VectorizedValue; 13438 } 13439 OpVecs.push_back(OpVec); 13440 } 13441 13442 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs); 13443 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) { 13444 SmallVector<Value *> GEPs; 13445 for (Value *V : E->Scalars) { 13446 if (isa<GetElementPtrInst>(V)) 13447 GEPs.push_back(V); 13448 } 13449 V = propagateMetadata(I, GEPs); 13450 } 13451 13452 V = FinalShuffle(V, E, VecTy); 13453 13454 E->VectorizedValue = V; 13455 ++NumVectorInstructions; 13456 13457 return V; 13458 } 13459 case Instruction::Call: { 13460 CallInst *CI = cast<CallInst>(VL0); 13461 setInsertPointAfterBundle(E); 13462 13463 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 13464 13465 SmallVector<Type *> ArgTys = 13466 buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(), 13467 It != MinBWs.end() ? It->second.first : 0); 13468 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); 13469 bool UseIntrinsic = ID != Intrinsic::not_intrinsic && 13470 VecCallCosts.first <= VecCallCosts.second; 13471 13472 Value *ScalarArg = nullptr; 13473 SmallVector<Value *> OpVecs; 13474 SmallVector<Type *, 2> TysForDecl; 13475 // Add return type if intrinsic is overloaded on it. 13476 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1)) 13477 TysForDecl.push_back(VecTy); 13478 auto *CEI = cast<CallInst>(VL0); 13479 for (unsigned I : seq<unsigned>(0, CI->arg_size())) { 13480 ValueList OpVL; 13481 // Some intrinsics have scalar arguments. This argument should not be 13482 // vectorized. 13483 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) { 13484 ScalarArg = CEI->getArgOperand(I); 13485 // if decided to reduce bitwidth of abs intrinsic, it second argument 13486 // must be set false (do not return poison, if value issigned min). 13487 if (ID == Intrinsic::abs && It != MinBWs.end() && 13488 It->second.first < DL->getTypeSizeInBits(CEI->getType())) 13489 ScalarArg = Builder.getFalse(); 13490 OpVecs.push_back(ScalarArg); 13491 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) 13492 TysForDecl.push_back(ScalarArg->getType()); 13493 continue; 13494 } 13495 13496 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs); 13497 if (E->VectorizedValue) { 13498 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13499 return E->VectorizedValue; 13500 } 13501 ScalarArg = CEI->getArgOperand(I); 13502 if (cast<VectorType>(OpVec->getType())->getElementType() != 13503 ScalarArg->getType()->getScalarType() && 13504 It == MinBWs.end()) { 13505 auto *CastTy = 13506 getWidenedType(ScalarArg->getType(), VecTy->getNumElements()); 13507 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I)); 13508 } else if (It != MinBWs.end()) { 13509 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I)); 13510 } 13511 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n"); 13512 OpVecs.push_back(OpVec); 13513 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) 13514 TysForDecl.push_back(OpVec->getType()); 13515 } 13516 13517 Function *CF; 13518 if (!UseIntrinsic) { 13519 VFShape Shape = 13520 VFShape::get(CI->getFunctionType(), 13521 ElementCount::getFixed( 13522 static_cast<unsigned>(VecTy->getNumElements())), 13523 false /*HasGlobalPred*/); 13524 CF = VFDatabase(*CI).getVectorizedFunction(Shape); 13525 } else { 13526 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); 13527 } 13528 13529 SmallVector<OperandBundleDef, 1> OpBundles; 13530 CI->getOperandBundlesAsDefs(OpBundles); 13531 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); 13532 13533 propagateIRFlags(V, E->Scalars, VL0); 13534 V = FinalShuffle(V, E, VecTy); 13535 13536 E->VectorizedValue = V; 13537 ++NumVectorInstructions; 13538 return V; 13539 } 13540 case Instruction::ShuffleVector: { 13541 assert(E->isAltShuffle() && 13542 ((Instruction::isBinaryOp(E->getOpcode()) && 13543 Instruction::isBinaryOp(E->getAltOpcode())) || 13544 (Instruction::isCast(E->getOpcode()) && 13545 Instruction::isCast(E->getAltOpcode())) || 13546 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && 13547 "Invalid Shuffle Vector Operand"); 13548 13549 Value *LHS = nullptr, *RHS = nullptr; 13550 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) { 13551 setInsertPointAfterBundle(E); 13552 LHS = vectorizeOperand(E, 0, PostponedPHIs); 13553 if (E->VectorizedValue) { 13554 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13555 return E->VectorizedValue; 13556 } 13557 RHS = vectorizeOperand(E, 1, PostponedPHIs); 13558 } else { 13559 setInsertPointAfterBundle(E); 13560 LHS = vectorizeOperand(E, 0, PostponedPHIs); 13561 } 13562 if (E->VectorizedValue) { 13563 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 13564 return E->VectorizedValue; 13565 } 13566 if (LHS && RHS && 13567 ((Instruction::isBinaryOp(E->getOpcode()) && 13568 (LHS->getType() != VecTy || RHS->getType() != VecTy)) || 13569 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) { 13570 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || 13571 getOperandEntry(E, 1)->isGather() || 13572 MinBWs.contains(getOperandEntry(E, 0)) || 13573 MinBWs.contains(getOperandEntry(E, 1))) && 13574 "Expected item in MinBWs."); 13575 Type *CastTy = VecTy; 13576 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) { 13577 if (cast<VectorType>(LHS->getType()) 13578 ->getElementType() 13579 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType()) 13580 ->getElementType() 13581 ->getIntegerBitWidth()) 13582 CastTy = RHS->getType(); 13583 else 13584 CastTy = LHS->getType(); 13585 } 13586 if (LHS->getType() != CastTy) 13587 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0)); 13588 if (RHS->getType() != CastTy) 13589 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1)); 13590 } 13591 13592 Value *V0, *V1; 13593 if (Instruction::isBinaryOp(E->getOpcode())) { 13594 V0 = Builder.CreateBinOp( 13595 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); 13596 V1 = Builder.CreateBinOp( 13597 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); 13598 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { 13599 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS); 13600 auto *AltCI = cast<CmpInst>(E->getAltOp()); 13601 CmpInst::Predicate AltPred = AltCI->getPredicate(); 13602 V1 = Builder.CreateCmp(AltPred, LHS, RHS); 13603 } else { 13604 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) { 13605 unsigned SrcBWSz = DL->getTypeSizeInBits( 13606 cast<VectorType>(LHS->getType())->getElementType()); 13607 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); 13608 if (BWSz <= SrcBWSz) { 13609 if (BWSz < SrcBWSz) 13610 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first); 13611 assert(LHS->getType() == VecTy && "Expected same type as operand."); 13612 if (auto *I = dyn_cast<Instruction>(LHS)) 13613 LHS = propagateMetadata(I, E->Scalars); 13614 E->VectorizedValue = LHS; 13615 ++NumVectorInstructions; 13616 return LHS; 13617 } 13618 } 13619 V0 = Builder.CreateCast( 13620 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); 13621 V1 = Builder.CreateCast( 13622 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); 13623 } 13624 // Add V0 and V1 to later analysis to try to find and remove matching 13625 // instruction, if any. 13626 for (Value *V : {V0, V1}) { 13627 if (auto *I = dyn_cast<Instruction>(V)) { 13628 GatherShuffleExtractSeq.insert(I); 13629 CSEBlocks.insert(I->getParent()); 13630 } 13631 } 13632 13633 // Create shuffle to take alternate operations from the vector. 13634 // Also, gather up main and alt scalar ops to propagate IR flags to 13635 // each vector operation. 13636 ValueList OpScalars, AltScalars; 13637 SmallVector<int> Mask; 13638 E->buildAltOpShuffleMask( 13639 [E, this](Instruction *I) { 13640 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 13641 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), 13642 *TLI); 13643 }, 13644 Mask, &OpScalars, &AltScalars); 13645 13646 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end()); 13647 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end()); 13648 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) { 13649 // Drop nuw flags for abs(sub(commutative), true). 13650 if (auto *I = dyn_cast<Instruction>(Vec); 13651 I && Opcode == Instruction::Sub && !MinBWs.contains(E) && 13652 any_of(E->Scalars, [](Value *V) { 13653 auto *IV = cast<Instruction>(V); 13654 return IV->getOpcode() == Instruction::Sub && 13655 isCommutative(cast<Instruction>(IV)); 13656 })) 13657 I->setHasNoUnsignedWrap(/*b=*/false); 13658 }; 13659 DropNuwFlag(V0, E->getOpcode()); 13660 DropNuwFlag(V1, E->getAltOpcode()); 13661 13662 Value *V = Builder.CreateShuffleVector(V0, V1, Mask); 13663 if (auto *I = dyn_cast<Instruction>(V)) { 13664 V = propagateMetadata(I, E->Scalars); 13665 GatherShuffleExtractSeq.insert(I); 13666 CSEBlocks.insert(I->getParent()); 13667 } 13668 13669 E->VectorizedValue = V; 13670 ++NumVectorInstructions; 13671 13672 return V; 13673 } 13674 default: 13675 llvm_unreachable("unknown inst"); 13676 } 13677 return nullptr; 13678 } 13679 13680 Value *BoUpSLP::vectorizeTree() { 13681 ExtraValueToDebugLocsMap ExternallyUsedValues; 13682 SmallVector<std::pair<Value *, Value *>> ReplacedExternals; 13683 return vectorizeTree(ExternallyUsedValues, ReplacedExternals); 13684 } 13685 13686 namespace { 13687 /// Data type for handling buildvector sequences with the reused scalars from 13688 /// other tree entries. 13689 struct ShuffledInsertData { 13690 /// List of insertelements to be replaced by shuffles. 13691 SmallVector<InsertElementInst *> InsertElements; 13692 /// The parent vectors and shuffle mask for the given list of inserts. 13693 MapVector<Value *, SmallVector<int>> ValueMasks; 13694 }; 13695 } // namespace 13696 13697 Value *BoUpSLP::vectorizeTree( 13698 const ExtraValueToDebugLocsMap &ExternallyUsedValues, 13699 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals, 13700 Instruction *ReductionRoot) { 13701 // All blocks must be scheduled before any instructions are inserted. 13702 for (auto &BSIter : BlocksSchedules) { 13703 scheduleBlock(BSIter.second.get()); 13704 } 13705 // Clean Entry-to-LastInstruction table. It can be affected after scheduling, 13706 // need to rebuild it. 13707 EntryToLastInstruction.clear(); 13708 13709 if (ReductionRoot) 13710 Builder.SetInsertPoint(ReductionRoot->getParent(), 13711 ReductionRoot->getIterator()); 13712 else 13713 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); 13714 13715 // Postpone emission of PHIs operands to avoid cyclic dependencies issues. 13716 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true); 13717 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) 13718 if (TE->State == TreeEntry::Vectorize && 13719 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() && 13720 TE->VectorizedValue) 13721 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); 13722 // Run through the list of postponed gathers and emit them, replacing the temp 13723 // emitted allocas with actual vector instructions. 13724 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef(); 13725 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues; 13726 for (const TreeEntry *E : PostponedNodes) { 13727 auto *TE = const_cast<TreeEntry *>(E); 13728 if (auto *VecTE = getTreeEntry(TE->Scalars.front())) 13729 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand( 13730 TE->UserTreeIndices.front().EdgeIdx)) && 13731 VecTE->isSame(TE->Scalars)) 13732 // Found gather node which is absolutely the same as one of the 13733 // vectorized nodes. It may happen after reordering. 13734 continue; 13735 auto *PrevVec = cast<Instruction>(TE->VectorizedValue); 13736 TE->VectorizedValue = nullptr; 13737 auto *UserI = 13738 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue); 13739 // If user is a PHI node, its vector code have to be inserted right before 13740 // block terminator. Since the node was delayed, there were some unresolved 13741 // dependencies at the moment when stab instruction was emitted. In a case 13742 // when any of these dependencies turn out an operand of another PHI, coming 13743 // from this same block, position of a stab instruction will become invalid. 13744 // The is because source vector that supposed to feed this gather node was 13745 // inserted at the end of the block [after stab instruction]. So we need 13746 // to adjust insertion point again to the end of block. 13747 if (isa<PHINode>(UserI)) { 13748 // Insert before all users. 13749 Instruction *InsertPt = PrevVec->getParent()->getTerminator(); 13750 for (User *U : PrevVec->users()) { 13751 if (U == UserI) 13752 continue; 13753 auto *UI = dyn_cast<Instruction>(U); 13754 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent()) 13755 continue; 13756 if (UI->comesBefore(InsertPt)) 13757 InsertPt = UI; 13758 } 13759 Builder.SetInsertPoint(InsertPt); 13760 } else { 13761 Builder.SetInsertPoint(PrevVec); 13762 } 13763 Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); 13764 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false); 13765 if (Vec->getType() != PrevVec->getType()) { 13766 assert(Vec->getType()->isIntOrIntVectorTy() && 13767 PrevVec->getType()->isIntOrIntVectorTy() && 13768 "Expected integer vector types only."); 13769 std::optional<bool> IsSigned; 13770 for (Value *V : TE->Scalars) { 13771 if (const TreeEntry *BaseTE = getTreeEntry(V)) { 13772 auto It = MinBWs.find(BaseTE); 13773 if (It != MinBWs.end()) { 13774 IsSigned = IsSigned.value_or(false) || It->second.second; 13775 if (*IsSigned) 13776 break; 13777 } 13778 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) { 13779 auto It = MinBWs.find(MNTE); 13780 if (It != MinBWs.end()) { 13781 IsSigned = IsSigned.value_or(false) || It->second.second; 13782 if (*IsSigned) 13783 break; 13784 } 13785 } 13786 if (IsSigned.value_or(false)) 13787 break; 13788 // Scan through gather nodes. 13789 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) { 13790 auto It = MinBWs.find(BVE); 13791 if (It != MinBWs.end()) { 13792 IsSigned = IsSigned.value_or(false) || It->second.second; 13793 if (*IsSigned) 13794 break; 13795 } 13796 } 13797 if (IsSigned.value_or(false)) 13798 break; 13799 if (auto *EE = dyn_cast<ExtractElementInst>(V)) { 13800 IsSigned = 13801 IsSigned.value_or(false) || 13802 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL)); 13803 continue; 13804 } 13805 if (IsSigned.value_or(false)) 13806 break; 13807 } 13808 } 13809 if (IsSigned.value_or(false)) { 13810 // Final attempt - check user node. 13811 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE); 13812 if (It != MinBWs.end()) 13813 IsSigned = It->second.second; 13814 } 13815 assert(IsSigned && 13816 "Expected user node or perfect diamond match in MinBWs."); 13817 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned); 13818 } 13819 PrevVec->replaceAllUsesWith(Vec); 13820 PostponedValues.try_emplace(Vec).first->second.push_back(TE); 13821 // Replace the stub vector node, if it was used before for one of the 13822 // buildvector nodes already. 13823 auto It = PostponedValues.find(PrevVec); 13824 if (It != PostponedValues.end()) { 13825 for (TreeEntry *VTE : It->getSecond()) 13826 VTE->VectorizedValue = Vec; 13827 } 13828 eraseInstruction(PrevVec); 13829 } 13830 13831 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() 13832 << " values .\n"); 13833 13834 SmallVector<ShuffledInsertData> ShuffledInserts; 13835 // Maps vector instruction to original insertelement instruction 13836 DenseMap<Value *, InsertElementInst *> VectorToInsertElement; 13837 // Maps extract Scalar to the corresponding extractelement instruction in the 13838 // basic block. Only one extractelement per block should be emitted. 13839 DenseMap<Value *, 13840 DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>> 13841 ScalarToEEs; 13842 SmallDenseSet<Value *, 4> UsedInserts; 13843 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts; 13844 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser; 13845 // Extract all of the elements with the external uses. 13846 for (const auto &ExternalUse : ExternalUses) { 13847 Value *Scalar = ExternalUse.Scalar; 13848 llvm::User *User = ExternalUse.User; 13849 13850 // Skip users that we already RAUW. This happens when one instruction 13851 // has multiple uses of the same value. 13852 if (User && !is_contained(Scalar->users(), User)) 13853 continue; 13854 TreeEntry *E = getTreeEntry(Scalar); 13855 assert(E && "Invalid scalar"); 13856 assert(!E->isGather() && "Extracting from a gather list"); 13857 // Non-instruction pointers are not deleted, just skip them. 13858 if (E->getOpcode() == Instruction::GetElementPtr && 13859 !isa<GetElementPtrInst>(Scalar)) 13860 continue; 13861 13862 Value *Vec = E->VectorizedValue; 13863 assert(Vec && "Can't find vectorizable value"); 13864 13865 Value *Lane = Builder.getInt32(ExternalUse.Lane); 13866 auto ExtractAndExtendIfNeeded = [&](Value *Vec) { 13867 if (Scalar->getType() != Vec->getType()) { 13868 Value *Ex = nullptr; 13869 Value *ExV = nullptr; 13870 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar); 13871 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP); 13872 auto It = ScalarToEEs.find(Scalar); 13873 if (It != ScalarToEEs.end()) { 13874 // No need to emit many extracts, just move the only one in the 13875 // current block. 13876 auto EEIt = It->second.find(Builder.GetInsertBlock()); 13877 if (EEIt != It->second.end()) { 13878 Instruction *I = EEIt->second.first; 13879 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && 13880 Builder.GetInsertPoint()->comesBefore(I)) { 13881 I->moveBefore(*Builder.GetInsertPoint()->getParent(), 13882 Builder.GetInsertPoint()); 13883 if (auto *CI = EEIt->second.second) 13884 CI->moveAfter(I); 13885 } 13886 Ex = I; 13887 ExV = EEIt->second.second ? EEIt->second.second : Ex; 13888 } 13889 } 13890 if (!Ex) { 13891 // "Reuse" the existing extract to improve final codegen. 13892 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { 13893 Value *V = ES->getVectorOperand(); 13894 if (const TreeEntry *ETE = getTreeEntry(V)) 13895 V = ETE->VectorizedValue; 13896 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); 13897 } else if (ReplaceGEP) { 13898 // Leave the GEPs as is, they are free in most cases and better to 13899 // keep them as GEPs. 13900 auto *CloneGEP = GEP->clone(); 13901 if (isa<Instruction>(Vec)) 13902 CloneGEP->insertBefore(*Builder.GetInsertBlock(), 13903 Builder.GetInsertPoint()); 13904 else 13905 CloneGEP->insertBefore(GEP); 13906 if (GEP->hasName()) 13907 CloneGEP->takeName(GEP); 13908 Ex = CloneGEP; 13909 } else { 13910 Ex = Builder.CreateExtractElement(Vec, Lane); 13911 } 13912 // If necessary, sign-extend or zero-extend ScalarRoot 13913 // to the larger type. 13914 ExV = Ex; 13915 if (Scalar->getType() != Ex->getType()) 13916 ExV = Builder.CreateIntCast(Ex, Scalar->getType(), 13917 MinBWs.find(E)->second.second); 13918 if (auto *I = dyn_cast<Instruction>(Ex)) 13919 ScalarToEEs[Scalar].try_emplace( 13920 Builder.GetInsertBlock(), 13921 std::make_pair(I, cast<Instruction>(ExV))); 13922 } 13923 // The then branch of the previous if may produce constants, since 0 13924 // operand might be a constant. 13925 if (auto *ExI = dyn_cast<Instruction>(Ex)) { 13926 GatherShuffleExtractSeq.insert(ExI); 13927 CSEBlocks.insert(ExI->getParent()); 13928 } 13929 return ExV; 13930 } 13931 assert(isa<FixedVectorType>(Scalar->getType()) && 13932 isa<InsertElementInst>(Scalar) && 13933 "In-tree scalar of vector type is not insertelement?"); 13934 auto *IE = cast<InsertElementInst>(Scalar); 13935 VectorToInsertElement.try_emplace(Vec, IE); 13936 return Vec; 13937 }; 13938 // If User == nullptr, the Scalar remains as scalar in vectorized 13939 // instructions or is used as extra arg. Generate ExtractElement instruction 13940 // and update the record for this scalar in ExternallyUsedValues. 13941 if (!User) { 13942 if (!ScalarsWithNullptrUser.insert(Scalar).second) 13943 continue; 13944 assert((ExternallyUsedValues.count(Scalar) || 13945 Scalar->hasNUsesOrMore(UsesLimit) || 13946 any_of(Scalar->users(), 13947 [&](llvm::User *U) { 13948 if (ExternalUsesAsGEPs.contains(U)) 13949 return true; 13950 TreeEntry *UseEntry = getTreeEntry(U); 13951 return UseEntry && 13952 (UseEntry->State == TreeEntry::Vectorize || 13953 UseEntry->State == 13954 TreeEntry::StridedVectorize) && 13955 (E->State == TreeEntry::Vectorize || 13956 E->State == TreeEntry::StridedVectorize) && 13957 doesInTreeUserNeedToExtract( 13958 Scalar, 13959 cast<Instruction>(UseEntry->Scalars.front()), 13960 TLI); 13961 })) && 13962 "Scalar with nullptr User must be registered in " 13963 "ExternallyUsedValues map or remain as scalar in vectorized " 13964 "instructions"); 13965 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 13966 if (auto *PHI = dyn_cast<PHINode>(VecI)) 13967 Builder.SetInsertPoint(PHI->getParent(), 13968 PHI->getParent()->getFirstNonPHIIt()); 13969 else 13970 Builder.SetInsertPoint(VecI->getParent(), 13971 std::next(VecI->getIterator())); 13972 } else { 13973 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); 13974 } 13975 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 13976 // Required to update internally referenced instructions. 13977 Scalar->replaceAllUsesWith(NewInst); 13978 ReplacedExternals.emplace_back(Scalar, NewInst); 13979 continue; 13980 } 13981 13982 if (auto *VU = dyn_cast<InsertElementInst>(User); 13983 VU && VU->getOperand(1) == Scalar) { 13984 // Skip if the scalar is another vector op or Vec is not an instruction. 13985 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) { 13986 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) { 13987 if (!UsedInserts.insert(VU).second) 13988 continue; 13989 // Need to use original vector, if the root is truncated. 13990 auto BWIt = MinBWs.find(E); 13991 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) { 13992 auto *ScalarTy = FTy->getElementType(); 13993 auto Key = std::make_pair(Vec, ScalarTy); 13994 auto VecIt = VectorCasts.find(Key); 13995 if (VecIt == VectorCasts.end()) { 13996 IRBuilderBase::InsertPointGuard Guard(Builder); 13997 if (auto *IVec = dyn_cast<PHINode>(Vec)) 13998 Builder.SetInsertPoint( 13999 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime()); 14000 else if (auto *IVec = dyn_cast<Instruction>(Vec)) 14001 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction()); 14002 Vec = Builder.CreateIntCast( 14003 Vec, 14004 getWidenedType( 14005 ScalarTy, 14006 cast<FixedVectorType>(Vec->getType())->getNumElements()), 14007 BWIt->second.second); 14008 VectorCasts.try_emplace(Key, Vec); 14009 } else { 14010 Vec = VecIt->second; 14011 } 14012 } 14013 14014 std::optional<unsigned> InsertIdx = getElementIndex(VU); 14015 if (InsertIdx) { 14016 auto *It = 14017 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) { 14018 // Checks if 2 insertelements are from the same buildvector. 14019 InsertElementInst *VecInsert = Data.InsertElements.front(); 14020 return areTwoInsertFromSameBuildVector( 14021 VU, VecInsert, 14022 [](InsertElementInst *II) { return II->getOperand(0); }); 14023 }); 14024 unsigned Idx = *InsertIdx; 14025 if (It == ShuffledInserts.end()) { 14026 (void)ShuffledInserts.emplace_back(); 14027 It = std::next(ShuffledInserts.begin(), 14028 ShuffledInserts.size() - 1); 14029 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; 14030 if (Mask.empty()) 14031 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 14032 // Find the insertvector, vectorized in tree, if any. 14033 Value *Base = VU; 14034 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { 14035 if (IEBase != User && 14036 (!IEBase->hasOneUse() || 14037 getElementIndex(IEBase).value_or(Idx) == Idx)) 14038 break; 14039 // Build the mask for the vectorized insertelement instructions. 14040 if (const TreeEntry *E = getTreeEntry(IEBase)) { 14041 do { 14042 IEBase = cast<InsertElementInst>(Base); 14043 int IEIdx = *getElementIndex(IEBase); 14044 assert(Mask[IEIdx] == PoisonMaskElem && 14045 "InsertElementInstruction used already."); 14046 Mask[IEIdx] = IEIdx; 14047 Base = IEBase->getOperand(0); 14048 } while (E == getTreeEntry(Base)); 14049 break; 14050 } 14051 Base = cast<InsertElementInst>(Base)->getOperand(0); 14052 // After the vectorization the def-use chain has changed, need 14053 // to look through original insertelement instructions, if they 14054 // get replaced by vector instructions. 14055 auto It = VectorToInsertElement.find(Base); 14056 if (It != VectorToInsertElement.end()) 14057 Base = It->second; 14058 } 14059 } 14060 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; 14061 if (Mask.empty()) 14062 Mask.assign(FTy->getNumElements(), PoisonMaskElem); 14063 Mask[Idx] = ExternalUse.Lane; 14064 It->InsertElements.push_back(cast<InsertElementInst>(User)); 14065 continue; 14066 } 14067 } 14068 } 14069 } 14070 14071 // Generate extracts for out-of-tree users. 14072 // Find the insertion point for the extractelement lane. 14073 if (auto *VecI = dyn_cast<Instruction>(Vec)) { 14074 if (PHINode *PH = dyn_cast<PHINode>(User)) { 14075 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) { 14076 if (PH->getIncomingValue(I) == Scalar) { 14077 Instruction *IncomingTerminator = 14078 PH->getIncomingBlock(I)->getTerminator(); 14079 if (isa<CatchSwitchInst>(IncomingTerminator)) { 14080 Builder.SetInsertPoint(VecI->getParent(), 14081 std::next(VecI->getIterator())); 14082 } else { 14083 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator()); 14084 } 14085 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 14086 PH->setOperand(I, NewInst); 14087 } 14088 } 14089 } else { 14090 Builder.SetInsertPoint(cast<Instruction>(User)); 14091 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 14092 User->replaceUsesOfWith(Scalar, NewInst); 14093 } 14094 } else { 14095 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); 14096 Value *NewInst = ExtractAndExtendIfNeeded(Vec); 14097 User->replaceUsesOfWith(Scalar, NewInst); 14098 } 14099 14100 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 14101 } 14102 14103 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) { 14104 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); 14105 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); 14106 int VF = cast<FixedVectorType>(V1->getType())->getNumElements(); 14107 for (int I = 0, E = Mask.size(); I < E; ++I) { 14108 if (Mask[I] < VF) 14109 CombinedMask1[I] = Mask[I]; 14110 else 14111 CombinedMask2[I] = Mask[I] - VF; 14112 } 14113 ShuffleInstructionBuilder ShuffleBuilder( 14114 cast<VectorType>(V1->getType())->getElementType(), Builder, *this); 14115 ShuffleBuilder.add(V1, CombinedMask1); 14116 if (V2) 14117 ShuffleBuilder.add(V2, CombinedMask2); 14118 return ShuffleBuilder.finalize(std::nullopt); 14119 }; 14120 14121 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask, 14122 bool ForSingleMask) { 14123 unsigned VF = Mask.size(); 14124 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements(); 14125 if (VF != VecVF) { 14126 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) { 14127 Vec = CreateShuffle(Vec, nullptr, Mask); 14128 return std::make_pair(Vec, true); 14129 } 14130 if (!ForSingleMask) { 14131 SmallVector<int> ResizeMask(VF, PoisonMaskElem); 14132 for (unsigned I = 0; I < VF; ++I) { 14133 if (Mask[I] != PoisonMaskElem) 14134 ResizeMask[Mask[I]] = Mask[I]; 14135 } 14136 Vec = CreateShuffle(Vec, nullptr, ResizeMask); 14137 } 14138 } 14139 14140 return std::make_pair(Vec, false); 14141 }; 14142 // Perform shuffling of the vectorize tree entries for better handling of 14143 // external extracts. 14144 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { 14145 // Find the first and the last instruction in the list of insertelements. 14146 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement); 14147 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front(); 14148 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back(); 14149 Builder.SetInsertPoint(LastInsert); 14150 auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); 14151 Value *NewInst = performExtractsShuffleAction<Value>( 14152 MutableArrayRef(Vector.data(), Vector.size()), 14153 FirstInsert->getOperand(0), 14154 [](Value *Vec) { 14155 return cast<VectorType>(Vec->getType()) 14156 ->getElementCount() 14157 .getKnownMinValue(); 14158 }, 14159 ResizeToVF, 14160 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask, 14161 ArrayRef<Value *> Vals) { 14162 assert((Vals.size() == 1 || Vals.size() == 2) && 14163 "Expected exactly 1 or 2 input values."); 14164 if (Vals.size() == 1) { 14165 // Do not create shuffle if the mask is a simple identity 14166 // non-resizing mask. 14167 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType()) 14168 ->getNumElements() || 14169 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) 14170 return CreateShuffle(Vals.front(), nullptr, Mask); 14171 return Vals.front(); 14172 } 14173 return CreateShuffle(Vals.front() ? Vals.front() 14174 : FirstInsert->getOperand(0), 14175 Vals.back(), Mask); 14176 }); 14177 auto It = ShuffledInserts[I].InsertElements.rbegin(); 14178 // Rebuild buildvector chain. 14179 InsertElementInst *II = nullptr; 14180 if (It != ShuffledInserts[I].InsertElements.rend()) 14181 II = *It; 14182 SmallVector<Instruction *> Inserts; 14183 while (It != ShuffledInserts[I].InsertElements.rend()) { 14184 assert(II && "Must be an insertelement instruction."); 14185 if (*It == II) 14186 ++It; 14187 else 14188 Inserts.push_back(cast<Instruction>(II)); 14189 II = dyn_cast<InsertElementInst>(II->getOperand(0)); 14190 } 14191 for (Instruction *II : reverse(Inserts)) { 14192 II->replaceUsesOfWith(II->getOperand(0), NewInst); 14193 if (auto *NewI = dyn_cast<Instruction>(NewInst)) 14194 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI)) 14195 II->moveAfter(NewI); 14196 NewInst = II; 14197 } 14198 LastInsert->replaceAllUsesWith(NewInst); 14199 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) { 14200 IE->replaceUsesOfWith(IE->getOperand(0), 14201 PoisonValue::get(IE->getOperand(0)->getType())); 14202 IE->replaceUsesOfWith(IE->getOperand(1), 14203 PoisonValue::get(IE->getOperand(1)->getType())); 14204 eraseInstruction(IE); 14205 } 14206 CSEBlocks.insert(LastInsert->getParent()); 14207 } 14208 14209 SmallVector<Instruction *> RemovedInsts; 14210 // For each vectorized value: 14211 for (auto &TEPtr : VectorizableTree) { 14212 TreeEntry *Entry = TEPtr.get(); 14213 14214 // No need to handle users of gathered values. 14215 if (Entry->isGather()) 14216 continue; 14217 14218 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 14219 14220 // For each lane: 14221 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 14222 Value *Scalar = Entry->Scalars[Lane]; 14223 14224 if (Entry->getOpcode() == Instruction::GetElementPtr && 14225 !isa<GetElementPtrInst>(Scalar)) 14226 continue; 14227 #ifndef NDEBUG 14228 Type *Ty = Scalar->getType(); 14229 if (!Ty->isVoidTy()) { 14230 for (User *U : Scalar->users()) { 14231 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 14232 14233 // It is legal to delete users in the ignorelist. 14234 assert((getTreeEntry(U) || 14235 (UserIgnoreList && UserIgnoreList->contains(U)) || 14236 (isa_and_nonnull<Instruction>(U) && 14237 isDeleted(cast<Instruction>(U)))) && 14238 "Deleting out-of-tree value"); 14239 } 14240 } 14241 #endif 14242 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 14243 auto *I = cast<Instruction>(Scalar); 14244 RemovedInsts.push_back(I); 14245 } 14246 } 14247 14248 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the 14249 // new vector instruction. 14250 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue)) 14251 V->mergeDIAssignID(RemovedInsts); 14252 14253 // Clear up reduction references, if any. 14254 if (UserIgnoreList) { 14255 for (Instruction *I : RemovedInsts) { 14256 if (getTreeEntry(I)->Idx != 0) 14257 continue; 14258 SmallVector<SelectInst *> LogicalOpSelects; 14259 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { 14260 // Do not replace condition of the logical op in form select <cond>. 14261 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) && 14262 (match(U.getUser(), m_LogicalAnd()) || 14263 match(U.getUser(), m_LogicalOr())) && 14264 U.getOperandNo() == 0; 14265 if (IsPoisoningLogicalOp) { 14266 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser())); 14267 return false; 14268 } 14269 return UserIgnoreList->contains(U.getUser()); 14270 }); 14271 // Replace conditions of the poisoning logical ops with the non-poison 14272 // constant value. 14273 for (SelectInst *SI : LogicalOpSelects) 14274 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType())); 14275 } 14276 } 14277 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias 14278 // cache correctness. 14279 // NOTE: removeInstructionAndOperands only marks the instruction for deletion 14280 // - instructions are not deleted until later. 14281 removeInstructionsAndOperands(ArrayRef(RemovedInsts)); 14282 14283 Builder.ClearInsertionPoint(); 14284 InstrElementSize.clear(); 14285 14286 const TreeEntry &RootTE = *VectorizableTree.front(); 14287 Value *Vec = RootTE.VectorizedValue; 14288 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 && 14289 It != MinBWs.end() && 14290 ReductionBitWidth != It->second.first) { 14291 IRBuilder<>::InsertPointGuard Guard(Builder); 14292 Builder.SetInsertPoint(ReductionRoot->getParent(), 14293 ReductionRoot->getIterator()); 14294 Vec = Builder.CreateIntCast( 14295 Vec, 14296 VectorType::get(Builder.getIntNTy(ReductionBitWidth), 14297 cast<VectorType>(Vec->getType())->getElementCount()), 14298 It->second.second); 14299 } 14300 return Vec; 14301 } 14302 14303 void BoUpSLP::optimizeGatherSequence() { 14304 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size() 14305 << " gather sequences instructions.\n"); 14306 // LICM InsertElementInst sequences. 14307 for (Instruction *I : GatherShuffleExtractSeq) { 14308 if (isDeleted(I)) 14309 continue; 14310 14311 // Check if this block is inside a loop. 14312 Loop *L = LI->getLoopFor(I->getParent()); 14313 if (!L) 14314 continue; 14315 14316 // Check if it has a preheader. 14317 BasicBlock *PreHeader = L->getLoopPreheader(); 14318 if (!PreHeader) 14319 continue; 14320 14321 // If the vector or the element that we insert into it are 14322 // instructions that are defined in this basic block then we can't 14323 // hoist this instruction. 14324 if (any_of(I->operands(), [L](Value *V) { 14325 auto *OpI = dyn_cast<Instruction>(V); 14326 return OpI && L->contains(OpI); 14327 })) 14328 continue; 14329 14330 // We can hoist this instruction. Move it to the pre-header. 14331 I->moveBefore(PreHeader->getTerminator()); 14332 CSEBlocks.insert(PreHeader); 14333 } 14334 14335 // Make a list of all reachable blocks in our CSE queue. 14336 SmallVector<const DomTreeNode *, 8> CSEWorkList; 14337 CSEWorkList.reserve(CSEBlocks.size()); 14338 for (BasicBlock *BB : CSEBlocks) 14339 if (DomTreeNode *N = DT->getNode(BB)) { 14340 assert(DT->isReachableFromEntry(N)); 14341 CSEWorkList.push_back(N); 14342 } 14343 14344 // Sort blocks by domination. This ensures we visit a block after all blocks 14345 // dominating it are visited. 14346 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { 14347 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && 14348 "Different nodes should have different DFS numbers"); 14349 return A->getDFSNumIn() < B->getDFSNumIn(); 14350 }); 14351 14352 // Less defined shuffles can be replaced by the more defined copies. 14353 // Between two shuffles one is less defined if it has the same vector operands 14354 // and its mask indeces are the same as in the first one or undefs. E.g. 14355 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, 14356 // poison, <0, 0, 0, 0>. 14357 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, 14358 SmallVectorImpl<int> &NewMask) { 14359 if (I1->getType() != I2->getType()) 14360 return false; 14361 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); 14362 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); 14363 if (!SI1 || !SI2) 14364 return I1->isIdenticalTo(I2); 14365 if (SI1->isIdenticalTo(SI2)) 14366 return true; 14367 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) 14368 if (SI1->getOperand(I) != SI2->getOperand(I)) 14369 return false; 14370 // Check if the second instruction is more defined than the first one. 14371 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); 14372 ArrayRef<int> SM1 = SI1->getShuffleMask(); 14373 // Count trailing undefs in the mask to check the final number of used 14374 // registers. 14375 unsigned LastUndefsCnt = 0; 14376 for (int I = 0, E = NewMask.size(); I < E; ++I) { 14377 if (SM1[I] == PoisonMaskElem) 14378 ++LastUndefsCnt; 14379 else 14380 LastUndefsCnt = 0; 14381 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem && 14382 NewMask[I] != SM1[I]) 14383 return false; 14384 if (NewMask[I] == PoisonMaskElem) 14385 NewMask[I] = SM1[I]; 14386 } 14387 // Check if the last undefs actually change the final number of used vector 14388 // registers. 14389 return SM1.size() - LastUndefsCnt > 1 && 14390 TTI->getNumberOfParts(SI1->getType()) == 14391 TTI->getNumberOfParts( 14392 getWidenedType(SI1->getType()->getElementType(), 14393 SM1.size() - LastUndefsCnt)); 14394 }; 14395 // Perform O(N^2) search over the gather/shuffle sequences and merge identical 14396 // instructions. TODO: We can further optimize this scan if we split the 14397 // instructions into different buckets based on the insert lane. 14398 SmallVector<Instruction *, 16> Visited; 14399 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 14400 assert(*I && 14401 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 14402 "Worklist not sorted properly!"); 14403 BasicBlock *BB = (*I)->getBlock(); 14404 // For all instructions in blocks containing gather sequences: 14405 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 14406 if (isDeleted(&In)) 14407 continue; 14408 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) && 14409 !GatherShuffleExtractSeq.contains(&In)) 14410 continue; 14411 14412 // Check if we can replace this instruction with any of the 14413 // visited instructions. 14414 bool Replaced = false; 14415 for (Instruction *&V : Visited) { 14416 SmallVector<int> NewMask; 14417 if (IsIdenticalOrLessDefined(&In, V, NewMask) && 14418 DT->dominates(V->getParent(), In.getParent())) { 14419 In.replaceAllUsesWith(V); 14420 eraseInstruction(&In); 14421 if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) 14422 if (!NewMask.empty()) 14423 SI->setShuffleMask(NewMask); 14424 Replaced = true; 14425 break; 14426 } 14427 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && 14428 GatherShuffleExtractSeq.contains(V) && 14429 IsIdenticalOrLessDefined(V, &In, NewMask) && 14430 DT->dominates(In.getParent(), V->getParent())) { 14431 In.moveAfter(V); 14432 V->replaceAllUsesWith(&In); 14433 eraseInstruction(V); 14434 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) 14435 if (!NewMask.empty()) 14436 SI->setShuffleMask(NewMask); 14437 V = &In; 14438 Replaced = true; 14439 break; 14440 } 14441 } 14442 if (!Replaced) { 14443 assert(!is_contained(Visited, &In)); 14444 Visited.push_back(&In); 14445 } 14446 } 14447 } 14448 CSEBlocks.clear(); 14449 GatherShuffleExtractSeq.clear(); 14450 } 14451 14452 BoUpSLP::ScheduleData * 14453 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { 14454 ScheduleData *Bundle = nullptr; 14455 ScheduleData *PrevInBundle = nullptr; 14456 for (Value *V : VL) { 14457 if (doesNotNeedToBeScheduled(V)) 14458 continue; 14459 ScheduleData *BundleMember = getScheduleData(V); 14460 assert(BundleMember && 14461 "no ScheduleData for bundle member " 14462 "(maybe not in same basic block)"); 14463 assert(BundleMember->isSchedulingEntity() && 14464 "bundle member already part of other bundle"); 14465 if (PrevInBundle) { 14466 PrevInBundle->NextInBundle = BundleMember; 14467 } else { 14468 Bundle = BundleMember; 14469 } 14470 14471 // Group the instructions to a bundle. 14472 BundleMember->FirstInBundle = Bundle; 14473 PrevInBundle = BundleMember; 14474 } 14475 assert(Bundle && "Failed to find schedule bundle"); 14476 return Bundle; 14477 } 14478 14479 // Groups the instructions to a bundle (which is then a single scheduling entity) 14480 // and schedules instructions until the bundle gets ready. 14481 std::optional<BoUpSLP::ScheduleData *> 14482 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 14483 const InstructionsState &S) { 14484 // No need to schedule PHIs, insertelement, extractelement and extractvalue 14485 // instructions. 14486 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || 14487 doesNotNeedToSchedule(VL)) 14488 return nullptr; 14489 14490 // Initialize the instruction bundle. 14491 Instruction *OldScheduleEnd = ScheduleEnd; 14492 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); 14493 14494 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, 14495 ScheduleData *Bundle) { 14496 // The scheduling region got new instructions at the lower end (or it is a 14497 // new region for the first bundle). This makes it necessary to 14498 // recalculate all dependencies. 14499 // It is seldom that this needs to be done a second time after adding the 14500 // initial bundle to the region. 14501 if (ScheduleEnd != OldScheduleEnd) { 14502 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) 14503 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); 14504 ReSchedule = true; 14505 } 14506 if (Bundle) { 14507 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle 14508 << " in block " << BB->getName() << "\n"); 14509 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); 14510 } 14511 14512 if (ReSchedule) { 14513 resetSchedule(); 14514 initialFillReadyList(ReadyInsts); 14515 } 14516 14517 // Now try to schedule the new bundle or (if no bundle) just calculate 14518 // dependencies. As soon as the bundle is "ready" it means that there are no 14519 // cyclic dependencies and we can schedule it. Note that's important that we 14520 // don't "schedule" the bundle yet (see cancelScheduling). 14521 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && 14522 !ReadyInsts.empty()) { 14523 ScheduleData *Picked = ReadyInsts.pop_back_val(); 14524 assert(Picked->isSchedulingEntity() && Picked->isReady() && 14525 "must be ready to schedule"); 14526 schedule(Picked, ReadyInsts); 14527 } 14528 }; 14529 14530 // Make sure that the scheduling region contains all 14531 // instructions of the bundle. 14532 for (Value *V : VL) { 14533 if (doesNotNeedToBeScheduled(V)) 14534 continue; 14535 if (!extendSchedulingRegion(V, S)) { 14536 // If the scheduling region got new instructions at the lower end (or it 14537 // is a new region for the first bundle). This makes it necessary to 14538 // recalculate all dependencies. 14539 // Otherwise the compiler may crash trying to incorrectly calculate 14540 // dependencies and emit instruction in the wrong order at the actual 14541 // scheduling. 14542 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); 14543 return std::nullopt; 14544 } 14545 } 14546 14547 bool ReSchedule = false; 14548 for (Value *V : VL) { 14549 if (doesNotNeedToBeScheduled(V)) 14550 continue; 14551 ScheduleData *BundleMember = getScheduleData(V); 14552 assert(BundleMember && 14553 "no ScheduleData for bundle member (maybe not in same basic block)"); 14554 14555 // Make sure we don't leave the pieces of the bundle in the ready list when 14556 // whole bundle might not be ready. 14557 ReadyInsts.remove(BundleMember); 14558 14559 if (!BundleMember->IsScheduled) 14560 continue; 14561 // A bundle member was scheduled as single instruction before and now 14562 // needs to be scheduled as part of the bundle. We just get rid of the 14563 // existing schedule. 14564 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 14565 << " was already scheduled\n"); 14566 ReSchedule = true; 14567 } 14568 14569 auto *Bundle = buildBundle(VL); 14570 TryScheduleBundleImpl(ReSchedule, Bundle); 14571 if (!Bundle->isReady()) { 14572 cancelScheduling(VL, S.OpValue); 14573 return std::nullopt; 14574 } 14575 return Bundle; 14576 } 14577 14578 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, 14579 Value *OpValue) { 14580 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) || 14581 doesNotNeedToSchedule(VL)) 14582 return; 14583 14584 if (doesNotNeedToBeScheduled(OpValue)) 14585 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); 14586 ScheduleData *Bundle = getScheduleData(OpValue); 14587 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 14588 assert(!Bundle->IsScheduled && 14589 "Can't cancel bundle which is already scheduled"); 14590 assert(Bundle->isSchedulingEntity() && 14591 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && 14592 "tried to unbundle something which is not a bundle"); 14593 14594 // Remove the bundle from the ready list. 14595 if (Bundle->isReady()) 14596 ReadyInsts.remove(Bundle); 14597 14598 // Un-bundle: make single instructions out of the bundle. 14599 ScheduleData *BundleMember = Bundle; 14600 while (BundleMember) { 14601 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 14602 BundleMember->FirstInBundle = BundleMember; 14603 ScheduleData *Next = BundleMember->NextInBundle; 14604 BundleMember->NextInBundle = nullptr; 14605 BundleMember->TE = nullptr; 14606 if (BundleMember->unscheduledDepsInBundle() == 0) { 14607 ReadyInsts.insert(BundleMember); 14608 } 14609 BundleMember = Next; 14610 } 14611 } 14612 14613 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { 14614 // Allocate a new ScheduleData for the instruction. 14615 if (ChunkPos >= ChunkSize) { 14616 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); 14617 ChunkPos = 0; 14618 } 14619 return &(ScheduleDataChunks.back()[ChunkPos++]); 14620 } 14621 14622 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, 14623 const InstructionsState &S) { 14624 if (getScheduleData(V, isOneOf(S, V))) 14625 return true; 14626 Instruction *I = dyn_cast<Instruction>(V); 14627 assert(I && "bundle member must be an instruction"); 14628 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && 14629 !doesNotNeedToBeScheduled(I) && 14630 "phi nodes/insertelements/extractelements/extractvalues don't need to " 14631 "be scheduled"); 14632 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool { 14633 ScheduleData *ISD = getScheduleData(I); 14634 if (!ISD) 14635 return false; 14636 assert(isInSchedulingRegion(ISD) && 14637 "ScheduleData not in scheduling region"); 14638 ScheduleData *SD = allocateScheduleDataChunks(); 14639 SD->Inst = I; 14640 SD->init(SchedulingRegionID, S.OpValue); 14641 ExtraScheduleDataMap[I][S.OpValue] = SD; 14642 return true; 14643 }; 14644 if (CheckScheduleForI(I)) 14645 return true; 14646 if (!ScheduleStart) { 14647 // It's the first instruction in the new region. 14648 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 14649 ScheduleStart = I; 14650 ScheduleEnd = I->getNextNode(); 14651 if (isOneOf(S, I) != I) 14652 CheckScheduleForI(I); 14653 assert(ScheduleEnd && "tried to vectorize a terminator?"); 14654 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 14655 return true; 14656 } 14657 // Search up and down at the same time, because we don't know if the new 14658 // instruction is above or below the existing scheduling region. 14659 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted 14660 // against the budget. Otherwise debug info could affect codegen. 14661 BasicBlock::reverse_iterator UpIter = 14662 ++ScheduleStart->getIterator().getReverse(); 14663 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 14664 BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); 14665 BasicBlock::iterator LowerEnd = BB->end(); 14666 auto IsAssumeLikeIntr = [](const Instruction &I) { 14667 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 14668 return II->isAssumeLikeIntrinsic(); 14669 return false; 14670 }; 14671 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); 14672 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); 14673 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && 14674 &*DownIter != I) { 14675 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 14676 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); 14677 return false; 14678 } 14679 14680 ++UpIter; 14681 ++DownIter; 14682 14683 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr); 14684 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr); 14685 } 14686 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { 14687 assert(I->getParent() == ScheduleStart->getParent() && 14688 "Instruction is in wrong basic block."); 14689 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 14690 ScheduleStart = I; 14691 if (isOneOf(S, I) != I) 14692 CheckScheduleForI(I); 14693 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I 14694 << "\n"); 14695 return true; 14696 } 14697 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && 14698 "Expected to reach top of the basic block or instruction down the " 14699 "lower end."); 14700 assert(I->getParent() == ScheduleEnd->getParent() && 14701 "Instruction is in wrong basic block."); 14702 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 14703 nullptr); 14704 ScheduleEnd = I->getNextNode(); 14705 if (isOneOf(S, I) != I) 14706 CheckScheduleForI(I); 14707 assert(ScheduleEnd && "tried to vectorize a terminator?"); 14708 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 14709 return true; 14710 } 14711 14712 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 14713 Instruction *ToI, 14714 ScheduleData *PrevLoadStore, 14715 ScheduleData *NextLoadStore) { 14716 ScheduleData *CurrentLoadStore = PrevLoadStore; 14717 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 14718 // No need to allocate data for non-schedulable instructions. 14719 if (doesNotNeedToBeScheduled(I)) 14720 continue; 14721 ScheduleData *SD = ScheduleDataMap.lookup(I); 14722 if (!SD) { 14723 SD = allocateScheduleDataChunks(); 14724 ScheduleDataMap[I] = SD; 14725 SD->Inst = I; 14726 } 14727 assert(!isInSchedulingRegion(SD) && 14728 "new ScheduleData already in scheduling region"); 14729 SD->init(SchedulingRegionID, I); 14730 14731 if (I->mayReadOrWriteMemory() && 14732 (!isa<IntrinsicInst>(I) || 14733 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && 14734 cast<IntrinsicInst>(I)->getIntrinsicID() != 14735 Intrinsic::pseudoprobe))) { 14736 // Update the linked list of memory accessing instructions. 14737 if (CurrentLoadStore) { 14738 CurrentLoadStore->NextLoadStore = SD; 14739 } else { 14740 FirstLoadStoreInRegion = SD; 14741 } 14742 CurrentLoadStore = SD; 14743 } 14744 14745 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || 14746 match(I, m_Intrinsic<Intrinsic::stackrestore>())) 14747 RegionHasStackSave = true; 14748 } 14749 if (NextLoadStore) { 14750 if (CurrentLoadStore) 14751 CurrentLoadStore->NextLoadStore = NextLoadStore; 14752 } else { 14753 LastLoadStoreInRegion = CurrentLoadStore; 14754 } 14755 } 14756 14757 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 14758 bool InsertInReadyList, 14759 BoUpSLP *SLP) { 14760 assert(SD->isSchedulingEntity()); 14761 14762 SmallVector<ScheduleData *, 10> WorkList; 14763 WorkList.push_back(SD); 14764 14765 while (!WorkList.empty()) { 14766 ScheduleData *SD = WorkList.pop_back_val(); 14767 for (ScheduleData *BundleMember = SD; BundleMember; 14768 BundleMember = BundleMember->NextInBundle) { 14769 assert(isInSchedulingRegion(BundleMember)); 14770 if (BundleMember->hasValidDependencies()) 14771 continue; 14772 14773 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember 14774 << "\n"); 14775 BundleMember->Dependencies = 0; 14776 BundleMember->resetUnscheduledDeps(); 14777 14778 // Handle def-use chain dependencies. 14779 if (BundleMember->OpValue != BundleMember->Inst) { 14780 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) { 14781 BundleMember->Dependencies++; 14782 ScheduleData *DestBundle = UseSD->FirstInBundle; 14783 if (!DestBundle->IsScheduled) 14784 BundleMember->incrementUnscheduledDeps(1); 14785 if (!DestBundle->hasValidDependencies()) 14786 WorkList.push_back(DestBundle); 14787 } 14788 } else { 14789 for (User *U : BundleMember->Inst->users()) { 14790 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) { 14791 BundleMember->Dependencies++; 14792 ScheduleData *DestBundle = UseSD->FirstInBundle; 14793 if (!DestBundle->IsScheduled) 14794 BundleMember->incrementUnscheduledDeps(1); 14795 if (!DestBundle->hasValidDependencies()) 14796 WorkList.push_back(DestBundle); 14797 } 14798 } 14799 } 14800 14801 auto MakeControlDependent = [&](Instruction *I) { 14802 auto *DepDest = getScheduleData(I); 14803 assert(DepDest && "must be in schedule window"); 14804 DepDest->ControlDependencies.push_back(BundleMember); 14805 BundleMember->Dependencies++; 14806 ScheduleData *DestBundle = DepDest->FirstInBundle; 14807 if (!DestBundle->IsScheduled) 14808 BundleMember->incrementUnscheduledDeps(1); 14809 if (!DestBundle->hasValidDependencies()) 14810 WorkList.push_back(DestBundle); 14811 }; 14812 14813 // Any instruction which isn't safe to speculate at the beginning of the 14814 // block is control dependend on any early exit or non-willreturn call 14815 // which proceeds it. 14816 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) { 14817 for (Instruction *I = BundleMember->Inst->getNextNode(); 14818 I != ScheduleEnd; I = I->getNextNode()) { 14819 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC)) 14820 continue; 14821 14822 // Add the dependency 14823 MakeControlDependent(I); 14824 14825 if (!isGuaranteedToTransferExecutionToSuccessor(I)) 14826 // Everything past here must be control dependent on I. 14827 break; 14828 } 14829 } 14830 14831 if (RegionHasStackSave) { 14832 // If we have an inalloc alloca instruction, it needs to be scheduled 14833 // after any preceeding stacksave. We also need to prevent any alloca 14834 // from reordering above a preceeding stackrestore. 14835 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) || 14836 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) { 14837 for (Instruction *I = BundleMember->Inst->getNextNode(); 14838 I != ScheduleEnd; I = I->getNextNode()) { 14839 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) || 14840 match(I, m_Intrinsic<Intrinsic::stackrestore>())) 14841 // Any allocas past here must be control dependent on I, and I 14842 // must be memory dependend on BundleMember->Inst. 14843 break; 14844 14845 if (!isa<AllocaInst>(I)) 14846 continue; 14847 14848 // Add the dependency 14849 MakeControlDependent(I); 14850 } 14851 } 14852 14853 // In addition to the cases handle just above, we need to prevent 14854 // allocas and loads/stores from moving below a stacksave or a 14855 // stackrestore. Avoiding moving allocas below stackrestore is currently 14856 // thought to be conservatism. Moving loads/stores below a stackrestore 14857 // can lead to incorrect code. 14858 if (isa<AllocaInst>(BundleMember->Inst) || 14859 BundleMember->Inst->mayReadOrWriteMemory()) { 14860 for (Instruction *I = BundleMember->Inst->getNextNode(); 14861 I != ScheduleEnd; I = I->getNextNode()) { 14862 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) && 14863 !match(I, m_Intrinsic<Intrinsic::stackrestore>())) 14864 continue; 14865 14866 // Add the dependency 14867 MakeControlDependent(I); 14868 break; 14869 } 14870 } 14871 } 14872 14873 // Handle the memory dependencies (if any). 14874 ScheduleData *DepDest = BundleMember->NextLoadStore; 14875 if (!DepDest) 14876 continue; 14877 Instruction *SrcInst = BundleMember->Inst; 14878 assert(SrcInst->mayReadOrWriteMemory() && 14879 "NextLoadStore list for non memory effecting bundle?"); 14880 MemoryLocation SrcLoc = getLocation(SrcInst); 14881 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 14882 unsigned NumAliased = 0; 14883 unsigned DistToSrc = 1; 14884 14885 for (; DepDest; DepDest = DepDest->NextLoadStore) { 14886 assert(isInSchedulingRegion(DepDest)); 14887 14888 // We have two limits to reduce the complexity: 14889 // 1) AliasedCheckLimit: It's a small limit to reduce calls to 14890 // SLP->isAliased (which is the expensive part in this loop). 14891 // 2) MaxMemDepDistance: It's for very large blocks and it aborts 14892 // the whole loop (even if the loop is fast, it's quadratic). 14893 // It's important for the loop break condition (see below) to 14894 // check this limit even between two read-only instructions. 14895 if (DistToSrc >= MaxMemDepDistance || 14896 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 14897 (NumAliased >= AliasedCheckLimit || 14898 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 14899 14900 // We increment the counter only if the locations are aliased 14901 // (instead of counting all alias checks). This gives a better 14902 // balance between reduced runtime and accurate dependencies. 14903 NumAliased++; 14904 14905 DepDest->MemoryDependencies.push_back(BundleMember); 14906 BundleMember->Dependencies++; 14907 ScheduleData *DestBundle = DepDest->FirstInBundle; 14908 if (!DestBundle->IsScheduled) { 14909 BundleMember->incrementUnscheduledDeps(1); 14910 } 14911 if (!DestBundle->hasValidDependencies()) { 14912 WorkList.push_back(DestBundle); 14913 } 14914 } 14915 14916 // Example, explaining the loop break condition: Let's assume our 14917 // starting instruction is i0 and MaxMemDepDistance = 3. 14918 // 14919 // +--------v--v--v 14920 // i0,i1,i2,i3,i4,i5,i6,i7,i8 14921 // +--------^--^--^ 14922 // 14923 // MaxMemDepDistance let us stop alias-checking at i3 and we add 14924 // dependencies from i0 to i3,i4,.. (even if they are not aliased). 14925 // Previously we already added dependencies from i3 to i6,i7,i8 14926 // (because of MaxMemDepDistance). As we added a dependency from 14927 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 14928 // and we can abort this loop at i6. 14929 if (DistToSrc >= 2 * MaxMemDepDistance) 14930 break; 14931 DistToSrc++; 14932 } 14933 } 14934 if (InsertInReadyList && SD->isReady()) { 14935 ReadyInsts.insert(SD); 14936 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst 14937 << "\n"); 14938 } 14939 } 14940 } 14941 14942 void BoUpSLP::BlockScheduling::resetSchedule() { 14943 assert(ScheduleStart && 14944 "tried to reset schedule on block which has not been scheduled"); 14945 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 14946 doForAllOpcodes(I, [&](ScheduleData *SD) { 14947 assert(isInSchedulingRegion(SD) && 14948 "ScheduleData not in scheduling region"); 14949 SD->IsScheduled = false; 14950 SD->resetUnscheduledDeps(); 14951 }); 14952 } 14953 ReadyInsts.clear(); 14954 } 14955 14956 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 14957 if (!BS->ScheduleStart) 14958 return; 14959 14960 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 14961 14962 // A key point - if we got here, pre-scheduling was able to find a valid 14963 // scheduling of the sub-graph of the scheduling window which consists 14964 // of all vector bundles and their transitive users. As such, we do not 14965 // need to reschedule anything *outside of* that subgraph. 14966 14967 BS->resetSchedule(); 14968 14969 // For the real scheduling we use a more sophisticated ready-list: it is 14970 // sorted by the original instruction location. This lets the final schedule 14971 // be as close as possible to the original instruction order. 14972 // WARNING: If changing this order causes a correctness issue, that means 14973 // there is some missing dependence edge in the schedule data graph. 14974 struct ScheduleDataCompare { 14975 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { 14976 return SD2->SchedulingPriority < SD1->SchedulingPriority; 14977 } 14978 }; 14979 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 14980 14981 // Ensure that all dependency data is updated (for nodes in the sub-graph) 14982 // and fill the ready-list with initial instructions. 14983 int Idx = 0; 14984 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 14985 I = I->getNextNode()) { 14986 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) { 14987 TreeEntry *SDTE = getTreeEntry(SD->Inst); 14988 (void)SDTE; 14989 assert((isVectorLikeInstWithConstOps(SD->Inst) || 14990 SD->isPartOfBundle() == 14991 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && 14992 "scheduler and vectorizer bundle mismatch"); 14993 SD->FirstInBundle->SchedulingPriority = Idx++; 14994 14995 if (SD->isSchedulingEntity() && SD->isPartOfBundle()) 14996 BS->calculateDependencies(SD, false, this); 14997 }); 14998 } 14999 BS->initialFillReadyList(ReadyInsts); 15000 15001 Instruction *LastScheduledInst = BS->ScheduleEnd; 15002 15003 // Do the "real" scheduling. 15004 while (!ReadyInsts.empty()) { 15005 ScheduleData *Picked = *ReadyInsts.begin(); 15006 ReadyInsts.erase(ReadyInsts.begin()); 15007 15008 // Move the scheduled instruction(s) to their dedicated places, if not 15009 // there yet. 15010 for (ScheduleData *BundleMember = Picked; BundleMember; 15011 BundleMember = BundleMember->NextInBundle) { 15012 Instruction *PickedInst = BundleMember->Inst; 15013 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) 15014 PickedInst->moveAfter(LastScheduledInst->getPrevNode()); 15015 LastScheduledInst = PickedInst; 15016 } 15017 15018 BS->schedule(Picked, ReadyInsts); 15019 } 15020 15021 // Check that we didn't break any of our invariants. 15022 #ifdef EXPENSIVE_CHECKS 15023 BS->verify(); 15024 #endif 15025 15026 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) 15027 // Check that all schedulable entities got scheduled 15028 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { 15029 BS->doForAllOpcodes(I, [&](ScheduleData *SD) { 15030 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) { 15031 assert(SD->IsScheduled && "must be scheduled at this point"); 15032 } 15033 }); 15034 } 15035 #endif 15036 15037 // Avoid duplicate scheduling of the block. 15038 BS->ScheduleStart = nullptr; 15039 } 15040 15041 unsigned BoUpSLP::getVectorElementSize(Value *V) { 15042 // If V is a store, just return the width of the stored value (or value 15043 // truncated just before storing) without traversing the expression tree. 15044 // This is the common case. 15045 if (auto *Store = dyn_cast<StoreInst>(V)) 15046 return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); 15047 15048 if (auto *IEI = dyn_cast<InsertElementInst>(V)) 15049 return getVectorElementSize(IEI->getOperand(1)); 15050 15051 auto E = InstrElementSize.find(V); 15052 if (E != InstrElementSize.end()) 15053 return E->second; 15054 15055 // If V is not a store, we can traverse the expression tree to find loads 15056 // that feed it. The type of the loaded value may indicate a more suitable 15057 // width than V's type. We want to base the vector element size on the width 15058 // of memory operations where possible. 15059 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist; 15060 SmallPtrSet<Instruction *, 16> Visited; 15061 if (auto *I = dyn_cast<Instruction>(V)) { 15062 Worklist.emplace_back(I, I->getParent(), 0); 15063 Visited.insert(I); 15064 } 15065 15066 // Traverse the expression tree in bottom-up order looking for loads. If we 15067 // encounter an instruction we don't yet handle, we give up. 15068 auto Width = 0u; 15069 Value *FirstNonBool = nullptr; 15070 while (!Worklist.empty()) { 15071 auto [I, Parent, Level] = Worklist.pop_back_val(); 15072 15073 // We should only be looking at scalar instructions here. If the current 15074 // instruction has a vector type, skip. 15075 auto *Ty = I->getType(); 15076 if (isa<VectorType>(Ty)) 15077 continue; 15078 if (Ty != Builder.getInt1Ty() && !FirstNonBool) 15079 FirstNonBool = I; 15080 if (Level > RecursionMaxDepth) 15081 continue; 15082 15083 // If the current instruction is a load, update MaxWidth to reflect the 15084 // width of the loaded value. 15085 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I)) 15086 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); 15087 15088 // Otherwise, we need to visit the operands of the instruction. We only 15089 // handle the interesting cases from buildTree here. If an operand is an 15090 // instruction we haven't yet visited and from the same basic block as the 15091 // user or the use is a PHI node, we add it to the worklist. 15092 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst, 15093 BinaryOperator, UnaryOperator>(I)) { 15094 for (Use &U : I->operands()) { 15095 if (auto *J = dyn_cast<Instruction>(U.get())) 15096 if (Visited.insert(J).second && 15097 (isa<PHINode>(I) || J->getParent() == Parent)) { 15098 Worklist.emplace_back(J, J->getParent(), Level + 1); 15099 continue; 15100 } 15101 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty()) 15102 FirstNonBool = U.get(); 15103 } 15104 } else { 15105 break; 15106 } 15107 } 15108 15109 // If we didn't encounter a memory access in the expression tree, or if we 15110 // gave up for some reason, just return the width of V. Otherwise, return the 15111 // maximum width we found. 15112 if (!Width) { 15113 if (V->getType() == Builder.getInt1Ty() && FirstNonBool) 15114 V = FirstNonBool; 15115 Width = DL->getTypeSizeInBits(V->getType()); 15116 } 15117 15118 for (Instruction *I : Visited) 15119 InstrElementSize[I] = Width; 15120 15121 return Width; 15122 } 15123 15124 bool BoUpSLP::collectValuesToDemote( 15125 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, 15126 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, 15127 unsigned &MaxDepthLevel, bool &IsProfitableToDemote, 15128 bool IsTruncRoot) const { 15129 // We can always demote constants. 15130 if (all_of(E.Scalars, IsaPred<Constant>)) 15131 return true; 15132 15133 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType()); 15134 if (OrigBitWidth == BitWidth) { 15135 MaxDepthLevel = 1; 15136 return true; 15137 } 15138 15139 // If the value is not a vectorized instruction in the expression and not used 15140 // by the insertelement instruction and not used in multiple vector nodes, it 15141 // cannot be demoted. 15142 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) { 15143 return !isKnownNonNegative(R, SimplifyQuery(*DL)); 15144 }); 15145 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool { 15146 if (MultiNodeScalars.contains(V)) 15147 return false; 15148 // For lat shuffle of sext/zext with many uses need to check the extra bit 15149 // for unsigned values, otherwise may have incorrect casting for reused 15150 // scalars. 15151 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL)); 15152 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) { 15153 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 15154 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL))) 15155 return true; 15156 } 15157 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); 15158 unsigned BitWidth1 = OrigBitWidth - NumSignBits; 15159 if (IsSignedNode) 15160 ++BitWidth1; 15161 if (auto *I = dyn_cast<Instruction>(V)) { 15162 APInt Mask = DB->getDemandedBits(I); 15163 unsigned BitWidth2 = 15164 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero()); 15165 while (!IsSignedNode && BitWidth2 < OrigBitWidth) { 15166 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1); 15167 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL))) 15168 break; 15169 BitWidth2 *= 2; 15170 } 15171 BitWidth1 = std::min(BitWidth1, BitWidth2); 15172 } 15173 BitWidth = std::max(BitWidth, BitWidth1); 15174 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); 15175 }; 15176 using namespace std::placeholders; 15177 auto FinalAnalysis = [&]() { 15178 if (!IsProfitableToDemote) 15179 return false; 15180 bool Res = all_of( 15181 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth))); 15182 // Demote gathers. 15183 if (Res && E.isGather()) { 15184 // Check possible extractelement instructions bases and final vector 15185 // length. 15186 SmallPtrSet<Value *, 4> UniqueBases; 15187 for (Value *V : E.Scalars) { 15188 auto *EE = dyn_cast<ExtractElementInst>(V); 15189 if (!EE) 15190 continue; 15191 UniqueBases.insert(EE->getVectorOperand()); 15192 } 15193 const unsigned VF = E.Scalars.size(); 15194 Type *OrigScalarTy = E.Scalars.front()->getType(); 15195 if (UniqueBases.size() <= 2 || 15196 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) == 15197 TTI->getNumberOfParts(getWidenedType( 15198 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF))) 15199 ToDemote.push_back(E.Idx); 15200 } 15201 return Res; 15202 }; 15203 if (E.isGather() || !Visited.insert(&E).second || 15204 any_of(E.Scalars, [&](Value *V) { 15205 return all_of(V->users(), [&](User *U) { 15206 return isa<InsertElementInst>(U) && !getTreeEntry(U); 15207 }); 15208 })) 15209 return FinalAnalysis(); 15210 15211 if (any_of(E.Scalars, [&](Value *V) { 15212 return !all_of(V->users(), [=](User *U) { 15213 return getTreeEntry(U) || 15214 (E.Idx == 0 && UserIgnoreList && 15215 UserIgnoreList->contains(U)) || 15216 (!isa<CmpInst>(U) && U->getType()->isSized() && 15217 !U->getType()->isScalableTy() && 15218 DL->getTypeSizeInBits(U->getType()) <= BitWidth); 15219 }) && !IsPotentiallyTruncated(V, BitWidth); 15220 })) 15221 return false; 15222 15223 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands, 15224 bool &NeedToExit) { 15225 NeedToExit = false; 15226 unsigned InitLevel = MaxDepthLevel; 15227 for (const TreeEntry *Op : Operands) { 15228 unsigned Level = InitLevel; 15229 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth, 15230 ToDemote, Visited, Level, IsProfitableToDemote, 15231 IsTruncRoot)) { 15232 if (!IsProfitableToDemote) 15233 return false; 15234 NeedToExit = true; 15235 if (!FinalAnalysis()) 15236 return false; 15237 continue; 15238 } 15239 MaxDepthLevel = std::max(MaxDepthLevel, Level); 15240 } 15241 return true; 15242 }; 15243 auto AttemptCheckBitwidth = 15244 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) { 15245 // Try all bitwidth < OrigBitWidth. 15246 NeedToExit = false; 15247 unsigned BestFailBitwidth = 0; 15248 for (; BitWidth < OrigBitWidth; BitWidth *= 2) { 15249 if (Checker(BitWidth, OrigBitWidth)) 15250 return true; 15251 if (BestFailBitwidth == 0 && FinalAnalysis()) 15252 BestFailBitwidth = BitWidth; 15253 } 15254 if (BitWidth >= OrigBitWidth) { 15255 if (BestFailBitwidth == 0) { 15256 BitWidth = OrigBitWidth; 15257 return false; 15258 } 15259 MaxDepthLevel = 1; 15260 BitWidth = BestFailBitwidth; 15261 NeedToExit = true; 15262 return true; 15263 } 15264 return false; 15265 }; 15266 auto TryProcessInstruction = 15267 [&](unsigned &BitWidth, 15268 ArrayRef<const TreeEntry *> Operands = std::nullopt, 15269 function_ref<bool(unsigned, unsigned)> Checker = {}) { 15270 if (Operands.empty()) { 15271 if (!IsTruncRoot) 15272 MaxDepthLevel = 1; 15273 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1, 15274 std::ref(BitWidth))); 15275 } else { 15276 // Several vectorized uses? Check if we can truncate it, otherwise - 15277 // exit. 15278 if (E.UserTreeIndices.size() > 1 && 15279 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1, 15280 std::ref(BitWidth)))) 15281 return false; 15282 bool NeedToExit = false; 15283 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit)) 15284 return false; 15285 if (NeedToExit) 15286 return true; 15287 if (!ProcessOperands(Operands, NeedToExit)) 15288 return false; 15289 if (NeedToExit) 15290 return true; 15291 } 15292 15293 ++MaxDepthLevel; 15294 // Record the entry that we can demote. 15295 ToDemote.push_back(E.Idx); 15296 return IsProfitableToDemote; 15297 }; 15298 switch (E.getOpcode()) { 15299 15300 // We can always demote truncations and extensions. Since truncations can 15301 // seed additional demotion, we save the truncated value. 15302 case Instruction::Trunc: 15303 if (IsProfitableToDemoteRoot) 15304 IsProfitableToDemote = true; 15305 return TryProcessInstruction(BitWidth); 15306 case Instruction::ZExt: 15307 case Instruction::SExt: 15308 IsProfitableToDemote = true; 15309 return TryProcessInstruction(BitWidth); 15310 15311 // We can demote certain binary operations if we can demote both of their 15312 // operands. 15313 case Instruction::Add: 15314 case Instruction::Sub: 15315 case Instruction::Mul: 15316 case Instruction::And: 15317 case Instruction::Or: 15318 case Instruction::Xor: { 15319 return TryProcessInstruction( 15320 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}); 15321 } 15322 case Instruction::Shl: { 15323 // If we are truncating the result of this SHL, and if it's a shift of an 15324 // inrange amount, we can always perform a SHL in a smaller type. 15325 auto ShlChecker = [&](unsigned BitWidth, unsigned) { 15326 return all_of(E.Scalars, [&](Value *V) { 15327 auto *I = cast<Instruction>(V); 15328 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); 15329 return AmtKnownBits.getMaxValue().ult(BitWidth); 15330 }); 15331 }; 15332 return TryProcessInstruction( 15333 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker); 15334 } 15335 case Instruction::LShr: { 15336 // If this is a truncate of a logical shr, we can truncate it to a smaller 15337 // lshr iff we know that the bits we would otherwise be shifting in are 15338 // already zeros. 15339 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 15340 return all_of(E.Scalars, [&](Value *V) { 15341 auto *I = cast<Instruction>(V); 15342 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); 15343 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 15344 return AmtKnownBits.getMaxValue().ult(BitWidth) && 15345 MaskedValueIsZero(I->getOperand(0), ShiftedBits, 15346 SimplifyQuery(*DL)); 15347 }); 15348 }; 15349 return TryProcessInstruction( 15350 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, 15351 LShrChecker); 15352 } 15353 case Instruction::AShr: { 15354 // If this is a truncate of an arithmetic shr, we can truncate it to a 15355 // smaller ashr iff we know that all the bits from the sign bit of the 15356 // original type and the sign bit of the truncate type are similar. 15357 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 15358 return all_of(E.Scalars, [&](Value *V) { 15359 auto *I = cast<Instruction>(V); 15360 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); 15361 unsigned ShiftedBits = OrigBitWidth - BitWidth; 15362 return AmtKnownBits.getMaxValue().ult(BitWidth) && 15363 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, 15364 nullptr, DT); 15365 }); 15366 }; 15367 return TryProcessInstruction( 15368 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, 15369 AShrChecker); 15370 } 15371 case Instruction::UDiv: 15372 case Instruction::URem: { 15373 // UDiv and URem can be truncated if all the truncated bits are zero. 15374 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 15375 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); 15376 return all_of(E.Scalars, [&](Value *V) { 15377 auto *I = cast<Instruction>(V); 15378 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 15379 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) && 15380 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); 15381 }); 15382 }; 15383 return TryProcessInstruction( 15384 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker); 15385 } 15386 15387 // We can demote selects if we can demote their true and false values. 15388 case Instruction::Select: { 15389 return TryProcessInstruction( 15390 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)}); 15391 } 15392 15393 // We can demote phis if we can demote all their incoming operands. Note that 15394 // we don't need to worry about cycles since we ensure single use above. 15395 case Instruction::PHI: { 15396 const unsigned NumOps = E.getNumOperands(); 15397 SmallVector<const TreeEntry *> Ops(NumOps); 15398 transform(seq<unsigned>(0, NumOps), Ops.begin(), 15399 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1)); 15400 15401 return TryProcessInstruction(BitWidth, Ops); 15402 } 15403 15404 case Instruction::Call: { 15405 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp()); 15406 if (!IC) 15407 break; 15408 Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI); 15409 if (ID != Intrinsic::abs && ID != Intrinsic::smin && 15410 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax) 15411 break; 15412 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0)); 15413 function_ref<bool(unsigned, unsigned)> CallChecker; 15414 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 15415 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); 15416 return all_of(E.Scalars, [&](Value *V) { 15417 auto *I = cast<Instruction>(V); 15418 if (ID == Intrinsic::umin || ID == Intrinsic::umax) { 15419 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 15420 return MaskedValueIsZero(I->getOperand(0), Mask, 15421 SimplifyQuery(*DL)) && 15422 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); 15423 } 15424 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) && 15425 "Expected min/max intrinsics only."); 15426 unsigned SignBits = OrigBitWidth - BitWidth; 15427 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1); 15428 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, 15429 nullptr, DT); 15430 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC, 15431 nullptr, DT); 15432 return SignBits <= Op0SignBits && 15433 ((SignBits != Op0SignBits && 15434 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) || 15435 MaskedValueIsZero(I->getOperand(0), Mask, 15436 SimplifyQuery(*DL))) && 15437 SignBits <= Op1SignBits && 15438 ((SignBits != Op1SignBits && 15439 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) || 15440 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL))); 15441 }); 15442 }; 15443 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { 15444 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); 15445 return all_of(E.Scalars, [&](Value *V) { 15446 auto *I = cast<Instruction>(V); 15447 unsigned SignBits = OrigBitWidth - BitWidth; 15448 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1); 15449 unsigned Op0SignBits = 15450 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT); 15451 return SignBits <= Op0SignBits && 15452 ((SignBits != Op0SignBits && 15453 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) || 15454 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL))); 15455 }); 15456 }; 15457 if (ID != Intrinsic::abs) { 15458 Operands.push_back(getOperandEntry(&E, 1)); 15459 CallChecker = CompChecker; 15460 } else { 15461 CallChecker = AbsChecker; 15462 } 15463 InstructionCost BestCost = 15464 std::numeric_limits<InstructionCost::CostType>::max(); 15465 unsigned BestBitWidth = BitWidth; 15466 unsigned VF = E.Scalars.size(); 15467 // Choose the best bitwidth based on cost estimations. 15468 auto Checker = [&](unsigned BitWidth, unsigned) { 15469 unsigned MinBW = PowerOf2Ceil(BitWidth); 15470 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW); 15471 auto VecCallCosts = getVectorCallCosts( 15472 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF), 15473 TTI, TLI, ArgTys); 15474 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second); 15475 if (Cost < BestCost) { 15476 BestCost = Cost; 15477 BestBitWidth = BitWidth; 15478 } 15479 return false; 15480 }; 15481 [[maybe_unused]] bool NeedToExit; 15482 (void)AttemptCheckBitwidth(Checker, NeedToExit); 15483 BitWidth = BestBitWidth; 15484 return TryProcessInstruction(BitWidth, Operands, CallChecker); 15485 } 15486 15487 // Otherwise, conservatively give up. 15488 default: 15489 break; 15490 } 15491 MaxDepthLevel = 1; 15492 return FinalAnalysis(); 15493 } 15494 15495 static RecurKind getRdxKind(Value *V); 15496 15497 void BoUpSLP::computeMinimumValueSizes() { 15498 // We only attempt to truncate integer expressions. 15499 bool IsStoreOrInsertElt = 15500 VectorizableTree.front()->getOpcode() == Instruction::Store || 15501 VectorizableTree.front()->getOpcode() == Instruction::InsertElement; 15502 if ((IsStoreOrInsertElt || UserIgnoreList) && 15503 ExtraBitWidthNodes.size() <= 1 && 15504 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 || 15505 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2)) 15506 return; 15507 15508 unsigned NodeIdx = 0; 15509 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather()) 15510 NodeIdx = 1; 15511 15512 // Ensure the roots of the vectorizable tree don't form a cycle. 15513 if (VectorizableTree[NodeIdx]->isGather() || 15514 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) || 15515 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices, 15516 [NodeIdx](const EdgeInfo &EI) { 15517 return EI.UserTE->Idx > 15518 static_cast<int>(NodeIdx); 15519 }))) 15520 return; 15521 15522 // The first value node for store/insertelement is sext/zext/trunc? Skip it, 15523 // resize to the final type. 15524 bool IsTruncRoot = false; 15525 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt; 15526 SmallVector<unsigned> RootDemotes; 15527 if (NodeIdx != 0 && 15528 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && 15529 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { 15530 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph."); 15531 IsTruncRoot = true; 15532 RootDemotes.push_back(NodeIdx); 15533 IsProfitableToDemoteRoot = true; 15534 ++NodeIdx; 15535 } 15536 15537 // Analyzed the reduction already and not profitable - exit. 15538 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front())) 15539 return; 15540 15541 SmallVector<unsigned> ToDemote; 15542 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, 15543 bool IsProfitableToDemoteRoot, unsigned Opcode, 15544 unsigned Limit, bool IsTruncRoot, 15545 bool IsSignedCmp) -> unsigned { 15546 ToDemote.clear(); 15547 // Check if the root is trunc and the next node is gather/buildvector, then 15548 // keep trunc in scalars, which is free in most cases. 15549 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && 15550 E.Idx > (IsStoreOrInsertElt ? 2 : 1) && 15551 all_of(E.Scalars, [&](Value *V) { 15552 return V->hasOneUse() || isa<Constant>(V) || 15553 (!V->hasNUsesOrMore(UsesLimit) && 15554 none_of(V->users(), [&](User *U) { 15555 const TreeEntry *TE = getTreeEntry(U); 15556 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; 15557 if (TE == UserTE || !TE) 15558 return false; 15559 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, 15560 SelectInst>(U) || 15561 !isa<CastInst, BinaryOperator, FreezeInst, PHINode, 15562 SelectInst>(UserTE->getMainOp())) 15563 return true; 15564 unsigned UserTESz = DL->getTypeSizeInBits( 15565 UserTE->Scalars.front()->getType()); 15566 auto It = MinBWs.find(TE); 15567 if (It != MinBWs.end() && It->second.first > UserTESz) 15568 return true; 15569 return DL->getTypeSizeInBits(U->getType()) > UserTESz; 15570 })); 15571 })) { 15572 ToDemote.push_back(E.Idx); 15573 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; 15574 auto It = MinBWs.find(UserTE); 15575 if (It != MinBWs.end()) 15576 return It->second.first; 15577 unsigned MaxBitWidth = 15578 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType()); 15579 MaxBitWidth = bit_ceil(MaxBitWidth); 15580 if (MaxBitWidth < 8 && MaxBitWidth > 1) 15581 MaxBitWidth = 8; 15582 return MaxBitWidth; 15583 } 15584 15585 unsigned VF = E.getVectorFactor(); 15586 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType()); 15587 if (!TreeRootIT || !Opcode) 15588 return 0u; 15589 15590 if (any_of(E.Scalars, 15591 [&](Value *V) { return AnalyzedMinBWVals.contains(V); })) 15592 return 0u; 15593 15594 unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF)); 15595 15596 // The maximum bit width required to represent all the values that can be 15597 // demoted without loss of precision. It would be safe to truncate the roots 15598 // of the expression to this width. 15599 unsigned MaxBitWidth = 1u; 15600 15601 // True if the roots can be zero-extended back to their original type, 15602 // rather than sign-extended. We know that if the leading bits are not 15603 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to 15604 // True. 15605 // Determine if the sign bit of all the roots is known to be zero. If not, 15606 // IsKnownPositive is set to False. 15607 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) { 15608 KnownBits Known = computeKnownBits(R, *DL); 15609 return Known.isNonNegative(); 15610 }); 15611 15612 // We first check if all the bits of the roots are demanded. If they're not, 15613 // we can truncate the roots to this narrower type. 15614 for (Value *Root : E.Scalars) { 15615 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT); 15616 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType()); 15617 unsigned BitWidth1 = NumTypeBits - NumSignBits; 15618 // If we can't prove that the sign bit is zero, we must add one to the 15619 // maximum bit width to account for the unknown sign bit. This preserves 15620 // the existing sign bit so we can safely sign-extend the root back to the 15621 // original type. Otherwise, if we know the sign bit is zero, we will 15622 // zero-extend the root instead. 15623 // 15624 // FIXME: This is somewhat suboptimal, as there will be cases where adding 15625 // one to the maximum bit width will yield a larger-than-necessary 15626 // type. In general, we need to add an extra bit only if we can't 15627 // prove that the upper bit of the original type is equal to the 15628 // upper bit of the proposed smaller type. If these two bits are 15629 // the same (either zero or one) we know that sign-extending from 15630 // the smaller type will result in the same value. Here, since we 15631 // can't yet prove this, we are just making the proposed smaller 15632 // type larger to ensure correctness. 15633 if (!IsKnownPositive) 15634 ++BitWidth1; 15635 15636 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root)); 15637 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); 15638 MaxBitWidth = 15639 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth); 15640 } 15641 15642 if (MaxBitWidth < 8 && MaxBitWidth > 1) 15643 MaxBitWidth = 8; 15644 15645 // If the original type is large, but reduced type does not improve the reg 15646 // use - ignore it. 15647 if (NumParts > 1 && 15648 NumParts == 15649 TTI->getNumberOfParts(getWidenedType( 15650 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF))) 15651 return 0u; 15652 15653 bool IsProfitableToDemote = Opcode == Instruction::Trunc || 15654 Opcode == Instruction::SExt || 15655 Opcode == Instruction::ZExt || NumParts > 1; 15656 // Conservatively determine if we can actually truncate the roots of the 15657 // expression. Collect the values that can be demoted in ToDemote and 15658 // additional roots that require investigating in Roots. 15659 DenseSet<const TreeEntry *> Visited; 15660 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1; 15661 bool NeedToDemote = IsProfitableToDemote; 15662 15663 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth, 15664 ToDemote, Visited, MaxDepthLevel, NeedToDemote, 15665 IsTruncRoot) || 15666 (MaxDepthLevel <= Limit && 15667 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && 15668 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) || 15669 DL->getTypeSizeInBits(TreeRootIT) / 15670 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front()) 15671 ->getOperand(0) 15672 ->getType()) > 15673 2))))) 15674 return 0u; 15675 // Round MaxBitWidth up to the next power-of-two. 15676 MaxBitWidth = bit_ceil(MaxBitWidth); 15677 15678 return MaxBitWidth; 15679 }; 15680 15681 // If we can truncate the root, we must collect additional values that might 15682 // be demoted as a result. That is, those seeded by truncations we will 15683 // modify. 15684 // Add reduction ops sizes, if any. 15685 if (UserIgnoreList && 15686 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) { 15687 for (Value *V : *UserIgnoreList) { 15688 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT); 15689 auto NumTypeBits = DL->getTypeSizeInBits(V->getType()); 15690 unsigned BitWidth1 = NumTypeBits - NumSignBits; 15691 if (!isKnownNonNegative(V, SimplifyQuery(*DL))) 15692 ++BitWidth1; 15693 unsigned BitWidth2 = BitWidth1; 15694 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) { 15695 auto Mask = DB->getDemandedBits(cast<Instruction>(V)); 15696 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); 15697 } 15698 ReductionBitWidth = 15699 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth); 15700 } 15701 if (ReductionBitWidth < 8 && ReductionBitWidth > 1) 15702 ReductionBitWidth = 8; 15703 15704 ReductionBitWidth = bit_ceil(ReductionBitWidth); 15705 } 15706 bool IsTopRoot = NodeIdx == 0; 15707 while (NodeIdx < VectorizableTree.size() && 15708 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && 15709 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { 15710 RootDemotes.push_back(NodeIdx); 15711 ++NodeIdx; 15712 IsTruncRoot = true; 15713 } 15714 bool IsSignedCmp = false; 15715 while (NodeIdx < VectorizableTree.size()) { 15716 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars; 15717 unsigned Limit = 2; 15718 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode(); 15719 if (IsTopRoot && 15720 ReductionBitWidth == 15721 DL->getTypeSizeInBits( 15722 VectorizableTree.front()->Scalars.front()->getType())) 15723 Limit = 3; 15724 unsigned MaxBitWidth = ComputeMaxBitWidth( 15725 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode, 15726 Limit, IsTruncRoot, IsSignedCmp); 15727 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) { 15728 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth) 15729 ReductionBitWidth = bit_ceil(MaxBitWidth); 15730 else if (MaxBitWidth == 0) 15731 ReductionBitWidth = 0; 15732 } 15733 15734 for (unsigned Idx : RootDemotes) { 15735 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) { 15736 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType()); 15737 if (OrigBitWidth > MaxBitWidth) { 15738 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth); 15739 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)); 15740 } 15741 return false; 15742 })) 15743 ToDemote.push_back(Idx); 15744 } 15745 RootDemotes.clear(); 15746 IsTopRoot = false; 15747 IsProfitableToDemoteRoot = true; 15748 15749 if (ExtraBitWidthNodes.empty()) { 15750 NodeIdx = VectorizableTree.size(); 15751 } else { 15752 unsigned NewIdx = 0; 15753 do { 15754 NewIdx = *ExtraBitWidthNodes.begin(); 15755 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin()); 15756 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty()); 15757 NodeIdx = NewIdx; 15758 IsTruncRoot = 15759 NodeIdx < VectorizableTree.size() && 15760 any_of(VectorizableTree[NodeIdx]->UserTreeIndices, 15761 [](const EdgeInfo &EI) { 15762 return EI.EdgeIdx == 0 && 15763 EI.UserTE->getOpcode() == Instruction::Trunc && 15764 !EI.UserTE->isAltShuffle(); 15765 }); 15766 IsSignedCmp = 15767 NodeIdx < VectorizableTree.size() && 15768 any_of(VectorizableTree[NodeIdx]->UserTreeIndices, 15769 [&](const EdgeInfo &EI) { 15770 return EI.UserTE->getOpcode() == Instruction::ICmp && 15771 any_of(EI.UserTE->Scalars, [&](Value *V) { 15772 auto *IC = dyn_cast<ICmpInst>(V); 15773 return IC && 15774 (IC->isSigned() || 15775 !isKnownNonNegative(IC->getOperand(0), 15776 SimplifyQuery(*DL)) || 15777 !isKnownNonNegative(IC->getOperand(1), 15778 SimplifyQuery(*DL))); 15779 }); 15780 }); 15781 } 15782 15783 // If the maximum bit width we compute is less than the with of the roots' 15784 // type, we can proceed with the narrowing. Otherwise, do nothing. 15785 if (MaxBitWidth == 0 || 15786 MaxBitWidth >= 15787 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) { 15788 if (UserIgnoreList) 15789 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end()); 15790 continue; 15791 } 15792 15793 // Finally, map the values we can demote to the maximum bit with we 15794 // computed. 15795 for (unsigned Idx : ToDemote) { 15796 TreeEntry *TE = VectorizableTree[Idx].get(); 15797 if (MinBWs.contains(TE)) 15798 continue; 15799 bool IsSigned = any_of(TE->Scalars, [&](Value *R) { 15800 return !isKnownNonNegative(R, SimplifyQuery(*DL)); 15801 }); 15802 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned); 15803 } 15804 } 15805 } 15806 15807 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 15808 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 15809 auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 15810 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); 15811 auto *AA = &AM.getResult<AAManager>(F); 15812 auto *LI = &AM.getResult<LoopAnalysis>(F); 15813 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 15814 auto *AC = &AM.getResult<AssumptionAnalysis>(F); 15815 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); 15816 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 15817 15818 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 15819 if (!Changed) 15820 return PreservedAnalyses::all(); 15821 15822 PreservedAnalyses PA; 15823 PA.preserveSet<CFGAnalyses>(); 15824 return PA; 15825 } 15826 15827 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, 15828 TargetTransformInfo *TTI_, 15829 TargetLibraryInfo *TLI_, AAResults *AA_, 15830 LoopInfo *LI_, DominatorTree *DT_, 15831 AssumptionCache *AC_, DemandedBits *DB_, 15832 OptimizationRemarkEmitter *ORE_) { 15833 if (!RunSLPVectorization) 15834 return false; 15835 SE = SE_; 15836 TTI = TTI_; 15837 TLI = TLI_; 15838 AA = AA_; 15839 LI = LI_; 15840 DT = DT_; 15841 AC = AC_; 15842 DB = DB_; 15843 DL = &F.getDataLayout(); 15844 15845 Stores.clear(); 15846 GEPs.clear(); 15847 bool Changed = false; 15848 15849 // If the target claims to have no vector registers don't attempt 15850 // vectorization. 15851 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { 15852 LLVM_DEBUG( 15853 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); 15854 return false; 15855 } 15856 15857 // Don't vectorize when the attribute NoImplicitFloat is used. 15858 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 15859 return false; 15860 15861 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 15862 15863 // Use the bottom up slp vectorizer to construct chains that start with 15864 // store instructions. 15865 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); 15866 15867 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 15868 // delete instructions. 15869 15870 // Update DFS numbers now so that we can use them for ordering. 15871 DT->updateDFSNumbers(); 15872 15873 // Scan the blocks in the function in post order. 15874 for (auto *BB : post_order(&F.getEntryBlock())) { 15875 // Start new block - clear the list of reduction roots. 15876 R.clearReductionData(); 15877 collectSeedInstructions(BB); 15878 15879 // Vectorize trees that end at stores. 15880 if (!Stores.empty()) { 15881 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() 15882 << " underlying objects.\n"); 15883 Changed |= vectorizeStoreChains(R); 15884 } 15885 15886 // Vectorize trees that end at reductions. 15887 Changed |= vectorizeChainsInBlock(BB, R); 15888 15889 // Vectorize the index computations of getelementptr instructions. This 15890 // is primarily intended to catch gather-like idioms ending at 15891 // non-consecutive loads. 15892 if (!GEPs.empty()) { 15893 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() 15894 << " underlying objects.\n"); 15895 Changed |= vectorizeGEPIndices(BB, R); 15896 } 15897 } 15898 15899 if (Changed) { 15900 R.optimizeGatherSequence(); 15901 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 15902 } 15903 return Changed; 15904 } 15905 15906 std::optional<bool> 15907 SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, 15908 unsigned Idx, unsigned MinVF, 15909 unsigned &Size) { 15910 Size = 0; 15911 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() 15912 << "\n"); 15913 const unsigned Sz = R.getVectorElementSize(Chain[0]); 15914 unsigned VF = Chain.size(); 15915 15916 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) { 15917 // Check if vectorizing with a non-power-of-2 VF should be considered. At 15918 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost 15919 // all vector lanes are used. 15920 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF)) 15921 return false; 15922 } 15923 15924 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx 15925 << "\n"); 15926 15927 SetVector<Value *> ValOps; 15928 for (Value *V : Chain) 15929 ValOps.insert(cast<StoreInst>(V)->getValueOperand()); 15930 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. 15931 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); 15932 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) { 15933 DenseSet<Value *> Stores(Chain.begin(), Chain.end()); 15934 bool IsPowerOf2 = 15935 isPowerOf2_32(ValOps.size()) || 15936 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1)); 15937 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load && 15938 (!S.MainOp->isSafeToRemove() || 15939 any_of(ValOps.getArrayRef(), 15940 [&](Value *V) { 15941 return !isa<ExtractElementInst>(V) && 15942 (V->getNumUses() > Chain.size() || 15943 any_of(V->users(), [&](User *U) { 15944 return !Stores.contains(U); 15945 })); 15946 }))) || 15947 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) { 15948 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2; 15949 return false; 15950 } 15951 } 15952 if (R.isLoadCombineCandidate(Chain)) 15953 return true; 15954 R.buildTree(Chain); 15955 // Check if tree tiny and store itself or its value is not vectorized. 15956 if (R.isTreeTinyAndNotFullyVectorizable()) { 15957 if (R.isGathered(Chain.front()) || 15958 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand())) 15959 return std::nullopt; 15960 Size = R.getTreeSize(); 15961 return false; 15962 } 15963 R.reorderTopToBottom(); 15964 R.reorderBottomToTop(); 15965 R.buildExternalUses(); 15966 15967 R.computeMinimumValueSizes(); 15968 R.transformNodes(); 15969 15970 Size = R.getTreeSize(); 15971 if (S.getOpcode() == Instruction::Load) 15972 Size = 2; // cut off masked gather small trees 15973 InstructionCost Cost = R.getTreeCost(); 15974 15975 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); 15976 if (Cost < -SLPCostThreshold) { 15977 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); 15978 15979 using namespace ore; 15980 15981 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", 15982 cast<StoreInst>(Chain[0])) 15983 << "Stores SLP vectorized with cost " << NV("Cost", Cost) 15984 << " and with tree size " 15985 << NV("TreeSize", R.getTreeSize())); 15986 15987 R.vectorizeTree(); 15988 return true; 15989 } 15990 15991 return false; 15992 } 15993 15994 /// Checks if the quadratic mean deviation is less than 90% of the mean size. 15995 static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes, 15996 bool First) { 15997 unsigned Num = 0; 15998 uint64_t Sum = std::accumulate( 15999 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0), 16000 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) { 16001 unsigned Size = First ? Val.first : Val.second; 16002 if (Size == 1) 16003 return V; 16004 ++Num; 16005 return V + Size; 16006 }); 16007 if (Num == 0) 16008 return true; 16009 uint64_t Mean = Sum / Num; 16010 if (Mean == 0) 16011 return true; 16012 uint64_t Dev = std::accumulate( 16013 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0), 16014 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) { 16015 unsigned P = First ? Val.first : Val.second; 16016 if (P == 1) 16017 return V; 16018 return V + (P - Mean) * (P - Mean); 16019 }) / 16020 Num; 16021 return Dev * 81 / (Mean * Mean) == 0; 16022 } 16023 16024 bool SLPVectorizerPass::vectorizeStores( 16025 ArrayRef<StoreInst *> Stores, BoUpSLP &R, 16026 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> 16027 &Visited) { 16028 // We may run into multiple chains that merge into a single chain. We mark the 16029 // stores that we vectorized so that we don't visit the same store twice. 16030 BoUpSLP::ValueSet VectorizedStores; 16031 bool Changed = false; 16032 16033 struct StoreDistCompare { 16034 bool operator()(const std::pair<unsigned, int> &Op1, 16035 const std::pair<unsigned, int> &Op2) const { 16036 return Op1.second < Op2.second; 16037 } 16038 }; 16039 // A set of pairs (index of store in Stores array ref, Distance of the store 16040 // address relative to base store address in units). 16041 using StoreIndexToDistSet = 16042 std::set<std::pair<unsigned, int>, StoreDistCompare>; 16043 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) { 16044 int PrevDist = -1; 16045 BoUpSLP::ValueList Operands; 16046 // Collect the chain into a list. 16047 for (auto [Idx, Data] : enumerate(Set)) { 16048 if (Operands.empty() || Data.second - PrevDist == 1) { 16049 Operands.push_back(Stores[Data.first]); 16050 PrevDist = Data.second; 16051 if (Idx != Set.size() - 1) 16052 continue; 16053 } 16054 auto E = make_scope_exit([&, &DataVar = Data]() { 16055 Operands.clear(); 16056 Operands.push_back(Stores[DataVar.first]); 16057 PrevDist = DataVar.second; 16058 }); 16059 16060 if (Operands.size() <= 1 || 16061 !Visited 16062 .insert({Operands.front(), 16063 cast<StoreInst>(Operands.front())->getValueOperand(), 16064 Operands.back(), 16065 cast<StoreInst>(Operands.back())->getValueOperand(), 16066 Operands.size()}) 16067 .second) 16068 continue; 16069 16070 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 16071 unsigned EltSize = R.getVectorElementSize(Operands[0]); 16072 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); 16073 16074 unsigned MaxVF = 16075 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); 16076 unsigned MaxRegVF = MaxVF; 16077 auto *Store = cast<StoreInst>(Operands[0]); 16078 Type *StoreTy = Store->getValueOperand()->getType(); 16079 Type *ValueTy = StoreTy; 16080 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) 16081 ValueTy = Trunc->getSrcTy(); 16082 if (ValueTy == StoreTy && 16083 R.getVectorElementSize(Store->getValueOperand()) <= EltSize) 16084 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size())); 16085 unsigned MinVF = std::max<unsigned>( 16086 2, PowerOf2Ceil(TTI->getStoreMinimumVF( 16087 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, 16088 ValueTy))); 16089 16090 if (MaxVF < MinVF) { 16091 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF 16092 << ") < " 16093 << "MinVF (" << MinVF << ")\n"); 16094 continue; 16095 } 16096 16097 unsigned NonPowerOf2VF = 0; 16098 if (VectorizeNonPowerOf2) { 16099 // First try vectorizing with a non-power-of-2 VF. At the moment, only 16100 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector 16101 // lanes are used. 16102 unsigned CandVF = Operands.size(); 16103 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF) 16104 NonPowerOf2VF = CandVF; 16105 } 16106 16107 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); 16108 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0)); 16109 unsigned Size = MinVF; 16110 for_each(reverse(CandidateVFs), [&](unsigned &VF) { 16111 VF = Size > MaxVF ? NonPowerOf2VF : Size; 16112 Size *= 2; 16113 }); 16114 unsigned End = Operands.size(); 16115 unsigned Repeat = 0; 16116 constexpr unsigned MaxAttempts = 4; 16117 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size()); 16118 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) { 16119 P.first = P.second = 1; 16120 }); 16121 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable; 16122 auto IsNotVectorized = [](bool First, 16123 const std::pair<unsigned, unsigned> &P) { 16124 return First ? P.first > 0 : P.second > 0; 16125 }; 16126 auto IsVectorized = [](bool First, 16127 const std::pair<unsigned, unsigned> &P) { 16128 return First ? P.first == 0 : P.second == 0; 16129 }; 16130 auto VFIsProfitable = [](bool First, unsigned Size, 16131 const std::pair<unsigned, unsigned> &P) { 16132 return First ? Size >= P.first : Size >= P.second; 16133 }; 16134 auto FirstSizeSame = [](unsigned Size, 16135 const std::pair<unsigned, unsigned> &P) { 16136 return Size == P.first; 16137 }; 16138 while (true) { 16139 ++Repeat; 16140 bool RepeatChanged = false; 16141 bool AnyProfitableGraph = false; 16142 for (unsigned Size : CandidateVFs) { 16143 AnyProfitableGraph = false; 16144 unsigned StartIdx = std::distance( 16145 RangeSizes.begin(), 16146 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF, 16147 std::placeholders::_1))); 16148 while (StartIdx < End) { 16149 unsigned EndIdx = 16150 std::distance(RangeSizes.begin(), 16151 find_if(RangeSizes.drop_front(StartIdx), 16152 std::bind(IsVectorized, Size >= MaxRegVF, 16153 std::placeholders::_1))); 16154 unsigned Sz = EndIdx >= End ? End : EndIdx; 16155 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) { 16156 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size), 16157 Size >= MaxRegVF)) { 16158 ++Cnt; 16159 continue; 16160 } 16161 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size); 16162 assert(all_of(Slice, 16163 [&](Value *V) { 16164 return cast<StoreInst>(V) 16165 ->getValueOperand() 16166 ->getType() == 16167 cast<StoreInst>(Slice.front()) 16168 ->getValueOperand() 16169 ->getType(); 16170 }) && 16171 "Expected all operands of same type."); 16172 if (!NonSchedulable.empty()) { 16173 auto [NonSchedSizeMax, NonSchedSizeMin] = 16174 NonSchedulable.lookup(Slice.front()); 16175 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) { 16176 Cnt += NonSchedSizeMax; 16177 continue; 16178 } 16179 } 16180 unsigned TreeSize; 16181 std::optional<bool> Res = 16182 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize); 16183 if (!Res) { 16184 NonSchedulable 16185 .try_emplace(Slice.front(), std::make_pair(Size, Size)) 16186 .first->getSecond() 16187 .second = Size; 16188 } else if (*Res) { 16189 // Mark the vectorized stores so that we don't vectorize them 16190 // again. 16191 VectorizedStores.insert(Slice.begin(), Slice.end()); 16192 // Mark the vectorized stores so that we don't vectorize them 16193 // again. 16194 AnyProfitableGraph = RepeatChanged = Changed = true; 16195 // If we vectorized initial block, no need to try to vectorize 16196 // it again. 16197 for_each(RangeSizes.slice(Cnt, Size), 16198 [](std::pair<unsigned, unsigned> &P) { 16199 P.first = P.second = 0; 16200 }); 16201 if (Cnt < StartIdx + MinVF) { 16202 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx), 16203 [](std::pair<unsigned, unsigned> &P) { 16204 P.first = P.second = 0; 16205 }); 16206 StartIdx = Cnt + Size; 16207 } 16208 if (Cnt > Sz - Size - MinVF) { 16209 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)), 16210 [](std::pair<unsigned, unsigned> &P) { 16211 P.first = P.second = 0; 16212 }); 16213 if (Sz == End) 16214 End = Cnt; 16215 Sz = Cnt; 16216 } 16217 Cnt += Size; 16218 continue; 16219 } 16220 if (Size > 2 && Res && 16221 !all_of(RangeSizes.slice(Cnt, Size), 16222 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize, 16223 std::placeholders::_1))) { 16224 Cnt += Size; 16225 continue; 16226 } 16227 // Check for the very big VFs that we're not rebuilding same 16228 // trees, just with larger number of elements. 16229 if (Size > MaxRegVF && TreeSize > 1 && 16230 all_of(RangeSizes.slice(Cnt, Size), 16231 std::bind(FirstSizeSame, TreeSize, 16232 std::placeholders::_1))) { 16233 Cnt += Size; 16234 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize) 16235 ++Cnt; 16236 continue; 16237 } 16238 if (TreeSize > 1) 16239 for_each(RangeSizes.slice(Cnt, Size), 16240 [&](std::pair<unsigned, unsigned> &P) { 16241 if (Size >= MaxRegVF) 16242 P.second = std::max(P.second, TreeSize); 16243 else 16244 P.first = std::max(P.first, TreeSize); 16245 }); 16246 ++Cnt; 16247 AnyProfitableGraph = true; 16248 } 16249 if (StartIdx >= End) 16250 break; 16251 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF) 16252 AnyProfitableGraph = true; 16253 StartIdx = std::distance( 16254 RangeSizes.begin(), 16255 find_if(RangeSizes.drop_front(Sz), 16256 std::bind(IsNotVectorized, Size >= MaxRegVF, 16257 std::placeholders::_1))); 16258 } 16259 if (!AnyProfitableGraph && Size >= MaxRegVF) 16260 break; 16261 } 16262 // All values vectorized - exit. 16263 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) { 16264 return P.first == 0 && P.second == 0; 16265 })) 16266 break; 16267 // Check if tried all attempts or no need for the last attempts at all. 16268 if (Repeat >= MaxAttempts || 16269 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph))) 16270 break; 16271 constexpr unsigned StoresLimit = 64; 16272 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>( 16273 Operands.size(), 16274 static_cast<unsigned>( 16275 End - 16276 std::distance( 16277 RangeSizes.begin(), 16278 find_if(RangeSizes, std::bind(IsNotVectorized, true, 16279 std::placeholders::_1))) + 16280 1))); 16281 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2; 16282 if (VF > MaxTotalNum || VF >= StoresLimit) 16283 break; 16284 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) { 16285 if (P.first != 0) 16286 P.first = std::max(P.second, P.first); 16287 }); 16288 // Last attempt to vectorize max number of elements, if all previous 16289 // attempts were unsuccessful because of the cost issues. 16290 CandidateVFs.clear(); 16291 CandidateVFs.push_back(VF); 16292 } 16293 } 16294 }; 16295 16296 // Stores pair (first: index of the store into Stores array ref, address of 16297 // which taken as base, second: sorted set of pairs {index, dist}, which are 16298 // indices of stores in the set and their store location distances relative to 16299 // the base address). 16300 16301 // Need to store the index of the very first store separately, since the set 16302 // may be reordered after the insertion and the first store may be moved. This 16303 // container allows to reduce number of calls of getPointersDiff() function. 16304 SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores; 16305 // Inserts the specified store SI with the given index Idx to the set of the 16306 // stores. If the store with the same distance is found already - stop 16307 // insertion, try to vectorize already found stores. If some stores from this 16308 // sequence were not vectorized - try to vectorize them with the new store 16309 // later. But this logic is applied only to the stores, that come before the 16310 // previous store with the same distance. 16311 // Example: 16312 // 1. store x, %p 16313 // 2. store y, %p+1 16314 // 3. store z, %p+2 16315 // 4. store a, %p 16316 // 5. store b, %p+3 16317 // - Scan this from the last to first store. The very first bunch of stores is 16318 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores 16319 // vector). 16320 // - The next store in the list - #1 - has the same distance from store #5 as 16321 // the store #4. 16322 // - Try to vectorize sequence of stores 4,2,3,5. 16323 // - If all these stores are vectorized - just drop them. 16324 // - If some of them are not vectorized (say, #3 and #5), do extra analysis. 16325 // - Start new stores sequence. 16326 // The new bunch of stores is {1, {1, 0}}. 16327 // - Add the stores from previous sequence, that were not vectorized. 16328 // Here we consider the stores in the reversed order, rather they are used in 16329 // the IR (Stores are reversed already, see vectorizeStoreChains() function). 16330 // Store #3 can be added -> comes after store #4 with the same distance as 16331 // store #1. 16332 // Store #5 cannot be added - comes before store #4. 16333 // This logic allows to improve the compile time, we assume that the stores 16334 // after previous store with the same distance most likely have memory 16335 // dependencies and no need to waste compile time to try to vectorize them. 16336 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. 16337 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { 16338 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) { 16339 std::optional<int> Diff = getPointersDiff( 16340 Stores[Set.first]->getValueOperand()->getType(), 16341 Stores[Set.first]->getPointerOperand(), 16342 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, 16343 /*StrictCheck=*/true); 16344 if (!Diff) 16345 continue; 16346 auto It = Set.second.find(std::make_pair(Idx, *Diff)); 16347 if (It == Set.second.end()) { 16348 Set.second.emplace(Idx, *Diff); 16349 return; 16350 } 16351 // Try to vectorize the first found set to avoid duplicate analysis. 16352 TryToVectorize(Set.second); 16353 StoreIndexToDistSet PrevSet; 16354 PrevSet.swap(Set.second); 16355 Set.first = Idx; 16356 Set.second.emplace(Idx, 0); 16357 // Insert stores that followed previous match to try to vectorize them 16358 // with this store. 16359 unsigned StartIdx = It->first + 1; 16360 SmallBitVector UsedStores(Idx - StartIdx); 16361 // Distances to previously found dup store (or this store, since they 16362 // store to the same addresses). 16363 SmallVector<int> Dists(Idx - StartIdx, 0); 16364 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) { 16365 // Do not try to vectorize sequences, we already tried. 16366 if (Pair.first <= It->first || 16367 VectorizedStores.contains(Stores[Pair.first])) 16368 break; 16369 unsigned BI = Pair.first - StartIdx; 16370 UsedStores.set(BI); 16371 Dists[BI] = Pair.second - It->second; 16372 } 16373 for (unsigned I = StartIdx; I < Idx; ++I) { 16374 unsigned BI = I - StartIdx; 16375 if (UsedStores.test(BI)) 16376 Set.second.emplace(I, Dists[BI]); 16377 } 16378 return; 16379 } 16380 auto &Res = SortedStores.emplace_back(); 16381 Res.first = Idx; 16382 Res.second.emplace(Idx, 0); 16383 }; 16384 Type *PrevValTy = nullptr; 16385 for (auto [I, SI] : enumerate(Stores)) { 16386 if (R.isDeleted(SI)) 16387 continue; 16388 if (!PrevValTy) 16389 PrevValTy = SI->getValueOperand()->getType(); 16390 // Check that we do not try to vectorize stores of different types. 16391 if (PrevValTy != SI->getValueOperand()->getType()) { 16392 for (auto &Set : SortedStores) 16393 TryToVectorize(Set.second); 16394 SortedStores.clear(); 16395 PrevValTy = SI->getValueOperand()->getType(); 16396 } 16397 FillStoresSet(I, SI); 16398 } 16399 16400 // Final vectorization attempt. 16401 for (auto &Set : SortedStores) 16402 TryToVectorize(Set.second); 16403 16404 return Changed; 16405 } 16406 16407 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { 16408 // Initialize the collections. We will make a single pass over the block. 16409 Stores.clear(); 16410 GEPs.clear(); 16411 16412 // Visit the store and getelementptr instructions in BB and organize them in 16413 // Stores and GEPs according to the underlying objects of their pointer 16414 // operands. 16415 for (Instruction &I : *BB) { 16416 // Ignore store instructions that are volatile or have a pointer operand 16417 // that doesn't point to a scalar type. 16418 if (auto *SI = dyn_cast<StoreInst>(&I)) { 16419 if (!SI->isSimple()) 16420 continue; 16421 if (!isValidElementType(SI->getValueOperand()->getType())) 16422 continue; 16423 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); 16424 } 16425 16426 // Ignore getelementptr instructions that have more than one index, a 16427 // constant index, or a pointer operand that doesn't point to a scalar 16428 // type. 16429 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 16430 if (GEP->getNumIndices() != 1) 16431 continue; 16432 Value *Idx = GEP->idx_begin()->get(); 16433 if (isa<Constant>(Idx)) 16434 continue; 16435 if (!isValidElementType(Idx->getType())) 16436 continue; 16437 if (GEP->getType()->isVectorTy()) 16438 continue; 16439 GEPs[GEP->getPointerOperand()].push_back(GEP); 16440 } 16441 } 16442 } 16443 16444 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 16445 bool MaxVFOnly) { 16446 if (VL.size() < 2) 16447 return false; 16448 16449 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " 16450 << VL.size() << ".\n"); 16451 16452 // Check that all of the parts are instructions of the same type, 16453 // we permit an alternate opcode via InstructionsState. 16454 InstructionsState S = getSameOpcode(VL, *TLI); 16455 if (!S.getOpcode()) 16456 return false; 16457 16458 Instruction *I0 = cast<Instruction>(S.OpValue); 16459 // Make sure invalid types (including vector type) are rejected before 16460 // determining vectorization factor for scalar instructions. 16461 for (Value *V : VL) { 16462 Type *Ty = V->getType(); 16463 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) { 16464 // NOTE: the following will give user internal llvm type name, which may 16465 // not be useful. 16466 R.getORE()->emit([&]() { 16467 std::string TypeStr; 16468 llvm::raw_string_ostream rso(TypeStr); 16469 Ty->print(rso); 16470 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) 16471 << "Cannot SLP vectorize list: type " 16472 << TypeStr + " is unsupported by vectorizer"; 16473 }); 16474 return false; 16475 } 16476 } 16477 16478 unsigned Sz = R.getVectorElementSize(I0); 16479 unsigned MinVF = R.getMinVF(Sz); 16480 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF); 16481 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); 16482 if (MaxVF < 2) { 16483 R.getORE()->emit([&]() { 16484 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) 16485 << "Cannot SLP vectorize list: vectorization factor " 16486 << "less than 2 is not supported"; 16487 }); 16488 return false; 16489 } 16490 16491 bool Changed = false; 16492 bool CandidateFound = false; 16493 InstructionCost MinCost = SLPCostThreshold.getValue(); 16494 Type *ScalarTy = VL[0]->getType(); 16495 if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) 16496 ScalarTy = IE->getOperand(1)->getType(); 16497 16498 unsigned NextInst = 0, MaxInst = VL.size(); 16499 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { 16500 // No actual vectorization should happen, if number of parts is the same as 16501 // provided vectorization factor (i.e. the scalar type is used for vector 16502 // code during codegen). 16503 auto *VecTy = getWidenedType(ScalarTy, VF); 16504 if (TTI->getNumberOfParts(VecTy) == VF) 16505 continue; 16506 for (unsigned I = NextInst; I < MaxInst; ++I) { 16507 unsigned ActualVF = std::min(MaxInst - I, VF); 16508 16509 if (!isPowerOf2_32(ActualVF)) 16510 continue; 16511 16512 if (MaxVFOnly && ActualVF < MaxVF) 16513 break; 16514 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2)) 16515 break; 16516 16517 ArrayRef<Value *> Ops = VL.slice(I, ActualVF); 16518 // Check that a previous iteration of this loop did not delete the Value. 16519 if (llvm::any_of(Ops, [&R](Value *V) { 16520 auto *I = dyn_cast<Instruction>(V); 16521 return I && R.isDeleted(I); 16522 })) 16523 continue; 16524 16525 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations " 16526 << "\n"); 16527 16528 R.buildTree(Ops); 16529 if (R.isTreeTinyAndNotFullyVectorizable()) 16530 continue; 16531 R.reorderTopToBottom(); 16532 R.reorderBottomToTop( 16533 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) && 16534 !R.doesRootHaveInTreeUses()); 16535 R.buildExternalUses(); 16536 16537 R.computeMinimumValueSizes(); 16538 R.transformNodes(); 16539 InstructionCost Cost = R.getTreeCost(); 16540 CandidateFound = true; 16541 MinCost = std::min(MinCost, Cost); 16542 16543 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost 16544 << " for VF=" << ActualVF << "\n"); 16545 if (Cost < -SLPCostThreshold) { 16546 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 16547 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", 16548 cast<Instruction>(Ops[0])) 16549 << "SLP vectorized with cost " << ore::NV("Cost", Cost) 16550 << " and with tree size " 16551 << ore::NV("TreeSize", R.getTreeSize())); 16552 16553 R.vectorizeTree(); 16554 // Move to the next bundle. 16555 I += VF - 1; 16556 NextInst = I + 1; 16557 Changed = true; 16558 } 16559 } 16560 } 16561 16562 if (!Changed && CandidateFound) { 16563 R.getORE()->emit([&]() { 16564 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) 16565 << "List vectorization was possible but not beneficial with cost " 16566 << ore::NV("Cost", MinCost) << " >= " 16567 << ore::NV("Treshold", -SLPCostThreshold); 16568 }); 16569 } else if (!Changed) { 16570 R.getORE()->emit([&]() { 16571 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) 16572 << "Cannot SLP vectorize list: vectorization was impossible" 16573 << " with available vectorization factors"; 16574 }); 16575 } 16576 return Changed; 16577 } 16578 16579 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { 16580 if (!I) 16581 return false; 16582 16583 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType())) 16584 return false; 16585 16586 Value *P = I->getParent(); 16587 16588 // Vectorize in current basic block only. 16589 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); 16590 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); 16591 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) 16592 return false; 16593 16594 // First collect all possible candidates 16595 SmallVector<std::pair<Value *, Value *>, 4> Candidates; 16596 Candidates.emplace_back(Op0, Op1); 16597 16598 auto *A = dyn_cast<BinaryOperator>(Op0); 16599 auto *B = dyn_cast<BinaryOperator>(Op1); 16600 // Try to skip B. 16601 if (A && B && B->hasOneUse()) { 16602 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 16603 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 16604 if (B0 && B0->getParent() == P) 16605 Candidates.emplace_back(A, B0); 16606 if (B1 && B1->getParent() == P) 16607 Candidates.emplace_back(A, B1); 16608 } 16609 // Try to skip A. 16610 if (B && A && A->hasOneUse()) { 16611 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 16612 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 16613 if (A0 && A0->getParent() == P) 16614 Candidates.emplace_back(A0, B); 16615 if (A1 && A1->getParent() == P) 16616 Candidates.emplace_back(A1, B); 16617 } 16618 16619 if (Candidates.size() == 1) 16620 return tryToVectorizeList({Op0, Op1}, R); 16621 16622 // We have multiple options. Try to pick the single best. 16623 std::optional<int> BestCandidate = R.findBestRootPair(Candidates); 16624 if (!BestCandidate) 16625 return false; 16626 return tryToVectorizeList( 16627 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); 16628 } 16629 16630 namespace { 16631 16632 /// Model horizontal reductions. 16633 /// 16634 /// A horizontal reduction is a tree of reduction instructions that has values 16635 /// that can be put into a vector as its leaves. For example: 16636 /// 16637 /// mul mul mul mul 16638 /// \ / \ / 16639 /// + + 16640 /// \ / 16641 /// + 16642 /// This tree has "mul" as its leaf values and "+" as its reduction 16643 /// instructions. A reduction can feed into a store or a binary operation 16644 /// feeding a phi. 16645 /// ... 16646 /// \ / 16647 /// + 16648 /// | 16649 /// phi += 16650 /// 16651 /// Or: 16652 /// ... 16653 /// \ / 16654 /// + 16655 /// | 16656 /// *p = 16657 /// 16658 class HorizontalReduction { 16659 using ReductionOpsType = SmallVector<Value *, 16>; 16660 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; 16661 ReductionOpsListType ReductionOps; 16662 /// List of possibly reduced values. 16663 SmallVector<SmallVector<Value *>> ReducedVals; 16664 /// Maps reduced value to the corresponding reduction operation. 16665 DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps; 16666 // Use map vector to make stable output. 16667 MapVector<Instruction *, Value *> ExtraArgs; 16668 WeakTrackingVH ReductionRoot; 16669 /// The type of reduction operation. 16670 RecurKind RdxKind; 16671 /// Checks if the optimization of original scalar identity operations on 16672 /// matched horizontal reductions is enabled and allowed. 16673 bool IsSupportedHorRdxIdentityOp = false; 16674 16675 static bool isCmpSelMinMax(Instruction *I) { 16676 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && 16677 RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); 16678 } 16679 16680 // And/or are potentially poison-safe logical patterns like: 16681 // select x, y, false 16682 // select x, true, y 16683 static bool isBoolLogicOp(Instruction *I) { 16684 return isa<SelectInst>(I) && 16685 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr())); 16686 } 16687 16688 /// Checks if instruction is associative and can be vectorized. 16689 static bool isVectorizable(RecurKind Kind, Instruction *I) { 16690 if (Kind == RecurKind::None) 16691 return false; 16692 16693 // Integer ops that map to select instructions or intrinsics are fine. 16694 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || 16695 isBoolLogicOp(I)) 16696 return true; 16697 16698 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { 16699 // FP min/max are associative except for NaN and -0.0. We do not 16700 // have to rule out -0.0 here because the intrinsic semantics do not 16701 // specify a fixed result for it. 16702 return I->getFastMathFlags().noNaNs(); 16703 } 16704 16705 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum) 16706 return true; 16707 16708 return I->isAssociative(); 16709 } 16710 16711 static Value *getRdxOperand(Instruction *I, unsigned Index) { 16712 // Poison-safe 'or' takes the form: select X, true, Y 16713 // To make that work with the normal operand processing, we skip the 16714 // true value operand. 16715 // TODO: Change the code and data structures to handle this without a hack. 16716 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1) 16717 return I->getOperand(2); 16718 return I->getOperand(Index); 16719 } 16720 16721 /// Creates reduction operation with the current opcode. 16722 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS, 16723 Value *RHS, const Twine &Name, bool UseSelect) { 16724 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); 16725 switch (Kind) { 16726 case RecurKind::Or: 16727 if (UseSelect && 16728 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 16729 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name); 16730 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 16731 Name); 16732 case RecurKind::And: 16733 if (UseSelect && 16734 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) 16735 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name); 16736 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 16737 Name); 16738 case RecurKind::Add: 16739 case RecurKind::Mul: 16740 case RecurKind::Xor: 16741 case RecurKind::FAdd: 16742 case RecurKind::FMul: 16743 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, 16744 Name); 16745 case RecurKind::FMax: 16746 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); 16747 case RecurKind::FMin: 16748 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); 16749 case RecurKind::FMaximum: 16750 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS); 16751 case RecurKind::FMinimum: 16752 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS); 16753 case RecurKind::SMax: 16754 if (UseSelect) { 16755 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); 16756 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 16757 } 16758 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); 16759 case RecurKind::SMin: 16760 if (UseSelect) { 16761 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); 16762 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 16763 } 16764 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); 16765 case RecurKind::UMax: 16766 if (UseSelect) { 16767 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); 16768 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 16769 } 16770 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); 16771 case RecurKind::UMin: 16772 if (UseSelect) { 16773 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); 16774 return Builder.CreateSelect(Cmp, LHS, RHS, Name); 16775 } 16776 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); 16777 default: 16778 llvm_unreachable("Unknown reduction operation."); 16779 } 16780 } 16781 16782 /// Creates reduction operation with the current opcode with the IR flags 16783 /// from \p ReductionOps, dropping nuw/nsw flags. 16784 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS, 16785 Value *RHS, const Twine &Name, 16786 const ReductionOpsListType &ReductionOps) { 16787 bool UseSelect = ReductionOps.size() == 2 || 16788 // Logical or/and. 16789 (ReductionOps.size() == 1 && 16790 any_of(ReductionOps.front(), IsaPred<SelectInst>)); 16791 assert((!UseSelect || ReductionOps.size() != 2 || 16792 isa<SelectInst>(ReductionOps[1][0])) && 16793 "Expected cmp + select pairs for reduction"); 16794 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); 16795 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { 16796 if (auto *Sel = dyn_cast<SelectInst>(Op)) { 16797 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr, 16798 /*IncludeWrapFlags=*/false); 16799 propagateIRFlags(Op, ReductionOps[1], nullptr, 16800 /*IncludeWrapFlags=*/false); 16801 return Op; 16802 } 16803 } 16804 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false); 16805 return Op; 16806 } 16807 16808 public: 16809 static RecurKind getRdxKind(Value *V) { 16810 auto *I = dyn_cast<Instruction>(V); 16811 if (!I) 16812 return RecurKind::None; 16813 if (match(I, m_Add(m_Value(), m_Value()))) 16814 return RecurKind::Add; 16815 if (match(I, m_Mul(m_Value(), m_Value()))) 16816 return RecurKind::Mul; 16817 if (match(I, m_And(m_Value(), m_Value())) || 16818 match(I, m_LogicalAnd(m_Value(), m_Value()))) 16819 return RecurKind::And; 16820 if (match(I, m_Or(m_Value(), m_Value())) || 16821 match(I, m_LogicalOr(m_Value(), m_Value()))) 16822 return RecurKind::Or; 16823 if (match(I, m_Xor(m_Value(), m_Value()))) 16824 return RecurKind::Xor; 16825 if (match(I, m_FAdd(m_Value(), m_Value()))) 16826 return RecurKind::FAdd; 16827 if (match(I, m_FMul(m_Value(), m_Value()))) 16828 return RecurKind::FMul; 16829 16830 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) 16831 return RecurKind::FMax; 16832 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) 16833 return RecurKind::FMin; 16834 16835 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value()))) 16836 return RecurKind::FMaximum; 16837 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value()))) 16838 return RecurKind::FMinimum; 16839 // This matches either cmp+select or intrinsics. SLP is expected to handle 16840 // either form. 16841 // TODO: If we are canonicalizing to intrinsics, we can remove several 16842 // special-case paths that deal with selects. 16843 if (match(I, m_SMax(m_Value(), m_Value()))) 16844 return RecurKind::SMax; 16845 if (match(I, m_SMin(m_Value(), m_Value()))) 16846 return RecurKind::SMin; 16847 if (match(I, m_UMax(m_Value(), m_Value()))) 16848 return RecurKind::UMax; 16849 if (match(I, m_UMin(m_Value(), m_Value()))) 16850 return RecurKind::UMin; 16851 16852 if (auto *Select = dyn_cast<SelectInst>(I)) { 16853 // Try harder: look for min/max pattern based on instructions producing 16854 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 16855 // During the intermediate stages of SLP, it's very common to have 16856 // pattern like this (since optimizeGatherSequence is run only once 16857 // at the end): 16858 // %1 = extractelement <2 x i32> %a, i32 0 16859 // %2 = extractelement <2 x i32> %a, i32 1 16860 // %cond = icmp sgt i32 %1, %2 16861 // %3 = extractelement <2 x i32> %a, i32 0 16862 // %4 = extractelement <2 x i32> %a, i32 1 16863 // %select = select i1 %cond, i32 %3, i32 %4 16864 CmpInst::Predicate Pred; 16865 Instruction *L1; 16866 Instruction *L2; 16867 16868 Value *LHS = Select->getTrueValue(); 16869 Value *RHS = Select->getFalseValue(); 16870 Value *Cond = Select->getCondition(); 16871 16872 // TODO: Support inverse predicates. 16873 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { 16874 if (!isa<ExtractElementInst>(RHS) || 16875 !L2->isIdenticalTo(cast<Instruction>(RHS))) 16876 return RecurKind::None; 16877 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { 16878 if (!isa<ExtractElementInst>(LHS) || 16879 !L1->isIdenticalTo(cast<Instruction>(LHS))) 16880 return RecurKind::None; 16881 } else { 16882 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) 16883 return RecurKind::None; 16884 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || 16885 !L1->isIdenticalTo(cast<Instruction>(LHS)) || 16886 !L2->isIdenticalTo(cast<Instruction>(RHS))) 16887 return RecurKind::None; 16888 } 16889 16890 switch (Pred) { 16891 default: 16892 return RecurKind::None; 16893 case CmpInst::ICMP_SGT: 16894 case CmpInst::ICMP_SGE: 16895 return RecurKind::SMax; 16896 case CmpInst::ICMP_SLT: 16897 case CmpInst::ICMP_SLE: 16898 return RecurKind::SMin; 16899 case CmpInst::ICMP_UGT: 16900 case CmpInst::ICMP_UGE: 16901 return RecurKind::UMax; 16902 case CmpInst::ICMP_ULT: 16903 case CmpInst::ICMP_ULE: 16904 return RecurKind::UMin; 16905 } 16906 } 16907 return RecurKind::None; 16908 } 16909 16910 /// Get the index of the first operand. 16911 static unsigned getFirstOperandIndex(Instruction *I) { 16912 return isCmpSelMinMax(I) ? 1 : 0; 16913 } 16914 16915 private: 16916 /// Total number of operands in the reduction operation. 16917 static unsigned getNumberOfOperands(Instruction *I) { 16918 return isCmpSelMinMax(I) ? 3 : 2; 16919 } 16920 16921 /// Checks if the instruction is in basic block \p BB. 16922 /// For a cmp+sel min/max reduction check that both ops are in \p BB. 16923 static bool hasSameParent(Instruction *I, BasicBlock *BB) { 16924 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) { 16925 auto *Sel = cast<SelectInst>(I); 16926 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition()); 16927 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; 16928 } 16929 return I->getParent() == BB; 16930 } 16931 16932 /// Expected number of uses for reduction operations/reduced values. 16933 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { 16934 if (IsCmpSelMinMax) { 16935 // SelectInst must be used twice while the condition op must have single 16936 // use only. 16937 if (auto *Sel = dyn_cast<SelectInst>(I)) 16938 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse(); 16939 return I->hasNUses(2); 16940 } 16941 16942 // Arithmetic reduction operation must be used once only. 16943 return I->hasOneUse(); 16944 } 16945 16946 /// Initializes the list of reduction operations. 16947 void initReductionOps(Instruction *I) { 16948 if (isCmpSelMinMax(I)) 16949 ReductionOps.assign(2, ReductionOpsType()); 16950 else 16951 ReductionOps.assign(1, ReductionOpsType()); 16952 } 16953 16954 /// Add all reduction operations for the reduction instruction \p I. 16955 void addReductionOps(Instruction *I) { 16956 if (isCmpSelMinMax(I)) { 16957 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); 16958 ReductionOps[1].emplace_back(I); 16959 } else { 16960 ReductionOps[0].emplace_back(I); 16961 } 16962 } 16963 16964 static bool isGoodForReduction(ArrayRef<Value *> Data) { 16965 int Sz = Data.size(); 16966 auto *I = dyn_cast<Instruction>(Data.front()); 16967 return Sz > 1 || isConstant(Data.front()) || 16968 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode())); 16969 } 16970 16971 public: 16972 HorizontalReduction() = default; 16973 16974 /// Try to find a reduction tree. 16975 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, 16976 ScalarEvolution &SE, const DataLayout &DL, 16977 const TargetLibraryInfo &TLI) { 16978 RdxKind = HorizontalReduction::getRdxKind(Root); 16979 if (!isVectorizable(RdxKind, Root)) 16980 return false; 16981 16982 // Analyze "regular" integer/FP types for reductions - no target-specific 16983 // types or pointers. 16984 Type *Ty = Root->getType(); 16985 if (!isValidElementType(Ty) || Ty->isPointerTy()) 16986 return false; 16987 16988 // Though the ultimate reduction may have multiple uses, its condition must 16989 // have only single use. 16990 if (auto *Sel = dyn_cast<SelectInst>(Root)) 16991 if (!Sel->getCondition()->hasOneUse()) 16992 return false; 16993 16994 ReductionRoot = Root; 16995 16996 // Iterate through all the operands of the possible reduction tree and 16997 // gather all the reduced values, sorting them by their value id. 16998 BasicBlock *BB = Root->getParent(); 16999 bool IsCmpSelMinMax = isCmpSelMinMax(Root); 17000 SmallVector<Instruction *> Worklist(1, Root); 17001 // Checks if the operands of the \p TreeN instruction are also reduction 17002 // operations or should be treated as reduced values or an extra argument, 17003 // which is not part of the reduction. 17004 auto CheckOperands = [&](Instruction *TreeN, 17005 SmallVectorImpl<Value *> &ExtraArgs, 17006 SmallVectorImpl<Value *> &PossibleReducedVals, 17007 SmallVectorImpl<Instruction *> &ReductionOps) { 17008 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN), 17009 getNumberOfOperands(TreeN)))) { 17010 Value *EdgeVal = getRdxOperand(TreeN, I); 17011 ReducedValsToOps[EdgeVal].push_back(TreeN); 17012 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); 17013 // Edge has wrong parent - mark as an extra argument. 17014 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) && 17015 !hasSameParent(EdgeInst, BB)) { 17016 ExtraArgs.push_back(EdgeVal); 17017 continue; 17018 } 17019 // If the edge is not an instruction, or it is different from the main 17020 // reduction opcode or has too many uses - possible reduced value. 17021 // Also, do not try to reduce const values, if the operation is not 17022 // foldable. 17023 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || 17024 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || 17025 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || 17026 !isVectorizable(RdxKind, EdgeInst) || 17027 (R.isAnalyzedReductionRoot(EdgeInst) && 17028 all_of(EdgeInst->operands(), IsaPred<Constant>))) { 17029 PossibleReducedVals.push_back(EdgeVal); 17030 continue; 17031 } 17032 ReductionOps.push_back(EdgeInst); 17033 } 17034 }; 17035 // Try to regroup reduced values so that it gets more profitable to try to 17036 // reduce them. Values are grouped by their value ids, instructions - by 17037 // instruction op id and/or alternate op id, plus do extra analysis for 17038 // loads (grouping them by the distabce between pointers) and cmp 17039 // instructions (grouping them by the predicate). 17040 MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>> 17041 PossibleReducedVals; 17042 initReductionOps(Root); 17043 DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap; 17044 SmallSet<size_t, 2> LoadKeyUsed; 17045 17046 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { 17047 Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); 17048 if (LoadKeyUsed.contains(Key)) { 17049 auto LIt = LoadsMap.find(Ptr); 17050 if (LIt != LoadsMap.end()) { 17051 for (LoadInst *RLI : LIt->second) { 17052 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), 17053 LI->getType(), LI->getPointerOperand(), DL, SE, 17054 /*StrictCheck=*/true)) 17055 return hash_value(RLI->getPointerOperand()); 17056 } 17057 for (LoadInst *RLI : LIt->second) { 17058 if (arePointersCompatible(RLI->getPointerOperand(), 17059 LI->getPointerOperand(), TLI)) { 17060 hash_code SubKey = hash_value(RLI->getPointerOperand()); 17061 return SubKey; 17062 } 17063 } 17064 if (LIt->second.size() > 2) { 17065 hash_code SubKey = 17066 hash_value(LIt->second.back()->getPointerOperand()); 17067 return SubKey; 17068 } 17069 } 17070 } 17071 LoadKeyUsed.insert(Key); 17072 LoadsMap.try_emplace(Ptr).first->second.push_back(LI); 17073 return hash_value(LI->getPointerOperand()); 17074 }; 17075 17076 while (!Worklist.empty()) { 17077 Instruction *TreeN = Worklist.pop_back_val(); 17078 SmallVector<Value *> Args; 17079 SmallVector<Value *> PossibleRedVals; 17080 SmallVector<Instruction *> PossibleReductionOps; 17081 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps); 17082 // If too many extra args - mark the instruction itself as a reduction 17083 // value, not a reduction operation. 17084 if (Args.size() < 2) { 17085 addReductionOps(TreeN); 17086 // Add extra args. 17087 if (!Args.empty()) { 17088 assert(Args.size() == 1 && "Expected only single argument."); 17089 ExtraArgs[TreeN] = Args.front(); 17090 } 17091 // Add reduction values. The values are sorted for better vectorization 17092 // results. 17093 for (Value *V : PossibleRedVals) { 17094 size_t Key, Idx; 17095 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, 17096 /*AllowAlternate=*/false); 17097 ++PossibleReducedVals[Key][Idx] 17098 .insert(std::make_pair(V, 0)) 17099 .first->second; 17100 } 17101 Worklist.append(PossibleReductionOps.rbegin(), 17102 PossibleReductionOps.rend()); 17103 } else { 17104 size_t Key, Idx; 17105 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey, 17106 /*AllowAlternate=*/false); 17107 ++PossibleReducedVals[Key][Idx] 17108 .insert(std::make_pair(TreeN, 0)) 17109 .first->second; 17110 } 17111 } 17112 auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); 17113 // Sort values by the total number of values kinds to start the reduction 17114 // from the longest possible reduced values sequences. 17115 for (auto &PossibleReducedVals : PossibleReducedValsVect) { 17116 auto PossibleRedVals = PossibleReducedVals.second.takeVector(); 17117 SmallVector<SmallVector<Value *>> PossibleRedValsVect; 17118 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); 17119 It != E; ++It) { 17120 PossibleRedValsVect.emplace_back(); 17121 auto RedValsVect = It->second.takeVector(); 17122 stable_sort(RedValsVect, llvm::less_second()); 17123 for (const std::pair<Value *, unsigned> &Data : RedValsVect) 17124 PossibleRedValsVect.back().append(Data.second, Data.first); 17125 } 17126 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { 17127 return P1.size() > P2.size(); 17128 }); 17129 int NewIdx = -1; 17130 for (ArrayRef<Value *> Data : PossibleRedValsVect) { 17131 if (NewIdx < 0 || 17132 (!isGoodForReduction(Data) && 17133 (!isa<LoadInst>(Data.front()) || 17134 !isa<LoadInst>(ReducedVals[NewIdx].front()) || 17135 getUnderlyingObject( 17136 cast<LoadInst>(Data.front())->getPointerOperand()) != 17137 getUnderlyingObject( 17138 cast<LoadInst>(ReducedVals[NewIdx].front()) 17139 ->getPointerOperand())))) { 17140 NewIdx = ReducedVals.size(); 17141 ReducedVals.emplace_back(); 17142 } 17143 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); 17144 } 17145 } 17146 // Sort the reduced values by number of same/alternate opcode and/or pointer 17147 // operand. 17148 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) { 17149 return P1.size() > P2.size(); 17150 }); 17151 return true; 17152 } 17153 17154 /// Attempt to vectorize the tree found by matchAssociativeReduction. 17155 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, 17156 const TargetLibraryInfo &TLI) { 17157 constexpr int ReductionLimit = 4; 17158 constexpr unsigned RegMaxNumber = 4; 17159 constexpr unsigned RedValsMaxNumber = 128; 17160 // If there are a sufficient number of reduction values, reduce 17161 // to a nearby power-of-2. We can safely generate oversized 17162 // vectors and rely on the backend to split them to legal sizes. 17163 unsigned NumReducedVals = 17164 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0, 17165 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned { 17166 if (!isGoodForReduction(Vals)) 17167 return Num; 17168 return Num + Vals.size(); 17169 }); 17170 if (NumReducedVals < ReductionLimit && 17171 (!AllowHorRdxIdenityOptimization || 17172 all_of(ReducedVals, [](ArrayRef<Value *> RedV) { 17173 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV); 17174 }))) { 17175 for (ReductionOpsType &RdxOps : ReductionOps) 17176 for (Value *RdxOp : RdxOps) 17177 V.analyzedReductionRoot(cast<Instruction>(RdxOp)); 17178 return nullptr; 17179 } 17180 17181 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(), 17182 TargetFolder(DL)); 17183 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot)); 17184 17185 // Track the reduced values in case if they are replaced by extractelement 17186 // because of the vectorization. 17187 DenseMap<Value *, WeakTrackingVH> TrackedVals( 17188 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size()); 17189 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; 17190 SmallVector<std::pair<Value *, Value *>> ReplacedExternals; 17191 ExternallyUsedValues.reserve(ExtraArgs.size() + 1); 17192 // The same extra argument may be used several times, so log each attempt 17193 // to use it. 17194 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { 17195 assert(Pair.first && "DebugLoc must be set."); 17196 ExternallyUsedValues[Pair.second].push_back(Pair.first); 17197 TrackedVals.try_emplace(Pair.second, Pair.second); 17198 } 17199 17200 // The compare instruction of a min/max is the insertion point for new 17201 // instructions and may be replaced with a new compare instruction. 17202 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { 17203 assert(isa<SelectInst>(RdxRootInst) && 17204 "Expected min/max reduction to have select root instruction"); 17205 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); 17206 assert(isa<Instruction>(ScalarCond) && 17207 "Expected min/max reduction to have compare condition"); 17208 return cast<Instruction>(ScalarCond); 17209 }; 17210 17211 // Return new VectorizedTree, based on previous value. 17212 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { 17213 if (VectorizedTree) { 17214 // Update the final value in the reduction. 17215 Builder.SetCurrentDebugLocation( 17216 cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); 17217 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) || 17218 (isGuaranteedNotToBePoison(Res) && 17219 !isGuaranteedNotToBePoison(VectorizedTree))) { 17220 auto It = ReducedValsToOps.find(Res); 17221 if (It != ReducedValsToOps.end() && 17222 any_of(It->getSecond(), 17223 [](Instruction *I) { return isBoolLogicOp(I); })) 17224 std::swap(VectorizedTree, Res); 17225 } 17226 17227 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx", 17228 ReductionOps); 17229 } 17230 // Initialize the final value in the reduction. 17231 return Res; 17232 }; 17233 bool AnyBoolLogicOp = 17234 any_of(ReductionOps.back(), [](Value *V) { 17235 return isBoolLogicOp(cast<Instruction>(V)); 17236 }); 17237 // The reduction root is used as the insertion point for new instructions, 17238 // so set it as externally used to prevent it from being deleted. 17239 ExternallyUsedValues[ReductionRoot]; 17240 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() * 17241 ReductionOps.front().size()); 17242 for (ReductionOpsType &RdxOps : ReductionOps) 17243 for (Value *RdxOp : RdxOps) { 17244 if (!RdxOp) 17245 continue; 17246 IgnoreList.insert(RdxOp); 17247 } 17248 // Intersect the fast-math-flags from all reduction operations. 17249 FastMathFlags RdxFMF; 17250 RdxFMF.set(); 17251 for (Value *U : IgnoreList) 17252 if (auto *FPMO = dyn_cast<FPMathOperator>(U)) 17253 RdxFMF &= FPMO->getFastMathFlags(); 17254 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot)); 17255 17256 // Need to track reduced vals, they may be changed during vectorization of 17257 // subvectors. 17258 for (ArrayRef<Value *> Candidates : ReducedVals) 17259 for (Value *V : Candidates) 17260 TrackedVals.try_emplace(V, V); 17261 17262 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size()); 17263 // List of the values that were reduced in other trees as part of gather 17264 // nodes and thus requiring extract if fully vectorized in other trees. 17265 SmallPtrSet<Value *, 4> RequiredExtract; 17266 Value *VectorizedTree = nullptr; 17267 bool CheckForReusedReductionOps = false; 17268 // Try to vectorize elements based on their type. 17269 SmallVector<InstructionsState> States; 17270 for (ArrayRef<Value *> RV : ReducedVals) 17271 States.push_back(getSameOpcode(RV, TLI)); 17272 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { 17273 ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; 17274 InstructionsState S = States[I]; 17275 SmallVector<Value *> Candidates; 17276 Candidates.reserve(2 * OrigReducedVals.size()); 17277 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size()); 17278 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { 17279 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; 17280 // Check if the reduction value was not overriden by the extractelement 17281 // instruction because of the vectorization and exclude it, if it is not 17282 // compatible with other values. 17283 // Also check if the instruction was folded to constant/other value. 17284 auto *Inst = dyn_cast<Instruction>(RdxVal); 17285 if ((Inst && isVectorLikeInstWithConstOps(Inst) && 17286 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) || 17287 (S.getOpcode() && !Inst)) 17288 continue; 17289 Candidates.push_back(RdxVal); 17290 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); 17291 } 17292 bool ShuffledExtracts = false; 17293 // Try to handle shuffled extractelements. 17294 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && 17295 I + 1 < E) { 17296 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI); 17297 if (NextS.getOpcode() == Instruction::ExtractElement && 17298 !NextS.isAltShuffle()) { 17299 SmallVector<Value *> CommonCandidates(Candidates); 17300 for (Value *RV : ReducedVals[I + 1]) { 17301 Value *RdxVal = TrackedVals.find(RV)->second; 17302 // Check if the reduction value was not overriden by the 17303 // extractelement instruction because of the vectorization and 17304 // exclude it, if it is not compatible with other values. 17305 if (auto *Inst = dyn_cast<Instruction>(RdxVal)) 17306 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst)) 17307 continue; 17308 CommonCandidates.push_back(RdxVal); 17309 TrackedToOrig.try_emplace(RdxVal, RV); 17310 } 17311 SmallVector<int> Mask; 17312 if (isFixedVectorShuffle(CommonCandidates, Mask)) { 17313 ++I; 17314 Candidates.swap(CommonCandidates); 17315 ShuffledExtracts = true; 17316 } 17317 } 17318 } 17319 17320 // Emit code for constant values. 17321 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 && 17322 allConstant(Candidates)) { 17323 Value *Res = Candidates.front(); 17324 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); 17325 for (Value *VC : ArrayRef(Candidates).drop_front()) { 17326 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps); 17327 ++VectorizedVals.try_emplace(VC, 0).first->getSecond(); 17328 if (auto *ResI = dyn_cast<Instruction>(Res)) 17329 V.analyzedReductionRoot(ResI); 17330 } 17331 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res); 17332 continue; 17333 } 17334 17335 unsigned NumReducedVals = Candidates.size(); 17336 if (NumReducedVals < ReductionLimit && 17337 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization || 17338 !isSplat(Candidates))) 17339 continue; 17340 17341 // Check if we support repeated scalar values processing (optimization of 17342 // original scalar identity operations on matched horizontal reductions). 17343 IsSupportedHorRdxIdentityOp = 17344 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul && 17345 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd; 17346 // Gather same values. 17347 MapVector<Value *, unsigned> SameValuesCounter; 17348 if (IsSupportedHorRdxIdentityOp) 17349 for (Value *V : Candidates) 17350 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; 17351 // Used to check if the reduced values used same number of times. In this 17352 // case the compiler may produce better code. E.g. if reduced values are 17353 // aabbccdd (8 x values), then the first node of the tree will have a node 17354 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. 17355 // Plus, the final reduction will be performed on <8 x aabbccdd>. 17356 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4 17357 // x abcd) * 2. 17358 // Currently it only handles add/fadd/xor. and/or/min/max do not require 17359 // this analysis, other operations may require an extra estimation of 17360 // the profitability. 17361 bool SameScaleFactor = false; 17362 bool OptReusedScalars = IsSupportedHorRdxIdentityOp && 17363 SameValuesCounter.size() != Candidates.size(); 17364 if (OptReusedScalars) { 17365 SameScaleFactor = 17366 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || 17367 RdxKind == RecurKind::Xor) && 17368 all_of(drop_begin(SameValuesCounter), 17369 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) { 17370 return P.second == SameValuesCounter.front().second; 17371 }); 17372 Candidates.resize(SameValuesCounter.size()); 17373 transform(SameValuesCounter, Candidates.begin(), 17374 [](const auto &P) { return P.first; }); 17375 NumReducedVals = Candidates.size(); 17376 // Have a reduction of the same element. 17377 if (NumReducedVals == 1) { 17378 Value *OrigV = TrackedToOrig.find(Candidates.front())->second; 17379 unsigned Cnt = SameValuesCounter.lookup(OrigV); 17380 Value *RedVal = 17381 emitScaleForReusedOps(Candidates.front(), Builder, Cnt); 17382 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); 17383 VectorizedVals.try_emplace(OrigV, Cnt); 17384 continue; 17385 } 17386 } 17387 17388 unsigned MaxVecRegSize = V.getMaxVecRegSize(); 17389 unsigned EltSize = V.getVectorElementSize(Candidates[0]); 17390 unsigned MaxElts = 17391 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize); 17392 17393 unsigned ReduxWidth = std::min<unsigned>( 17394 llvm::bit_floor(NumReducedVals), 17395 std::clamp<unsigned>(MaxElts, RedValsMaxNumber, 17396 RegMaxNumber * RedValsMaxNumber)); 17397 unsigned Start = 0; 17398 unsigned Pos = Start; 17399 // Restarts vectorization attempt with lower vector factor. 17400 unsigned PrevReduxWidth = ReduxWidth; 17401 bool CheckForReusedReductionOpsLocal = false; 17402 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, 17403 &CheckForReusedReductionOpsLocal, 17404 &PrevReduxWidth, &V, 17405 &IgnoreList](bool IgnoreVL = false) { 17406 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); 17407 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { 17408 // Check if any of the reduction ops are gathered. If so, worth 17409 // trying again with less number of reduction ops. 17410 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered; 17411 } 17412 ++Pos; 17413 if (Pos < NumReducedVals - ReduxWidth + 1) 17414 return IsAnyRedOpGathered; 17415 Pos = Start; 17416 ReduxWidth /= 2; 17417 return IsAnyRedOpGathered; 17418 }; 17419 bool AnyVectorized = false; 17420 while (Pos < NumReducedVals - ReduxWidth + 1 && 17421 ReduxWidth >= ReductionLimit) { 17422 // Dependency in tree of the reduction ops - drop this attempt, try 17423 // later. 17424 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && 17425 Start == 0) { 17426 CheckForReusedReductionOps = true; 17427 break; 17428 } 17429 PrevReduxWidth = ReduxWidth; 17430 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth); 17431 // Beeing analyzed already - skip. 17432 if (V.areAnalyzedReductionVals(VL)) { 17433 (void)AdjustReducedVals(/*IgnoreVL=*/true); 17434 continue; 17435 } 17436 // Early exit if any of the reduction values were deleted during 17437 // previous vectorization attempts. 17438 if (any_of(VL, [&V](Value *RedVal) { 17439 auto *RedValI = dyn_cast<Instruction>(RedVal); 17440 if (!RedValI) 17441 return false; 17442 return V.isDeleted(RedValI); 17443 })) 17444 break; 17445 V.buildTree(VL, IgnoreList); 17446 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { 17447 if (!AdjustReducedVals()) 17448 V.analyzedReductionVals(VL); 17449 continue; 17450 } 17451 if (V.isLoadCombineReductionCandidate(RdxKind)) { 17452 if (!AdjustReducedVals()) 17453 V.analyzedReductionVals(VL); 17454 continue; 17455 } 17456 V.reorderTopToBottom(); 17457 // No need to reorder the root node at all. 17458 V.reorderBottomToTop(/*IgnoreReorder=*/true); 17459 // Keep extracted other reduction values, if they are used in the 17460 // vectorization trees. 17461 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( 17462 ExternallyUsedValues); 17463 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { 17464 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) 17465 continue; 17466 for (Value *V : ReducedVals[Cnt]) 17467 if (isa<Instruction>(V)) 17468 LocalExternallyUsedValues[TrackedVals[V]]; 17469 } 17470 if (!IsSupportedHorRdxIdentityOp) { 17471 // Number of uses of the candidates in the vector of values. 17472 assert(SameValuesCounter.empty() && 17473 "Reused values counter map is not empty"); 17474 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { 17475 if (Cnt >= Pos && Cnt < Pos + ReduxWidth) 17476 continue; 17477 Value *V = Candidates[Cnt]; 17478 Value *OrigV = TrackedToOrig.find(V)->second; 17479 ++SameValuesCounter[OrigV]; 17480 } 17481 } 17482 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end()); 17483 // Gather externally used values. 17484 SmallPtrSet<Value *, 4> Visited; 17485 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { 17486 if (Cnt >= Pos && Cnt < Pos + ReduxWidth) 17487 continue; 17488 Value *RdxVal = Candidates[Cnt]; 17489 if (!Visited.insert(RdxVal).second) 17490 continue; 17491 // Check if the scalar was vectorized as part of the vectorization 17492 // tree but not the top node. 17493 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) { 17494 LocalExternallyUsedValues[RdxVal]; 17495 continue; 17496 } 17497 Value *OrigV = TrackedToOrig.find(RdxVal)->second; 17498 unsigned NumOps = 17499 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV]; 17500 if (NumOps != ReducedValsToOps.find(OrigV)->second.size()) 17501 LocalExternallyUsedValues[RdxVal]; 17502 } 17503 // Do not need the list of reused scalars in regular mode anymore. 17504 if (!IsSupportedHorRdxIdentityOp) 17505 SameValuesCounter.clear(); 17506 for (Value *RdxVal : VL) 17507 if (RequiredExtract.contains(RdxVal)) 17508 LocalExternallyUsedValues[RdxVal]; 17509 // Update LocalExternallyUsedValues for the scalar, replaced by 17510 // extractelement instructions. 17511 DenseMap<Value *, Value *> ReplacementToExternal; 17512 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) 17513 ReplacementToExternal.try_emplace(Pair.second, Pair.first); 17514 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) { 17515 Value *Ext = Pair.first; 17516 auto RIt = ReplacementToExternal.find(Ext); 17517 while (RIt != ReplacementToExternal.end()) { 17518 Ext = RIt->second; 17519 RIt = ReplacementToExternal.find(Ext); 17520 } 17521 auto *It = ExternallyUsedValues.find(Ext); 17522 if (It == ExternallyUsedValues.end()) 17523 continue; 17524 LocalExternallyUsedValues[Pair.second].append(It->second); 17525 } 17526 V.buildExternalUses(LocalExternallyUsedValues); 17527 17528 V.computeMinimumValueSizes(); 17529 V.transformNodes(); 17530 17531 // Estimate cost. 17532 InstructionCost TreeCost = V.getTreeCost(VL); 17533 InstructionCost ReductionCost = 17534 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF); 17535 InstructionCost Cost = TreeCost + ReductionCost; 17536 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost 17537 << " for reduction\n"); 17538 if (!Cost.isValid()) 17539 break; 17540 if (Cost >= -SLPCostThreshold) { 17541 V.getORE()->emit([&]() { 17542 return OptimizationRemarkMissed( 17543 SV_NAME, "HorSLPNotBeneficial", 17544 ReducedValsToOps.find(VL[0])->second.front()) 17545 << "Vectorizing horizontal reduction is possible " 17546 << "but not beneficial with cost " << ore::NV("Cost", Cost) 17547 << " and threshold " 17548 << ore::NV("Threshold", -SLPCostThreshold); 17549 }); 17550 if (!AdjustReducedVals()) 17551 V.analyzedReductionVals(VL); 17552 continue; 17553 } 17554 17555 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" 17556 << Cost << ". (HorRdx)\n"); 17557 V.getORE()->emit([&]() { 17558 return OptimizationRemark( 17559 SV_NAME, "VectorizedHorizontalReduction", 17560 ReducedValsToOps.find(VL[0])->second.front()) 17561 << "Vectorized horizontal reduction with cost " 17562 << ore::NV("Cost", Cost) << " and with tree size " 17563 << ore::NV("TreeSize", V.getTreeSize()); 17564 }); 17565 17566 Builder.setFastMathFlags(RdxFMF); 17567 17568 // Emit a reduction. If the root is a select (min/max idiom), the insert 17569 // point is the compare condition of that select. 17570 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); 17571 Instruction *InsertPt = RdxRootInst; 17572 if (IsCmpSelMinMax) 17573 InsertPt = GetCmpForMinMaxReduction(RdxRootInst); 17574 17575 // Vectorize a tree. 17576 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues, 17577 ReplacedExternals, InsertPt); 17578 17579 Builder.SetInsertPoint(InsertPt); 17580 17581 // To prevent poison from leaking across what used to be sequential, 17582 // safe, scalar boolean logic operations, the reduction operand must be 17583 // frozen. 17584 if ((isBoolLogicOp(RdxRootInst) || 17585 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) && 17586 !isGuaranteedNotToBePoison(VectorizedRoot)) 17587 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); 17588 17589 // Emit code to correctly handle reused reduced values, if required. 17590 if (OptReusedScalars && !SameScaleFactor) { 17591 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V, 17592 SameValuesCounter, TrackedToOrig); 17593 } 17594 17595 Value *ReducedSubTree = 17596 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); 17597 if (ReducedSubTree->getType() != VL.front()->getType()) { 17598 assert(ReducedSubTree->getType() != VL.front()->getType() && 17599 "Expected different reduction type."); 17600 ReducedSubTree = 17601 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(), 17602 V.isSignedMinBitwidthRootNode()); 17603 } 17604 17605 // Improved analysis for add/fadd/xor reductions with same scale factor 17606 // for all operands of reductions. We can emit scalar ops for them 17607 // instead. 17608 if (OptReusedScalars && SameScaleFactor) 17609 ReducedSubTree = emitScaleForReusedOps( 17610 ReducedSubTree, Builder, SameValuesCounter.front().second); 17611 17612 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); 17613 // Count vectorized reduced values to exclude them from final reduction. 17614 for (Value *RdxVal : VL) { 17615 Value *OrigV = TrackedToOrig.find(RdxVal)->second; 17616 if (IsSupportedHorRdxIdentityOp) { 17617 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]); 17618 continue; 17619 } 17620 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); 17621 if (!V.isVectorized(RdxVal)) 17622 RequiredExtract.insert(RdxVal); 17623 } 17624 Pos += ReduxWidth; 17625 Start = Pos; 17626 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); 17627 AnyVectorized = true; 17628 } 17629 if (OptReusedScalars && !AnyVectorized) { 17630 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) { 17631 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second); 17632 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); 17633 Value *OrigV = TrackedToOrig.find(P.first)->second; 17634 VectorizedVals.try_emplace(OrigV, P.second); 17635 } 17636 continue; 17637 } 17638 } 17639 if (VectorizedTree) { 17640 // Reorder operands of bool logical op in the natural order to avoid 17641 // possible problem with poison propagation. If not possible to reorder 17642 // (both operands are originally RHS), emit an extra freeze instruction 17643 // for the LHS operand. 17644 // I.e., if we have original code like this: 17645 // RedOp1 = select i1 ?, i1 LHS, i1 false 17646 // RedOp2 = select i1 RHS, i1 ?, i1 false 17647 17648 // Then, we swap LHS/RHS to create a new op that matches the poison 17649 // semantics of the original code. 17650 17651 // If we have original code like this and both values could be poison: 17652 // RedOp1 = select i1 ?, i1 LHS, i1 false 17653 // RedOp2 = select i1 ?, i1 RHS, i1 false 17654 17655 // Then, we must freeze LHS in the new op. 17656 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS, 17657 Instruction *RedOp1, 17658 Instruction *RedOp2, 17659 bool InitStep) { 17660 if (!AnyBoolLogicOp) 17661 return; 17662 if (isBoolLogicOp(RedOp1) && 17663 ((!InitStep && LHS == VectorizedTree) || 17664 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS))) 17665 return; 17666 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) || 17667 getRdxOperand(RedOp2, 0) == RHS || 17668 isGuaranteedNotToBePoison(RHS))) { 17669 std::swap(LHS, RHS); 17670 return; 17671 } 17672 if (LHS != VectorizedTree) 17673 LHS = Builder.CreateFreeze(LHS); 17674 }; 17675 // Finish the reduction. 17676 // Need to add extra arguments and not vectorized possible reduction 17677 // values. 17678 // Try to avoid dependencies between the scalar remainders after 17679 // reductions. 17680 auto FinalGen = 17681 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals, 17682 bool InitStep) { 17683 unsigned Sz = InstVals.size(); 17684 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + 17685 Sz % 2); 17686 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) { 17687 Instruction *RedOp = InstVals[I + 1].first; 17688 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); 17689 Value *RdxVal1 = InstVals[I].second; 17690 Value *StableRdxVal1 = RdxVal1; 17691 auto It1 = TrackedVals.find(RdxVal1); 17692 if (It1 != TrackedVals.end()) 17693 StableRdxVal1 = It1->second; 17694 Value *RdxVal2 = InstVals[I + 1].second; 17695 Value *StableRdxVal2 = RdxVal2; 17696 auto It2 = TrackedVals.find(RdxVal2); 17697 if (It2 != TrackedVals.end()) 17698 StableRdxVal2 = It2->second; 17699 // To prevent poison from leaking across what used to be 17700 // sequential, safe, scalar boolean logic operations, the 17701 // reduction operand must be frozen. 17702 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first, 17703 RedOp, InitStep); 17704 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1, 17705 StableRdxVal2, "op.rdx", ReductionOps); 17706 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed); 17707 } 17708 if (Sz % 2 == 1) 17709 ExtraReds[Sz / 2] = InstVals.back(); 17710 return ExtraReds; 17711 }; 17712 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions; 17713 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot), 17714 VectorizedTree); 17715 SmallPtrSet<Value *, 8> Visited; 17716 for (ArrayRef<Value *> Candidates : ReducedVals) { 17717 for (Value *RdxVal : Candidates) { 17718 if (!Visited.insert(RdxVal).second) 17719 continue; 17720 unsigned NumOps = VectorizedVals.lookup(RdxVal); 17721 for (Instruction *RedOp : 17722 ArrayRef(ReducedValsToOps.find(RdxVal)->second) 17723 .drop_back(NumOps)) 17724 ExtraReductions.emplace_back(RedOp, RdxVal); 17725 } 17726 } 17727 for (auto &Pair : ExternallyUsedValues) { 17728 // Add each externally used value to the final reduction. 17729 for (auto *I : Pair.second) 17730 ExtraReductions.emplace_back(I, Pair.first); 17731 } 17732 // Iterate through all not-vectorized reduction values/extra arguments. 17733 bool InitStep = true; 17734 while (ExtraReductions.size() > 1) { 17735 SmallVector<std::pair<Instruction *, Value *>> NewReds = 17736 FinalGen(ExtraReductions, InitStep); 17737 ExtraReductions.swap(NewReds); 17738 InitStep = false; 17739 } 17740 VectorizedTree = ExtraReductions.front().second; 17741 17742 ReductionRoot->replaceAllUsesWith(VectorizedTree); 17743 17744 // The original scalar reduction is expected to have no remaining 17745 // uses outside the reduction tree itself. Assert that we got this 17746 // correct, replace internal uses with undef, and mark for eventual 17747 // deletion. 17748 #ifndef NDEBUG 17749 SmallSet<Value *, 4> IgnoreSet; 17750 for (ArrayRef<Value *> RdxOps : ReductionOps) 17751 IgnoreSet.insert(RdxOps.begin(), RdxOps.end()); 17752 #endif 17753 for (ArrayRef<Value *> RdxOps : ReductionOps) { 17754 for (Value *Ignore : RdxOps) { 17755 if (!Ignore) 17756 continue; 17757 #ifndef NDEBUG 17758 for (auto *U : Ignore->users()) { 17759 assert(IgnoreSet.count(U) && 17760 "All users must be either in the reduction ops list."); 17761 } 17762 #endif 17763 if (!Ignore->use_empty()) { 17764 Value *P = PoisonValue::get(Ignore->getType()); 17765 Ignore->replaceAllUsesWith(P); 17766 } 17767 } 17768 V.removeInstructionsAndOperands(RdxOps); 17769 } 17770 } else if (!CheckForReusedReductionOps) { 17771 for (ReductionOpsType &RdxOps : ReductionOps) 17772 for (Value *RdxOp : RdxOps) 17773 V.analyzedReductionRoot(cast<Instruction>(RdxOp)); 17774 } 17775 return VectorizedTree; 17776 } 17777 17778 private: 17779 /// Calculate the cost of a reduction. 17780 InstructionCost getReductionCost(TargetTransformInfo *TTI, 17781 ArrayRef<Value *> ReducedVals, 17782 bool IsCmpSelMinMax, unsigned ReduxWidth, 17783 FastMathFlags FMF) { 17784 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 17785 Type *ScalarTy = ReducedVals.front()->getType(); 17786 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth); 17787 InstructionCost VectorCost = 0, ScalarCost; 17788 // If all of the reduced values are constant, the vector cost is 0, since 17789 // the reduction value can be calculated at the compile time. 17790 bool AllConsts = allConstant(ReducedVals); 17791 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) { 17792 InstructionCost Cost = 0; 17793 // Scalar cost is repeated for N-1 elements. 17794 int Cnt = ReducedVals.size(); 17795 for (Value *RdxVal : ReducedVals) { 17796 if (Cnt == 1) 17797 break; 17798 --Cnt; 17799 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) { 17800 Cost += GenCostFn(); 17801 continue; 17802 } 17803 InstructionCost ScalarCost = 0; 17804 for (User *U : RdxVal->users()) { 17805 auto *RdxOp = cast<Instruction>(U); 17806 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { 17807 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); 17808 continue; 17809 } 17810 ScalarCost = InstructionCost::getInvalid(); 17811 break; 17812 } 17813 if (ScalarCost.isValid()) 17814 Cost += ScalarCost; 17815 else 17816 Cost += GenCostFn(); 17817 } 17818 return Cost; 17819 }; 17820 switch (RdxKind) { 17821 case RecurKind::Add: 17822 case RecurKind::Mul: 17823 case RecurKind::Or: 17824 case RecurKind::And: 17825 case RecurKind::Xor: 17826 case RecurKind::FAdd: 17827 case RecurKind::FMul: { 17828 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); 17829 if (!AllConsts) 17830 VectorCost = 17831 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); 17832 ScalarCost = EvaluateScalarCost([&]() { 17833 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); 17834 }); 17835 break; 17836 } 17837 case RecurKind::FMax: 17838 case RecurKind::FMin: 17839 case RecurKind::FMaximum: 17840 case RecurKind::FMinimum: 17841 case RecurKind::SMax: 17842 case RecurKind::SMin: 17843 case RecurKind::UMax: 17844 case RecurKind::UMin: { 17845 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 17846 if (!AllConsts) 17847 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); 17848 ScalarCost = EvaluateScalarCost([&]() { 17849 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); 17850 return TTI->getIntrinsicInstrCost(ICA, CostKind); 17851 }); 17852 break; 17853 } 17854 default: 17855 llvm_unreachable("Expected arithmetic or min/max reduction operation"); 17856 } 17857 17858 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost 17859 << " for reduction of " << shortBundleName(ReducedVals) 17860 << " (It is a splitting reduction)\n"); 17861 return VectorCost - ScalarCost; 17862 } 17863 17864 /// Emit a horizontal reduction of the vectorized value. 17865 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder, 17866 unsigned ReduxWidth, const TargetTransformInfo *TTI) { 17867 assert(VectorizedValue && "Need to have a vectorized tree node"); 17868 assert(isPowerOf2_32(ReduxWidth) && 17869 "We only handle power-of-two reductions for now"); 17870 assert(RdxKind != RecurKind::FMulAdd && 17871 "A call to the llvm.fmuladd intrinsic is not handled yet"); 17872 17873 ++NumVectorInstructions; 17874 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind); 17875 } 17876 17877 /// Emits optimized code for unique scalar value reused \p Cnt times. 17878 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, 17879 unsigned Cnt) { 17880 assert(IsSupportedHorRdxIdentityOp && 17881 "The optimization of matched scalar identity horizontal reductions " 17882 "must be supported."); 17883 switch (RdxKind) { 17884 case RecurKind::Add: { 17885 // res = mul vv, n 17886 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); 17887 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " 17888 << VectorizedValue << ". (HorRdx)\n"); 17889 return Builder.CreateMul(VectorizedValue, Scale); 17890 } 17891 case RecurKind::Xor: { 17892 // res = n % 2 ? 0 : vv 17893 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue 17894 << ". (HorRdx)\n"); 17895 if (Cnt % 2 == 0) 17896 return Constant::getNullValue(VectorizedValue->getType()); 17897 return VectorizedValue; 17898 } 17899 case RecurKind::FAdd: { 17900 // res = fmul v, n 17901 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); 17902 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " 17903 << VectorizedValue << ". (HorRdx)\n"); 17904 return Builder.CreateFMul(VectorizedValue, Scale); 17905 } 17906 case RecurKind::And: 17907 case RecurKind::Or: 17908 case RecurKind::SMax: 17909 case RecurKind::SMin: 17910 case RecurKind::UMax: 17911 case RecurKind::UMin: 17912 case RecurKind::FMax: 17913 case RecurKind::FMin: 17914 case RecurKind::FMaximum: 17915 case RecurKind::FMinimum: 17916 // res = vv 17917 return VectorizedValue; 17918 case RecurKind::Mul: 17919 case RecurKind::FMul: 17920 case RecurKind::FMulAdd: 17921 case RecurKind::IAnyOf: 17922 case RecurKind::FAnyOf: 17923 case RecurKind::None: 17924 llvm_unreachable("Unexpected reduction kind for repeated scalar."); 17925 } 17926 return nullptr; 17927 } 17928 17929 /// Emits actual operation for the scalar identity values, found during 17930 /// horizontal reduction analysis. 17931 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, 17932 BoUpSLP &R, 17933 const MapVector<Value *, unsigned> &SameValuesCounter, 17934 const DenseMap<Value *, Value *> &TrackedToOrig) { 17935 assert(IsSupportedHorRdxIdentityOp && 17936 "The optimization of matched scalar identity horizontal reductions " 17937 "must be supported."); 17938 ArrayRef<Value *> VL = R.getRootNodeScalars(); 17939 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType()); 17940 if (VTy->getElementType() != VL.front()->getType()) { 17941 VectorizedValue = Builder.CreateIntCast( 17942 VectorizedValue, 17943 getWidenedType(VL.front()->getType(), VTy->getNumElements()), 17944 R.isSignedMinBitwidthRootNode()); 17945 } 17946 switch (RdxKind) { 17947 case RecurKind::Add: { 17948 // root = mul prev_root, <1, 1, n, 1> 17949 SmallVector<Constant *> Vals; 17950 for (Value *V : VL) { 17951 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); 17952 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); 17953 } 17954 auto *Scale = ConstantVector::get(Vals); 17955 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of " 17956 << VectorizedValue << ". (HorRdx)\n"); 17957 return Builder.CreateMul(VectorizedValue, Scale); 17958 } 17959 case RecurKind::And: 17960 case RecurKind::Or: 17961 // No need for multiple or/and(s). 17962 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue 17963 << ". (HorRdx)\n"); 17964 return VectorizedValue; 17965 case RecurKind::SMax: 17966 case RecurKind::SMin: 17967 case RecurKind::UMax: 17968 case RecurKind::UMin: 17969 case RecurKind::FMax: 17970 case RecurKind::FMin: 17971 case RecurKind::FMaximum: 17972 case RecurKind::FMinimum: 17973 // No need for multiple min/max(s) of the same value. 17974 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue 17975 << ". (HorRdx)\n"); 17976 return VectorizedValue; 17977 case RecurKind::Xor: { 17978 // Replace values with even number of repeats with 0, since 17979 // x xor x = 0. 17980 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, 17981 // 7>, if elements 4th and 6th elements have even number of repeats. 17982 SmallVector<int> Mask( 17983 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(), 17984 PoisonMaskElem); 17985 std::iota(Mask.begin(), Mask.end(), 0); 17986 bool NeedShuffle = false; 17987 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { 17988 Value *V = VL[I]; 17989 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); 17990 if (Cnt % 2 == 0) { 17991 Mask[I] = VF; 17992 NeedShuffle = true; 17993 } 17994 } 17995 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I 17996 : Mask) dbgs() 17997 << I << " "; 17998 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n"); 17999 if (NeedShuffle) 18000 VectorizedValue = Builder.CreateShuffleVector( 18001 VectorizedValue, 18002 ConstantVector::getNullValue(VectorizedValue->getType()), Mask); 18003 return VectorizedValue; 18004 } 18005 case RecurKind::FAdd: { 18006 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> 18007 SmallVector<Constant *> Vals; 18008 for (Value *V : VL) { 18009 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); 18010 Vals.push_back(ConstantFP::get(V->getType(), Cnt)); 18011 } 18012 auto *Scale = ConstantVector::get(Vals); 18013 return Builder.CreateFMul(VectorizedValue, Scale); 18014 } 18015 case RecurKind::Mul: 18016 case RecurKind::FMul: 18017 case RecurKind::FMulAdd: 18018 case RecurKind::IAnyOf: 18019 case RecurKind::FAnyOf: 18020 case RecurKind::None: 18021 llvm_unreachable("Unexpected reduction kind for reused scalars."); 18022 } 18023 return nullptr; 18024 } 18025 }; 18026 } // end anonymous namespace 18027 18028 /// Gets recurrence kind from the specified value. 18029 static RecurKind getRdxKind(Value *V) { 18030 return HorizontalReduction::getRdxKind(V); 18031 } 18032 static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) { 18033 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) 18034 return cast<FixedVectorType>(IE->getType())->getNumElements(); 18035 18036 unsigned AggregateSize = 1; 18037 auto *IV = cast<InsertValueInst>(InsertInst); 18038 Type *CurrentType = IV->getType(); 18039 do { 18040 if (auto *ST = dyn_cast<StructType>(CurrentType)) { 18041 for (auto *Elt : ST->elements()) 18042 if (Elt != ST->getElementType(0)) // check homogeneity 18043 return std::nullopt; 18044 AggregateSize *= ST->getNumElements(); 18045 CurrentType = ST->getElementType(0); 18046 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { 18047 AggregateSize *= AT->getNumElements(); 18048 CurrentType = AT->getElementType(); 18049 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { 18050 AggregateSize *= VT->getNumElements(); 18051 return AggregateSize; 18052 } else if (CurrentType->isSingleValueType()) { 18053 return AggregateSize; 18054 } else { 18055 return std::nullopt; 18056 } 18057 } while (true); 18058 } 18059 18060 static void findBuildAggregate_rec(Instruction *LastInsertInst, 18061 TargetTransformInfo *TTI, 18062 SmallVectorImpl<Value *> &BuildVectorOpds, 18063 SmallVectorImpl<Value *> &InsertElts, 18064 unsigned OperandOffset) { 18065 do { 18066 Value *InsertedOperand = LastInsertInst->getOperand(1); 18067 std::optional<unsigned> OperandIndex = 18068 getElementIndex(LastInsertInst, OperandOffset); 18069 if (!OperandIndex) 18070 return; 18071 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) { 18072 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, 18073 BuildVectorOpds, InsertElts, *OperandIndex); 18074 18075 } else { 18076 BuildVectorOpds[*OperandIndex] = InsertedOperand; 18077 InsertElts[*OperandIndex] = LastInsertInst; 18078 } 18079 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); 18080 } while (LastInsertInst != nullptr && 18081 isa<InsertValueInst, InsertElementInst>(LastInsertInst) && 18082 LastInsertInst->hasOneUse()); 18083 } 18084 18085 /// Recognize construction of vectors like 18086 /// %ra = insertelement <4 x float> poison, float %s0, i32 0 18087 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 18088 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 18089 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 18090 /// starting from the last insertelement or insertvalue instruction. 18091 /// 18092 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 18093 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 18094 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 18095 /// 18096 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 18097 /// 18098 /// \return true if it matches. 18099 static bool findBuildAggregate(Instruction *LastInsertInst, 18100 TargetTransformInfo *TTI, 18101 SmallVectorImpl<Value *> &BuildVectorOpds, 18102 SmallVectorImpl<Value *> &InsertElts) { 18103 18104 assert((isa<InsertElementInst>(LastInsertInst) || 18105 isa<InsertValueInst>(LastInsertInst)) && 18106 "Expected insertelement or insertvalue instruction!"); 18107 18108 assert((BuildVectorOpds.empty() && InsertElts.empty()) && 18109 "Expected empty result vectors!"); 18110 18111 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); 18112 if (!AggregateSize) 18113 return false; 18114 BuildVectorOpds.resize(*AggregateSize); 18115 InsertElts.resize(*AggregateSize); 18116 18117 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0); 18118 llvm::erase(BuildVectorOpds, nullptr); 18119 llvm::erase(InsertElts, nullptr); 18120 if (BuildVectorOpds.size() >= 2) 18121 return true; 18122 18123 return false; 18124 } 18125 18126 /// Try and get a reduction instruction from a phi node. 18127 /// 18128 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions 18129 /// if they come from either \p ParentBB or a containing loop latch. 18130 /// 18131 /// \returns A candidate reduction value if possible, or \code nullptr \endcode 18132 /// if not possible. 18133 static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P, 18134 BasicBlock *ParentBB, LoopInfo *LI) { 18135 // There are situations where the reduction value is not dominated by the 18136 // reduction phi. Vectorizing such cases has been reported to cause 18137 // miscompiles. See PR25787. 18138 auto DominatedReduxValue = [&](Value *R) { 18139 return isa<Instruction>(R) && 18140 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); 18141 }; 18142 18143 Instruction *Rdx = nullptr; 18144 18145 // Return the incoming value if it comes from the same BB as the phi node. 18146 if (P->getIncomingBlock(0) == ParentBB) { 18147 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); 18148 } else if (P->getIncomingBlock(1) == ParentBB) { 18149 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); 18150 } 18151 18152 if (Rdx && DominatedReduxValue(Rdx)) 18153 return Rdx; 18154 18155 // Otherwise, check whether we have a loop latch to look at. 18156 Loop *BBL = LI->getLoopFor(ParentBB); 18157 if (!BBL) 18158 return nullptr; 18159 BasicBlock *BBLatch = BBL->getLoopLatch(); 18160 if (!BBLatch) 18161 return nullptr; 18162 18163 // There is a loop latch, return the incoming value if it comes from 18164 // that. This reduction pattern occasionally turns up. 18165 if (P->getIncomingBlock(0) == BBLatch) { 18166 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0)); 18167 } else if (P->getIncomingBlock(1) == BBLatch) { 18168 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1)); 18169 } 18170 18171 if (Rdx && DominatedReduxValue(Rdx)) 18172 return Rdx; 18173 18174 return nullptr; 18175 } 18176 18177 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { 18178 if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) 18179 return true; 18180 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) 18181 return true; 18182 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) 18183 return true; 18184 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1)))) 18185 return true; 18186 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1)))) 18187 return true; 18188 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) 18189 return true; 18190 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) 18191 return true; 18192 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1)))) 18193 return true; 18194 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1)))) 18195 return true; 18196 return false; 18197 } 18198 18199 /// We could have an initial reduction that is not an add. 18200 /// r *= v1 + v2 + v3 + v4 18201 /// In such a case start looking for a tree rooted in the first '+'. 18202 /// \Returns the new root if found, which may be nullptr if not an instruction. 18203 static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi, 18204 Instruction *Root) { 18205 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) || 18206 isa<IntrinsicInst>(Root)) && 18207 "Expected binop, select, or intrinsic for reduction matching"); 18208 Value *LHS = 18209 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root)); 18210 Value *RHS = 18211 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1); 18212 if (LHS == Phi) 18213 return dyn_cast<Instruction>(RHS); 18214 if (RHS == Phi) 18215 return dyn_cast<Instruction>(LHS); 18216 return nullptr; 18217 } 18218 18219 /// \p Returns the first operand of \p I that does not match \p Phi. If 18220 /// operand is not an instruction it returns nullptr. 18221 static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) { 18222 Value *Op0 = nullptr; 18223 Value *Op1 = nullptr; 18224 if (!matchRdxBop(I, Op0, Op1)) 18225 return nullptr; 18226 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0); 18227 } 18228 18229 /// \Returns true if \p I is a candidate instruction for reduction vectorization. 18230 static bool isReductionCandidate(Instruction *I) { 18231 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value())); 18232 Value *B0 = nullptr, *B1 = nullptr; 18233 bool IsBinop = matchRdxBop(I, B0, B1); 18234 return IsBinop || IsSelect; 18235 } 18236 18237 bool SLPVectorizerPass::vectorizeHorReduction( 18238 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, 18239 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) { 18240 if (!ShouldVectorizeHor) 18241 return false; 18242 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root); 18243 18244 if (Root->getParent() != BB || isa<PHINode>(Root)) 18245 return false; 18246 18247 // If we can find a secondary reduction root, use that instead. 18248 auto SelectRoot = [&]() { 18249 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) && 18250 HorizontalReduction::getRdxKind(Root) != RecurKind::None) 18251 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root)) 18252 return NewRoot; 18253 return Root; 18254 }; 18255 18256 // Start analysis starting from Root instruction. If horizontal reduction is 18257 // found, try to vectorize it. If it is not a horizontal reduction or 18258 // vectorization is not possible or not effective, and currently analyzed 18259 // instruction is a binary operation, try to vectorize the operands, using 18260 // pre-order DFS traversal order. If the operands were not vectorized, repeat 18261 // the same procedure considering each operand as a possible root of the 18262 // horizontal reduction. 18263 // Interrupt the process if the Root instruction itself was vectorized or all 18264 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 18265 // If a horizintal reduction was not matched or vectorized we collect 18266 // instructions for possible later attempts for vectorization. 18267 std::queue<std::pair<Instruction *, unsigned>> Stack; 18268 Stack.emplace(SelectRoot(), 0); 18269 SmallPtrSet<Value *, 8> VisitedInstrs; 18270 bool Res = false; 18271 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * { 18272 if (R.isAnalyzedReductionRoot(Inst)) 18273 return nullptr; 18274 if (!isReductionCandidate(Inst)) 18275 return nullptr; 18276 HorizontalReduction HorRdx; 18277 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) 18278 return nullptr; 18279 return HorRdx.tryToReduce(R, *DL, TTI, *TLI); 18280 }; 18281 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { 18282 if (TryOperandsAsNewSeeds && FutureSeed == Root) { 18283 FutureSeed = getNonPhiOperand(Root, P); 18284 if (!FutureSeed) 18285 return false; 18286 } 18287 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their 18288 // analysis is done separately. 18289 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed)) 18290 PostponedInsts.push_back(FutureSeed); 18291 return true; 18292 }; 18293 18294 while (!Stack.empty()) { 18295 Instruction *Inst; 18296 unsigned Level; 18297 std::tie(Inst, Level) = Stack.front(); 18298 Stack.pop(); 18299 // Do not try to analyze instruction that has already been vectorized. 18300 // This may happen when we vectorize instruction operands on a previous 18301 // iteration while stack was populated before that happened. 18302 if (R.isDeleted(Inst)) 18303 continue; 18304 if (Value *VectorizedV = TryToReduce(Inst)) { 18305 Res = true; 18306 if (auto *I = dyn_cast<Instruction>(VectorizedV)) { 18307 // Try to find another reduction. 18308 Stack.emplace(I, Level); 18309 continue; 18310 } 18311 if (R.isDeleted(Inst)) 18312 continue; 18313 } else { 18314 // We could not vectorize `Inst` so try to use it as a future seed. 18315 if (!TryAppendToPostponedInsts(Inst)) { 18316 assert(Stack.empty() && "Expected empty stack"); 18317 break; 18318 } 18319 } 18320 18321 // Try to vectorize operands. 18322 // Continue analysis for the instruction from the same basic block only to 18323 // save compile time. 18324 if (++Level < RecursionMaxDepth) 18325 for (auto *Op : Inst->operand_values()) 18326 if (VisitedInstrs.insert(Op).second) 18327 if (auto *I = dyn_cast<Instruction>(Op)) 18328 // Do not try to vectorize CmpInst operands, this is done 18329 // separately. 18330 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) && 18331 !R.isDeleted(I) && I->getParent() == BB) 18332 Stack.emplace(I, Level); 18333 } 18334 return Res; 18335 } 18336 18337 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, 18338 BasicBlock *BB, BoUpSLP &R, 18339 TargetTransformInfo *TTI) { 18340 SmallVector<WeakTrackingVH> PostponedInsts; 18341 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts); 18342 Res |= tryToVectorize(PostponedInsts, R); 18343 return Res; 18344 } 18345 18346 bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts, 18347 BoUpSLP &R) { 18348 bool Res = false; 18349 for (Value *V : Insts) 18350 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst)) 18351 Res |= tryToVectorize(Inst, R); 18352 return Res; 18353 } 18354 18355 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, 18356 BasicBlock *BB, BoUpSLP &R, 18357 bool MaxVFOnly) { 18358 if (!R.canMapToVector(IVI->getType())) 18359 return false; 18360 18361 SmallVector<Value *, 16> BuildVectorOpds; 18362 SmallVector<Value *, 16> BuildVectorInsts; 18363 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) 18364 return false; 18365 18366 if (MaxVFOnly && BuildVectorOpds.size() == 2) { 18367 R.getORE()->emit([&]() { 18368 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI) 18369 << "Cannot SLP vectorize list: only 2 elements of buildvalue, " 18370 "trying reduction first."; 18371 }); 18372 return false; 18373 } 18374 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); 18375 // Aggregate value is unlikely to be processed in vector register. 18376 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly); 18377 } 18378 18379 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, 18380 BasicBlock *BB, BoUpSLP &R, 18381 bool MaxVFOnly) { 18382 SmallVector<Value *, 16> BuildVectorInsts; 18383 SmallVector<Value *, 16> BuildVectorOpds; 18384 SmallVector<int> Mask; 18385 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || 18386 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) && 18387 isFixedVectorShuffle(BuildVectorOpds, Mask))) 18388 return false; 18389 18390 if (MaxVFOnly && BuildVectorInsts.size() == 2) { 18391 R.getORE()->emit([&]() { 18392 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI) 18393 << "Cannot SLP vectorize list: only 2 elements of buildvector, " 18394 "trying reduction first."; 18395 }); 18396 return false; 18397 } 18398 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); 18399 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly); 18400 } 18401 18402 template <typename T> 18403 static bool tryToVectorizeSequence( 18404 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, 18405 function_ref<bool(T *, T *)> AreCompatible, 18406 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, 18407 bool MaxVFOnly, BoUpSLP &R) { 18408 bool Changed = false; 18409 // Sort by type, parent, operands. 18410 stable_sort(Incoming, Comparator); 18411 18412 // Try to vectorize elements base on their type. 18413 SmallVector<T *> Candidates; 18414 SmallVector<T *> VL; 18415 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E; 18416 VL.clear()) { 18417 // Look for the next elements with the same type, parent and operand 18418 // kinds. 18419 auto *I = dyn_cast<Instruction>(*IncIt); 18420 if (!I || R.isDeleted(I)) { 18421 ++IncIt; 18422 continue; 18423 } 18424 auto *SameTypeIt = IncIt; 18425 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) || 18426 R.isDeleted(cast<Instruction>(*SameTypeIt)) || 18427 AreCompatible(*SameTypeIt, *IncIt))) { 18428 auto *I = dyn_cast<Instruction>(*SameTypeIt); 18429 ++SameTypeIt; 18430 if (I && !R.isDeleted(I)) 18431 VL.push_back(cast<T>(I)); 18432 } 18433 18434 // Try to vectorize them. 18435 unsigned NumElts = VL.size(); 18436 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" 18437 << NumElts << ")\n"); 18438 // The vectorization is a 3-state attempt: 18439 // 1. Try to vectorize instructions with the same/alternate opcodes with the 18440 // size of maximal register at first. 18441 // 2. Try to vectorize remaining instructions with the same type, if 18442 // possible. This may result in the better vectorization results rather than 18443 // if we try just to vectorize instructions with the same/alternate opcodes. 18444 // 3. Final attempt to try to vectorize all instructions with the 18445 // same/alternate ops only, this may result in some extra final 18446 // vectorization. 18447 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) { 18448 // Success start over because instructions might have been changed. 18449 Changed = true; 18450 VL.swap(Candidates); 18451 Candidates.clear(); 18452 for (T *V : VL) { 18453 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I)) 18454 Candidates.push_back(V); 18455 } 18456 } else { 18457 /// \Returns the minimum number of elements that we will attempt to 18458 /// vectorize. 18459 auto GetMinNumElements = [&R](Value *V) { 18460 unsigned EltSize = R.getVectorElementSize(V); 18461 return std::max(2U, R.getMaxVecRegSize() / EltSize); 18462 }; 18463 if (NumElts < GetMinNumElements(*IncIt) && 18464 (Candidates.empty() || 18465 Candidates.front()->getType() == (*IncIt)->getType())) { 18466 for (T *V : VL) { 18467 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I)) 18468 Candidates.push_back(V); 18469 } 18470 } 18471 } 18472 // Final attempt to vectorize instructions with the same types. 18473 if (Candidates.size() > 1 && 18474 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { 18475 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) { 18476 // Success start over because instructions might have been changed. 18477 Changed = true; 18478 } else if (MaxVFOnly) { 18479 // Try to vectorize using small vectors. 18480 SmallVector<T *> VL; 18481 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End; 18482 VL.clear()) { 18483 auto *I = dyn_cast<Instruction>(*It); 18484 if (!I || R.isDeleted(I)) { 18485 ++It; 18486 continue; 18487 } 18488 auto *SameTypeIt = It; 18489 while (SameTypeIt != End && 18490 (!isa<Instruction>(*SameTypeIt) || 18491 R.isDeleted(cast<Instruction>(*SameTypeIt)) || 18492 AreCompatible(*SameTypeIt, *It))) { 18493 auto *I = dyn_cast<Instruction>(*SameTypeIt); 18494 ++SameTypeIt; 18495 if (I && !R.isDeleted(I)) 18496 VL.push_back(cast<T>(I)); 18497 } 18498 unsigned NumElts = VL.size(); 18499 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), 18500 /*MaxVFOnly=*/false)) 18501 Changed = true; 18502 It = SameTypeIt; 18503 } 18504 } 18505 Candidates.clear(); 18506 } 18507 18508 // Start over at the next instruction of a different type (or the end). 18509 IncIt = SameTypeIt; 18510 } 18511 return Changed; 18512 } 18513 18514 /// Compare two cmp instructions. If IsCompatibility is true, function returns 18515 /// true if 2 cmps have same/swapped predicates and mos compatible corresponding 18516 /// operands. If IsCompatibility is false, function implements strict weak 18517 /// ordering relation between two cmp instructions, returning true if the first 18518 /// instruction is "less" than the second, i.e. its predicate is less than the 18519 /// predicate of the second or the operands IDs are less than the operands IDs 18520 /// of the second cmp instruction. 18521 template <bool IsCompatibility> 18522 static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, 18523 const DominatorTree &DT) { 18524 assert(isValidElementType(V->getType()) && 18525 isValidElementType(V2->getType()) && 18526 "Expected valid element types only."); 18527 if (V == V2) 18528 return IsCompatibility; 18529 auto *CI1 = cast<CmpInst>(V); 18530 auto *CI2 = cast<CmpInst>(V2); 18531 if (CI1->getOperand(0)->getType()->getTypeID() < 18532 CI2->getOperand(0)->getType()->getTypeID()) 18533 return !IsCompatibility; 18534 if (CI1->getOperand(0)->getType()->getTypeID() > 18535 CI2->getOperand(0)->getType()->getTypeID()) 18536 return false; 18537 CmpInst::Predicate Pred1 = CI1->getPredicate(); 18538 CmpInst::Predicate Pred2 = CI2->getPredicate(); 18539 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); 18540 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); 18541 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); 18542 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); 18543 if (BasePred1 < BasePred2) 18544 return !IsCompatibility; 18545 if (BasePred1 > BasePred2) 18546 return false; 18547 // Compare operands. 18548 bool CI1Preds = Pred1 == BasePred1; 18549 bool CI2Preds = Pred2 == BasePred1; 18550 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { 18551 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1); 18552 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1); 18553 if (Op1 == Op2) 18554 continue; 18555 if (Op1->getValueID() < Op2->getValueID()) 18556 return !IsCompatibility; 18557 if (Op1->getValueID() > Op2->getValueID()) 18558 return false; 18559 if (auto *I1 = dyn_cast<Instruction>(Op1)) 18560 if (auto *I2 = dyn_cast<Instruction>(Op2)) { 18561 if (IsCompatibility) { 18562 if (I1->getParent() != I2->getParent()) 18563 return false; 18564 } else { 18565 // Try to compare nodes with same parent. 18566 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent()); 18567 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent()); 18568 if (!NodeI1) 18569 return NodeI2 != nullptr; 18570 if (!NodeI2) 18571 return false; 18572 assert((NodeI1 == NodeI2) == 18573 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 18574 "Different nodes should have different DFS numbers"); 18575 if (NodeI1 != NodeI2) 18576 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 18577 } 18578 InstructionsState S = getSameOpcode({I1, I2}, TLI); 18579 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle())) 18580 continue; 18581 if (IsCompatibility) 18582 return false; 18583 if (I1->getOpcode() != I2->getOpcode()) 18584 return I1->getOpcode() < I2->getOpcode(); 18585 } 18586 } 18587 return IsCompatibility; 18588 } 18589 18590 template <typename ItT> 18591 bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, 18592 BasicBlock *BB, BoUpSLP &R) { 18593 bool Changed = false; 18594 // Try to find reductions first. 18595 for (CmpInst *I : CmpInsts) { 18596 if (R.isDeleted(I)) 18597 continue; 18598 for (Value *Op : I->operands()) 18599 if (auto *RootOp = dyn_cast<Instruction>(Op)) { 18600 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI); 18601 if (R.isDeleted(I)) 18602 break; 18603 } 18604 } 18605 // Try to vectorize operands as vector bundles. 18606 for (CmpInst *I : CmpInsts) { 18607 if (R.isDeleted(I)) 18608 continue; 18609 Changed |= tryToVectorize(I, R); 18610 } 18611 // Try to vectorize list of compares. 18612 // Sort by type, compare predicate, etc. 18613 auto CompareSorter = [&](Value *V, Value *V2) { 18614 if (V == V2) 18615 return false; 18616 return compareCmp<false>(V, V2, *TLI, *DT); 18617 }; 18618 18619 auto AreCompatibleCompares = [&](Value *V1, Value *V2) { 18620 if (V1 == V2) 18621 return true; 18622 return compareCmp<true>(V1, V2, *TLI, *DT); 18623 }; 18624 18625 SmallVector<Value *> Vals; 18626 for (Instruction *V : CmpInsts) 18627 if (!R.isDeleted(V) && isValidElementType(V->getType())) 18628 Vals.push_back(V); 18629 if (Vals.size() <= 1) 18630 return Changed; 18631 Changed |= tryToVectorizeSequence<Value>( 18632 Vals, CompareSorter, AreCompatibleCompares, 18633 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { 18634 // Exclude possible reductions from other blocks. 18635 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) { 18636 return any_of(V->users(), [V](User *U) { 18637 auto *Select = dyn_cast<SelectInst>(U); 18638 return Select && 18639 Select->getParent() != cast<Instruction>(V)->getParent(); 18640 }); 18641 }); 18642 if (ArePossiblyReducedInOtherBlock) 18643 return false; 18644 return tryToVectorizeList(Candidates, R, MaxVFOnly); 18645 }, 18646 /*MaxVFOnly=*/true, R); 18647 return Changed; 18648 } 18649 18650 bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, 18651 BasicBlock *BB, BoUpSLP &R) { 18652 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) && 18653 "This function only accepts Insert instructions"); 18654 bool OpsChanged = false; 18655 SmallVector<WeakTrackingVH> PostponedInsts; 18656 for (auto *I : reverse(Instructions)) { 18657 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only. 18658 if (R.isDeleted(I) || isa<CmpInst>(I)) 18659 continue; 18660 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { 18661 OpsChanged |= 18662 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true); 18663 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { 18664 OpsChanged |= 18665 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true); 18666 } 18667 // pass2 - try to vectorize reductions only 18668 if (R.isDeleted(I)) 18669 continue; 18670 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts); 18671 if (R.isDeleted(I) || isa<CmpInst>(I)) 18672 continue; 18673 // pass3 - try to match and vectorize a buildvector sequence. 18674 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { 18675 OpsChanged |= 18676 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false); 18677 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { 18678 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R, 18679 /*MaxVFOnly=*/false); 18680 } 18681 } 18682 // Now try to vectorize postponed instructions. 18683 OpsChanged |= tryToVectorize(PostponedInsts, R); 18684 18685 Instructions.clear(); 18686 return OpsChanged; 18687 } 18688 18689 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 18690 bool Changed = false; 18691 SmallVector<Value *, 4> Incoming; 18692 SmallPtrSet<Value *, 16> VisitedInstrs; 18693 // Maps phi nodes to the non-phi nodes found in the use tree for each phi 18694 // node. Allows better to identify the chains that can be vectorized in the 18695 // better way. 18696 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; 18697 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { 18698 assert(isValidElementType(V1->getType()) && 18699 isValidElementType(V2->getType()) && 18700 "Expected vectorizable types only."); 18701 // It is fine to compare type IDs here, since we expect only vectorizable 18702 // types, like ints, floats and pointers, we don't care about other type. 18703 if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) 18704 return true; 18705 if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) 18706 return false; 18707 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 18708 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 18709 if (Opcodes1.size() < Opcodes2.size()) 18710 return true; 18711 if (Opcodes1.size() > Opcodes2.size()) 18712 return false; 18713 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 18714 { 18715 // Instructions come first. 18716 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]); 18717 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]); 18718 if (I1 && I2) { 18719 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); 18720 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); 18721 if (!NodeI1) 18722 return NodeI2 != nullptr; 18723 if (!NodeI2) 18724 return false; 18725 assert((NodeI1 == NodeI2) == 18726 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 18727 "Different nodes should have different DFS numbers"); 18728 if (NodeI1 != NodeI2) 18729 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 18730 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 18731 if (S.getOpcode() && !S.isAltShuffle()) 18732 continue; 18733 return I1->getOpcode() < I2->getOpcode(); 18734 } 18735 if (I1) 18736 return true; 18737 if (I2) 18738 return false; 18739 } 18740 { 18741 // Non-undef constants come next. 18742 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]); 18743 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]); 18744 if (C1 && C2) 18745 continue; 18746 if (C1) 18747 return true; 18748 if (C2) 18749 return false; 18750 } 18751 bool U1 = isa<UndefValue>(Opcodes1[I]); 18752 bool U2 = isa<UndefValue>(Opcodes2[I]); 18753 { 18754 // Non-constant non-instructions come next. 18755 if (!U1 && !U2) { 18756 auto ValID1 = Opcodes1[I]->getValueID(); 18757 auto ValID2 = Opcodes2[I]->getValueID(); 18758 if (ValID1 == ValID2) 18759 continue; 18760 if (ValID1 < ValID2) 18761 return true; 18762 if (ValID1 > ValID2) 18763 return false; 18764 } 18765 if (!U1) 18766 return true; 18767 if (!U2) 18768 return false; 18769 } 18770 // Undefs come last. 18771 assert(U1 && U2 && "The only thing left should be undef & undef."); 18772 continue; 18773 } 18774 return false; 18775 }; 18776 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { 18777 if (V1 == V2) 18778 return true; 18779 if (V1->getType() != V2->getType()) 18780 return false; 18781 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; 18782 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; 18783 if (Opcodes1.size() != Opcodes2.size()) 18784 return false; 18785 for (int I = 0, E = Opcodes1.size(); I < E; ++I) { 18786 // Undefs are compatible with any other value. 18787 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) 18788 continue; 18789 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) 18790 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { 18791 if (R.isDeleted(I1) || R.isDeleted(I2)) 18792 return false; 18793 if (I1->getParent() != I2->getParent()) 18794 return false; 18795 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 18796 if (S.getOpcode()) 18797 continue; 18798 return false; 18799 } 18800 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) 18801 continue; 18802 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) 18803 return false; 18804 } 18805 return true; 18806 }; 18807 18808 bool HaveVectorizedPhiNodes = false; 18809 do { 18810 // Collect the incoming values from the PHIs. 18811 Incoming.clear(); 18812 for (Instruction &I : *BB) { 18813 auto *P = dyn_cast<PHINode>(&I); 18814 if (!P || P->getNumIncomingValues() > MaxPHINumOperands) 18815 break; 18816 18817 // No need to analyze deleted, vectorized and non-vectorizable 18818 // instructions. 18819 if (!VisitedInstrs.count(P) && !R.isDeleted(P) && 18820 isValidElementType(P->getType())) 18821 Incoming.push_back(P); 18822 } 18823 18824 if (Incoming.size() <= 1) 18825 break; 18826 18827 // Find the corresponding non-phi nodes for better matching when trying to 18828 // build the tree. 18829 for (Value *V : Incoming) { 18830 SmallVectorImpl<Value *> &Opcodes = 18831 PHIToOpcodes.try_emplace(V).first->getSecond(); 18832 if (!Opcodes.empty()) 18833 continue; 18834 SmallVector<Value *, 4> Nodes(1, V); 18835 SmallPtrSet<Value *, 4> Visited; 18836 while (!Nodes.empty()) { 18837 auto *PHI = cast<PHINode>(Nodes.pop_back_val()); 18838 if (!Visited.insert(PHI).second) 18839 continue; 18840 for (Value *V : PHI->incoming_values()) { 18841 if (auto *PHI1 = dyn_cast<PHINode>((V))) { 18842 Nodes.push_back(PHI1); 18843 continue; 18844 } 18845 Opcodes.emplace_back(V); 18846 } 18847 } 18848 } 18849 18850 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( 18851 Incoming, PHICompare, AreCompatiblePHIs, 18852 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { 18853 return tryToVectorizeList(Candidates, R, MaxVFOnly); 18854 }, 18855 /*MaxVFOnly=*/true, R); 18856 Changed |= HaveVectorizedPhiNodes; 18857 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) { 18858 auto *PHI = dyn_cast<PHINode>(P.first); 18859 return !PHI || R.isDeleted(PHI); 18860 })) 18861 PHIToOpcodes.clear(); 18862 VisitedInstrs.insert(Incoming.begin(), Incoming.end()); 18863 } while (HaveVectorizedPhiNodes); 18864 18865 VisitedInstrs.clear(); 18866 18867 InstSetVector PostProcessInserts; 18868 SmallSetVector<CmpInst *, 8> PostProcessCmps; 18869 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true 18870 // also vectorizes `PostProcessCmps`. 18871 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { 18872 bool Changed = vectorizeInserts(PostProcessInserts, BB, R); 18873 if (VectorizeCmps) { 18874 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R); 18875 PostProcessCmps.clear(); 18876 } 18877 PostProcessInserts.clear(); 18878 return Changed; 18879 }; 18880 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. 18881 auto IsInPostProcessInstrs = [&](Instruction *I) { 18882 if (auto *Cmp = dyn_cast<CmpInst>(I)) 18883 return PostProcessCmps.contains(Cmp); 18884 return isa<InsertElementInst, InsertValueInst>(I) && 18885 PostProcessInserts.contains(I); 18886 }; 18887 // Returns true if `I` is an instruction without users, like terminator, or 18888 // function call with ignored return value, store. Ignore unused instructions 18889 // (basing on instruction type, except for CallInst and InvokeInst). 18890 auto HasNoUsers = [](Instruction *I) { 18891 return I->use_empty() && 18892 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I)); 18893 }; 18894 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) { 18895 // Skip instructions with scalable type. The num of elements is unknown at 18896 // compile-time for scalable type. 18897 if (isa<ScalableVectorType>(It->getType())) 18898 continue; 18899 18900 // Skip instructions marked for the deletion. 18901 if (R.isDeleted(&*It)) 18902 continue; 18903 // We may go through BB multiple times so skip the one we have checked. 18904 if (!VisitedInstrs.insert(&*It).second) { 18905 if (HasNoUsers(&*It) && 18906 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) { 18907 // We would like to start over since some instructions are deleted 18908 // and the iterator may become invalid value. 18909 Changed = true; 18910 It = BB->begin(); 18911 E = BB->end(); 18912 } 18913 continue; 18914 } 18915 18916 if (isa<DbgInfoIntrinsic>(It)) 18917 continue; 18918 18919 // Try to vectorize reductions that use PHINodes. 18920 if (PHINode *P = dyn_cast<PHINode>(It)) { 18921 // Check that the PHI is a reduction PHI. 18922 if (P->getNumIncomingValues() == 2) { 18923 // Try to match and vectorize a horizontal reduction. 18924 Instruction *Root = getReductionInstr(DT, P, BB, LI); 18925 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) { 18926 Changed = true; 18927 It = BB->begin(); 18928 E = BB->end(); 18929 continue; 18930 } 18931 } 18932 // Try to vectorize the incoming values of the PHI, to catch reductions 18933 // that feed into PHIs. 18934 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) { 18935 // Skip if the incoming block is the current BB for now. Also, bypass 18936 // unreachable IR for efficiency and to avoid crashing. 18937 // TODO: Collect the skipped incoming values and try to vectorize them 18938 // after processing BB. 18939 if (BB == P->getIncomingBlock(I) || 18940 !DT->isReachableFromEntry(P->getIncomingBlock(I))) 18941 continue; 18942 18943 // Postponed instructions should not be vectorized here, delay their 18944 // vectorization. 18945 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I)); 18946 PI && !IsInPostProcessInstrs(PI)) { 18947 bool Res = vectorizeRootInstruction(nullptr, PI, 18948 P->getIncomingBlock(I), R, TTI); 18949 Changed |= Res; 18950 if (Res && R.isDeleted(P)) { 18951 It = BB->begin(); 18952 E = BB->end(); 18953 break; 18954 } 18955 } 18956 } 18957 continue; 18958 } 18959 18960 if (HasNoUsers(&*It)) { 18961 bool OpsChanged = false; 18962 auto *SI = dyn_cast<StoreInst>(It); 18963 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; 18964 if (SI) { 18965 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand())); 18966 // Try to vectorize chain in store, if this is the only store to the 18967 // address in the block. 18968 // TODO: This is just a temporarily solution to save compile time. Need 18969 // to investigate if we can safely turn on slp-vectorize-hor-store 18970 // instead to allow lookup for reduction chains in all non-vectorized 18971 // stores (need to check side effects and compile time). 18972 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) && 18973 SI->getValueOperand()->hasOneUse(); 18974 } 18975 if (TryToVectorizeRoot) { 18976 for (auto *V : It->operand_values()) { 18977 // Postponed instructions should not be vectorized here, delay their 18978 // vectorization. 18979 if (auto *VI = dyn_cast<Instruction>(V); 18980 VI && !IsInPostProcessInstrs(VI)) 18981 // Try to match and vectorize a horizontal reduction. 18982 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI); 18983 } 18984 } 18985 // Start vectorization of post-process list of instructions from the 18986 // top-tree instructions to try to vectorize as many instructions as 18987 // possible. 18988 OpsChanged |= 18989 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator()); 18990 if (OpsChanged) { 18991 // We would like to start over since some instructions are deleted 18992 // and the iterator may become invalid value. 18993 Changed = true; 18994 It = BB->begin(); 18995 E = BB->end(); 18996 continue; 18997 } 18998 } 18999 19000 if (isa<InsertElementInst, InsertValueInst>(It)) 19001 PostProcessInserts.insert(&*It); 19002 else if (isa<CmpInst>(It)) 19003 PostProcessCmps.insert(cast<CmpInst>(&*It)); 19004 } 19005 19006 return Changed; 19007 } 19008 19009 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 19010 auto Changed = false; 19011 for (auto &Entry : GEPs) { 19012 // If the getelementptr list has fewer than two elements, there's nothing 19013 // to do. 19014 if (Entry.second.size() < 2) 19015 continue; 19016 19017 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 19018 << Entry.second.size() << ".\n"); 19019 19020 // Process the GEP list in chunks suitable for the target's supported 19021 // vector size. If a vector register can't hold 1 element, we are done. We 19022 // are trying to vectorize the index computations, so the maximum number of 19023 // elements is based on the size of the index expression, rather than the 19024 // size of the GEP itself (the target's pointer size). 19025 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) { 19026 return !R.isDeleted(GEP); 19027 }); 19028 if (It == Entry.second.end()) 19029 continue; 19030 unsigned MaxVecRegSize = R.getMaxVecRegSize(); 19031 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin()); 19032 if (MaxVecRegSize < EltSize) 19033 continue; 19034 19035 unsigned MaxElts = MaxVecRegSize / EltSize; 19036 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { 19037 auto Len = std::min<unsigned>(BE - BI, MaxElts); 19038 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); 19039 19040 // Initialize a set a candidate getelementptrs. Note that we use a 19041 // SetVector here to preserve program order. If the index computations 19042 // are vectorizable and begin with loads, we want to minimize the chance 19043 // of having to reorder them later. 19044 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 19045 19046 // Some of the candidates may have already been vectorized after we 19047 // initially collected them or their index is optimized to constant value. 19048 // If so, they are marked as deleted, so remove them from the set of 19049 // candidates. 19050 Candidates.remove_if([&R](Value *I) { 19051 return R.isDeleted(cast<Instruction>(I)) || 19052 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get()); 19053 }); 19054 19055 // Remove from the set of candidates all pairs of getelementptrs with 19056 // constant differences. Such getelementptrs are likely not good 19057 // candidates for vectorization in a bottom-up phase since one can be 19058 // computed from the other. We also ensure all candidate getelementptr 19059 // indices are unique. 19060 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 19061 auto *GEPI = GEPList[I]; 19062 if (!Candidates.count(GEPI)) 19063 continue; 19064 auto *SCEVI = SE->getSCEV(GEPList[I]); 19065 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 19066 auto *GEPJ = GEPList[J]; 19067 auto *SCEVJ = SE->getSCEV(GEPList[J]); 19068 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 19069 Candidates.remove(GEPI); 19070 Candidates.remove(GEPJ); 19071 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 19072 Candidates.remove(GEPJ); 19073 } 19074 } 19075 } 19076 19077 // We break out of the above computation as soon as we know there are 19078 // fewer than two candidates remaining. 19079 if (Candidates.size() < 2) 19080 continue; 19081 19082 // Add the single, non-constant index of each candidate to the bundle. We 19083 // ensured the indices met these constraints when we originally collected 19084 // the getelementptrs. 19085 SmallVector<Value *, 16> Bundle(Candidates.size()); 19086 auto BundleIndex = 0u; 19087 for (auto *V : Candidates) { 19088 auto *GEP = cast<GetElementPtrInst>(V); 19089 auto *GEPIdx = GEP->idx_begin()->get(); 19090 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx)); 19091 Bundle[BundleIndex++] = GEPIdx; 19092 } 19093 19094 // Try and vectorize the indices. We are currently only interested in 19095 // gather-like cases of the form: 19096 // 19097 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 19098 // 19099 // where the loads of "a", the loads of "b", and the subtractions can be 19100 // performed in parallel. It's likely that detecting this pattern in a 19101 // bottom-up phase will be simpler and less costly than building a 19102 // full-blown top-down phase beginning at the consecutive loads. 19103 Changed |= tryToVectorizeList(Bundle, R); 19104 } 19105 } 19106 return Changed; 19107 } 19108 19109 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { 19110 bool Changed = false; 19111 // Sort by type, base pointers and values operand. Value operands must be 19112 // compatible (have the same opcode, same parent), otherwise it is 19113 // definitely not profitable to try to vectorize them. 19114 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { 19115 if (V->getValueOperand()->getType()->getTypeID() < 19116 V2->getValueOperand()->getType()->getTypeID()) 19117 return true; 19118 if (V->getValueOperand()->getType()->getTypeID() > 19119 V2->getValueOperand()->getType()->getTypeID()) 19120 return false; 19121 if (V->getPointerOperandType()->getTypeID() < 19122 V2->getPointerOperandType()->getTypeID()) 19123 return true; 19124 if (V->getPointerOperandType()->getTypeID() > 19125 V2->getPointerOperandType()->getTypeID()) 19126 return false; 19127 // UndefValues are compatible with all other values. 19128 if (isa<UndefValue>(V->getValueOperand()) || 19129 isa<UndefValue>(V2->getValueOperand())) 19130 return false; 19131 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) 19132 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 19133 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = 19134 DT->getNode(I1->getParent()); 19135 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = 19136 DT->getNode(I2->getParent()); 19137 assert(NodeI1 && "Should only process reachable instructions"); 19138 assert(NodeI2 && "Should only process reachable instructions"); 19139 assert((NodeI1 == NodeI2) == 19140 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && 19141 "Different nodes should have different DFS numbers"); 19142 if (NodeI1 != NodeI2) 19143 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); 19144 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 19145 if (S.getOpcode()) 19146 return false; 19147 return I1->getOpcode() < I2->getOpcode(); 19148 } 19149 if (isa<Constant>(V->getValueOperand()) && 19150 isa<Constant>(V2->getValueOperand())) 19151 return false; 19152 return V->getValueOperand()->getValueID() < 19153 V2->getValueOperand()->getValueID(); 19154 }; 19155 19156 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { 19157 if (V1 == V2) 19158 return true; 19159 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) 19160 return false; 19161 if (V1->getPointerOperandType() != V2->getPointerOperandType()) 19162 return false; 19163 // Undefs are compatible with any other value. 19164 if (isa<UndefValue>(V1->getValueOperand()) || 19165 isa<UndefValue>(V2->getValueOperand())) 19166 return true; 19167 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) 19168 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { 19169 if (I1->getParent() != I2->getParent()) 19170 return false; 19171 InstructionsState S = getSameOpcode({I1, I2}, *TLI); 19172 return S.getOpcode() > 0; 19173 } 19174 if (isa<Constant>(V1->getValueOperand()) && 19175 isa<Constant>(V2->getValueOperand())) 19176 return true; 19177 return V1->getValueOperand()->getValueID() == 19178 V2->getValueOperand()->getValueID(); 19179 }; 19180 19181 // Attempt to sort and vectorize each of the store-groups. 19182 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted; 19183 for (auto &Pair : Stores) { 19184 if (Pair.second.size() < 2) 19185 continue; 19186 19187 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 19188 << Pair.second.size() << ".\n"); 19189 19190 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) 19191 continue; 19192 19193 // Reverse stores to do bottom-to-top analysis. This is important if the 19194 // values are stores to the same addresses several times, in this case need 19195 // to follow the stores order (reversed to meet the memory dependecies). 19196 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(), 19197 Pair.second.rend()); 19198 Changed |= tryToVectorizeSequence<StoreInst>( 19199 ReversedStores, StoreSorter, AreCompatibleStores, 19200 [&](ArrayRef<StoreInst *> Candidates, bool) { 19201 return vectorizeStores(Candidates, R, Attempted); 19202 }, 19203 /*MaxVFOnly=*/false, R); 19204 } 19205 return Changed; 19206 } 19207