xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 6c05f3a74f30934ee60919cc97e16ec69b542b06)
1  //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10  // stores that can be put together into vector-stores. Next, it attempts to
11  // construct vectorizable tree using the use-def chains. If a profitable tree
12  // was found, the SLP vectorizer performs vectorization on the tree.
13  //
14  // The pass is inspired by the work described in the paper:
15  //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16  //
17  //===----------------------------------------------------------------------===//
18  
19  #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20  #include "llvm/ADT/DenseMap.h"
21  #include "llvm/ADT/DenseSet.h"
22  #include "llvm/ADT/PriorityQueue.h"
23  #include "llvm/ADT/STLExtras.h"
24  #include "llvm/ADT/ScopeExit.h"
25  #include "llvm/ADT/SetOperations.h"
26  #include "llvm/ADT/SetVector.h"
27  #include "llvm/ADT/SmallBitVector.h"
28  #include "llvm/ADT/SmallPtrSet.h"
29  #include "llvm/ADT/SmallSet.h"
30  #include "llvm/ADT/SmallString.h"
31  #include "llvm/ADT/Statistic.h"
32  #include "llvm/ADT/iterator.h"
33  #include "llvm/ADT/iterator_range.h"
34  #include "llvm/Analysis/AliasAnalysis.h"
35  #include "llvm/Analysis/AssumptionCache.h"
36  #include "llvm/Analysis/CodeMetrics.h"
37  #include "llvm/Analysis/ConstantFolding.h"
38  #include "llvm/Analysis/DemandedBits.h"
39  #include "llvm/Analysis/GlobalsModRef.h"
40  #include "llvm/Analysis/IVDescriptors.h"
41  #include "llvm/Analysis/LoopAccessAnalysis.h"
42  #include "llvm/Analysis/LoopInfo.h"
43  #include "llvm/Analysis/MemoryLocation.h"
44  #include "llvm/Analysis/OptimizationRemarkEmitter.h"
45  #include "llvm/Analysis/ScalarEvolution.h"
46  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
47  #include "llvm/Analysis/TargetLibraryInfo.h"
48  #include "llvm/Analysis/TargetTransformInfo.h"
49  #include "llvm/Analysis/ValueTracking.h"
50  #include "llvm/Analysis/VectorUtils.h"
51  #include "llvm/IR/Attributes.h"
52  #include "llvm/IR/BasicBlock.h"
53  #include "llvm/IR/Constant.h"
54  #include "llvm/IR/Constants.h"
55  #include "llvm/IR/DataLayout.h"
56  #include "llvm/IR/DerivedTypes.h"
57  #include "llvm/IR/Dominators.h"
58  #include "llvm/IR/Function.h"
59  #include "llvm/IR/IRBuilder.h"
60  #include "llvm/IR/InstrTypes.h"
61  #include "llvm/IR/Instruction.h"
62  #include "llvm/IR/Instructions.h"
63  #include "llvm/IR/IntrinsicInst.h"
64  #include "llvm/IR/Intrinsics.h"
65  #include "llvm/IR/Module.h"
66  #include "llvm/IR/Operator.h"
67  #include "llvm/IR/PatternMatch.h"
68  #include "llvm/IR/Type.h"
69  #include "llvm/IR/Use.h"
70  #include "llvm/IR/User.h"
71  #include "llvm/IR/Value.h"
72  #include "llvm/IR/ValueHandle.h"
73  #ifdef EXPENSIVE_CHECKS
74  #include "llvm/IR/Verifier.h"
75  #endif
76  #include "llvm/Pass.h"
77  #include "llvm/Support/Casting.h"
78  #include "llvm/Support/CommandLine.h"
79  #include "llvm/Support/Compiler.h"
80  #include "llvm/Support/DOTGraphTraits.h"
81  #include "llvm/Support/Debug.h"
82  #include "llvm/Support/ErrorHandling.h"
83  #include "llvm/Support/GraphWriter.h"
84  #include "llvm/Support/InstructionCost.h"
85  #include "llvm/Support/KnownBits.h"
86  #include "llvm/Support/MathExtras.h"
87  #include "llvm/Support/raw_ostream.h"
88  #include "llvm/Transforms/Utils/InjectTLIMappings.h"
89  #include "llvm/Transforms/Utils/Local.h"
90  #include "llvm/Transforms/Utils/LoopUtils.h"
91  #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92  #include <algorithm>
93  #include <cassert>
94  #include <cstdint>
95  #include <iterator>
96  #include <memory>
97  #include <optional>
98  #include <set>
99  #include <string>
100  #include <tuple>
101  #include <utility>
102  
103  using namespace llvm;
104  using namespace llvm::PatternMatch;
105  using namespace slpvectorizer;
106  
107  #define SV_NAME "slp-vectorizer"
108  #define DEBUG_TYPE "SLP"
109  
110  STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111  
112  static cl::opt<bool>
113      RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114                          cl::desc("Run the SLP vectorization passes"));
115  
116  static cl::opt<bool>
117      SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118               cl::desc("Enable vectorization for wider vector utilization"));
119  
120  static cl::opt<int>
121      SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
122                       cl::desc("Only vectorize if you gain more than this "
123                                "number "));
124  
125  static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
126      "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127      cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128               "heuristics and makes vectorization decision via cost modeling."));
129  
130  static cl::opt<bool>
131  ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132                     cl::desc("Attempt to vectorize horizontal reductions"));
133  
134  static cl::opt<bool> ShouldStartVectorizeHorAtStore(
135      "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136      cl::desc(
137          "Attempt to vectorize horizontal reductions feeding into a store"));
138  
139  // NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140  // even if we match a reduction but do not vectorize in the end.
141  static cl::opt<bool> AllowHorRdxIdenityOptimization(
142      "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143      cl::desc("Allow optimization of original scalar identity operations on "
144               "matched horizontal reductions."));
145  
146  static cl::opt<int>
147  MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
148      cl::desc("Attempt to vectorize for this register size in bits"));
149  
150  static cl::opt<unsigned>
151  MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
152      cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153  
154  /// Limits the size of scheduling regions in a block.
155  /// It avoid long compile times for _very_ large blocks where vector
156  /// instructions are spread over a wide range.
157  /// This limit is way higher than needed by real-world functions.
158  static cl::opt<int>
159  ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160      cl::desc("Limit the size of the SLP scheduling region per block"));
161  
162  static cl::opt<int> MinVectorRegSizeOption(
163      "slp-min-reg-size", cl::init(128), cl::Hidden,
164      cl::desc("Attempt to vectorize for this register size in bits"));
165  
166  static cl::opt<unsigned> RecursionMaxDepth(
167      "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168      cl::desc("Limit the recursion depth when building a vectorizable tree"));
169  
170  static cl::opt<unsigned> MinTreeSize(
171      "slp-min-tree-size", cl::init(3), cl::Hidden,
172      cl::desc("Only vectorize small trees if they are fully vectorizable"));
173  
174  // The maximum depth that the look-ahead score heuristic will explore.
175  // The higher this value, the higher the compilation time overhead.
176  static cl::opt<int> LookAheadMaxDepth(
177      "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178      cl::desc("The maximum look-ahead depth for operand reordering scores"));
179  
180  // The maximum depth that the look-ahead score heuristic will explore
181  // when it probing among candidates for vectorization tree roots.
182  // The higher this value, the higher the compilation time overhead but unlike
183  // similar limit for operands ordering this is less frequently used, hence
184  // impact of higher value is less noticeable.
185  static cl::opt<int> RootLookAheadMaxDepth(
186      "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187      cl::desc("The maximum look-ahead depth for searching best rooting option"));
188  
189  static cl::opt<unsigned> MinProfitableStridedLoads(
190      "slp-min-strided-loads", cl::init(2), cl::Hidden,
191      cl::desc("The minimum number of loads, which should be considered strided, "
192               "if the stride is > 1 or is runtime value"));
193  
194  static cl::opt<unsigned> MaxProfitableLoadStride(
195      "slp-max-stride", cl::init(8), cl::Hidden,
196      cl::desc("The maximum stride, considered to be profitable."));
197  
198  static cl::opt<bool>
199      ViewSLPTree("view-slp-tree", cl::Hidden,
200                  cl::desc("Display the SLP trees with Graphviz"));
201  
202  static cl::opt<bool> VectorizeNonPowerOf2(
203      "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204      cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205  
206  // Limit the number of alias checks. The limit is chosen so that
207  // it has no negative effect on the llvm benchmarks.
208  static const unsigned AliasedCheckLimit = 10;
209  
210  // Limit of the number of uses for potentially transformed instructions/values,
211  // used in checks to avoid compile-time explode.
212  static constexpr int UsesLimit = 64;
213  
214  // Another limit for the alias checks: The maximum distance between load/store
215  // instructions where alias checks are done.
216  // This limit is useful for very large basic blocks.
217  static const unsigned MaxMemDepDistance = 160;
218  
219  /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220  /// regions to be handled.
221  static const int MinScheduleRegionSize = 16;
222  
223  /// Maximum allowed number of operands in the PHI nodes.
224  static const unsigned MaxPHINumOperands = 128;
225  
226  /// Predicate for the element types that the SLP vectorizer supports.
227  ///
228  /// The most important thing to filter here are types which are invalid in LLVM
229  /// vectors. We also filter target specific types which have absolutely no
230  /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231  /// avoids spending time checking the cost model and realizing that they will
232  /// be inevitably scalarized.
233  static bool isValidElementType(Type *Ty) {
234    // TODO: Support ScalableVectorType.
235    if (SLPReVec && isa<FixedVectorType>(Ty))
236      Ty = Ty->getScalarType();
237    return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238           !Ty->isPPC_FP128Ty();
239  }
240  
241  /// \returns the number of elements for Ty.
242  static unsigned getNumElements(Type *Ty) {
243    assert(!isa<ScalableVectorType>(Ty) &&
244           "ScalableVectorType is not supported.");
245    if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246      return VecTy->getNumElements();
247    return 1;
248  }
249  
250  /// \returns the vector type of ScalarTy based on vectorization factor.
251  static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252    return FixedVectorType::get(ScalarTy->getScalarType(),
253                                VF * getNumElements(ScalarTy));
254  }
255  
256  /// \returns True if the value is a constant (but not globals/constant
257  /// expressions).
258  static bool isConstant(Value *V) {
259    return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
260  }
261  
262  /// Checks if \p V is one of vector-like instructions, i.e. undef,
263  /// insertelement/extractelement with constant indices for fixed vector type or
264  /// extractvalue instruction.
265  static bool isVectorLikeInstWithConstOps(Value *V) {
266    if (!isa<InsertElementInst, ExtractElementInst>(V) &&
267        !isa<ExtractValueInst, UndefValue>(V))
268      return false;
269    auto *I = dyn_cast<Instruction>(V);
270    if (!I || isa<ExtractValueInst>(I))
271      return true;
272    if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
273      return false;
274    if (isa<ExtractElementInst>(I))
275      return isConstant(I->getOperand(1));
276    assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
277    return isConstant(I->getOperand(2));
278  }
279  
280  /// Returns power-of-2 number of elements in a single register (part), given the
281  /// total number of elements \p Size and number of registers (parts) \p
282  /// NumParts.
283  static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
284    return PowerOf2Ceil(divideCeil(Size, NumParts));
285  }
286  
287  /// Returns correct remaining number of elements, considering total amount \p
288  /// Size, (power-of-2 number) of elements in a single register \p PartNumElems
289  /// and current register (part) \p Part.
290  static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
291                              unsigned Part) {
292    return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
293  }
294  
295  #if !defined(NDEBUG)
296  /// Print a short descriptor of the instruction bundle suitable for debug output.
297  static std::string shortBundleName(ArrayRef<Value *> VL) {
298    std::string Result;
299    raw_string_ostream OS(Result);
300    OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
301    OS.flush();
302    return Result;
303  }
304  #endif
305  
306  /// \returns true if all of the instructions in \p VL are in the same block or
307  /// false otherwise.
308  static bool allSameBlock(ArrayRef<Value *> VL) {
309    Instruction *I0 = dyn_cast<Instruction>(VL[0]);
310    if (!I0)
311      return false;
312    if (all_of(VL, isVectorLikeInstWithConstOps))
313      return true;
314  
315    BasicBlock *BB = I0->getParent();
316    for (int I = 1, E = VL.size(); I < E; I++) {
317      auto *II = dyn_cast<Instruction>(VL[I]);
318      if (!II)
319        return false;
320  
321      if (BB != II->getParent())
322        return false;
323    }
324    return true;
325  }
326  
327  /// \returns True if all of the values in \p VL are constants (but not
328  /// globals/constant expressions).
329  static bool allConstant(ArrayRef<Value *> VL) {
330    // Constant expressions and globals can't be vectorized like normal integer/FP
331    // constants.
332    return all_of(VL, isConstant);
333  }
334  
335  /// \returns True if all of the values in \p VL are identical or some of them
336  /// are UndefValue.
337  static bool isSplat(ArrayRef<Value *> VL) {
338    Value *FirstNonUndef = nullptr;
339    for (Value *V : VL) {
340      if (isa<UndefValue>(V))
341        continue;
342      if (!FirstNonUndef) {
343        FirstNonUndef = V;
344        continue;
345      }
346      if (V != FirstNonUndef)
347        return false;
348    }
349    return FirstNonUndef != nullptr;
350  }
351  
352  /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
353  static bool isCommutative(Instruction *I) {
354    if (auto *Cmp = dyn_cast<CmpInst>(I))
355      return Cmp->isCommutative();
356    if (auto *BO = dyn_cast<BinaryOperator>(I))
357      return BO->isCommutative() ||
358             (BO->getOpcode() == Instruction::Sub &&
359              !BO->hasNUsesOrMore(UsesLimit) &&
360              all_of(
361                  BO->uses(),
362                  [](const Use &U) {
363                    // Commutative, if icmp eq/ne sub, 0
364                    ICmpInst::Predicate Pred;
365                    if (match(U.getUser(),
366                              m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
367                        (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
368                      return true;
369                    // Commutative, if abs(sub nsw, true) or abs(sub, false).
370                    ConstantInt *Flag;
371                    return match(U.getUser(),
372                                 m_Intrinsic<Intrinsic::abs>(
373                                     m_Specific(U.get()), m_ConstantInt(Flag))) &&
374                           (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
375                            Flag->isOne());
376                  })) ||
377             (BO->getOpcode() == Instruction::FSub &&
378              !BO->hasNUsesOrMore(UsesLimit) &&
379              all_of(BO->uses(), [](const Use &U) {
380                return match(U.getUser(),
381                             m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
382              }));
383    return I->isCommutative();
384  }
385  
386  template <typename T>
387  static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
388                                                       unsigned Offset) {
389    static_assert(std::is_same_v<T, InsertElementInst> ||
390                      std::is_same_v<T, ExtractElementInst>,
391                  "unsupported T");
392    int Index = Offset;
393    if (const auto *IE = dyn_cast<T>(Inst)) {
394      const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
395      if (!VT)
396        return std::nullopt;
397      const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
398      if (!CI)
399        return std::nullopt;
400      if (CI->getValue().uge(VT->getNumElements()))
401        return std::nullopt;
402      Index *= VT->getNumElements();
403      Index += CI->getZExtValue();
404      return Index;
405    }
406    return std::nullopt;
407  }
408  
409  /// \returns inserting or extracting index of InsertElement, ExtractElement or
410  /// InsertValue instruction, using Offset as base offset for index.
411  /// \returns std::nullopt if the index is not an immediate.
412  static std::optional<unsigned> getElementIndex(const Value *Inst,
413                                                 unsigned Offset = 0) {
414    if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
415      return Index;
416    if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
417      return Index;
418  
419    int Index = Offset;
420  
421    const auto *IV = dyn_cast<InsertValueInst>(Inst);
422    if (!IV)
423      return std::nullopt;
424  
425    Type *CurrentType = IV->getType();
426    for (unsigned I : IV->indices()) {
427      if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
428        Index *= ST->getNumElements();
429        CurrentType = ST->getElementType(I);
430      } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
431        Index *= AT->getNumElements();
432        CurrentType = AT->getElementType();
433      } else {
434        return std::nullopt;
435      }
436      Index += I;
437    }
438    return Index;
439  }
440  
441  namespace {
442  /// Specifies the way the mask should be analyzed for undefs/poisonous elements
443  /// in the shuffle mask.
444  enum class UseMask {
445    FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
446              ///< check for the mask elements for the first argument (mask
447              ///< indices are in range [0:VF)).
448    SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
449               ///< for the mask elements for the second argument (mask indices
450               ///< are in range [VF:2*VF))
451    UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
452                 ///< future shuffle elements and mark them as ones as being used
453                 ///< in future. Non-undef elements are considered as unused since
454                 ///< they're already marked as used in the mask.
455  };
456  } // namespace
457  
458  /// Prepares a use bitset for the given mask either for the first argument or
459  /// for the second.
460  static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
461                                     UseMask MaskArg) {
462    SmallBitVector UseMask(VF, true);
463    for (auto [Idx, Value] : enumerate(Mask)) {
464      if (Value == PoisonMaskElem) {
465        if (MaskArg == UseMask::UndefsAsMask)
466          UseMask.reset(Idx);
467        continue;
468      }
469      if (MaskArg == UseMask::FirstArg && Value < VF)
470        UseMask.reset(Value);
471      else if (MaskArg == UseMask::SecondArg && Value >= VF)
472        UseMask.reset(Value - VF);
473    }
474    return UseMask;
475  }
476  
477  /// Checks if the given value is actually an undefined constant vector.
478  /// Also, if the \p UseMask is not empty, tries to check if the non-masked
479  /// elements actually mask the insertelement buildvector, if any.
480  template <bool IsPoisonOnly = false>
481  static SmallBitVector isUndefVector(const Value *V,
482                                      const SmallBitVector &UseMask = {}) {
483    SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
484    using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
485    if (isa<T>(V))
486      return Res;
487    auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
488    if (!VecTy)
489      return Res.reset();
490    auto *C = dyn_cast<Constant>(V);
491    if (!C) {
492      if (!UseMask.empty()) {
493        const Value *Base = V;
494        while (auto *II = dyn_cast<InsertElementInst>(Base)) {
495          Base = II->getOperand(0);
496          if (isa<T>(II->getOperand(1)))
497            continue;
498          std::optional<unsigned> Idx = getElementIndex(II);
499          if (!Idx) {
500            Res.reset();
501            return Res;
502          }
503          if (*Idx < UseMask.size() && !UseMask.test(*Idx))
504            Res.reset(*Idx);
505        }
506        // TODO: Add analysis for shuffles here too.
507        if (V == Base) {
508          Res.reset();
509        } else {
510          SmallBitVector SubMask(UseMask.size(), false);
511          Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
512        }
513      } else {
514        Res.reset();
515      }
516      return Res;
517    }
518    for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
519      if (Constant *Elem = C->getAggregateElement(I))
520        if (!isa<T>(Elem) &&
521            (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
522          Res.reset(I);
523    }
524    return Res;
525  }
526  
527  /// Checks if the vector of instructions can be represented as a shuffle, like:
528  /// %x0 = extractelement <4 x i8> %x, i32 0
529  /// %x3 = extractelement <4 x i8> %x, i32 3
530  /// %y1 = extractelement <4 x i8> %y, i32 1
531  /// %y2 = extractelement <4 x i8> %y, i32 2
532  /// %x0x0 = mul i8 %x0, %x0
533  /// %x3x3 = mul i8 %x3, %x3
534  /// %y1y1 = mul i8 %y1, %y1
535  /// %y2y2 = mul i8 %y2, %y2
536  /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
537  /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
538  /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
539  /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
540  /// ret <4 x i8> %ins4
541  /// can be transformed into:
542  /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
543  ///                                                         i32 6>
544  /// %2 = mul <4 x i8> %1, %1
545  /// ret <4 x i8> %2
546  /// Mask will return the Shuffle Mask equivalent to the extracted elements.
547  /// TODO: Can we split off and reuse the shuffle mask detection from
548  /// ShuffleVectorInst/getShuffleCost?
549  static std::optional<TargetTransformInfo::ShuffleKind>
550  isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
551    const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
552    if (It == VL.end())
553      return std::nullopt;
554    unsigned Size =
555        std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
556          auto *EI = dyn_cast<ExtractElementInst>(V);
557          if (!EI)
558            return S;
559          auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
560          if (!VTy)
561            return S;
562          return std::max(S, VTy->getNumElements());
563        });
564  
565    Value *Vec1 = nullptr;
566    Value *Vec2 = nullptr;
567    bool HasNonUndefVec = any_of(VL, [](Value *V) {
568      auto *EE = dyn_cast<ExtractElementInst>(V);
569      if (!EE)
570        return false;
571      Value *Vec = EE->getVectorOperand();
572      if (isa<UndefValue>(Vec))
573        return false;
574      return isGuaranteedNotToBePoison(Vec);
575    });
576    enum ShuffleMode { Unknown, Select, Permute };
577    ShuffleMode CommonShuffleMode = Unknown;
578    Mask.assign(VL.size(), PoisonMaskElem);
579    for (unsigned I = 0, E = VL.size(); I < E; ++I) {
580      // Undef can be represented as an undef element in a vector.
581      if (isa<UndefValue>(VL[I]))
582        continue;
583      auto *EI = cast<ExtractElementInst>(VL[I]);
584      if (isa<ScalableVectorType>(EI->getVectorOperandType()))
585        return std::nullopt;
586      auto *Vec = EI->getVectorOperand();
587      // We can extractelement from undef or poison vector.
588      if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
589        continue;
590      // All vector operands must have the same number of vector elements.
591      if (isa<UndefValue>(Vec)) {
592        Mask[I] = I;
593      } else {
594        if (isa<UndefValue>(EI->getIndexOperand()))
595          continue;
596        auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
597        if (!Idx)
598          return std::nullopt;
599        // Undefined behavior if Idx is negative or >= Size.
600        if (Idx->getValue().uge(Size))
601          continue;
602        unsigned IntIdx = Idx->getValue().getZExtValue();
603        Mask[I] = IntIdx;
604      }
605      if (isUndefVector(Vec).all() && HasNonUndefVec)
606        continue;
607      // For correct shuffling we have to have at most 2 different vector operands
608      // in all extractelement instructions.
609      if (!Vec1 || Vec1 == Vec) {
610        Vec1 = Vec;
611      } else if (!Vec2 || Vec2 == Vec) {
612        Vec2 = Vec;
613        Mask[I] += Size;
614      } else {
615        return std::nullopt;
616      }
617      if (CommonShuffleMode == Permute)
618        continue;
619      // If the extract index is not the same as the operation number, it is a
620      // permutation.
621      if (Mask[I] % Size != I) {
622        CommonShuffleMode = Permute;
623        continue;
624      }
625      CommonShuffleMode = Select;
626    }
627    // If we're not crossing lanes in different vectors, consider it as blending.
628    if (CommonShuffleMode == Select && Vec2)
629      return TargetTransformInfo::SK_Select;
630    // If Vec2 was never used, we have a permutation of a single vector, otherwise
631    // we have permutation of 2 vectors.
632    return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
633                : TargetTransformInfo::SK_PermuteSingleSrc;
634  }
635  
636  /// \returns True if Extract{Value,Element} instruction extracts element Idx.
637  static std::optional<unsigned> getExtractIndex(Instruction *E) {
638    unsigned Opcode = E->getOpcode();
639    assert((Opcode == Instruction::ExtractElement ||
640            Opcode == Instruction::ExtractValue) &&
641           "Expected extractelement or extractvalue instruction.");
642    if (Opcode == Instruction::ExtractElement) {
643      auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
644      if (!CI)
645        return std::nullopt;
646      return CI->getZExtValue();
647    }
648    auto *EI = cast<ExtractValueInst>(E);
649    if (EI->getNumIndices() != 1)
650      return std::nullopt;
651    return *EI->idx_begin();
652  }
653  
654  namespace {
655  
656  /// Main data required for vectorization of instructions.
657  struct InstructionsState {
658    /// The very first instruction in the list with the main opcode.
659    Value *OpValue = nullptr;
660  
661    /// The main/alternate instruction.
662    Instruction *MainOp = nullptr;
663    Instruction *AltOp = nullptr;
664  
665    /// The main/alternate opcodes for the list of instructions.
666    unsigned getOpcode() const {
667      return MainOp ? MainOp->getOpcode() : 0;
668    }
669  
670    unsigned getAltOpcode() const {
671      return AltOp ? AltOp->getOpcode() : 0;
672    }
673  
674    /// Some of the instructions in the list have alternate opcodes.
675    bool isAltShuffle() const { return AltOp != MainOp; }
676  
677    bool isOpcodeOrAlt(Instruction *I) const {
678      unsigned CheckedOpcode = I->getOpcode();
679      return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
680    }
681  
682    InstructionsState() = delete;
683    InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
684        : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685  };
686  
687  } // end anonymous namespace
688  
689  /// Chooses the correct key for scheduling data. If \p Op has the same (or
690  /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691  /// OpValue.
692  static Value *isOneOf(const InstructionsState &S, Value *Op) {
693    auto *I = dyn_cast<Instruction>(Op);
694    if (I && S.isOpcodeOrAlt(I))
695      return Op;
696    return S.OpValue;
697  }
698  
699  /// \returns true if \p Opcode is allowed as part of the main/alternate
700  /// instruction for SLP vectorization.
701  ///
702  /// Example of unsupported opcode is SDIV that can potentially cause UB if the
703  /// "shuffled out" lane would result in division by zero.
704  static bool isValidForAlternation(unsigned Opcode) {
705    if (Instruction::isIntDivRem(Opcode))
706      return false;
707  
708    return true;
709  }
710  
711  static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712                                         const TargetLibraryInfo &TLI,
713                                         unsigned BaseIndex = 0);
714  
715  /// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716  /// compatible instructions or constants, or just some other regular values.
717  static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
718                                  Value *Op1, const TargetLibraryInfo &TLI) {
719    return (isConstant(BaseOp0) && isConstant(Op0)) ||
720           (isConstant(BaseOp1) && isConstant(Op1)) ||
721           (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722            !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723           BaseOp0 == Op0 || BaseOp1 == Op1 ||
724           getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
725           getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
726  }
727  
728  /// \returns true if a compare instruction \p CI has similar "look" and
729  /// same predicate as \p BaseCI, "as is" or with its operands and predicate
730  /// swapped, false otherwise.
731  static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
732                                 const TargetLibraryInfo &TLI) {
733    assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
734           "Assessing comparisons of different types?");
735    CmpInst::Predicate BasePred = BaseCI->getPredicate();
736    CmpInst::Predicate Pred = CI->getPredicate();
737    CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
738  
739    Value *BaseOp0 = BaseCI->getOperand(0);
740    Value *BaseOp1 = BaseCI->getOperand(1);
741    Value *Op0 = CI->getOperand(0);
742    Value *Op1 = CI->getOperand(1);
743  
744    return (BasePred == Pred &&
745            areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
746           (BasePred == SwappedPred &&
747            areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
748  }
749  
750  /// \returns analysis of the Instructions in \p VL described in
751  /// InstructionsState, the Opcode that we suppose the whole list
752  /// could be vectorized even if its structure is diverse.
753  static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754                                         const TargetLibraryInfo &TLI,
755                                         unsigned BaseIndex) {
756    // Make sure these are all Instructions.
757    if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
758      return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759  
760    bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761    bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762    bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
763    CmpInst::Predicate BasePred =
764        IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
765                : CmpInst::BAD_ICMP_PREDICATE;
766    unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767    unsigned AltOpcode = Opcode;
768    unsigned AltIndex = BaseIndex;
769  
770    bool SwappedPredsCompatible = [&]() {
771      if (!IsCmpOp)
772        return false;
773      SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
774      UniquePreds.insert(BasePred);
775      UniqueNonSwappedPreds.insert(BasePred);
776      for (Value *V : VL) {
777        auto *I = dyn_cast<CmpInst>(V);
778        if (!I)
779          return false;
780        CmpInst::Predicate CurrentPred = I->getPredicate();
781        CmpInst::Predicate SwappedCurrentPred =
782            CmpInst::getSwappedPredicate(CurrentPred);
783        UniqueNonSwappedPreds.insert(CurrentPred);
784        if (!UniquePreds.contains(CurrentPred) &&
785            !UniquePreds.contains(SwappedCurrentPred))
786          UniquePreds.insert(CurrentPred);
787      }
788      // Total number of predicates > 2, but if consider swapped predicates
789      // compatible only 2, consider swappable predicates as compatible opcodes,
790      // not alternate.
791      return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
792    }();
793    // Check for one alternate opcode from another BinaryOperator.
794    // TODO - generalize to support all operators (types, calls etc.).
795    auto *IBase = cast<Instruction>(VL[BaseIndex]);
796    Intrinsic::ID BaseID = 0;
797    SmallVector<VFInfo> BaseMappings;
798    if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
799      BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
800      BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
801      if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
802        return InstructionsState(VL[BaseIndex], nullptr, nullptr);
803    }
804    for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
805      auto *I = cast<Instruction>(VL[Cnt]);
806      unsigned InstOpcode = I->getOpcode();
807      if (IsBinOp && isa<BinaryOperator>(I)) {
808        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
809          continue;
810        if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
811            isValidForAlternation(Opcode)) {
812          AltOpcode = InstOpcode;
813          AltIndex = Cnt;
814          continue;
815        }
816      } else if (IsCastOp && isa<CastInst>(I)) {
817        Value *Op0 = IBase->getOperand(0);
818        Type *Ty0 = Op0->getType();
819        Value *Op1 = I->getOperand(0);
820        Type *Ty1 = Op1->getType();
821        if (Ty0 == Ty1) {
822          if (InstOpcode == Opcode || InstOpcode == AltOpcode)
823            continue;
824          if (Opcode == AltOpcode) {
825            assert(isValidForAlternation(Opcode) &&
826                   isValidForAlternation(InstOpcode) &&
827                   "Cast isn't safe for alternation, logic needs to be updated!");
828            AltOpcode = InstOpcode;
829            AltIndex = Cnt;
830            continue;
831          }
832        }
833      } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
834        auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
835        Type *Ty0 = BaseInst->getOperand(0)->getType();
836        Type *Ty1 = Inst->getOperand(0)->getType();
837        if (Ty0 == Ty1) {
838          assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
839          // Check for compatible operands. If the corresponding operands are not
840          // compatible - need to perform alternate vectorization.
841          CmpInst::Predicate CurrentPred = Inst->getPredicate();
842          CmpInst::Predicate SwappedCurrentPred =
843              CmpInst::getSwappedPredicate(CurrentPred);
844  
845          if ((E == 2 || SwappedPredsCompatible) &&
846              (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
847            continue;
848  
849          if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
850            continue;
851          auto *AltInst = cast<CmpInst>(VL[AltIndex]);
852          if (AltIndex != BaseIndex) {
853            if (isCmpSameOrSwapped(AltInst, Inst, TLI))
854              continue;
855          } else if (BasePred != CurrentPred) {
856            assert(
857                isValidForAlternation(InstOpcode) &&
858                "CmpInst isn't safe for alternation, logic needs to be updated!");
859            AltIndex = Cnt;
860            continue;
861          }
862          CmpInst::Predicate AltPred = AltInst->getPredicate();
863          if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
864              AltPred == CurrentPred || AltPred == SwappedCurrentPred)
865            continue;
866        }
867      } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
868        if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
869          if (Gep->getNumOperands() != 2 ||
870              Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
871            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
872        } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
873          if (!isVectorLikeInstWithConstOps(EI))
874            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
875        } else if (auto *LI = dyn_cast<LoadInst>(I)) {
876          auto *BaseLI = cast<LoadInst>(IBase);
877          if (!LI->isSimple() || !BaseLI->isSimple())
878            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
879        } else if (auto *Call = dyn_cast<CallInst>(I)) {
880          auto *CallBase = cast<CallInst>(IBase);
881          if (Call->getCalledFunction() != CallBase->getCalledFunction())
882            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
883          if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
884              !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885                          Call->op_begin() + Call->getBundleOperandsEndIndex(),
886                          CallBase->op_begin() +
887                              CallBase->getBundleOperandsStartIndex())))
888            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
889          Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
890          if (ID != BaseID)
891            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
892          if (!ID) {
893            SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
894            if (Mappings.size() != BaseMappings.size() ||
895                Mappings.front().ISA != BaseMappings.front().ISA ||
896                Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
897                Mappings.front().VectorName != BaseMappings.front().VectorName ||
898                Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
899                Mappings.front().Shape.Parameters !=
900                    BaseMappings.front().Shape.Parameters)
901              return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902          }
903        }
904        continue;
905      }
906      return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907    }
908  
909    return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
910                             cast<Instruction>(VL[AltIndex]));
911  }
912  
913  /// \returns true if all of the values in \p VL have the same type or false
914  /// otherwise.
915  static bool allSameType(ArrayRef<Value *> VL) {
916    Type *Ty = VL.front()->getType();
917    return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
918  }
919  
920  /// \returns True if in-tree use also needs extract. This refers to
921  /// possible scalar operand in vectorized instruction.
922  static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
923                                          TargetLibraryInfo *TLI) {
924    unsigned Opcode = UserInst->getOpcode();
925    switch (Opcode) {
926    case Instruction::Load: {
927      LoadInst *LI = cast<LoadInst>(UserInst);
928      return (LI->getPointerOperand() == Scalar);
929    }
930    case Instruction::Store: {
931      StoreInst *SI = cast<StoreInst>(UserInst);
932      return (SI->getPointerOperand() == Scalar);
933    }
934    case Instruction::Call: {
935      CallInst *CI = cast<CallInst>(UserInst);
936      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
937      return any_of(enumerate(CI->args()), [&](auto &&Arg) {
938        return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939               Arg.value().get() == Scalar;
940      });
941    }
942    default:
943      return false;
944    }
945  }
946  
947  /// \returns the AA location that is being access by the instruction.
948  static MemoryLocation getLocation(Instruction *I) {
949    if (StoreInst *SI = dyn_cast<StoreInst>(I))
950      return MemoryLocation::get(SI);
951    if (LoadInst *LI = dyn_cast<LoadInst>(I))
952      return MemoryLocation::get(LI);
953    return MemoryLocation();
954  }
955  
956  /// \returns True if the instruction is not a volatile or atomic load/store.
957  static bool isSimple(Instruction *I) {
958    if (LoadInst *LI = dyn_cast<LoadInst>(I))
959      return LI->isSimple();
960    if (StoreInst *SI = dyn_cast<StoreInst>(I))
961      return SI->isSimple();
962    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
963      return !MI->isVolatile();
964    return true;
965  }
966  
967  /// Shuffles \p Mask in accordance with the given \p SubMask.
968  /// \param ExtendingManyInputs Supports reshuffling of the mask with not only
969  /// one but two input vectors.
970  static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
971                      bool ExtendingManyInputs = false) {
972    if (SubMask.empty())
973      return;
974    assert(
975        (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
976         // Check if input scalars were extended to match the size of other node.
977         (SubMask.size() == Mask.size() &&
978          std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
979                      [](int Idx) { return Idx == PoisonMaskElem; }))) &&
980        "SubMask with many inputs support must be larger than the mask.");
981    if (Mask.empty()) {
982      Mask.append(SubMask.begin(), SubMask.end());
983      return;
984    }
985    SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
986    int TermValue = std::min(Mask.size(), SubMask.size());
987    for (int I = 0, E = SubMask.size(); I < E; ++I) {
988      if (SubMask[I] == PoisonMaskElem ||
989          (!ExtendingManyInputs &&
990           (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
991        continue;
992      NewMask[I] = Mask[SubMask[I]];
993    }
994    Mask.swap(NewMask);
995  }
996  
997  /// Order may have elements assigned special value (size) which is out of
998  /// bounds. Such indices only appear on places which correspond to undef values
999  /// (see canReuseExtract for details) and used in order to avoid undef values
1000  /// have effect on operands ordering.
1001  /// The first loop below simply finds all unused indices and then the next loop
1002  /// nest assigns these indices for undef values positions.
1003  /// As an example below Order has two undef positions and they have assigned
1004  /// values 3 and 7 respectively:
1005  /// before:  6 9 5 4 9 2 1 0
1006  /// after:   6 3 5 4 7 2 1 0
1007  static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1008    const unsigned Sz = Order.size();
1009    SmallBitVector UnusedIndices(Sz, /*t=*/true);
1010    SmallBitVector MaskedIndices(Sz);
1011    for (unsigned I = 0; I < Sz; ++I) {
1012      if (Order[I] < Sz)
1013        UnusedIndices.reset(Order[I]);
1014      else
1015        MaskedIndices.set(I);
1016    }
1017    if (MaskedIndices.none())
1018      return;
1019    assert(UnusedIndices.count() == MaskedIndices.count() &&
1020           "Non-synced masked/available indices.");
1021    int Idx = UnusedIndices.find_first();
1022    int MIdx = MaskedIndices.find_first();
1023    while (MIdx >= 0) {
1024      assert(Idx >= 0 && "Indices must be synced.");
1025      Order[MIdx] = Idx;
1026      Idx = UnusedIndices.find_next(Idx);
1027      MIdx = MaskedIndices.find_next(MIdx);
1028    }
1029  }
1030  
1031  /// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1032  /// Opcode1.
1033  SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
1034                                 unsigned Opcode1) {
1035    SmallBitVector OpcodeMask(VL.size(), false);
1036    for (unsigned Lane : seq<unsigned>(VL.size()))
1037      if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1038        OpcodeMask.set(Lane);
1039    return OpcodeMask;
1040  }
1041  
1042  namespace llvm {
1043  
1044  static void inversePermutation(ArrayRef<unsigned> Indices,
1045                                 SmallVectorImpl<int> &Mask) {
1046    Mask.clear();
1047    const unsigned E = Indices.size();
1048    Mask.resize(E, PoisonMaskElem);
1049    for (unsigned I = 0; I < E; ++I)
1050      Mask[Indices[I]] = I;
1051  }
1052  
1053  /// Reorders the list of scalars in accordance with the given \p Mask.
1054  static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1055                             ArrayRef<int> Mask) {
1056    assert(!Mask.empty() && "Expected non-empty mask.");
1057    SmallVector<Value *> Prev(Scalars.size(),
1058                              PoisonValue::get(Scalars.front()->getType()));
1059    Prev.swap(Scalars);
1060    for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1061      if (Mask[I] != PoisonMaskElem)
1062        Scalars[Mask[I]] = Prev[I];
1063  }
1064  
1065  /// Checks if the provided value does not require scheduling. It does not
1066  /// require scheduling if this is not an instruction or it is an instruction
1067  /// that does not read/write memory and all operands are either not instructions
1068  /// or phi nodes or instructions from different blocks.
1069  static bool areAllOperandsNonInsts(Value *V) {
1070    auto *I = dyn_cast<Instruction>(V);
1071    if (!I)
1072      return true;
1073    return !mayHaveNonDefUseDependency(*I) &&
1074      all_of(I->operands(), [I](Value *V) {
1075        auto *IO = dyn_cast<Instruction>(V);
1076        if (!IO)
1077          return true;
1078        return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1079      });
1080  }
1081  
1082  /// Checks if the provided value does not require scheduling. It does not
1083  /// require scheduling if this is not an instruction or it is an instruction
1084  /// that does not read/write memory and all users are phi nodes or instructions
1085  /// from the different blocks.
1086  static bool isUsedOutsideBlock(Value *V) {
1087    auto *I = dyn_cast<Instruction>(V);
1088    if (!I)
1089      return true;
1090    // Limits the number of uses to save compile time.
1091    return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1092           all_of(I->users(), [I](User *U) {
1093             auto *IU = dyn_cast<Instruction>(U);
1094             if (!IU)
1095               return true;
1096             return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1097           });
1098  }
1099  
1100  /// Checks if the specified value does not require scheduling. It does not
1101  /// require scheduling if all operands and all users do not need to be scheduled
1102  /// in the current basic block.
1103  static bool doesNotNeedToBeScheduled(Value *V) {
1104    return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1105  }
1106  
1107  /// Checks if the specified array of instructions does not require scheduling.
1108  /// It is so if all either instructions have operands that do not require
1109  /// scheduling or their users do not require scheduling since they are phis or
1110  /// in other basic blocks.
1111  static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1112    return !VL.empty() &&
1113           (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
1114  }
1115  
1116  namespace slpvectorizer {
1117  
1118  /// Bottom Up SLP Vectorizer.
1119  class BoUpSLP {
1120    struct TreeEntry;
1121    struct ScheduleData;
1122    class ShuffleCostEstimator;
1123    class ShuffleInstructionBuilder;
1124  
1125  public:
1126    /// Tracks the state we can represent the loads in the given sequence.
1127    enum class LoadsState {
1128      Gather,
1129      Vectorize,
1130      ScatterVectorize,
1131      StridedVectorize
1132    };
1133  
1134    using ValueList = SmallVector<Value *, 8>;
1135    using InstrList = SmallVector<Instruction *, 16>;
1136    using ValueSet = SmallPtrSet<Value *, 16>;
1137    using StoreList = SmallVector<StoreInst *, 8>;
1138    using ExtraValueToDebugLocsMap =
1139        MapVector<Value *, SmallVector<Instruction *, 2>>;
1140    using OrdersType = SmallVector<unsigned, 4>;
1141  
1142    BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1143            TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1144            DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1145            const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1146        : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147          AC(AC), DB(DB), DL(DL), ORE(ORE),
1148          Builder(Se->getContext(), TargetFolder(*DL)) {
1149      CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1150      // Use the vector register size specified by the target unless overridden
1151      // by a command-line option.
1152      // TODO: It would be better to limit the vectorization factor based on
1153      //       data type rather than just register size. For example, x86 AVX has
1154      //       256-bit registers, but it does not support integer operations
1155      //       at that width (that requires AVX2).
1156      if (MaxVectorRegSizeOption.getNumOccurrences())
1157        MaxVecRegSize = MaxVectorRegSizeOption;
1158      else
1159        MaxVecRegSize =
1160            TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1161                .getFixedValue();
1162  
1163      if (MinVectorRegSizeOption.getNumOccurrences())
1164        MinVecRegSize = MinVectorRegSizeOption;
1165      else
1166        MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1167    }
1168  
1169    /// Vectorize the tree that starts with the elements in \p VL.
1170    /// Returns the vectorized root.
1171    Value *vectorizeTree();
1172  
1173    /// Vectorize the tree but with the list of externally used values \p
1174    /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1175    /// generated extractvalue instructions.
1176    /// \param ReplacedExternals containd list of replaced external values
1177    /// {scalar, replace} after emitting extractelement for external uses.
1178    Value *
1179    vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1180                  SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1181                  Instruction *ReductionRoot = nullptr);
1182  
1183    /// \returns the cost incurred by unwanted spills and fills, caused by
1184    /// holding live values over call sites.
1185    InstructionCost getSpillCost() const;
1186  
1187    /// \returns the vectorization cost of the subtree that starts at \p VL.
1188    /// A negative number means that this is profitable.
1189    InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1190  
1191    /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1192    /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1193    void buildTree(ArrayRef<Value *> Roots,
1194                   const SmallDenseSet<Value *> &UserIgnoreLst);
1195  
1196    /// Construct a vectorizable tree that starts at \p Roots.
1197    void buildTree(ArrayRef<Value *> Roots);
1198  
1199    /// Returns whether the root node has in-tree uses.
1200    bool doesRootHaveInTreeUses() const {
1201      return !VectorizableTree.empty() &&
1202             !VectorizableTree.front()->UserTreeIndices.empty();
1203    }
1204  
1205    /// Return the scalars of the root node.
1206    ArrayRef<Value *> getRootNodeScalars() const {
1207      assert(!VectorizableTree.empty() && "No graph to get the first node from");
1208      return VectorizableTree.front()->Scalars;
1209    }
1210  
1211    /// Checks if the root graph node can be emitted with narrower bitwidth at
1212    /// codegen and returns it signedness, if so.
1213    bool isSignedMinBitwidthRootNode() const {
1214      return MinBWs.at(VectorizableTree.front().get()).second;
1215    }
1216  
1217    /// Builds external uses of the vectorized scalars, i.e. the list of
1218    /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1219    /// ExternallyUsedValues contains additional list of external uses to handle
1220    /// vectorization of reductions.
1221    void
1222    buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1223  
1224    /// Transforms graph nodes to target specific representations, if profitable.
1225    void transformNodes();
1226  
1227    /// Clear the internal data structures that are created by 'buildTree'.
1228    void deleteTree() {
1229      VectorizableTree.clear();
1230      ScalarToTreeEntry.clear();
1231      MultiNodeScalars.clear();
1232      MustGather.clear();
1233      NonScheduledFirst.clear();
1234      EntryToLastInstruction.clear();
1235      ExternalUses.clear();
1236      ExternalUsesAsGEPs.clear();
1237      for (auto &Iter : BlocksSchedules) {
1238        BlockScheduling *BS = Iter.second.get();
1239        BS->clear();
1240      }
1241      MinBWs.clear();
1242      ReductionBitWidth = 0;
1243      CastMaxMinBWSizes.reset();
1244      ExtraBitWidthNodes.clear();
1245      InstrElementSize.clear();
1246      UserIgnoreList = nullptr;
1247      PostponedGathers.clear();
1248      ValueToGatherNodes.clear();
1249    }
1250  
1251    unsigned getTreeSize() const { return VectorizableTree.size(); }
1252  
1253    /// Perform LICM and CSE on the newly generated gather sequences.
1254    void optimizeGatherSequence();
1255  
1256    /// Checks if the specified gather tree entry \p TE can be represented as a
1257    /// shuffled vector entry + (possibly) permutation with other gathers. It
1258    /// implements the checks only for possibly ordered scalars (Loads,
1259    /// ExtractElement, ExtractValue), which can be part of the graph.
1260    std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1261  
1262    /// Sort loads into increasing pointers offsets to allow greater clustering.
1263    std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1264  
1265    /// Gets reordering data for the given tree entry. If the entry is vectorized
1266    /// - just return ReorderIndices, otherwise check if the scalars can be
1267    /// reordered and return the most optimal order.
1268    /// \return std::nullopt if ordering is not important, empty order, if
1269    /// identity order is important, or the actual order.
1270    /// \param TopToBottom If true, include the order of vectorized stores and
1271    /// insertelement nodes, otherwise skip them.
1272    std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1273                                                bool TopToBottom);
1274  
1275    /// Reorders the current graph to the most profitable order starting from the
1276    /// root node to the leaf nodes. The best order is chosen only from the nodes
1277    /// of the same size (vectorization factor). Smaller nodes are considered
1278    /// parts of subgraph with smaller VF and they are reordered independently. We
1279    /// can make it because we still need to extend smaller nodes to the wider VF
1280    /// and we can merge reordering shuffles with the widening shuffles.
1281    void reorderTopToBottom();
1282  
1283    /// Reorders the current graph to the most profitable order starting from
1284    /// leaves to the root. It allows to rotate small subgraphs and reduce the
1285    /// number of reshuffles if the leaf nodes use the same order. In this case we
1286    /// can merge the orders and just shuffle user node instead of shuffling its
1287    /// operands. Plus, even the leaf nodes have different orders, it allows to
1288    /// sink reordering in the graph closer to the root node and merge it later
1289    /// during analysis.
1290    void reorderBottomToTop(bool IgnoreReorder = false);
1291  
1292    /// \return The vector element size in bits to use when vectorizing the
1293    /// expression tree ending at \p V. If V is a store, the size is the width of
1294    /// the stored value. Otherwise, the size is the width of the largest loaded
1295    /// value reaching V. This method is used by the vectorizer to calculate
1296    /// vectorization factors.
1297    unsigned getVectorElementSize(Value *V);
1298  
1299    /// Compute the minimum type sizes required to represent the entries in a
1300    /// vectorizable tree.
1301    void computeMinimumValueSizes();
1302  
1303    // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1304    unsigned getMaxVecRegSize() const {
1305      return MaxVecRegSize;
1306    }
1307  
1308    // \returns minimum vector register size as set by cl::opt.
1309    unsigned getMinVecRegSize() const {
1310      return MinVecRegSize;
1311    }
1312  
1313    unsigned getMinVF(unsigned Sz) const {
1314      return std::max(2U, getMinVecRegSize() / Sz);
1315    }
1316  
1317    unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1318      unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1319        MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1320      return MaxVF ? MaxVF : UINT_MAX;
1321    }
1322  
1323    /// Check if homogeneous aggregate is isomorphic to some VectorType.
1324    /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1325    /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1326    /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1327    ///
1328    /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1329    unsigned canMapToVector(Type *T) const;
1330  
1331    /// \returns True if the VectorizableTree is both tiny and not fully
1332    /// vectorizable. We do not vectorize such trees.
1333    bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1334  
1335    /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1336    /// can be load combined in the backend. Load combining may not be allowed in
1337    /// the IR optimizer, so we do not want to alter the pattern. For example,
1338    /// partially transforming a scalar bswap() pattern into vector code is
1339    /// effectively impossible for the backend to undo.
1340    /// TODO: If load combining is allowed in the IR optimizer, this analysis
1341    ///       may not be necessary.
1342    bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1343  
1344    /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1345    /// can be load combined in the backend. Load combining may not be allowed in
1346    /// the IR optimizer, so we do not want to alter the pattern. For example,
1347    /// partially transforming a scalar bswap() pattern into vector code is
1348    /// effectively impossible for the backend to undo.
1349    /// TODO: If load combining is allowed in the IR optimizer, this analysis
1350    ///       may not be necessary.
1351    bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1352  
1353    /// Checks if the given array of loads can be represented as a vectorized,
1354    /// scatter or just simple gather.
1355    /// \param VL list of loads.
1356    /// \param VL0 main load value.
1357    /// \param Order returned order of load instructions.
1358    /// \param PointerOps returned list of pointer operands.
1359    /// \param TryRecursiveCheck used to check if long masked gather can be
1360    /// represented as a serie of loads/insert subvector, if profitable.
1361    LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
1362                                 SmallVectorImpl<unsigned> &Order,
1363                                 SmallVectorImpl<Value *> &PointerOps,
1364                                 bool TryRecursiveCheck = true) const;
1365  
1366    OptimizationRemarkEmitter *getORE() { return ORE; }
1367  
1368    /// This structure holds any data we need about the edges being traversed
1369    /// during buildTree_rec(). We keep track of:
1370    /// (i) the user TreeEntry index, and
1371    /// (ii) the index of the edge.
1372    struct EdgeInfo {
1373      EdgeInfo() = default;
1374      EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1375          : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1376      /// The user TreeEntry.
1377      TreeEntry *UserTE = nullptr;
1378      /// The operand index of the use.
1379      unsigned EdgeIdx = UINT_MAX;
1380  #ifndef NDEBUG
1381      friend inline raw_ostream &operator<<(raw_ostream &OS,
1382                                            const BoUpSLP::EdgeInfo &EI) {
1383        EI.dump(OS);
1384        return OS;
1385      }
1386      /// Debug print.
1387      void dump(raw_ostream &OS) const {
1388        OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1389           << " EdgeIdx:" << EdgeIdx << "}";
1390      }
1391      LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1392  #endif
1393      bool operator == (const EdgeInfo &Other) const {
1394        return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1395      }
1396    };
1397  
1398    /// A helper class used for scoring candidates for two consecutive lanes.
1399    class LookAheadHeuristics {
1400      const TargetLibraryInfo &TLI;
1401      const DataLayout &DL;
1402      ScalarEvolution &SE;
1403      const BoUpSLP &R;
1404      int NumLanes; // Total number of lanes (aka vectorization factor).
1405      int MaxLevel; // The maximum recursion depth for accumulating score.
1406  
1407    public:
1408      LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1409                          ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1410                          int MaxLevel)
1411          : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1412            MaxLevel(MaxLevel) {}
1413  
1414      // The hard-coded scores listed here are not very important, though it shall
1415      // be higher for better matches to improve the resulting cost. When
1416      // computing the scores of matching one sub-tree with another, we are
1417      // basically counting the number of values that are matching. So even if all
1418      // scores are set to 1, we would still get a decent matching result.
1419      // However, sometimes we have to break ties. For example we may have to
1420      // choose between matching loads vs matching opcodes. This is what these
1421      // scores are helping us with: they provide the order of preference. Also,
1422      // this is important if the scalar is externally used or used in another
1423      // tree entry node in the different lane.
1424  
1425      /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1426      static const int ScoreConsecutiveLoads = 4;
1427      /// The same load multiple times. This should have a better score than
1428      /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1429      /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1430      /// a vector load and 1.0 for a broadcast.
1431      static const int ScoreSplatLoads = 3;
1432      /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1433      static const int ScoreReversedLoads = 3;
1434      /// A load candidate for masked gather.
1435      static const int ScoreMaskedGatherCandidate = 1;
1436      /// ExtractElementInst from same vector and consecutive indexes.
1437      static const int ScoreConsecutiveExtracts = 4;
1438      /// ExtractElementInst from same vector and reversed indices.
1439      static const int ScoreReversedExtracts = 3;
1440      /// Constants.
1441      static const int ScoreConstants = 2;
1442      /// Instructions with the same opcode.
1443      static const int ScoreSameOpcode = 2;
1444      /// Instructions with alt opcodes (e.g, add + sub).
1445      static const int ScoreAltOpcodes = 1;
1446      /// Identical instructions (a.k.a. splat or broadcast).
1447      static const int ScoreSplat = 1;
1448      /// Matching with an undef is preferable to failing.
1449      static const int ScoreUndef = 1;
1450      /// Score for failing to find a decent match.
1451      static const int ScoreFail = 0;
1452      /// Score if all users are vectorized.
1453      static const int ScoreAllUserVectorized = 1;
1454  
1455      /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1456      /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1457      /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1458      /// MainAltOps.
1459      int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1460                          ArrayRef<Value *> MainAltOps) const {
1461        if (!isValidElementType(V1->getType()) ||
1462            !isValidElementType(V2->getType()))
1463          return LookAheadHeuristics::ScoreFail;
1464  
1465        if (V1 == V2) {
1466          if (isa<LoadInst>(V1)) {
1467            // Retruns true if the users of V1 and V2 won't need to be extracted.
1468            auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1469              // Bail out if we have too many uses to save compilation time.
1470              if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1471                return false;
1472  
1473              auto AllUsersVectorized = [U1, U2, this](Value *V) {
1474                return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1475                  return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1476                });
1477              };
1478              return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1479            };
1480            // A broadcast of a load can be cheaper on some targets.
1481            if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1482                                            ElementCount::getFixed(NumLanes)) &&
1483                ((int)V1->getNumUses() == NumLanes ||
1484                 AllUsersAreInternal(V1, V2)))
1485              return LookAheadHeuristics::ScoreSplatLoads;
1486          }
1487          return LookAheadHeuristics::ScoreSplat;
1488        }
1489  
1490        auto CheckSameEntryOrFail = [&]() {
1491          if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1492              TE1 && TE1 == R.getTreeEntry(V2))
1493            return LookAheadHeuristics::ScoreSplatLoads;
1494          return LookAheadHeuristics::ScoreFail;
1495        };
1496  
1497        auto *LI1 = dyn_cast<LoadInst>(V1);
1498        auto *LI2 = dyn_cast<LoadInst>(V2);
1499        if (LI1 && LI2) {
1500          if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1501              !LI2->isSimple())
1502            return CheckSameEntryOrFail();
1503  
1504          std::optional<int> Dist = getPointersDiff(
1505              LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1506              LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1507          if (!Dist || *Dist == 0) {
1508            if (getUnderlyingObject(LI1->getPointerOperand()) ==
1509                    getUnderlyingObject(LI2->getPointerOperand()) &&
1510                R.TTI->isLegalMaskedGather(
1511                    getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1512              return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1513            return CheckSameEntryOrFail();
1514          }
1515          // The distance is too large - still may be profitable to use masked
1516          // loads/gathers.
1517          if (std::abs(*Dist) > NumLanes / 2)
1518            return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1519          // This still will detect consecutive loads, but we might have "holes"
1520          // in some cases. It is ok for non-power-2 vectorization and may produce
1521          // better results. It should not affect current vectorization.
1522          return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1523                             : LookAheadHeuristics::ScoreReversedLoads;
1524        }
1525  
1526        auto *C1 = dyn_cast<Constant>(V1);
1527        auto *C2 = dyn_cast<Constant>(V2);
1528        if (C1 && C2)
1529          return LookAheadHeuristics::ScoreConstants;
1530  
1531        // Extracts from consecutive indexes of the same vector better score as
1532        // the extracts could be optimized away.
1533        Value *EV1;
1534        ConstantInt *Ex1Idx;
1535        if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1536          // Undefs are always profitable for extractelements.
1537          // Compiler can easily combine poison and extractelement <non-poison> or
1538          // undef and extractelement <poison>. But combining undef +
1539          // extractelement <non-poison-but-may-produce-poison> requires some
1540          // extra operations.
1541          if (isa<UndefValue>(V2))
1542            return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1543                       ? LookAheadHeuristics::ScoreConsecutiveExtracts
1544                       : LookAheadHeuristics::ScoreSameOpcode;
1545          Value *EV2 = nullptr;
1546          ConstantInt *Ex2Idx = nullptr;
1547          if (match(V2,
1548                    m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
1549                                                           m_Undef())))) {
1550            // Undefs are always profitable for extractelements.
1551            if (!Ex2Idx)
1552              return LookAheadHeuristics::ScoreConsecutiveExtracts;
1553            if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1554              return LookAheadHeuristics::ScoreConsecutiveExtracts;
1555            if (EV2 == EV1) {
1556              int Idx1 = Ex1Idx->getZExtValue();
1557              int Idx2 = Ex2Idx->getZExtValue();
1558              int Dist = Idx2 - Idx1;
1559              // The distance is too large - still may be profitable to use
1560              // shuffles.
1561              if (std::abs(Dist) == 0)
1562                return LookAheadHeuristics::ScoreSplat;
1563              if (std::abs(Dist) > NumLanes / 2)
1564                return LookAheadHeuristics::ScoreSameOpcode;
1565              return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1566                                : LookAheadHeuristics::ScoreReversedExtracts;
1567            }
1568            return LookAheadHeuristics::ScoreAltOpcodes;
1569          }
1570          return CheckSameEntryOrFail();
1571        }
1572  
1573        auto *I1 = dyn_cast<Instruction>(V1);
1574        auto *I2 = dyn_cast<Instruction>(V2);
1575        if (I1 && I2) {
1576          if (I1->getParent() != I2->getParent())
1577            return CheckSameEntryOrFail();
1578          SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1579          Ops.push_back(I1);
1580          Ops.push_back(I2);
1581          InstructionsState S = getSameOpcode(Ops, TLI);
1582          // Note: Only consider instructions with <= 2 operands to avoid
1583          // complexity explosion.
1584          if (S.getOpcode() &&
1585              (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1586               !S.isAltShuffle()) &&
1587              all_of(Ops, [&S](Value *V) {
1588                return cast<Instruction>(V)->getNumOperands() ==
1589                       S.MainOp->getNumOperands();
1590              }))
1591            return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1592                                    : LookAheadHeuristics::ScoreSameOpcode;
1593        }
1594  
1595        if (isa<UndefValue>(V2))
1596          return LookAheadHeuristics::ScoreUndef;
1597  
1598        return CheckSameEntryOrFail();
1599      }
1600  
1601      /// Go through the operands of \p LHS and \p RHS recursively until
1602      /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1603      /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1604      /// of \p U1 and \p U2), except at the beginning of the recursion where
1605      /// these are set to nullptr.
1606      ///
1607      /// For example:
1608      /// \verbatim
1609      ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
1610      ///     \ /         \ /         \ /        \ /
1611      ///      +           +           +          +
1612      ///     G1          G2          G3         G4
1613      /// \endverbatim
1614      /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1615      /// each level recursively, accumulating the score. It starts from matching
1616      /// the additions at level 0, then moves on to the loads (level 1). The
1617      /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1618      /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1619      /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1620      /// Please note that the order of the operands does not matter, as we
1621      /// evaluate the score of all profitable combinations of operands. In
1622      /// other words the score of G1 and G4 is the same as G1 and G2. This
1623      /// heuristic is based on ideas described in:
1624      ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
1625      ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1626      ///   Luís F. W. Góes
1627      int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1628                             Instruction *U2, int CurrLevel,
1629                             ArrayRef<Value *> MainAltOps) const {
1630  
1631        // Get the shallow score of V1 and V2.
1632        int ShallowScoreAtThisLevel =
1633            getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1634  
1635        // If reached MaxLevel,
1636        //  or if V1 and V2 are not instructions,
1637        //  or if they are SPLAT,
1638        //  or if they are not consecutive,
1639        //  or if profitable to vectorize loads or extractelements, early return
1640        //  the current cost.
1641        auto *I1 = dyn_cast<Instruction>(LHS);
1642        auto *I2 = dyn_cast<Instruction>(RHS);
1643        if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1644            ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1645            (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1646              (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1647              (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1648             ShallowScoreAtThisLevel))
1649          return ShallowScoreAtThisLevel;
1650        assert(I1 && I2 && "Should have early exited.");
1651  
1652        // Contains the I2 operand indexes that got matched with I1 operands.
1653        SmallSet<unsigned, 4> Op2Used;
1654  
1655        // Recursion towards the operands of I1 and I2. We are trying all possible
1656        // operand pairs, and keeping track of the best score.
1657        for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1658             OpIdx1 != NumOperands1; ++OpIdx1) {
1659          // Try to pair op1I with the best operand of I2.
1660          int MaxTmpScore = 0;
1661          unsigned MaxOpIdx2 = 0;
1662          bool FoundBest = false;
1663          // If I2 is commutative try all combinations.
1664          unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1665          unsigned ToIdx = isCommutative(I2)
1666                               ? I2->getNumOperands()
1667                               : std::min(I2->getNumOperands(), OpIdx1 + 1);
1668          assert(FromIdx <= ToIdx && "Bad index");
1669          for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1670            // Skip operands already paired with OpIdx1.
1671            if (Op2Used.count(OpIdx2))
1672              continue;
1673            // Recursively calculate the cost at each level
1674            int TmpScore =
1675                getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1676                                   I1, I2, CurrLevel + 1, std::nullopt);
1677            // Look for the best score.
1678            if (TmpScore > LookAheadHeuristics::ScoreFail &&
1679                TmpScore > MaxTmpScore) {
1680              MaxTmpScore = TmpScore;
1681              MaxOpIdx2 = OpIdx2;
1682              FoundBest = true;
1683            }
1684          }
1685          if (FoundBest) {
1686            // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1687            Op2Used.insert(MaxOpIdx2);
1688            ShallowScoreAtThisLevel += MaxTmpScore;
1689          }
1690        }
1691        return ShallowScoreAtThisLevel;
1692      }
1693    };
1694    /// A helper data structure to hold the operands of a vector of instructions.
1695    /// This supports a fixed vector length for all operand vectors.
1696    class VLOperands {
1697      /// For each operand we need (i) the value, and (ii) the opcode that it
1698      /// would be attached to if the expression was in a left-linearized form.
1699      /// This is required to avoid illegal operand reordering.
1700      /// For example:
1701      /// \verbatim
1702      ///                         0 Op1
1703      ///                         |/
1704      /// Op1 Op2   Linearized    + Op2
1705      ///   \ /     ---------->   |/
1706      ///    -                    -
1707      ///
1708      /// Op1 - Op2            (0 + Op1) - Op2
1709      /// \endverbatim
1710      ///
1711      /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1712      ///
1713      /// Another way to think of this is to track all the operations across the
1714      /// path from the operand all the way to the root of the tree and to
1715      /// calculate the operation that corresponds to this path. For example, the
1716      /// path from Op2 to the root crosses the RHS of the '-', therefore the
1717      /// corresponding operation is a '-' (which matches the one in the
1718      /// linearized tree, as shown above).
1719      ///
1720      /// For lack of a better term, we refer to this operation as Accumulated
1721      /// Path Operation (APO).
1722      struct OperandData {
1723        OperandData() = default;
1724        OperandData(Value *V, bool APO, bool IsUsed)
1725            : V(V), APO(APO), IsUsed(IsUsed) {}
1726        /// The operand value.
1727        Value *V = nullptr;
1728        /// TreeEntries only allow a single opcode, or an alternate sequence of
1729        /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1730        /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1731        /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1732        /// (e.g., Add/Mul)
1733        bool APO = false;
1734        /// Helper data for the reordering function.
1735        bool IsUsed = false;
1736      };
1737  
1738      /// During operand reordering, we are trying to select the operand at lane
1739      /// that matches best with the operand at the neighboring lane. Our
1740      /// selection is based on the type of value we are looking for. For example,
1741      /// if the neighboring lane has a load, we need to look for a load that is
1742      /// accessing a consecutive address. These strategies are summarized in the
1743      /// 'ReorderingMode' enumerator.
1744      enum class ReorderingMode {
1745        Load,     ///< Matching loads to consecutive memory addresses
1746        Opcode,   ///< Matching instructions based on opcode (same or alternate)
1747        Constant, ///< Matching constants
1748        Splat,    ///< Matching the same instruction multiple times (broadcast)
1749        Failed,   ///< We failed to create a vectorizable group
1750      };
1751  
1752      using OperandDataVec = SmallVector<OperandData, 2>;
1753  
1754      /// A vector of operand vectors.
1755      SmallVector<OperandDataVec, 4> OpsVec;
1756  
1757      const TargetLibraryInfo &TLI;
1758      const DataLayout &DL;
1759      ScalarEvolution &SE;
1760      const BoUpSLP &R;
1761      const Loop *L = nullptr;
1762  
1763      /// \returns the operand data at \p OpIdx and \p Lane.
1764      OperandData &getData(unsigned OpIdx, unsigned Lane) {
1765        return OpsVec[OpIdx][Lane];
1766      }
1767  
1768      /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1769      const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1770        return OpsVec[OpIdx][Lane];
1771      }
1772  
1773      /// Clears the used flag for all entries.
1774      void clearUsed() {
1775        for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1776             OpIdx != NumOperands; ++OpIdx)
1777          for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1778               ++Lane)
1779            OpsVec[OpIdx][Lane].IsUsed = false;
1780      }
1781  
1782      /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1783      void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1784        std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1785      }
1786  
1787      /// \param Lane lane of the operands under analysis.
1788      /// \param OpIdx operand index in \p Lane lane we're looking the best
1789      /// candidate for.
1790      /// \param Idx operand index of the current candidate value.
1791      /// \returns The additional score due to possible broadcasting of the
1792      /// elements in the lane. It is more profitable to have power-of-2 unique
1793      /// elements in the lane, it will be vectorized with higher probability
1794      /// after removing duplicates. Currently the SLP vectorizer supports only
1795      /// vectorization of the power-of-2 number of unique scalars.
1796      int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1797        Value *IdxLaneV = getData(Idx, Lane).V;
1798        if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1799          return 0;
1800        SmallPtrSet<Value *, 4> Uniques;
1801        for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1802          if (Ln == Lane)
1803            continue;
1804          Value *OpIdxLnV = getData(OpIdx, Ln).V;
1805          if (!isa<Instruction>(OpIdxLnV))
1806            return 0;
1807          Uniques.insert(OpIdxLnV);
1808        }
1809        int UniquesCount = Uniques.size();
1810        int UniquesCntWithIdxLaneV =
1811            Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1812        Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813        int UniquesCntWithOpIdxLaneV =
1814            Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1815        if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1816          return 0;
1817        return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1818                UniquesCntWithOpIdxLaneV) -
1819               (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1820      }
1821  
1822      /// \param Lane lane of the operands under analysis.
1823      /// \param OpIdx operand index in \p Lane lane we're looking the best
1824      /// candidate for.
1825      /// \param Idx operand index of the current candidate value.
1826      /// \returns The additional score for the scalar which users are all
1827      /// vectorized.
1828      int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1829        Value *IdxLaneV = getData(Idx, Lane).V;
1830        Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1831        // Do not care about number of uses for vector-like instructions
1832        // (extractelement/extractvalue with constant indices), they are extracts
1833        // themselves and already externally used. Vectorization of such
1834        // instructions does not add extra extractelement instruction, just may
1835        // remove it.
1836        if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1837            isVectorLikeInstWithConstOps(OpIdxLaneV))
1838          return LookAheadHeuristics::ScoreAllUserVectorized;
1839        auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1840        if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1841          return 0;
1842        return R.areAllUsersVectorized(IdxLaneI)
1843                   ? LookAheadHeuristics::ScoreAllUserVectorized
1844                   : 0;
1845      }
1846  
1847      /// Score scaling factor for fully compatible instructions but with
1848      /// different number of external uses. Allows better selection of the
1849      /// instructions with less external uses.
1850      static const int ScoreScaleFactor = 10;
1851  
1852      /// \Returns the look-ahead score, which tells us how much the sub-trees
1853      /// rooted at \p LHS and \p RHS match, the more they match the higher the
1854      /// score. This helps break ties in an informed way when we cannot decide on
1855      /// the order of the operands by just considering the immediate
1856      /// predecessors.
1857      int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1858                            int Lane, unsigned OpIdx, unsigned Idx,
1859                            bool &IsUsed) {
1860        LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1861                                      LookAheadMaxDepth);
1862        // Keep track of the instruction stack as we recurse into the operands
1863        // during the look-ahead score exploration.
1864        int Score =
1865            LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1866                                         /*CurrLevel=*/1, MainAltOps);
1867        if (Score) {
1868          int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1869          if (Score <= -SplatScore) {
1870            // Set the minimum score for splat-like sequence to avoid setting
1871            // failed state.
1872            Score = 1;
1873          } else {
1874            Score += SplatScore;
1875            // Scale score to see the difference between different operands
1876            // and similar operands but all vectorized/not all vectorized
1877            // uses. It does not affect actual selection of the best
1878            // compatible operand in general, just allows to select the
1879            // operand with all vectorized uses.
1880            Score *= ScoreScaleFactor;
1881            Score += getExternalUseScore(Lane, OpIdx, Idx);
1882            IsUsed = true;
1883          }
1884        }
1885        return Score;
1886      }
1887  
1888      /// Best defined scores per lanes between the passes. Used to choose the
1889      /// best operand (with the highest score) between the passes.
1890      /// The key - {Operand Index, Lane}.
1891      /// The value - the best score between the passes for the lane and the
1892      /// operand.
1893      SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1894          BestScoresPerLanes;
1895  
1896      // Search all operands in Ops[*][Lane] for the one that matches best
1897      // Ops[OpIdx][LastLane] and return its opreand index.
1898      // If no good match can be found, return std::nullopt.
1899      std::optional<unsigned>
1900      getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1901                     ArrayRef<ReorderingMode> ReorderingModes,
1902                     ArrayRef<Value *> MainAltOps) {
1903        unsigned NumOperands = getNumOperands();
1904  
1905        // The operand of the previous lane at OpIdx.
1906        Value *OpLastLane = getData(OpIdx, LastLane).V;
1907  
1908        // Our strategy mode for OpIdx.
1909        ReorderingMode RMode = ReorderingModes[OpIdx];
1910        if (RMode == ReorderingMode::Failed)
1911          return std::nullopt;
1912  
1913        // The linearized opcode of the operand at OpIdx, Lane.
1914        bool OpIdxAPO = getData(OpIdx, Lane).APO;
1915  
1916        // The best operand index and its score.
1917        // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1918        // are using the score to differentiate between the two.
1919        struct BestOpData {
1920          std::optional<unsigned> Idx;
1921          unsigned Score = 0;
1922        } BestOp;
1923        BestOp.Score =
1924            BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1925                .first->second;
1926  
1927        // Track if the operand must be marked as used. If the operand is set to
1928        // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1929        // want to reestimate the operands again on the following iterations).
1930        bool IsUsed = RMode == ReorderingMode::Splat ||
1931                      RMode == ReorderingMode::Constant ||
1932                      RMode == ReorderingMode::Load;
1933        // Iterate through all unused operands and look for the best.
1934        for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1935          // Get the operand at Idx and Lane.
1936          OperandData &OpData = getData(Idx, Lane);
1937          Value *Op = OpData.V;
1938          bool OpAPO = OpData.APO;
1939  
1940          // Skip already selected operands.
1941          if (OpData.IsUsed)
1942            continue;
1943  
1944          // Skip if we are trying to move the operand to a position with a
1945          // different opcode in the linearized tree form. This would break the
1946          // semantics.
1947          if (OpAPO != OpIdxAPO)
1948            continue;
1949  
1950          // Look for an operand that matches the current mode.
1951          switch (RMode) {
1952          case ReorderingMode::Load:
1953          case ReorderingMode::Opcode: {
1954            bool LeftToRight = Lane > LastLane;
1955            Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1956            Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1957            int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1958                                          OpIdx, Idx, IsUsed);
1959            if (Score > static_cast<int>(BestOp.Score) ||
1960                (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1961                 Idx == OpIdx)) {
1962              BestOp.Idx = Idx;
1963              BestOp.Score = Score;
1964              BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1965            }
1966            break;
1967          }
1968          case ReorderingMode::Constant:
1969            if (isa<Constant>(Op) ||
1970                (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1971              BestOp.Idx = Idx;
1972              if (isa<Constant>(Op)) {
1973                BestOp.Score = LookAheadHeuristics::ScoreConstants;
1974                BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1975                    LookAheadHeuristics::ScoreConstants;
1976              }
1977              if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1978                IsUsed = false;
1979            }
1980            break;
1981          case ReorderingMode::Splat:
1982            if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1983              IsUsed = Op == OpLastLane;
1984              if (Op == OpLastLane) {
1985                BestOp.Score = LookAheadHeuristics::ScoreSplat;
1986                BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1987                    LookAheadHeuristics::ScoreSplat;
1988              }
1989              BestOp.Idx = Idx;
1990            }
1991            break;
1992          case ReorderingMode::Failed:
1993            llvm_unreachable("Not expected Failed reordering mode.");
1994          }
1995        }
1996  
1997        if (BestOp.Idx) {
1998          getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1999          return BestOp.Idx;
2000        }
2001        // If we could not find a good match return std::nullopt.
2002        return std::nullopt;
2003      }
2004  
2005      /// Helper for reorderOperandVecs.
2006      /// \returns the lane that we should start reordering from. This is the one
2007      /// which has the least number of operands that can freely move about or
2008      /// less profitable because it already has the most optimal set of operands.
2009      unsigned getBestLaneToStartReordering() const {
2010        unsigned Min = UINT_MAX;
2011        unsigned SameOpNumber = 0;
2012        // std::pair<unsigned, unsigned> is used to implement a simple voting
2013        // algorithm and choose the lane with the least number of operands that
2014        // can freely move about or less profitable because it already has the
2015        // most optimal set of operands. The first unsigned is a counter for
2016        // voting, the second unsigned is the counter of lanes with instructions
2017        // with same/alternate opcodes and same parent basic block.
2018        MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2019        // Try to be closer to the original results, if we have multiple lanes
2020        // with same cost. If 2 lanes have the same cost, use the one with the
2021        // lowest index.
2022        for (int I = getNumLanes(); I > 0; --I) {
2023          unsigned Lane = I - 1;
2024          OperandsOrderData NumFreeOpsHash =
2025              getMaxNumOperandsThatCanBeReordered(Lane);
2026          // Compare the number of operands that can move and choose the one with
2027          // the least number.
2028          if (NumFreeOpsHash.NumOfAPOs < Min) {
2029            Min = NumFreeOpsHash.NumOfAPOs;
2030            SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2031            HashMap.clear();
2032            HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2033          } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034                     NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2035            // Select the most optimal lane in terms of number of operands that
2036            // should be moved around.
2037            SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038            HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2039          } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040                     NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041            auto *It = HashMap.find(NumFreeOpsHash.Hash);
2042            if (It == HashMap.end())
2043              HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2044            else
2045              ++It->second.first;
2046          }
2047        }
2048        // Select the lane with the minimum counter.
2049        unsigned BestLane = 0;
2050        unsigned CntMin = UINT_MAX;
2051        for (const auto &Data : reverse(HashMap)) {
2052          if (Data.second.first < CntMin) {
2053            CntMin = Data.second.first;
2054            BestLane = Data.second.second;
2055          }
2056        }
2057        return BestLane;
2058      }
2059  
2060      /// Data structure that helps to reorder operands.
2061      struct OperandsOrderData {
2062        /// The best number of operands with the same APOs, which can be
2063        /// reordered.
2064        unsigned NumOfAPOs = UINT_MAX;
2065        /// Number of operands with the same/alternate instruction opcode and
2066        /// parent.
2067        unsigned NumOpsWithSameOpcodeParent = 0;
2068        /// Hash for the actual operands ordering.
2069        /// Used to count operands, actually their position id and opcode
2070        /// value. It is used in the voting mechanism to find the lane with the
2071        /// least number of operands that can freely move about or less profitable
2072        /// because it already has the most optimal set of operands. Can be
2073        /// replaced with SmallVector<unsigned> instead but hash code is faster
2074        /// and requires less memory.
2075        unsigned Hash = 0;
2076      };
2077      /// \returns the maximum number of operands that are allowed to be reordered
2078      /// for \p Lane and the number of compatible instructions(with the same
2079      /// parent/opcode). This is used as a heuristic for selecting the first lane
2080      /// to start operand reordering.
2081      OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2082        unsigned CntTrue = 0;
2083        unsigned NumOperands = getNumOperands();
2084        // Operands with the same APO can be reordered. We therefore need to count
2085        // how many of them we have for each APO, like this: Cnt[APO] = x.
2086        // Since we only have two APOs, namely true and false, we can avoid using
2087        // a map. Instead we can simply count the number of operands that
2088        // correspond to one of them (in this case the 'true' APO), and calculate
2089        // the other by subtracting it from the total number of operands.
2090        // Operands with the same instruction opcode and parent are more
2091        // profitable since we don't need to move them in many cases, with a high
2092        // probability such lane already can be vectorized effectively.
2093        bool AllUndefs = true;
2094        unsigned NumOpsWithSameOpcodeParent = 0;
2095        Instruction *OpcodeI = nullptr;
2096        BasicBlock *Parent = nullptr;
2097        unsigned Hash = 0;
2098        for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2099          const OperandData &OpData = getData(OpIdx, Lane);
2100          if (OpData.APO)
2101            ++CntTrue;
2102          // Use Boyer-Moore majority voting for finding the majority opcode and
2103          // the number of times it occurs.
2104          if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2105            if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2106                I->getParent() != Parent) {
2107              if (NumOpsWithSameOpcodeParent == 0) {
2108                NumOpsWithSameOpcodeParent = 1;
2109                OpcodeI = I;
2110                Parent = I->getParent();
2111              } else {
2112                --NumOpsWithSameOpcodeParent;
2113              }
2114            } else {
2115              ++NumOpsWithSameOpcodeParent;
2116            }
2117          }
2118          Hash = hash_combine(
2119              Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2120          AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2121        }
2122        if (AllUndefs)
2123          return {};
2124        OperandsOrderData Data;
2125        Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2126        Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2127        Data.Hash = Hash;
2128        return Data;
2129      }
2130  
2131      /// Go through the instructions in VL and append their operands.
2132      void appendOperandsOfVL(ArrayRef<Value *> VL) {
2133        assert(!VL.empty() && "Bad VL");
2134        assert((empty() || VL.size() == getNumLanes()) &&
2135               "Expected same number of lanes");
2136        assert(isa<Instruction>(VL[0]) && "Expected instruction");
2137        unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2138        constexpr unsigned IntrinsicNumOperands = 2;
2139        if (isa<IntrinsicInst>(VL[0]))
2140          NumOperands = IntrinsicNumOperands;
2141        OpsVec.resize(NumOperands);
2142        unsigned NumLanes = VL.size();
2143        for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144          OpsVec[OpIdx].resize(NumLanes);
2145          for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2146            assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2147            // Our tree has just 3 nodes: the root and two operands.
2148            // It is therefore trivial to get the APO. We only need to check the
2149            // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2150            // RHS operand. The LHS operand of both add and sub is never attached
2151            // to an inversese operation in the linearized form, therefore its APO
2152            // is false. The RHS is true only if VL[Lane] is an inverse operation.
2153  
2154            // Since operand reordering is performed on groups of commutative
2155            // operations or alternating sequences (e.g., +, -), we can safely
2156            // tell the inverse operations by checking commutativity.
2157            bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2158            bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2159            OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2160                                   APO, false};
2161          }
2162        }
2163      }
2164  
2165      /// \returns the number of operands.
2166      unsigned getNumOperands() const { return OpsVec.size(); }
2167  
2168      /// \returns the number of lanes.
2169      unsigned getNumLanes() const { return OpsVec[0].size(); }
2170  
2171      /// \returns the operand value at \p OpIdx and \p Lane.
2172      Value *getValue(unsigned OpIdx, unsigned Lane) const {
2173        return getData(OpIdx, Lane).V;
2174      }
2175  
2176      /// \returns true if the data structure is empty.
2177      bool empty() const { return OpsVec.empty(); }
2178  
2179      /// Clears the data.
2180      void clear() { OpsVec.clear(); }
2181  
2182      /// \Returns true if there are enough operands identical to \p Op to fill
2183      /// the whole vector (it is mixed with constants or loop invariant values).
2184      /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2185      bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2186        bool OpAPO = getData(OpIdx, Lane).APO;
2187        bool IsInvariant = L && L->isLoopInvariant(Op);
2188        unsigned Cnt = 0;
2189        for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2190          if (Ln == Lane)
2191            continue;
2192          // This is set to true if we found a candidate for broadcast at Lane.
2193          bool FoundCandidate = false;
2194          for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195            OperandData &Data = getData(OpI, Ln);
2196            if (Data.APO != OpAPO || Data.IsUsed)
2197              continue;
2198            Value *OpILane = getValue(OpI, Lane);
2199            bool IsConstantOp = isa<Constant>(OpILane);
2200            // Consider the broadcast candidate if:
2201            // 1. Same value is found in one of the operands.
2202            if (Data.V == Op ||
2203                // 2. The operand in the given lane is not constant but there is a
2204                // constant operand in another lane (which can be moved to the
2205                // given lane). In this case we can represent it as a simple
2206                // permutation of constant and broadcast.
2207                (!IsConstantOp &&
2208                 ((Lns > 2 && isa<Constant>(Data.V)) ||
2209                  // 2.1. If we have only 2 lanes, need to check that value in the
2210                  // next lane does not build same opcode sequence.
2211                  (Lns == 2 &&
2212                   !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2213                        .getOpcode() &&
2214                   isa<Constant>(Data.V)))) ||
2215                // 3. The operand in the current lane is loop invariant (can be
2216                // hoisted out) and another operand is also a loop invariant
2217                // (though not a constant). In this case the whole vector can be
2218                // hoisted out.
2219                // FIXME: need to teach the cost model about this case for better
2220                // estimation.
2221                (IsInvariant && !isa<Constant>(Data.V) &&
2222                 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2223                 L->isLoopInvariant(Data.V))) {
2224              FoundCandidate = true;
2225              Data.IsUsed = Data.V == Op;
2226              if (Data.V == Op)
2227                ++Cnt;
2228              break;
2229            }
2230          }
2231          if (!FoundCandidate)
2232            return false;
2233        }
2234        return getNumLanes() == 2 || Cnt > 1;
2235      }
2236  
2237      /// Checks if there is at least single compatible operand in lanes other
2238      /// than \p Lane, compatible with the operand \p Op.
2239      bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2240        bool OpAPO = getData(OpIdx, Lane).APO;
2241        for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2242          if (Ln == Lane)
2243            continue;
2244          if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2245                const OperandData &Data = getData(OpI, Ln);
2246                if (Data.APO != OpAPO || Data.IsUsed)
2247                  return true;
2248                Value *OpILn = getValue(OpI, Ln);
2249                return (L && L->isLoopInvariant(OpILn)) ||
2250                       (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2251                        Op->getParent() == cast<Instruction>(OpILn)->getParent());
2252              }))
2253            return true;
2254        }
2255        return false;
2256      }
2257  
2258    public:
2259      /// Initialize with all the operands of the instruction vector \p RootVL.
2260      VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
2261          : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2262            L(R.LI->getLoopFor(
2263                (cast<Instruction>(RootVL.front())->getParent()))) {
2264        // Append all the operands of RootVL.
2265        appendOperandsOfVL(RootVL);
2266      }
2267  
2268      /// \Returns a value vector with the operands across all lanes for the
2269      /// opearnd at \p OpIdx.
2270      ValueList getVL(unsigned OpIdx) const {
2271        ValueList OpVL(OpsVec[OpIdx].size());
2272        assert(OpsVec[OpIdx].size() == getNumLanes() &&
2273               "Expected same num of lanes across all operands");
2274        for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275          OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2276        return OpVL;
2277      }
2278  
2279      // Performs operand reordering for 2 or more operands.
2280      // The original operands are in OrigOps[OpIdx][Lane].
2281      // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2282      void reorder() {
2283        unsigned NumOperands = getNumOperands();
2284        unsigned NumLanes = getNumLanes();
2285        // Each operand has its own mode. We are using this mode to help us select
2286        // the instructions for each lane, so that they match best with the ones
2287        // we have selected so far.
2288        SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2289  
2290        // This is a greedy single-pass algorithm. We are going over each lane
2291        // once and deciding on the best order right away with no back-tracking.
2292        // However, in order to increase its effectiveness, we start with the lane
2293        // that has operands that can move the least. For example, given the
2294        // following lanes:
2295        //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
2296        //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
2297        //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
2298        //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
2299        // we will start at Lane 1, since the operands of the subtraction cannot
2300        // be reordered. Then we will visit the rest of the lanes in a circular
2301        // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2302  
2303        // Find the first lane that we will start our search from.
2304        unsigned FirstLane = getBestLaneToStartReordering();
2305  
2306        // Initialize the modes.
2307        for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2308          Value *OpLane0 = getValue(OpIdx, FirstLane);
2309          // Keep track if we have instructions with all the same opcode on one
2310          // side.
2311          if (isa<LoadInst>(OpLane0))
2312            ReorderingModes[OpIdx] = ReorderingMode::Load;
2313          else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2314            // Check if OpLane0 should be broadcast.
2315            if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2316                !canBeVectorized(OpILane0, OpIdx, FirstLane))
2317              ReorderingModes[OpIdx] = ReorderingMode::Splat;
2318            else
2319              ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2320          } else if (isa<Constant>(OpLane0))
2321            ReorderingModes[OpIdx] = ReorderingMode::Constant;
2322          else if (isa<Argument>(OpLane0))
2323            // Our best hope is a Splat. It may save some cost in some cases.
2324            ReorderingModes[OpIdx] = ReorderingMode::Splat;
2325          else
2326            // NOTE: This should be unreachable.
2327            ReorderingModes[OpIdx] = ReorderingMode::Failed;
2328        }
2329  
2330        // Check that we don't have same operands. No need to reorder if operands
2331        // are just perfect diamond or shuffled diamond match. Do not do it only
2332        // for possible broadcasts or non-power of 2 number of scalars (just for
2333        // now).
2334        auto &&SkipReordering = [this]() {
2335          SmallPtrSet<Value *, 4> UniqueValues;
2336          ArrayRef<OperandData> Op0 = OpsVec.front();
2337          for (const OperandData &Data : Op0)
2338            UniqueValues.insert(Data.V);
2339          for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2340            if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2341                  return !UniqueValues.contains(Data.V);
2342                }))
2343              return false;
2344          }
2345          // TODO: Check if we can remove a check for non-power-2 number of
2346          // scalars after full support of non-power-2 vectorization.
2347          return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2348        };
2349  
2350        // If the initial strategy fails for any of the operand indexes, then we
2351        // perform reordering again in a second pass. This helps avoid assigning
2352        // high priority to the failed strategy, and should improve reordering for
2353        // the non-failed operand indexes.
2354        for (int Pass = 0; Pass != 2; ++Pass) {
2355          // Check if no need to reorder operands since they're are perfect or
2356          // shuffled diamond match.
2357          // Need to do it to avoid extra external use cost counting for
2358          // shuffled matches, which may cause regressions.
2359          if (SkipReordering())
2360            break;
2361          // Skip the second pass if the first pass did not fail.
2362          bool StrategyFailed = false;
2363          // Mark all operand data as free to use.
2364          clearUsed();
2365          // We keep the original operand order for the FirstLane, so reorder the
2366          // rest of the lanes. We are visiting the nodes in a circular fashion,
2367          // using FirstLane as the center point and increasing the radius
2368          // distance.
2369          SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2370          for (unsigned I = 0; I < NumOperands; ++I)
2371            MainAltOps[I].push_back(getData(I, FirstLane).V);
2372  
2373          for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2374            // Visit the lane on the right and then the lane on the left.
2375            for (int Direction : {+1, -1}) {
2376              int Lane = FirstLane + Direction * Distance;
2377              if (Lane < 0 || Lane >= (int)NumLanes)
2378                continue;
2379              int LastLane = Lane - Direction;
2380              assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2381                     "Out of bounds");
2382              // Look for a good match for each operand.
2383              for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384                // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2385                std::optional<unsigned> BestIdx = getBestOperand(
2386                    OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2387                // By not selecting a value, we allow the operands that follow to
2388                // select a better matching value. We will get a non-null value in
2389                // the next run of getBestOperand().
2390                if (BestIdx) {
2391                  // Swap the current operand with the one returned by
2392                  // getBestOperand().
2393                  swap(OpIdx, *BestIdx, Lane);
2394                } else {
2395                  // Enable the second pass.
2396                  StrategyFailed = true;
2397                }
2398                // Try to get the alternate opcode and follow it during analysis.
2399                if (MainAltOps[OpIdx].size() != 2) {
2400                  OperandData &AltOp = getData(OpIdx, Lane);
2401                  InstructionsState OpS =
2402                      getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2403                  if (OpS.getOpcode() && OpS.isAltShuffle())
2404                    MainAltOps[OpIdx].push_back(AltOp.V);
2405                }
2406              }
2407            }
2408          }
2409          // Skip second pass if the strategy did not fail.
2410          if (!StrategyFailed)
2411            break;
2412        }
2413      }
2414  
2415  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2416      LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2417        switch (RMode) {
2418        case ReorderingMode::Load:
2419          return "Load";
2420        case ReorderingMode::Opcode:
2421          return "Opcode";
2422        case ReorderingMode::Constant:
2423          return "Constant";
2424        case ReorderingMode::Splat:
2425          return "Splat";
2426        case ReorderingMode::Failed:
2427          return "Failed";
2428        }
2429        llvm_unreachable("Unimplemented Reordering Type");
2430      }
2431  
2432      LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2433                                                     raw_ostream &OS) {
2434        return OS << getModeStr(RMode);
2435      }
2436  
2437      /// Debug print.
2438      LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2439        printMode(RMode, dbgs());
2440      }
2441  
2442      friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2443        return printMode(RMode, OS);
2444      }
2445  
2446      LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2447        const unsigned Indent = 2;
2448        unsigned Cnt = 0;
2449        for (const OperandDataVec &OpDataVec : OpsVec) {
2450          OS << "Operand " << Cnt++ << "\n";
2451          for (const OperandData &OpData : OpDataVec) {
2452            OS.indent(Indent) << "{";
2453            if (Value *V = OpData.V)
2454              OS << *V;
2455            else
2456              OS << "null";
2457            OS << ", APO:" << OpData.APO << "}\n";
2458          }
2459          OS << "\n";
2460        }
2461        return OS;
2462      }
2463  
2464      /// Debug print.
2465      LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2466  #endif
2467    };
2468  
2469    /// Evaluate each pair in \p Candidates and return index into \p Candidates
2470    /// for a pair which have highest score deemed to have best chance to form
2471    /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2472    /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2473    /// of the cost, considered to be good enough score.
2474    std::optional<int>
2475    findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2476                     int Limit = LookAheadHeuristics::ScoreFail) const {
2477      LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2478                                    RootLookAheadMaxDepth);
2479      int BestScore = Limit;
2480      std::optional<int> Index;
2481      for (int I : seq<int>(0, Candidates.size())) {
2482        int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2483                                                 Candidates[I].second,
2484                                                 /*U1=*/nullptr, /*U2=*/nullptr,
2485                                                 /*Level=*/1, std::nullopt);
2486        if (Score > BestScore) {
2487          BestScore = Score;
2488          Index = I;
2489        }
2490      }
2491      return Index;
2492    }
2493  
2494    /// Checks if the instruction is marked for deletion.
2495    bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2496  
2497    /// Removes an instruction from its block and eventually deletes it.
2498    /// It's like Instruction::eraseFromParent() except that the actual deletion
2499    /// is delayed until BoUpSLP is destructed.
2500    void eraseInstruction(Instruction *I) {
2501      DeletedInstructions.insert(I);
2502    }
2503  
2504    /// Remove instructions from the parent function and clear the operands of \p
2505    /// DeadVals instructions, marking for deletion trivially dead operands.
2506    template <typename T>
2507    void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2508      SmallVector<WeakTrackingVH> DeadInsts;
2509      for (T *V : DeadVals) {
2510        auto *I = cast<Instruction>(V);
2511        DeletedInstructions.insert(I);
2512      }
2513      DenseSet<Value *> Processed;
2514      for (T *V : DeadVals) {
2515        if (!V || !Processed.insert(V).second)
2516          continue;
2517        auto *I = cast<Instruction>(V);
2518        salvageDebugInfo(*I);
2519        SmallVector<const TreeEntry *> Entries;
2520        if (const TreeEntry *Entry = getTreeEntry(I)) {
2521          Entries.push_back(Entry);
2522          auto It = MultiNodeScalars.find(I);
2523          if (It != MultiNodeScalars.end())
2524            Entries.append(It->second.begin(), It->second.end());
2525        }
2526        for (Use &U : I->operands()) {
2527          if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2528              OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2529              wouldInstructionBeTriviallyDead(OpI, TLI) &&
2530              (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2531                 return Entry->VectorizedValue == OpI;
2532               })))
2533            DeadInsts.push_back(OpI);
2534        }
2535        I->dropAllReferences();
2536      }
2537      for (T *V : DeadVals) {
2538        auto *I = cast<Instruction>(V);
2539        if (!I->getParent())
2540          continue;
2541        assert((I->use_empty() || all_of(I->uses(),
2542                                         [&](Use &U) {
2543                                           return isDeleted(
2544                                               cast<Instruction>(U.getUser()));
2545                                         })) &&
2546               "trying to erase instruction with users.");
2547        I->removeFromParent();
2548        SE->forgetValue(I);
2549      }
2550      // Process the dead instruction list until empty.
2551      while (!DeadInsts.empty()) {
2552        Value *V = DeadInsts.pop_back_val();
2553        Instruction *VI = cast_or_null<Instruction>(V);
2554        if (!VI || !VI->getParent())
2555          continue;
2556        assert(isInstructionTriviallyDead(VI, TLI) &&
2557               "Live instruction found in dead worklist!");
2558        assert(VI->use_empty() && "Instructions with uses are not dead.");
2559  
2560        // Don't lose the debug info while deleting the instructions.
2561        salvageDebugInfo(*VI);
2562  
2563        // Null out all of the instruction's operands to see if any operand
2564        // becomes dead as we go.
2565        for (Use &OpU : VI->operands()) {
2566          Value *OpV = OpU.get();
2567          if (!OpV)
2568            continue;
2569          OpU.set(nullptr);
2570  
2571          if (!OpV->use_empty())
2572            continue;
2573  
2574          // If the operand is an instruction that became dead as we nulled out
2575          // the operand, and if it is 'trivially' dead, delete it in a future
2576          // loop iteration.
2577          if (auto *OpI = dyn_cast<Instruction>(OpV))
2578            if (!DeletedInstructions.contains(OpI) &&
2579                isInstructionTriviallyDead(OpI, TLI))
2580              DeadInsts.push_back(OpI);
2581        }
2582  
2583        VI->removeFromParent();
2584        DeletedInstructions.insert(VI);
2585        SE->forgetValue(VI);
2586      }
2587    }
2588  
2589    /// Checks if the instruction was already analyzed for being possible
2590    /// reduction root.
2591    bool isAnalyzedReductionRoot(Instruction *I) const {
2592      return AnalyzedReductionsRoots.count(I);
2593    }
2594    /// Register given instruction as already analyzed for being possible
2595    /// reduction root.
2596    void analyzedReductionRoot(Instruction *I) {
2597      AnalyzedReductionsRoots.insert(I);
2598    }
2599    /// Checks if the provided list of reduced values was checked already for
2600    /// vectorization.
2601    bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
2602      return AnalyzedReductionVals.contains(hash_value(VL));
2603    }
2604    /// Adds the list of reduced values to list of already checked values for the
2605    /// vectorization.
2606    void analyzedReductionVals(ArrayRef<Value *> VL) {
2607      AnalyzedReductionVals.insert(hash_value(VL));
2608    }
2609    /// Clear the list of the analyzed reduction root instructions.
2610    void clearReductionData() {
2611      AnalyzedReductionsRoots.clear();
2612      AnalyzedReductionVals.clear();
2613      AnalyzedMinBWVals.clear();
2614    }
2615    /// Checks if the given value is gathered in one of the nodes.
2616    bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2617      return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2618    }
2619    /// Checks if the given value is gathered in one of the nodes.
2620    bool isGathered(const Value *V) const {
2621      return MustGather.contains(V);
2622    }
2623    /// Checks if the specified value was not schedule.
2624    bool isNotScheduled(const Value *V) const {
2625      return NonScheduledFirst.contains(V);
2626    }
2627  
2628    /// Check if the value is vectorized in the tree.
2629    bool isVectorized(Value *V) const { return getTreeEntry(V); }
2630  
2631    ~BoUpSLP();
2632  
2633  private:
2634    /// Determine if a node \p E in can be demoted to a smaller type with a
2635    /// truncation. We collect the entries that will be demoted in ToDemote.
2636    /// \param E Node for analysis
2637    /// \param ToDemote indices of the nodes to be demoted.
2638    bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2639                               unsigned &BitWidth,
2640                               SmallVectorImpl<unsigned> &ToDemote,
2641                               DenseSet<const TreeEntry *> &Visited,
2642                               unsigned &MaxDepthLevel,
2643                               bool &IsProfitableToDemote,
2644                               bool IsTruncRoot) const;
2645  
2646    /// Check if the operands on the edges \p Edges of the \p UserTE allows
2647    /// reordering (i.e. the operands can be reordered because they have only one
2648    /// user and reordarable).
2649    /// \param ReorderableGathers List of all gather nodes that require reordering
2650    /// (e.g., gather of extractlements or partially vectorizable loads).
2651    /// \param GatherOps List of gather operand nodes for \p UserTE that require
2652    /// reordering, subset of \p NonVectorized.
2653    bool
2654    canReorderOperands(TreeEntry *UserTE,
2655                       SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2656                       ArrayRef<TreeEntry *> ReorderableGathers,
2657                       SmallVectorImpl<TreeEntry *> &GatherOps);
2658  
2659    /// Checks if the given \p TE is a gather node with clustered reused scalars
2660    /// and reorders it per given \p Mask.
2661    void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2662  
2663    /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2664    /// if any. If it is not vectorized (gather node), returns nullptr.
2665    TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2666      ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2667      TreeEntry *TE = nullptr;
2668      const auto *It = find_if(VL, [&](Value *V) {
2669        TE = getTreeEntry(V);
2670        if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2671          return true;
2672        auto It = MultiNodeScalars.find(V);
2673        if (It != MultiNodeScalars.end()) {
2674          for (TreeEntry *E : It->second) {
2675            if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2676              TE = E;
2677              return true;
2678            }
2679          }
2680        }
2681        return false;
2682      });
2683      if (It != VL.end()) {
2684        assert(TE->isSame(VL) && "Expected same scalars.");
2685        return TE;
2686      }
2687      return nullptr;
2688    }
2689  
2690    /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2691    /// if any. If it is not vectorized (gather node), returns nullptr.
2692    const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2693                                          unsigned OpIdx) const {
2694      return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2695          const_cast<TreeEntry *>(UserTE), OpIdx);
2696    }
2697  
2698    /// Checks if all users of \p I are the part of the vectorization tree.
2699    bool areAllUsersVectorized(
2700        Instruction *I,
2701        const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2702  
2703    /// Return information about the vector formed for the specified index
2704    /// of a vector of (the same) instruction.
2705    TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2706  
2707    /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2708    const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2709  
2710    /// \returns Cast context for the given graph node.
2711    TargetTransformInfo::CastContextHint
2712    getCastContextHint(const TreeEntry &TE) const;
2713  
2714    /// \returns the cost of the vectorizable entry.
2715    InstructionCost getEntryCost(const TreeEntry *E,
2716                                 ArrayRef<Value *> VectorizedVals,
2717                                 SmallPtrSetImpl<Value *> &CheckedExtracts);
2718  
2719    /// This is the recursive part of buildTree.
2720    void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2721                       const EdgeInfo &EI);
2722  
2723    /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2724    /// be vectorized to use the original vector (or aggregate "bitcast" to a
2725    /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2726    /// returns false, setting \p CurrentOrder to either an empty vector or a
2727    /// non-identity permutation that allows to reuse extract instructions.
2728    /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2729    /// extract order.
2730    bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2731                         SmallVectorImpl<unsigned> &CurrentOrder,
2732                         bool ResizeAllowed = false) const;
2733  
2734    /// Vectorize a single entry in the tree.
2735    /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2736    /// avoid issues with def-use order.
2737    Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2738  
2739    /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2740    /// \p E.
2741    /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2742    /// avoid issues with def-use order.
2743    Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2744  
2745    /// Create a new vector from a list of scalar values.  Produces a sequence
2746    /// which exploits values reused across lanes, and arranges the inserts
2747    /// for ease of later optimization.
2748    template <typename BVTy, typename ResTy, typename... Args>
2749    ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2750  
2751    /// Create a new vector from a list of scalar values.  Produces a sequence
2752    /// which exploits values reused across lanes, and arranges the inserts
2753    /// for ease of later optimization.
2754    Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2755  
2756    /// Returns the instruction in the bundle, which can be used as a base point
2757    /// for scheduling. Usually it is the last instruction in the bundle, except
2758    /// for the case when all operands are external (in this case, it is the first
2759    /// instruction in the list).
2760    Instruction &getLastInstructionInBundle(const TreeEntry *E);
2761  
2762    /// Tries to find extractelement instructions with constant indices from fixed
2763    /// vector type and gather such instructions into a bunch, which highly likely
2764    /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2765    /// was successful, the matched scalars are replaced by poison values in \p VL
2766    /// for future analysis.
2767    std::optional<TargetTransformInfo::ShuffleKind>
2768    tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2769                                             SmallVectorImpl<int> &Mask) const;
2770  
2771    /// Tries to find extractelement instructions with constant indices from fixed
2772    /// vector type and gather such instructions into a bunch, which highly likely
2773    /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2774    /// was successful, the matched scalars are replaced by poison values in \p VL
2775    /// for future analysis.
2776    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2777    tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2778                               SmallVectorImpl<int> &Mask,
2779                               unsigned NumParts) const;
2780  
2781    /// Checks if the gathered \p VL can be represented as a single register
2782    /// shuffle(s) of previous tree entries.
2783    /// \param TE Tree entry checked for permutation.
2784    /// \param VL List of scalars (a subset of the TE scalar), checked for
2785    /// permutations. Must form single-register vector.
2786    /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2787    /// commands to build the mask using the original vector value, without
2788    /// relying on the potential reordering.
2789    /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2790    /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2791    std::optional<TargetTransformInfo::ShuffleKind>
2792    isGatherShuffledSingleRegisterEntry(
2793        const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2794        SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2795        bool ForOrder);
2796  
2797    /// Checks if the gathered \p VL can be represented as multi-register
2798    /// shuffle(s) of previous tree entries.
2799    /// \param TE Tree entry checked for permutation.
2800    /// \param VL List of scalars (a subset of the TE scalar), checked for
2801    /// permutations.
2802    /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2803    /// commands to build the mask using the original vector value, without
2804    /// relying on the potential reordering.
2805    /// \returns per-register series of ShuffleKind, if gathered values can be
2806    /// represented as shuffles of previous tree entries. \p Mask is filled with
2807    /// the shuffle mask (also on per-register base).
2808    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2809    isGatherShuffledEntry(
2810        const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2811        SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2812        unsigned NumParts, bool ForOrder = false);
2813  
2814    /// \returns the scalarization cost for this list of values. Assuming that
2815    /// this subtree gets vectorized, we may need to extract the values from the
2816    /// roots. This method calculates the cost of extracting the values.
2817    /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2818    InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2819                                  Type *ScalarTy) const;
2820  
2821    /// Set the Builder insert point to one after the last instruction in
2822    /// the bundle
2823    void setInsertPointAfterBundle(const TreeEntry *E);
2824  
2825    /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2826    /// specified, the starting vector value is poison.
2827    Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2828  
2829    /// \returns whether the VectorizableTree is fully vectorizable and will
2830    /// be beneficial even the tree height is tiny.
2831    bool isFullyVectorizableTinyTree(bool ForReduction) const;
2832  
2833    /// Reorder commutative or alt operands to get better probability of
2834    /// generating vectorized code.
2835    static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2836                                               SmallVectorImpl<Value *> &Left,
2837                                               SmallVectorImpl<Value *> &Right,
2838                                               const BoUpSLP &R);
2839  
2840    /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2841    /// users of \p TE and collects the stores. It returns the map from the store
2842    /// pointers to the collected stores.
2843    DenseMap<Value *, SmallVector<StoreInst *>>
2844    collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2845  
2846    /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2847    /// stores in \p StoresVec can form a vector instruction. If so it returns
2848    /// true and populates \p ReorderIndices with the shuffle indices of the
2849    /// stores when compared to the sorted vector.
2850    bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2851                       OrdersType &ReorderIndices) const;
2852  
2853    /// Iterates through the users of \p TE, looking for scalar stores that can be
2854    /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2855    /// their order and builds an order index vector for each store bundle. It
2856    /// returns all these order vectors found.
2857    /// We run this after the tree has formed, otherwise we may come across user
2858    /// instructions that are not yet in the tree.
2859    SmallVector<OrdersType, 1>
2860    findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2861  
2862    struct TreeEntry {
2863      using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2864      TreeEntry(VecTreeTy &Container) : Container(Container) {}
2865  
2866      /// \returns Common mask for reorder indices and reused scalars.
2867      SmallVector<int> getCommonMask() const {
2868        SmallVector<int> Mask;
2869        inversePermutation(ReorderIndices, Mask);
2870        ::addMask(Mask, ReuseShuffleIndices);
2871        return Mask;
2872      }
2873  
2874      /// \returns true if the scalars in VL are equal to this entry.
2875      bool isSame(ArrayRef<Value *> VL) const {
2876        auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2877          if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2878            return std::equal(VL.begin(), VL.end(), Scalars.begin());
2879          return VL.size() == Mask.size() &&
2880                 std::equal(VL.begin(), VL.end(), Mask.begin(),
2881                            [Scalars](Value *V, int Idx) {
2882                              return (isa<UndefValue>(V) &&
2883                                      Idx == PoisonMaskElem) ||
2884                                     (Idx != PoisonMaskElem && V == Scalars[Idx]);
2885                            });
2886        };
2887        if (!ReorderIndices.empty()) {
2888          // TODO: implement matching if the nodes are just reordered, still can
2889          // treat the vector as the same if the list of scalars matches VL
2890          // directly, without reordering.
2891          SmallVector<int> Mask;
2892          inversePermutation(ReorderIndices, Mask);
2893          if (VL.size() == Scalars.size())
2894            return IsSame(Scalars, Mask);
2895          if (VL.size() == ReuseShuffleIndices.size()) {
2896            ::addMask(Mask, ReuseShuffleIndices);
2897            return IsSame(Scalars, Mask);
2898          }
2899          return false;
2900        }
2901        return IsSame(Scalars, ReuseShuffleIndices);
2902      }
2903  
2904      bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2905        return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906               UserTreeIndices.front().UserTE == UserEI.UserTE;
2907      }
2908  
2909      /// \returns true if current entry has same operands as \p TE.
2910      bool hasEqualOperands(const TreeEntry &TE) const {
2911        if (TE.getNumOperands() != getNumOperands())
2912          return false;
2913        SmallBitVector Used(getNumOperands());
2914        for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2915          unsigned PrevCount = Used.count();
2916          for (unsigned K = 0; K < E; ++K) {
2917            if (Used.test(K))
2918              continue;
2919            if (getOperand(K) == TE.getOperand(I)) {
2920              Used.set(K);
2921              break;
2922            }
2923          }
2924          // Check if we actually found the matching operand.
2925          if (PrevCount == Used.count())
2926            return false;
2927        }
2928        return true;
2929      }
2930  
2931      /// \return Final vectorization factor for the node. Defined by the total
2932      /// number of vectorized scalars, including those, used several times in the
2933      /// entry and counted in the \a ReuseShuffleIndices, if any.
2934      unsigned getVectorFactor() const {
2935        if (!ReuseShuffleIndices.empty())
2936          return ReuseShuffleIndices.size();
2937        return Scalars.size();
2938      };
2939  
2940      /// Checks if the current node is a gather node.
2941      bool isGather() const {return State == NeedToGather; }
2942  
2943      /// A vector of scalars.
2944      ValueList Scalars;
2945  
2946      /// The Scalars are vectorized into this value. It is initialized to Null.
2947      WeakTrackingVH VectorizedValue = nullptr;
2948  
2949      /// New vector phi instructions emitted for the vectorized phi nodes.
2950      PHINode *PHI = nullptr;
2951  
2952      /// Do we need to gather this sequence or vectorize it
2953      /// (either with vector instruction or with scatter/gather
2954      /// intrinsics for store/load)?
2955      enum EntryState {
2956        Vectorize,
2957        ScatterVectorize,
2958        StridedVectorize,
2959        NeedToGather
2960      };
2961      EntryState State;
2962  
2963      /// Does this sequence require some shuffling?
2964      SmallVector<int, 4> ReuseShuffleIndices;
2965  
2966      /// Does this entry require reordering?
2967      SmallVector<unsigned, 4> ReorderIndices;
2968  
2969      /// Points back to the VectorizableTree.
2970      ///
2971      /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
2972      /// to be a pointer and needs to be able to initialize the child iterator.
2973      /// Thus we need a reference back to the container to translate the indices
2974      /// to entries.
2975      VecTreeTy &Container;
2976  
2977      /// The TreeEntry index containing the user of this entry.  We can actually
2978      /// have multiple users so the data structure is not truly a tree.
2979      SmallVector<EdgeInfo, 1> UserTreeIndices;
2980  
2981      /// The index of this treeEntry in VectorizableTree.
2982      int Idx = -1;
2983  
2984    private:
2985      /// The operands of each instruction in each lane Operands[op_index][lane].
2986      /// Note: This helps avoid the replication of the code that performs the
2987      /// reordering of operands during buildTree_rec() and vectorizeTree().
2988      SmallVector<ValueList, 2> Operands;
2989  
2990      /// The main/alternate instruction.
2991      Instruction *MainOp = nullptr;
2992      Instruction *AltOp = nullptr;
2993  
2994    public:
2995      /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2996      void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2997        if (Operands.size() < OpIdx + 1)
2998          Operands.resize(OpIdx + 1);
2999        assert(Operands[OpIdx].empty() && "Already resized?");
3000        assert(OpVL.size() <= Scalars.size() &&
3001               "Number of operands is greater than the number of scalars.");
3002        Operands[OpIdx].resize(OpVL.size());
3003        copy(OpVL, Operands[OpIdx].begin());
3004      }
3005  
3006      /// Set the operands of this bundle in their original order.
3007      void setOperandsInOrder() {
3008        assert(Operands.empty() && "Already initialized?");
3009        auto *I0 = cast<Instruction>(Scalars[0]);
3010        Operands.resize(I0->getNumOperands());
3011        unsigned NumLanes = Scalars.size();
3012        for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3013             OpIdx != NumOperands; ++OpIdx) {
3014          Operands[OpIdx].resize(NumLanes);
3015          for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3016            auto *I = cast<Instruction>(Scalars[Lane]);
3017            assert(I->getNumOperands() == NumOperands &&
3018                   "Expected same number of operands");
3019            Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3020          }
3021        }
3022      }
3023  
3024      /// Reorders operands of the node to the given mask \p Mask.
3025      void reorderOperands(ArrayRef<int> Mask) {
3026        for (ValueList &Operand : Operands)
3027          reorderScalars(Operand, Mask);
3028      }
3029  
3030      /// \returns the \p OpIdx operand of this TreeEntry.
3031      ValueList &getOperand(unsigned OpIdx) {
3032        assert(OpIdx < Operands.size() && "Off bounds");
3033        return Operands[OpIdx];
3034      }
3035  
3036      /// \returns the \p OpIdx operand of this TreeEntry.
3037      ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3038        assert(OpIdx < Operands.size() && "Off bounds");
3039        return Operands[OpIdx];
3040      }
3041  
3042      /// \returns the number of operands.
3043      unsigned getNumOperands() const { return Operands.size(); }
3044  
3045      /// \return the single \p OpIdx operand.
3046      Value *getSingleOperand(unsigned OpIdx) const {
3047        assert(OpIdx < Operands.size() && "Off bounds");
3048        assert(!Operands[OpIdx].empty() && "No operand available");
3049        return Operands[OpIdx][0];
3050      }
3051  
3052      /// Some of the instructions in the list have alternate opcodes.
3053      bool isAltShuffle() const { return MainOp != AltOp; }
3054  
3055      bool isOpcodeOrAlt(Instruction *I) const {
3056        unsigned CheckedOpcode = I->getOpcode();
3057        return (getOpcode() == CheckedOpcode ||
3058                getAltOpcode() == CheckedOpcode);
3059      }
3060  
3061      /// Chooses the correct key for scheduling data. If \p Op has the same (or
3062      /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3063      /// \p OpValue.
3064      Value *isOneOf(Value *Op) const {
3065        auto *I = dyn_cast<Instruction>(Op);
3066        if (I && isOpcodeOrAlt(I))
3067          return Op;
3068        return MainOp;
3069      }
3070  
3071      void setOperations(const InstructionsState &S) {
3072        MainOp = S.MainOp;
3073        AltOp = S.AltOp;
3074      }
3075  
3076      Instruction *getMainOp() const {
3077        return MainOp;
3078      }
3079  
3080      Instruction *getAltOp() const {
3081        return AltOp;
3082      }
3083  
3084      /// The main/alternate opcodes for the list of instructions.
3085      unsigned getOpcode() const {
3086        return MainOp ? MainOp->getOpcode() : 0;
3087      }
3088  
3089      unsigned getAltOpcode() const {
3090        return AltOp ? AltOp->getOpcode() : 0;
3091      }
3092  
3093      /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3094      /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3095      int findLaneForValue(Value *V) const {
3096        unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3097        assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3098        if (!ReorderIndices.empty())
3099          FoundLane = ReorderIndices[FoundLane];
3100        assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3101        if (!ReuseShuffleIndices.empty()) {
3102          FoundLane = std::distance(ReuseShuffleIndices.begin(),
3103                                    find(ReuseShuffleIndices, FoundLane));
3104        }
3105        return FoundLane;
3106      }
3107  
3108      /// Build a shuffle mask for graph entry which represents a merge of main
3109      /// and alternate operations.
3110      void
3111      buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3112                            SmallVectorImpl<int> &Mask,
3113                            SmallVectorImpl<Value *> *OpScalars = nullptr,
3114                            SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3115  
3116      /// Return true if this is a non-power-of-2 node.
3117      bool isNonPowOf2Vec() const {
3118        bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3119        assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3120               "Reshuffling not supported with non-power-of-2 vectors yet.");
3121        return IsNonPowerOf2;
3122      }
3123  
3124  #ifndef NDEBUG
3125      /// Debug printer.
3126      LLVM_DUMP_METHOD void dump() const {
3127        dbgs() << Idx << ".\n";
3128        for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3129          dbgs() << "Operand " << OpI << ":\n";
3130          for (const Value *V : Operands[OpI])
3131            dbgs().indent(2) << *V << "\n";
3132        }
3133        dbgs() << "Scalars: \n";
3134        for (Value *V : Scalars)
3135          dbgs().indent(2) << *V << "\n";
3136        dbgs() << "State: ";
3137        switch (State) {
3138        case Vectorize:
3139          dbgs() << "Vectorize\n";
3140          break;
3141        case ScatterVectorize:
3142          dbgs() << "ScatterVectorize\n";
3143          break;
3144        case StridedVectorize:
3145          dbgs() << "StridedVectorize\n";
3146          break;
3147        case NeedToGather:
3148          dbgs() << "NeedToGather\n";
3149          break;
3150        }
3151        dbgs() << "MainOp: ";
3152        if (MainOp)
3153          dbgs() << *MainOp << "\n";
3154        else
3155          dbgs() << "NULL\n";
3156        dbgs() << "AltOp: ";
3157        if (AltOp)
3158          dbgs() << *AltOp << "\n";
3159        else
3160          dbgs() << "NULL\n";
3161        dbgs() << "VectorizedValue: ";
3162        if (VectorizedValue)
3163          dbgs() << *VectorizedValue << "\n";
3164        else
3165          dbgs() << "NULL\n";
3166        dbgs() << "ReuseShuffleIndices: ";
3167        if (ReuseShuffleIndices.empty())
3168          dbgs() << "Empty";
3169        else
3170          for (int ReuseIdx : ReuseShuffleIndices)
3171            dbgs() << ReuseIdx << ", ";
3172        dbgs() << "\n";
3173        dbgs() << "ReorderIndices: ";
3174        for (unsigned ReorderIdx : ReorderIndices)
3175          dbgs() << ReorderIdx << ", ";
3176        dbgs() << "\n";
3177        dbgs() << "UserTreeIndices: ";
3178        for (const auto &EInfo : UserTreeIndices)
3179          dbgs() << EInfo << ", ";
3180        dbgs() << "\n";
3181      }
3182  #endif
3183    };
3184  
3185  #ifndef NDEBUG
3186    void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3187                       InstructionCost VecCost, InstructionCost ScalarCost,
3188                       StringRef Banner) const {
3189      dbgs() << "SLP: " << Banner << ":\n";
3190      E->dump();
3191      dbgs() << "SLP: Costs:\n";
3192      dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3193      dbgs() << "SLP:     VectorCost = " << VecCost << "\n";
3194      dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";
3195      dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = "
3196             << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3197    }
3198  #endif
3199  
3200    /// Create a new VectorizableTree entry.
3201    TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3202                            std::optional<ScheduleData *> Bundle,
3203                            const InstructionsState &S,
3204                            const EdgeInfo &UserTreeIdx,
3205                            ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3206                            ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3207      TreeEntry::EntryState EntryState =
3208          Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209      return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210                          ReuseShuffleIndices, ReorderIndices);
3211    }
3212  
3213    TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3214                            TreeEntry::EntryState EntryState,
3215                            std::optional<ScheduleData *> Bundle,
3216                            const InstructionsState &S,
3217                            const EdgeInfo &UserTreeIdx,
3218                            ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3219                            ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3220      assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3221              (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222             "Need to vectorize gather entry?");
3223      VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3224      TreeEntry *Last = VectorizableTree.back().get();
3225      Last->Idx = VectorizableTree.size() - 1;
3226      Last->State = EntryState;
3227      Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3228                                       ReuseShuffleIndices.end());
3229      if (ReorderIndices.empty()) {
3230        Last->Scalars.assign(VL.begin(), VL.end());
3231        Last->setOperations(S);
3232      } else {
3233        // Reorder scalars and build final mask.
3234        Last->Scalars.assign(VL.size(), nullptr);
3235        transform(ReorderIndices, Last->Scalars.begin(),
3236                  [VL](unsigned Idx) -> Value * {
3237                    if (Idx >= VL.size())
3238                      return UndefValue::get(VL.front()->getType());
3239                    return VL[Idx];
3240                  });
3241        InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3242        Last->setOperations(S);
3243        Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3244      }
3245      if (!Last->isGather()) {
3246        for (Value *V : VL) {
3247          const TreeEntry *TE = getTreeEntry(V);
3248          assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3249                 "Scalar already in tree!");
3250          if (TE) {
3251            if (TE != Last)
3252              MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3253            continue;
3254          }
3255          ScalarToTreeEntry[V] = Last;
3256        }
3257        // Update the scheduler bundle to point to this TreeEntry.
3258        ScheduleData *BundleMember = *Bundle;
3259        assert((BundleMember || isa<PHINode>(S.MainOp) ||
3260                isVectorLikeInstWithConstOps(S.MainOp) ||
3261                doesNotNeedToSchedule(VL)) &&
3262               "Bundle and VL out of sync");
3263        if (BundleMember) {
3264          for (Value *V : VL) {
3265            if (doesNotNeedToBeScheduled(V))
3266              continue;
3267            if (!BundleMember)
3268              continue;
3269            BundleMember->TE = Last;
3270            BundleMember = BundleMember->NextInBundle;
3271          }
3272        }
3273        assert(!BundleMember && "Bundle and VL out of sync");
3274      } else {
3275        // Build a map for gathered scalars to the nodes where they are used.
3276        bool AllConstsOrCasts = true;
3277        for (Value *V : VL)
3278          if (!isConstant(V)) {
3279            auto *I = dyn_cast<CastInst>(V);
3280            AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3281            ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3282          }
3283        if (AllConstsOrCasts)
3284          CastMaxMinBWSizes =
3285              std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3286        MustGather.insert(VL.begin(), VL.end());
3287      }
3288  
3289      if (UserTreeIdx.UserTE) {
3290        Last->UserTreeIndices.push_back(UserTreeIdx);
3291        assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3292               "Reordering isn't implemented for non-power-of-2 nodes yet");
3293      }
3294      return Last;
3295    }
3296  
3297    /// -- Vectorization State --
3298    /// Holds all of the tree entries.
3299    TreeEntry::VecTreeTy VectorizableTree;
3300  
3301  #ifndef NDEBUG
3302    /// Debug printer.
3303    LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3304      for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305        VectorizableTree[Id]->dump();
3306        dbgs() << "\n";
3307      }
3308    }
3309  #endif
3310  
3311    TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3312  
3313    const TreeEntry *getTreeEntry(Value *V) const {
3314      return ScalarToTreeEntry.lookup(V);
3315    }
3316  
3317    /// Check that the operand node of alternate node does not generate
3318    /// buildvector sequence. If it is, then probably not worth it to build
3319    /// alternate shuffle, if number of buildvector operands + alternate
3320    /// instruction > than the number of buildvector instructions.
3321    /// \param S the instructions state of the analyzed values.
3322    /// \param VL list of the instructions with alternate opcodes.
3323    bool areAltOperandsProfitable(const InstructionsState &S,
3324                                  ArrayRef<Value *> VL) const;
3325  
3326    /// Checks if the specified list of the instructions/values can be vectorized
3327    /// and fills required data before actual scheduling of the instructions.
3328    TreeEntry::EntryState getScalarsVectorizationState(
3329        InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3330        OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3331  
3332    /// Maps a specific scalar to its tree entry.
3333    SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3334  
3335    /// List of scalars, used in several vectorize nodes, and the list of the
3336    /// nodes.
3337    SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3338  
3339    /// Maps a value to the proposed vectorizable size.
3340    SmallDenseMap<Value *, unsigned> InstrElementSize;
3341  
3342    /// A list of scalars that we found that we need to keep as scalars.
3343    ValueSet MustGather;
3344  
3345    /// A set of first non-schedulable values.
3346    ValueSet NonScheduledFirst;
3347  
3348    /// A map between the vectorized entries and the last instructions in the
3349    /// bundles. The bundles are built in use order, not in the def order of the
3350    /// instructions. So, we cannot rely directly on the last instruction in the
3351    /// bundle being the last instruction in the program order during
3352    /// vectorization process since the basic blocks are affected, need to
3353    /// pre-gather them before.
3354    DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3355  
3356    /// List of gather nodes, depending on other gather/vector nodes, which should
3357    /// be emitted after the vector instruction emission process to correctly
3358    /// handle order of the vector instructions and shuffles.
3359    SetVector<const TreeEntry *> PostponedGathers;
3360  
3361    using ValueToGatherNodesMap =
3362        DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3363    ValueToGatherNodesMap ValueToGatherNodes;
3364  
3365    /// This POD struct describes one external user in the vectorized tree.
3366    struct ExternalUser {
3367      ExternalUser(Value *S, llvm::User *U, int L)
3368          : Scalar(S), User(U), Lane(L) {}
3369  
3370      // Which scalar in our function.
3371      Value *Scalar;
3372  
3373      // Which user that uses the scalar.
3374      llvm::User *User;
3375  
3376      // Which lane does the scalar belong to.
3377      int Lane;
3378    };
3379    using UserList = SmallVector<ExternalUser, 16>;
3380  
3381    /// Checks if two instructions may access the same memory.
3382    ///
3383    /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3384    /// is invariant in the calling loop.
3385    bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3386                   Instruction *Inst2) {
3387      if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3388        return true;
3389      // First check if the result is already in the cache.
3390      AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3391      auto It = AliasCache.find(Key);
3392      if (It != AliasCache.end())
3393        return It->second;
3394      bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3395      // Store the result in the cache.
3396      AliasCache.try_emplace(Key, Aliased);
3397      AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3398      return Aliased;
3399    }
3400  
3401    using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3402  
3403    /// Cache for alias results.
3404    /// TODO: consider moving this to the AliasAnalysis itself.
3405    DenseMap<AliasCacheKey, bool> AliasCache;
3406  
3407    // Cache for pointerMayBeCaptured calls inside AA.  This is preserved
3408    // globally through SLP because we don't perform any action which
3409    // invalidates capture results.
3410    BatchAAResults BatchAA;
3411  
3412    /// Temporary store for deleted instructions. Instructions will be deleted
3413    /// eventually when the BoUpSLP is destructed.  The deferral is required to
3414    /// ensure that there are no incorrect collisions in the AliasCache, which
3415    /// can happen if a new instruction is allocated at the same address as a
3416    /// previously deleted instruction.
3417    DenseSet<Instruction *> DeletedInstructions;
3418  
3419    /// Set of the instruction, being analyzed already for reductions.
3420    SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3421  
3422    /// Set of hashes for the list of reduction values already being analyzed.
3423    DenseSet<size_t> AnalyzedReductionVals;
3424  
3425    /// Values, already been analyzed for mininmal bitwidth and found to be
3426    /// non-profitable.
3427    DenseSet<Value *> AnalyzedMinBWVals;
3428  
3429    /// A list of values that need to extracted out of the tree.
3430    /// This list holds pairs of (Internal Scalar : External User). External User
3431    /// can be nullptr, it means that this Internal Scalar will be used later,
3432    /// after vectorization.
3433    UserList ExternalUses;
3434  
3435    /// A list of GEPs which can be reaplced by scalar GEPs instead of
3436    /// extractelement instructions.
3437    SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3438  
3439    /// Values used only by @llvm.assume calls.
3440    SmallPtrSet<const Value *, 32> EphValues;
3441  
3442    /// Holds all of the instructions that we gathered, shuffle instructions and
3443    /// extractelements.
3444    SetVector<Instruction *> GatherShuffleExtractSeq;
3445  
3446    /// A list of blocks that we are going to CSE.
3447    DenseSet<BasicBlock *> CSEBlocks;
3448  
3449    /// Contains all scheduling relevant data for an instruction.
3450    /// A ScheduleData either represents a single instruction or a member of an
3451    /// instruction bundle (= a group of instructions which is combined into a
3452    /// vector instruction).
3453    struct ScheduleData {
3454      // The initial value for the dependency counters. It means that the
3455      // dependencies are not calculated yet.
3456      enum { InvalidDeps = -1 };
3457  
3458      ScheduleData() = default;
3459  
3460      void init(int BlockSchedulingRegionID, Value *OpVal) {
3461        FirstInBundle = this;
3462        NextInBundle = nullptr;
3463        NextLoadStore = nullptr;
3464        IsScheduled = false;
3465        SchedulingRegionID = BlockSchedulingRegionID;
3466        clearDependencies();
3467        OpValue = OpVal;
3468        TE = nullptr;
3469      }
3470  
3471      /// Verify basic self consistency properties
3472      void verify() {
3473        if (hasValidDependencies()) {
3474          assert(UnscheduledDeps <= Dependencies && "invariant");
3475        } else {
3476          assert(UnscheduledDeps == Dependencies && "invariant");
3477        }
3478  
3479        if (IsScheduled) {
3480          assert(isSchedulingEntity() &&
3481                  "unexpected scheduled state");
3482          for (const ScheduleData *BundleMember = this; BundleMember;
3483               BundleMember = BundleMember->NextInBundle) {
3484            assert(BundleMember->hasValidDependencies() &&
3485                   BundleMember->UnscheduledDeps == 0 &&
3486                   "unexpected scheduled state");
3487            assert((BundleMember == this || !BundleMember->IsScheduled) &&
3488                   "only bundle is marked scheduled");
3489          }
3490        }
3491  
3492        assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493               "all bundle members must be in same basic block");
3494      }
3495  
3496      /// Returns true if the dependency information has been calculated.
3497      /// Note that depenendency validity can vary between instructions within
3498      /// a single bundle.
3499      bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3500  
3501      /// Returns true for single instructions and for bundle representatives
3502      /// (= the head of a bundle).
3503      bool isSchedulingEntity() const { return FirstInBundle == this; }
3504  
3505      /// Returns true if it represents an instruction bundle and not only a
3506      /// single instruction.
3507      bool isPartOfBundle() const {
3508        return NextInBundle != nullptr || FirstInBundle != this || TE;
3509      }
3510  
3511      /// Returns true if it is ready for scheduling, i.e. it has no more
3512      /// unscheduled depending instructions/bundles.
3513      bool isReady() const {
3514        assert(isSchedulingEntity() &&
3515               "can't consider non-scheduling entity for ready list");
3516        return unscheduledDepsInBundle() == 0 && !IsScheduled;
3517      }
3518  
3519      /// Modifies the number of unscheduled dependencies for this instruction,
3520      /// and returns the number of remaining dependencies for the containing
3521      /// bundle.
3522      int incrementUnscheduledDeps(int Incr) {
3523        assert(hasValidDependencies() &&
3524               "increment of unscheduled deps would be meaningless");
3525        UnscheduledDeps += Incr;
3526        return FirstInBundle->unscheduledDepsInBundle();
3527      }
3528  
3529      /// Sets the number of unscheduled dependencies to the number of
3530      /// dependencies.
3531      void resetUnscheduledDeps() {
3532        UnscheduledDeps = Dependencies;
3533      }
3534  
3535      /// Clears all dependency information.
3536      void clearDependencies() {
3537        Dependencies = InvalidDeps;
3538        resetUnscheduledDeps();
3539        MemoryDependencies.clear();
3540        ControlDependencies.clear();
3541      }
3542  
3543      int unscheduledDepsInBundle() const {
3544        assert(isSchedulingEntity() && "only meaningful on the bundle");
3545        int Sum = 0;
3546        for (const ScheduleData *BundleMember = this; BundleMember;
3547             BundleMember = BundleMember->NextInBundle) {
3548          if (BundleMember->UnscheduledDeps == InvalidDeps)
3549            return InvalidDeps;
3550          Sum += BundleMember->UnscheduledDeps;
3551        }
3552        return Sum;
3553      }
3554  
3555      void dump(raw_ostream &os) const {
3556        if (!isSchedulingEntity()) {
3557          os << "/ " << *Inst;
3558        } else if (NextInBundle) {
3559          os << '[' << *Inst;
3560          ScheduleData *SD = NextInBundle;
3561          while (SD) {
3562            os << ';' << *SD->Inst;
3563            SD = SD->NextInBundle;
3564          }
3565          os << ']';
3566        } else {
3567          os << *Inst;
3568        }
3569      }
3570  
3571      Instruction *Inst = nullptr;
3572  
3573      /// Opcode of the current instruction in the schedule data.
3574      Value *OpValue = nullptr;
3575  
3576      /// The TreeEntry that this instruction corresponds to.
3577      TreeEntry *TE = nullptr;
3578  
3579      /// Points to the head in an instruction bundle (and always to this for
3580      /// single instructions).
3581      ScheduleData *FirstInBundle = nullptr;
3582  
3583      /// Single linked list of all instructions in a bundle. Null if it is a
3584      /// single instruction.
3585      ScheduleData *NextInBundle = nullptr;
3586  
3587      /// Single linked list of all memory instructions (e.g. load, store, call)
3588      /// in the block - until the end of the scheduling region.
3589      ScheduleData *NextLoadStore = nullptr;
3590  
3591      /// The dependent memory instructions.
3592      /// This list is derived on demand in calculateDependencies().
3593      SmallVector<ScheduleData *, 4> MemoryDependencies;
3594  
3595      /// List of instructions which this instruction could be control dependent
3596      /// on.  Allowing such nodes to be scheduled below this one could introduce
3597      /// a runtime fault which didn't exist in the original program.
3598      /// ex: this is a load or udiv following a readonly call which inf loops
3599      SmallVector<ScheduleData *, 4> ControlDependencies;
3600  
3601      /// This ScheduleData is in the current scheduling region if this matches
3602      /// the current SchedulingRegionID of BlockScheduling.
3603      int SchedulingRegionID = 0;
3604  
3605      /// Used for getting a "good" final ordering of instructions.
3606      int SchedulingPriority = 0;
3607  
3608      /// The number of dependencies. Constitutes of the number of users of the
3609      /// instruction plus the number of dependent memory instructions (if any).
3610      /// This value is calculated on demand.
3611      /// If InvalidDeps, the number of dependencies is not calculated yet.
3612      int Dependencies = InvalidDeps;
3613  
3614      /// The number of dependencies minus the number of dependencies of scheduled
3615      /// instructions. As soon as this is zero, the instruction/bundle gets ready
3616      /// for scheduling.
3617      /// Note that this is negative as long as Dependencies is not calculated.
3618      int UnscheduledDeps = InvalidDeps;
3619  
3620      /// True if this instruction is scheduled (or considered as scheduled in the
3621      /// dry-run).
3622      bool IsScheduled = false;
3623    };
3624  
3625  #ifndef NDEBUG
3626    friend inline raw_ostream &operator<<(raw_ostream &os,
3627                                          const BoUpSLP::ScheduleData &SD) {
3628      SD.dump(os);
3629      return os;
3630    }
3631  #endif
3632  
3633    friend struct GraphTraits<BoUpSLP *>;
3634    friend struct DOTGraphTraits<BoUpSLP *>;
3635  
3636    /// Contains all scheduling data for a basic block.
3637    /// It does not schedules instructions, which are not memory read/write
3638    /// instructions and their operands are either constants, or arguments, or
3639    /// phis, or instructions from others blocks, or their users are phis or from
3640    /// the other blocks. The resulting vector instructions can be placed at the
3641    /// beginning of the basic block without scheduling (if operands does not need
3642    /// to be scheduled) or at the end of the block (if users are outside of the
3643    /// block). It allows to save some compile time and memory used by the
3644    /// compiler.
3645    /// ScheduleData is assigned for each instruction in between the boundaries of
3646    /// the tree entry, even for those, which are not part of the graph. It is
3647    /// required to correctly follow the dependencies between the instructions and
3648    /// their correct scheduling. The ScheduleData is not allocated for the
3649    /// instructions, which do not require scheduling, like phis, nodes with
3650    /// extractelements/insertelements only or nodes with instructions, with
3651    /// uses/operands outside of the block.
3652    struct BlockScheduling {
3653      BlockScheduling(BasicBlock *BB)
3654          : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3655  
3656      void clear() {
3657        ReadyInsts.clear();
3658        ScheduleStart = nullptr;
3659        ScheduleEnd = nullptr;
3660        FirstLoadStoreInRegion = nullptr;
3661        LastLoadStoreInRegion = nullptr;
3662        RegionHasStackSave = false;
3663  
3664        // Reduce the maximum schedule region size by the size of the
3665        // previous scheduling run.
3666        ScheduleRegionSizeLimit -= ScheduleRegionSize;
3667        if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3668          ScheduleRegionSizeLimit = MinScheduleRegionSize;
3669        ScheduleRegionSize = 0;
3670  
3671        // Make a new scheduling region, i.e. all existing ScheduleData is not
3672        // in the new region yet.
3673        ++SchedulingRegionID;
3674      }
3675  
3676      ScheduleData *getScheduleData(Instruction *I) {
3677        if (BB != I->getParent())
3678          // Avoid lookup if can't possibly be in map.
3679          return nullptr;
3680        ScheduleData *SD = ScheduleDataMap.lookup(I);
3681        if (SD && isInSchedulingRegion(SD))
3682          return SD;
3683        return nullptr;
3684      }
3685  
3686      ScheduleData *getScheduleData(Value *V) {
3687        if (auto *I = dyn_cast<Instruction>(V))
3688          return getScheduleData(I);
3689        return nullptr;
3690      }
3691  
3692      ScheduleData *getScheduleData(Value *V, Value *Key) {
3693        if (V == Key)
3694          return getScheduleData(V);
3695        auto I = ExtraScheduleDataMap.find(V);
3696        if (I != ExtraScheduleDataMap.end()) {
3697          ScheduleData *SD = I->second.lookup(Key);
3698          if (SD && isInSchedulingRegion(SD))
3699            return SD;
3700        }
3701        return nullptr;
3702      }
3703  
3704      bool isInSchedulingRegion(ScheduleData *SD) const {
3705        return SD->SchedulingRegionID == SchedulingRegionID;
3706      }
3707  
3708      /// Marks an instruction as scheduled and puts all dependent ready
3709      /// instructions into the ready-list.
3710      template <typename ReadyListType>
3711      void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712        SD->IsScheduled = true;
3713        LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
3714  
3715        for (ScheduleData *BundleMember = SD; BundleMember;
3716             BundleMember = BundleMember->NextInBundle) {
3717          if (BundleMember->Inst != BundleMember->OpValue)
3718            continue;
3719  
3720          // Handle the def-use chain dependencies.
3721  
3722          // Decrement the unscheduled counter and insert to ready list if ready.
3723          auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3724            doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3725              if (OpDef && OpDef->hasValidDependencies() &&
3726                  OpDef->incrementUnscheduledDeps(-1) == 0) {
3727                // There are no more unscheduled dependencies after
3728                // decrementing, so we can put the dependent instruction
3729                // into the ready list.
3730                ScheduleData *DepBundle = OpDef->FirstInBundle;
3731                assert(!DepBundle->IsScheduled &&
3732                       "already scheduled bundle gets ready");
3733                ReadyList.insert(DepBundle);
3734                LLVM_DEBUG(dbgs()
3735                           << "SLP:    gets ready (def): " << *DepBundle << "\n");
3736              }
3737            });
3738          };
3739  
3740          // If BundleMember is a vector bundle, its operands may have been
3741          // reordered during buildTree(). We therefore need to get its operands
3742          // through the TreeEntry.
3743          if (TreeEntry *TE = BundleMember->TE) {
3744            // Need to search for the lane since the tree entry can be reordered.
3745            int Lane = std::distance(TE->Scalars.begin(),
3746                                     find(TE->Scalars, BundleMember->Inst));
3747            assert(Lane >= 0 && "Lane not set");
3748  
3749            // Since vectorization tree is being built recursively this assertion
3750            // ensures that the tree entry has all operands set before reaching
3751            // this code. Couple of exceptions known at the moment are extracts
3752            // where their second (immediate) operand is not added. Since
3753            // immediates do not affect scheduler behavior this is considered
3754            // okay.
3755            auto *In = BundleMember->Inst;
3756            assert(
3757                In &&
3758                (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3759                 In->getNumOperands() == TE->getNumOperands()) &&
3760                "Missed TreeEntry operands?");
3761            (void)In; // fake use to avoid build failure when assertions disabled
3762  
3763            for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3764                 OpIdx != NumOperands; ++OpIdx)
3765              if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3766                DecrUnsched(I);
3767          } else {
3768            // If BundleMember is a stand-alone instruction, no operand reordering
3769            // has taken place, so we directly access its operands.
3770            for (Use &U : BundleMember->Inst->operands())
3771              if (auto *I = dyn_cast<Instruction>(U.get()))
3772                DecrUnsched(I);
3773          }
3774          // Handle the memory dependencies.
3775          for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776            if (MemoryDepSD->hasValidDependencies() &&
3777                MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3778              // There are no more unscheduled dependencies after decrementing,
3779              // so we can put the dependent instruction into the ready list.
3780              ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781              assert(!DepBundle->IsScheduled &&
3782                     "already scheduled bundle gets ready");
3783              ReadyList.insert(DepBundle);
3784              LLVM_DEBUG(dbgs()
3785                         << "SLP:    gets ready (mem): " << *DepBundle << "\n");
3786            }
3787          }
3788          // Handle the control dependencies.
3789          for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790            if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3791              // There are no more unscheduled dependencies after decrementing,
3792              // so we can put the dependent instruction into the ready list.
3793              ScheduleData *DepBundle = DepSD->FirstInBundle;
3794              assert(!DepBundle->IsScheduled &&
3795                     "already scheduled bundle gets ready");
3796              ReadyList.insert(DepBundle);
3797              LLVM_DEBUG(dbgs()
3798                         << "SLP:    gets ready (ctl): " << *DepBundle << "\n");
3799            }
3800          }
3801        }
3802      }
3803  
3804      /// Verify basic self consistency properties of the data structure.
3805      void verify() {
3806        if (!ScheduleStart)
3807          return;
3808  
3809        assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810               ScheduleStart->comesBefore(ScheduleEnd) &&
3811               "Not a valid scheduling region?");
3812  
3813        for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3814          auto *SD = getScheduleData(I);
3815          if (!SD)
3816            continue;
3817          assert(isInSchedulingRegion(SD) &&
3818                 "primary schedule data not in window?");
3819          assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820                 "entire bundle in window!");
3821          (void)SD;
3822          doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3823        }
3824  
3825        for (auto *SD : ReadyInsts) {
3826          assert(SD->isSchedulingEntity() && SD->isReady() &&
3827                 "item in ready list not ready?");
3828          (void)SD;
3829        }
3830      }
3831  
3832      void doForAllOpcodes(Value *V,
3833                           function_ref<void(ScheduleData *SD)> Action) {
3834        if (ScheduleData *SD = getScheduleData(V))
3835          Action(SD);
3836        auto I = ExtraScheduleDataMap.find(V);
3837        if (I != ExtraScheduleDataMap.end())
3838          for (auto &P : I->second)
3839            if (isInSchedulingRegion(P.second))
3840              Action(P.second);
3841      }
3842  
3843      /// Put all instructions into the ReadyList which are ready for scheduling.
3844      template <typename ReadyListType>
3845      void initialFillReadyList(ReadyListType &ReadyList) {
3846        for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3847          doForAllOpcodes(I, [&](ScheduleData *SD) {
3848            if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3849                SD->isReady()) {
3850              ReadyList.insert(SD);
3851              LLVM_DEBUG(dbgs()
3852                         << "SLP:    initially in ready list: " << *SD << "\n");
3853            }
3854          });
3855        }
3856      }
3857  
3858      /// Build a bundle from the ScheduleData nodes corresponding to the
3859      /// scalar instruction for each lane.
3860      ScheduleData *buildBundle(ArrayRef<Value *> VL);
3861  
3862      /// Checks if a bundle of instructions can be scheduled, i.e. has no
3863      /// cyclic dependencies. This is only a dry-run, no instructions are
3864      /// actually moved at this stage.
3865      /// \returns the scheduling bundle. The returned Optional value is not
3866      /// std::nullopt if \p VL is allowed to be scheduled.
3867      std::optional<ScheduleData *>
3868      tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3869                        const InstructionsState &S);
3870  
3871      /// Un-bundles a group of instructions.
3872      void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3873  
3874      /// Allocates schedule data chunk.
3875      ScheduleData *allocateScheduleDataChunks();
3876  
3877      /// Extends the scheduling region so that V is inside the region.
3878      /// \returns true if the region size is within the limit.
3879      bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3880  
3881      /// Initialize the ScheduleData structures for new instructions in the
3882      /// scheduling region.
3883      void initScheduleData(Instruction *FromI, Instruction *ToI,
3884                            ScheduleData *PrevLoadStore,
3885                            ScheduleData *NextLoadStore);
3886  
3887      /// Updates the dependency information of a bundle and of all instructions/
3888      /// bundles which depend on the original bundle.
3889      void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3890                                 BoUpSLP *SLP);
3891  
3892      /// Sets all instruction in the scheduling region to un-scheduled.
3893      void resetSchedule();
3894  
3895      BasicBlock *BB;
3896  
3897      /// Simple memory allocation for ScheduleData.
3898      SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3899  
3900      /// The size of a ScheduleData array in ScheduleDataChunks.
3901      int ChunkSize;
3902  
3903      /// The allocator position in the current chunk, which is the last entry
3904      /// of ScheduleDataChunks.
3905      int ChunkPos;
3906  
3907      /// Attaches ScheduleData to Instruction.
3908      /// Note that the mapping survives during all vectorization iterations, i.e.
3909      /// ScheduleData structures are recycled.
3910      DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3911  
3912      /// Attaches ScheduleData to Instruction with the leading key.
3913      DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3914          ExtraScheduleDataMap;
3915  
3916      /// The ready-list for scheduling (only used for the dry-run).
3917      SetVector<ScheduleData *> ReadyInsts;
3918  
3919      /// The first instruction of the scheduling region.
3920      Instruction *ScheduleStart = nullptr;
3921  
3922      /// The first instruction _after_ the scheduling region.
3923      Instruction *ScheduleEnd = nullptr;
3924  
3925      /// The first memory accessing instruction in the scheduling region
3926      /// (can be null).
3927      ScheduleData *FirstLoadStoreInRegion = nullptr;
3928  
3929      /// The last memory accessing instruction in the scheduling region
3930      /// (can be null).
3931      ScheduleData *LastLoadStoreInRegion = nullptr;
3932  
3933      /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3934      /// region?  Used to optimize the dependence calculation for the
3935      /// common case where there isn't.
3936      bool RegionHasStackSave = false;
3937  
3938      /// The current size of the scheduling region.
3939      int ScheduleRegionSize = 0;
3940  
3941      /// The maximum size allowed for the scheduling region.
3942      int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3943  
3944      /// The ID of the scheduling region. For a new vectorization iteration this
3945      /// is incremented which "removes" all ScheduleData from the region.
3946      /// Make sure that the initial SchedulingRegionID is greater than the
3947      /// initial SchedulingRegionID in ScheduleData (which is 0).
3948      int SchedulingRegionID = 1;
3949    };
3950  
3951    /// Attaches the BlockScheduling structures to basic blocks.
3952    MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3953  
3954    /// Performs the "real" scheduling. Done before vectorization is actually
3955    /// performed in a basic block.
3956    void scheduleBlock(BlockScheduling *BS);
3957  
3958    /// List of users to ignore during scheduling and that don't need extracting.
3959    const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3960  
3961    /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3962    /// sorted SmallVectors of unsigned.
3963    struct OrdersTypeDenseMapInfo {
3964      static OrdersType getEmptyKey() {
3965        OrdersType V;
3966        V.push_back(~1U);
3967        return V;
3968      }
3969  
3970      static OrdersType getTombstoneKey() {
3971        OrdersType V;
3972        V.push_back(~2U);
3973        return V;
3974      }
3975  
3976      static unsigned getHashValue(const OrdersType &V) {
3977        return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3978      }
3979  
3980      static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3981        return LHS == RHS;
3982      }
3983    };
3984  
3985    // Analysis and block reference.
3986    Function *F;
3987    ScalarEvolution *SE;
3988    TargetTransformInfo *TTI;
3989    TargetLibraryInfo *TLI;
3990    LoopInfo *LI;
3991    DominatorTree *DT;
3992    AssumptionCache *AC;
3993    DemandedBits *DB;
3994    const DataLayout *DL;
3995    OptimizationRemarkEmitter *ORE;
3996  
3997    unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3998    unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3999  
4000    /// Instruction builder to construct the vectorized tree.
4001    IRBuilder<TargetFolder> Builder;
4002  
4003    /// A map of scalar integer values to the smallest bit width with which they
4004    /// can legally be represented. The values map to (width, signed) pairs,
4005    /// where "width" indicates the minimum bit width and "signed" is True if the
4006    /// value must be signed-extended, rather than zero-extended, back to its
4007    /// original width.
4008    DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
4009  
4010    /// Final size of the reduced vector, if the current graph represents the
4011    /// input for the reduction and it was possible to narrow the size of the
4012    /// reduction.
4013    unsigned ReductionBitWidth = 0;
4014  
4015    /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4016    /// type sizes, used in the tree.
4017    std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4018  
4019    /// Indices of the vectorized nodes, which supposed to be the roots of the new
4020    /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4021    DenseSet<unsigned> ExtraBitWidthNodes;
4022  };
4023  
4024  } // end namespace slpvectorizer
4025  
4026  template <> struct GraphTraits<BoUpSLP *> {
4027    using TreeEntry = BoUpSLP::TreeEntry;
4028  
4029    /// NodeRef has to be a pointer per the GraphWriter.
4030    using NodeRef = TreeEntry *;
4031  
4032    using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
4033  
4034    /// Add the VectorizableTree to the index iterator to be able to return
4035    /// TreeEntry pointers.
4036    struct ChildIteratorType
4037        : public iterator_adaptor_base<
4038              ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4039      ContainerTy &VectorizableTree;
4040  
4041      ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
4042                        ContainerTy &VT)
4043          : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4044  
4045      NodeRef operator*() { return I->UserTE; }
4046    };
4047  
4048    static NodeRef getEntryNode(BoUpSLP &R) {
4049      return R.VectorizableTree[0].get();
4050    }
4051  
4052    static ChildIteratorType child_begin(NodeRef N) {
4053      return {N->UserTreeIndices.begin(), N->Container};
4054    }
4055  
4056    static ChildIteratorType child_end(NodeRef N) {
4057      return {N->UserTreeIndices.end(), N->Container};
4058    }
4059  
4060    /// For the node iterator we just need to turn the TreeEntry iterator into a
4061    /// TreeEntry* iterator so that it dereferences to NodeRef.
4062    class nodes_iterator {
4063      using ItTy = ContainerTy::iterator;
4064      ItTy It;
4065  
4066    public:
4067      nodes_iterator(const ItTy &It2) : It(It2) {}
4068      NodeRef operator*() { return It->get(); }
4069      nodes_iterator operator++() {
4070        ++It;
4071        return *this;
4072      }
4073      bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4074    };
4075  
4076    static nodes_iterator nodes_begin(BoUpSLP *R) {
4077      return nodes_iterator(R->VectorizableTree.begin());
4078    }
4079  
4080    static nodes_iterator nodes_end(BoUpSLP *R) {
4081      return nodes_iterator(R->VectorizableTree.end());
4082    }
4083  
4084    static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4085  };
4086  
4087  template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4088    using TreeEntry = BoUpSLP::TreeEntry;
4089  
4090    DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4091  
4092    std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4093      std::string Str;
4094      raw_string_ostream OS(Str);
4095      OS << Entry->Idx << ".\n";
4096      if (isSplat(Entry->Scalars))
4097        OS << "<splat> ";
4098      for (auto *V : Entry->Scalars) {
4099        OS << *V;
4100        if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4101              return EU.Scalar == V;
4102            }))
4103          OS << " <extract>";
4104        OS << "\n";
4105      }
4106      return Str;
4107    }
4108  
4109    static std::string getNodeAttributes(const TreeEntry *Entry,
4110                                         const BoUpSLP *) {
4111      if (Entry->isGather())
4112        return "color=red";
4113      if (Entry->State == TreeEntry::ScatterVectorize ||
4114          Entry->State == TreeEntry::StridedVectorize)
4115        return "color=blue";
4116      return "";
4117    }
4118  };
4119  
4120  } // end namespace llvm
4121  
4122  BoUpSLP::~BoUpSLP() {
4123    SmallVector<WeakTrackingVH> DeadInsts;
4124    for (auto *I : DeletedInstructions) {
4125      if (!I->getParent()) {
4126        // Temporarily insert instruction back to erase them from parent and
4127        // memory later.
4128        if (isa<PHINode>(I))
4129          // Phi nodes must be the very first instructions in the block.
4130          I->insertBefore(F->getEntryBlock(),
4131                          F->getEntryBlock().getFirstNonPHIIt());
4132        else
4133          I->insertBefore(F->getEntryBlock().getTerminator());
4134        continue;
4135      }
4136      for (Use &U : I->operands()) {
4137        auto *Op = dyn_cast<Instruction>(U.get());
4138        if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4139            wouldInstructionBeTriviallyDead(Op, TLI))
4140          DeadInsts.emplace_back(Op);
4141      }
4142      I->dropAllReferences();
4143    }
4144    for (auto *I : DeletedInstructions) {
4145      assert(I->use_empty() &&
4146             "trying to erase instruction with users.");
4147      I->eraseFromParent();
4148    }
4149  
4150    // Cleanup any dead scalar code feeding the vectorized instructions
4151    RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4152  
4153  #ifdef EXPENSIVE_CHECKS
4154    // If we could guarantee that this call is not extremely slow, we could
4155    // remove the ifdef limitation (see PR47712).
4156    assert(!verifyFunction(*F, &dbgs()));
4157  #endif
4158  }
4159  
4160  /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4161  /// contains original mask for the scalars reused in the node. Procedure
4162  /// transform this mask in accordance with the given \p Mask.
4163  static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
4164    assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4165           "Expected non-empty mask.");
4166    SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4167    Prev.swap(Reuses);
4168    for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4169      if (Mask[I] != PoisonMaskElem)
4170        Reuses[Mask[I]] = Prev[I];
4171  }
4172  
4173  /// Reorders the given \p Order according to the given \p Mask. \p Order - is
4174  /// the original order of the scalars. Procedure transforms the provided order
4175  /// in accordance with the given \p Mask. If the resulting \p Order is just an
4176  /// identity order, \p Order is cleared.
4177  static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
4178                           bool BottomOrder = false) {
4179    assert(!Mask.empty() && "Expected non-empty mask.");
4180    unsigned Sz = Mask.size();
4181    if (BottomOrder) {
4182      SmallVector<unsigned> PrevOrder;
4183      if (Order.empty()) {
4184        PrevOrder.resize(Sz);
4185        std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4186      } else {
4187        PrevOrder.swap(Order);
4188      }
4189      Order.assign(Sz, Sz);
4190      for (unsigned I = 0; I < Sz; ++I)
4191        if (Mask[I] != PoisonMaskElem)
4192          Order[I] = PrevOrder[Mask[I]];
4193      if (all_of(enumerate(Order), [&](const auto &Data) {
4194            return Data.value() == Sz || Data.index() == Data.value();
4195          })) {
4196        Order.clear();
4197        return;
4198      }
4199      fixupOrderingIndices(Order);
4200      return;
4201    }
4202    SmallVector<int> MaskOrder;
4203    if (Order.empty()) {
4204      MaskOrder.resize(Sz);
4205      std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4206    } else {
4207      inversePermutation(Order, MaskOrder);
4208    }
4209    reorderReuses(MaskOrder, Mask);
4210    if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4211      Order.clear();
4212      return;
4213    }
4214    Order.assign(Sz, Sz);
4215    for (unsigned I = 0; I < Sz; ++I)
4216      if (MaskOrder[I] != PoisonMaskElem)
4217        Order[MaskOrder[I]] = I;
4218    fixupOrderingIndices(Order);
4219  }
4220  
4221  std::optional<BoUpSLP::OrdersType>
4222  BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4223    assert(TE.isGather() && "Expected gather node only.");
4224    // Try to find subvector extract/insert patterns and reorder only such
4225    // patterns.
4226    SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4227    Type *ScalarTy = GatheredScalars.front()->getType();
4228    int NumScalars = GatheredScalars.size();
4229    if (!isValidElementType(ScalarTy))
4230      return std::nullopt;
4231    auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4232    int NumParts = TTI->getNumberOfParts(VecTy);
4233    if (NumParts == 0 || NumParts >= NumScalars)
4234      NumParts = 1;
4235    SmallVector<int> ExtractMask;
4236    SmallVector<int> Mask;
4237    SmallVector<SmallVector<const TreeEntry *>> Entries;
4238    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4239        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4240    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4241        isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4242                              /*ForOrder=*/true);
4243    // No shuffled operands - ignore.
4244    if (GatherShuffles.empty() && ExtractShuffles.empty())
4245      return std::nullopt;
4246    OrdersType CurrentOrder(NumScalars, NumScalars);
4247    if (GatherShuffles.size() == 1 &&
4248        *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4249        Entries.front().front()->isSame(TE.Scalars)) {
4250      // Perfect match in the graph, will reuse the previously vectorized
4251      // node. Cost is 0.
4252      std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4253      return CurrentOrder;
4254    }
4255    auto IsSplatMask = [](ArrayRef<int> Mask) {
4256      int SingleElt = PoisonMaskElem;
4257      return all_of(Mask, [&](int I) {
4258        if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4259          SingleElt = I;
4260        return I == PoisonMaskElem || I == SingleElt;
4261      });
4262    };
4263    // Exclusive broadcast mask - ignore.
4264    if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4265         (Entries.size() != 1 ||
4266          Entries.front().front()->ReorderIndices.empty())) ||
4267        (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4268      return std::nullopt;
4269    SmallBitVector ShuffledSubMasks(NumParts);
4270    auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4271                                    ArrayRef<int> Mask, int PartSz, int NumParts,
4272                                    function_ref<unsigned(unsigned)> GetVF) {
4273      for (int I : seq<int>(0, NumParts)) {
4274        if (ShuffledSubMasks.test(I))
4275          continue;
4276        const int VF = GetVF(I);
4277        if (VF == 0)
4278          continue;
4279        unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4280        MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4281        // Shuffle of at least 2 vectors - ignore.
4282        if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4283          std::fill(Slice.begin(), Slice.end(), NumScalars);
4284          ShuffledSubMasks.set(I);
4285          continue;
4286        }
4287        // Try to include as much elements from the mask as possible.
4288        int FirstMin = INT_MAX;
4289        int SecondVecFound = false;
4290        for (int K : seq<int>(Limit)) {
4291          int Idx = Mask[I * PartSz + K];
4292          if (Idx == PoisonMaskElem) {
4293            Value *V = GatheredScalars[I * PartSz + K];
4294            if (isConstant(V) && !isa<PoisonValue>(V)) {
4295              SecondVecFound = true;
4296              break;
4297            }
4298            continue;
4299          }
4300          if (Idx < VF) {
4301            if (FirstMin > Idx)
4302              FirstMin = Idx;
4303          } else {
4304            SecondVecFound = true;
4305            break;
4306          }
4307        }
4308        FirstMin = (FirstMin / PartSz) * PartSz;
4309        // Shuffle of at least 2 vectors - ignore.
4310        if (SecondVecFound) {
4311          std::fill(Slice.begin(), Slice.end(), NumScalars);
4312          ShuffledSubMasks.set(I);
4313          continue;
4314        }
4315        for (int K : seq<int>(Limit)) {
4316          int Idx = Mask[I * PartSz + K];
4317          if (Idx == PoisonMaskElem)
4318            continue;
4319          Idx -= FirstMin;
4320          if (Idx >= PartSz) {
4321            SecondVecFound = true;
4322            break;
4323          }
4324          if (CurrentOrder[I * PartSz + Idx] >
4325                  static_cast<unsigned>(I * PartSz + K) &&
4326              CurrentOrder[I * PartSz + Idx] !=
4327                  static_cast<unsigned>(I * PartSz + Idx))
4328            CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4329        }
4330        // Shuffle of at least 2 vectors - ignore.
4331        if (SecondVecFound) {
4332          std::fill(Slice.begin(), Slice.end(), NumScalars);
4333          ShuffledSubMasks.set(I);
4334          continue;
4335        }
4336      }
4337    };
4338    int PartSz = getPartNumElems(NumScalars, NumParts);
4339    if (!ExtractShuffles.empty())
4340      TransformMaskToOrder(
4341          CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4342            if (!ExtractShuffles[I])
4343              return 0U;
4344            unsigned VF = 0;
4345            unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4346            for (unsigned Idx : seq<unsigned>(Sz)) {
4347              int K = I * PartSz + Idx;
4348              if (ExtractMask[K] == PoisonMaskElem)
4349                continue;
4350              if (!TE.ReuseShuffleIndices.empty())
4351                K = TE.ReuseShuffleIndices[K];
4352              if (!TE.ReorderIndices.empty())
4353                K = std::distance(TE.ReorderIndices.begin(),
4354                                  find(TE.ReorderIndices, K));
4355              auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4356              if (!EI)
4357                continue;
4358              VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4359                                    ->getElementCount()
4360                                    .getKnownMinValue());
4361            }
4362            return VF;
4363          });
4364    // Check special corner case - single shuffle of the same entry.
4365    if (GatherShuffles.size() == 1 && NumParts != 1) {
4366      if (ShuffledSubMasks.any())
4367        return std::nullopt;
4368      PartSz = NumScalars;
4369      NumParts = 1;
4370    }
4371    if (!Entries.empty())
4372      TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4373        if (!GatherShuffles[I])
4374          return 0U;
4375        return std::max(Entries[I].front()->getVectorFactor(),
4376                        Entries[I].back()->getVectorFactor());
4377      });
4378    int NumUndefs =
4379        count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4380    if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4381      return std::nullopt;
4382    return std::move(CurrentOrder);
4383  }
4384  
4385  static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4386                                    const TargetLibraryInfo &TLI,
4387                                    bool CompareOpcodes = true) {
4388    if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4389      return false;
4390    auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4391    if (!GEP1)
4392      return false;
4393    auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4394    if (!GEP2)
4395      return false;
4396    return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4397           ((isConstant(GEP1->getOperand(1)) &&
4398             isConstant(GEP2->getOperand(1))) ||
4399            !CompareOpcodes ||
4400            getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4401                .getOpcode());
4402  }
4403  
4404  /// Calculates minimal alignment as a common alignment.
4405  template <typename T>
4406  static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4407    Align CommonAlignment = cast<T>(VL.front())->getAlign();
4408    for (Value *V : VL.drop_front())
4409      CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4410    return CommonAlignment;
4411  }
4412  
4413  /// Check if \p Order represents reverse order.
4414  static bool isReverseOrder(ArrayRef<unsigned> Order) {
4415    unsigned Sz = Order.size();
4416    return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4417      return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4418    });
4419  }
4420  
4421  /// Checks if the provided list of pointers \p Pointers represents the strided
4422  /// pointers for type ElemTy. If they are not, std::nullopt is returned.
4423  /// Otherwise, if \p Inst is not specified, just initialized optional value is
4424  /// returned to show that the pointers represent strided pointers. If \p Inst
4425  /// specified, the runtime stride is materialized before the given \p Inst.
4426  /// \returns std::nullopt if the pointers are not pointers with the runtime
4427  /// stride, nullptr or actual stride value, otherwise.
4428  static std::optional<Value *>
4429  calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4430                    const DataLayout &DL, ScalarEvolution &SE,
4431                    SmallVectorImpl<unsigned> &SortedIndices,
4432                    Instruction *Inst = nullptr) {
4433    SmallVector<const SCEV *> SCEVs;
4434    const SCEV *PtrSCEVLowest = nullptr;
4435    const SCEV *PtrSCEVHighest = nullptr;
4436    // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4437    // addresses).
4438    for (Value *Ptr : PointerOps) {
4439      const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4440      if (!PtrSCEV)
4441        return std::nullopt;
4442      SCEVs.push_back(PtrSCEV);
4443      if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444        PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4445        continue;
4446      }
4447      const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4448      if (isa<SCEVCouldNotCompute>(Diff))
4449        return std::nullopt;
4450      if (Diff->isNonConstantNegative()) {
4451        PtrSCEVLowest = PtrSCEV;
4452        continue;
4453      }
4454      const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4455      if (isa<SCEVCouldNotCompute>(Diff1))
4456        return std::nullopt;
4457      if (Diff1->isNonConstantNegative()) {
4458        PtrSCEVHighest = PtrSCEV;
4459        continue;
4460      }
4461    }
4462    // Dist = PtrSCEVHighest - PtrSCEVLowest;
4463    const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4464    if (isa<SCEVCouldNotCompute>(Dist))
4465      return std::nullopt;
4466    int Size = DL.getTypeStoreSize(ElemTy);
4467    auto TryGetStride = [&](const SCEV *Dist,
4468                            const SCEV *Multiplier) -> const SCEV * {
4469      if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4470        if (M->getOperand(0) == Multiplier)
4471          return M->getOperand(1);
4472        if (M->getOperand(1) == Multiplier)
4473          return M->getOperand(0);
4474        return nullptr;
4475      }
4476      if (Multiplier == Dist)
4477        return SE.getConstant(Dist->getType(), 1);
4478      return SE.getUDivExactExpr(Dist, Multiplier);
4479    };
4480    // Stride_in_elements = Dist / element_size * (num_elems - 1).
4481    const SCEV *Stride = nullptr;
4482    if (Size != 1 || SCEVs.size() > 2) {
4483      const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4484      Stride = TryGetStride(Dist, Sz);
4485      if (!Stride)
4486        return std::nullopt;
4487    }
4488    if (!Stride || isa<SCEVConstant>(Stride))
4489      return std::nullopt;
4490    // Iterate through all pointers and check if all distances are
4491    // unique multiple of Stride.
4492    using DistOrdPair = std::pair<int64_t, int>;
4493    auto Compare = llvm::less_first();
4494    std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4495    int Cnt = 0;
4496    bool IsConsecutive = true;
4497    for (const SCEV *PtrSCEV : SCEVs) {
4498      unsigned Dist = 0;
4499      if (PtrSCEV != PtrSCEVLowest) {
4500        const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4501        const SCEV *Coeff = TryGetStride(Diff, Stride);
4502        if (!Coeff)
4503          return std::nullopt;
4504        const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4505        if (!SC || isa<SCEVCouldNotCompute>(SC))
4506          return std::nullopt;
4507        if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4508                                                    SE.getMulExpr(Stride, SC)))
4509                 ->isZero())
4510          return std::nullopt;
4511        Dist = SC->getAPInt().getZExtValue();
4512      }
4513      // If the strides are not the same or repeated, we can't vectorize.
4514      if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4515        return std::nullopt;
4516      auto Res = Offsets.emplace(Dist, Cnt);
4517      if (!Res.second)
4518        return std::nullopt;
4519      // Consecutive order if the inserted element is the last one.
4520      IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4521      ++Cnt;
4522    }
4523    if (Offsets.size() != SCEVs.size())
4524      return std::nullopt;
4525    SortedIndices.clear();
4526    if (!IsConsecutive) {
4527      // Fill SortedIndices array only if it is non-consecutive.
4528      SortedIndices.resize(PointerOps.size());
4529      Cnt = 0;
4530      for (const std::pair<int64_t, int> &Pair : Offsets) {
4531        SortedIndices[Cnt] = Pair.second;
4532        ++Cnt;
4533      }
4534    }
4535    if (!Inst)
4536      return nullptr;
4537    SCEVExpander Expander(SE, DL, "strided-load-vec");
4538    return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4539  }
4540  
4541  static std::pair<InstructionCost, InstructionCost>
4542  getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4543              Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4544              Type *ScalarTy, VectorType *VecTy);
4545  
4546  BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4547      ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4548      SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4549    // Check that a vectorized load would load the same memory as a scalar
4550    // load. For example, we don't want to vectorize loads that are smaller
4551    // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4552    // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4553    // from such a struct, we read/write packed bits disagreeing with the
4554    // unvectorized version.
4555    Type *ScalarTy = VL0->getType();
4556  
4557    if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4558      return LoadsState::Gather;
4559  
4560    // Make sure all loads in the bundle are simple - we can't vectorize
4561    // atomic or volatile loads.
4562    PointerOps.clear();
4563    const unsigned Sz = VL.size();
4564    PointerOps.resize(Sz);
4565    auto *POIter = PointerOps.begin();
4566    for (Value *V : VL) {
4567      auto *L = cast<LoadInst>(V);
4568      if (!L->isSimple())
4569        return LoadsState::Gather;
4570      *POIter = L->getPointerOperand();
4571      ++POIter;
4572    }
4573  
4574    Order.clear();
4575    auto *VecTy = getWidenedType(ScalarTy, Sz);
4576    // Check the order of pointer operands or that all pointers are the same.
4577    bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4578    // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4579    if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4580      assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4581                                     "supported with VectorizeNonPowerOf2");
4582      return LoadsState::Gather;
4583    }
4584  
4585    Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4586    if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4587        TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4588        calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4589      return LoadsState::StridedVectorize;
4590    if (IsSorted || all_of(PointerOps, [&](Value *P) {
4591          return arePointersCompatible(P, PointerOps.front(), *TLI);
4592        })) {
4593      if (IsSorted) {
4594        Value *Ptr0;
4595        Value *PtrN;
4596        if (Order.empty()) {
4597          Ptr0 = PointerOps.front();
4598          PtrN = PointerOps.back();
4599        } else {
4600          Ptr0 = PointerOps[Order.front()];
4601          PtrN = PointerOps[Order.back()];
4602        }
4603        std::optional<int> Diff =
4604            getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4605        // Check that the sorted loads are consecutive.
4606        if (static_cast<unsigned>(*Diff) == Sz - 1)
4607          return LoadsState::Vectorize;
4608        // Simple check if not a strided access - clear order.
4609        bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4610        // Try to generate strided load node if:
4611        // 1. Target with strided load support is detected.
4612        // 2. The number of loads is greater than MinProfitableStridedLoads,
4613        // or the potential stride <= MaxProfitableLoadStride and the
4614        // potential stride is power-of-2 (to avoid perf regressions for the very
4615        // small number of loads) and max distance > number of loads, or potential
4616        // stride is -1.
4617        // 3. The loads are ordered, or number of unordered loads <=
4618        // MaxProfitableUnorderedLoads, or loads are in reversed order.
4619        // (this check is to avoid extra costs for very expensive shuffles).
4620        if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4621                                    (static_cast<unsigned>(std::abs(*Diff)) <=
4622                                         MaxProfitableLoadStride * Sz &&
4623                                     isPowerOf2_32(std::abs(*Diff)))) &&
4624                                   static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4625                                  *Diff == -(static_cast<int>(Sz) - 1))) {
4626          int Stride = *Diff / static_cast<int>(Sz - 1);
4627          if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4628            Align Alignment =
4629                cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4630                    ->getAlign();
4631            if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4632              // Iterate through all pointers and check if all distances are
4633              // unique multiple of Dist.
4634              SmallSet<int, 4> Dists;
4635              for (Value *Ptr : PointerOps) {
4636                int Dist = 0;
4637                if (Ptr == PtrN)
4638                  Dist = *Diff;
4639                else if (Ptr != Ptr0)
4640                  Dist =
4641                      *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4642                // If the strides are not the same or repeated, we can't
4643                // vectorize.
4644                if (((Dist / Stride) * Stride) != Dist ||
4645                    !Dists.insert(Dist).second)
4646                  break;
4647              }
4648              if (Dists.size() == Sz)
4649                return LoadsState::StridedVectorize;
4650            }
4651          }
4652        }
4653      }
4654      auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4655        unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4656        unsigned MinVF = getMinVF(Sz);
4657        unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4658        MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4659        for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4660          unsigned VectorizedCnt = 0;
4661          SmallVector<LoadsState> States;
4662          for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4663               Cnt += VF, ++VectorizedCnt) {
4664            ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4665            SmallVector<unsigned> Order;
4666            SmallVector<Value *> PointerOps;
4667            LoadsState LS =
4668                canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4669                                  /*TryRecursiveCheck=*/false);
4670            // Check that the sorted loads are consecutive.
4671            if (LS == LoadsState::Gather)
4672              break;
4673            // If need the reorder - consider as high-cost masked gather for now.
4674            if ((LS == LoadsState::Vectorize ||
4675                 LS == LoadsState::StridedVectorize) &&
4676                !Order.empty() && !isReverseOrder(Order))
4677              LS = LoadsState::ScatterVectorize;
4678            States.push_back(LS);
4679          }
4680          // Can be vectorized later as a serie of loads/insertelements.
4681          if (VectorizedCnt == VL.size() / VF) {
4682            // Compare masked gather cost and loads + insersubvector costs.
4683            TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4684            auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4685                TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4686                CostKind, ScalarTy, VecTy);
4687            InstructionCost MaskedGatherCost =
4688                TTI.getGatherScatterOpCost(
4689                    Instruction::Load, VecTy,
4690                    cast<LoadInst>(VL0)->getPointerOperand(),
4691                    /*VariableMask=*/false, CommonAlignment, CostKind) +
4692                VectorGEPCost - ScalarGEPCost;
4693            InstructionCost VecLdCost = 0;
4694            auto *SubVecTy = getWidenedType(ScalarTy, VF);
4695            for (auto [I, LS] : enumerate(States)) {
4696              auto *LI0 = cast<LoadInst>(VL[I * VF]);
4697              switch (LS) {
4698              case LoadsState::Vectorize: {
4699                auto [ScalarGEPCost, VectorGEPCost] =
4700                    getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4701                                LI0->getPointerOperand(), Instruction::Load,
4702                                CostKind, ScalarTy, SubVecTy);
4703                VecLdCost += TTI.getMemoryOpCost(
4704                                 Instruction::Load, SubVecTy, LI0->getAlign(),
4705                                 LI0->getPointerAddressSpace(), CostKind,
4706                                 TTI::OperandValueInfo()) +
4707                             VectorGEPCost - ScalarGEPCost;
4708                break;
4709              }
4710              case LoadsState::StridedVectorize: {
4711                auto [ScalarGEPCost, VectorGEPCost] =
4712                    getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4713                                LI0->getPointerOperand(), Instruction::Load,
4714                                CostKind, ScalarTy, SubVecTy);
4715                VecLdCost +=
4716                    TTI.getStridedMemoryOpCost(
4717                        Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4718                        /*VariableMask=*/false, CommonAlignment, CostKind) +
4719                    VectorGEPCost - ScalarGEPCost;
4720                break;
4721              }
4722              case LoadsState::ScatterVectorize: {
4723                auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4724                    TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4725                    LI0->getPointerOperand(), Instruction::GetElementPtr,
4726                    CostKind, ScalarTy, SubVecTy);
4727                VecLdCost +=
4728                    TTI.getGatherScatterOpCost(
4729                        Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4730                        /*VariableMask=*/false, CommonAlignment, CostKind) +
4731                    VectorGEPCost - ScalarGEPCost;
4732                break;
4733              }
4734              case LoadsState::Gather:
4735                llvm_unreachable(
4736                    "Expected only consecutive, strided or masked gather loads.");
4737              }
4738              SmallVector<int> ShuffleMask(VL.size());
4739              for (int Idx : seq<int>(0, VL.size()))
4740                ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4741              VecLdCost +=
4742                  TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4743                                     CostKind, I * VF, SubVecTy);
4744            }
4745            // If masked gather cost is higher - better to vectorize, so
4746            // consider it as a gather node. It will be better estimated
4747            // later.
4748            if (MaskedGatherCost >= VecLdCost)
4749              return true;
4750          }
4751        }
4752        return false;
4753      };
4754      // TODO: need to improve analysis of the pointers, if not all of them are
4755      // GEPs or have > 2 operands, we end up with a gather node, which just
4756      // increases the cost.
4757      Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4758      bool ProfitableGatherPointers =
4759          L && Sz > 2 &&
4760          static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4761            return L->isLoopInvariant(V);
4762          })) <= Sz / 2;
4763      if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4764            auto *GEP = dyn_cast<GetElementPtrInst>(P);
4765            return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4766                   (GEP && GEP->getNumOperands() == 2 &&
4767                    isa<Constant, Instruction>(GEP->getOperand(1)));
4768          })) {
4769        Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4770        if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4771            !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4772          // Check if potential masked gather can be represented as series
4773          // of loads + insertsubvectors.
4774          if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4775            // If masked gather cost is higher - better to vectorize, so
4776            // consider it as a gather node. It will be better estimated
4777            // later.
4778            return LoadsState::Gather;
4779          }
4780          return LoadsState::ScatterVectorize;
4781        }
4782      }
4783    }
4784  
4785    return LoadsState::Gather;
4786  }
4787  
4788  static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
4789                                     const DataLayout &DL, ScalarEvolution &SE,
4790                                     SmallVectorImpl<unsigned> &SortedIndices) {
4791    assert(llvm::all_of(
4792               VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4793           "Expected list of pointer operands.");
4794    // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4795    // Ptr into, sort and return the sorted indices with values next to one
4796    // another.
4797    MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
4798    Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4799  
4800    unsigned Cnt = 1;
4801    for (Value *Ptr : VL.drop_front()) {
4802      bool Found = any_of(Bases, [&](auto &Base) {
4803        std::optional<int> Diff =
4804            getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4805                            /*StrictCheck=*/true);
4806        if (!Diff)
4807          return false;
4808  
4809        Base.second.emplace_back(Ptr, *Diff, Cnt++);
4810        return true;
4811      });
4812  
4813      if (!Found) {
4814        // If we haven't found enough to usefully cluster, return early.
4815        if (Bases.size() > VL.size() / 2 - 1)
4816          return false;
4817  
4818        // Not found already - add a new Base
4819        Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4820      }
4821    }
4822  
4823    // For each of the bases sort the pointers by Offset and check if any of the
4824    // base become consecutively allocated.
4825    bool AnyConsecutive = false;
4826    for (auto &Base : Bases) {
4827      auto &Vec = Base.second;
4828      if (Vec.size() > 1) {
4829        llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4830                                  const std::tuple<Value *, int, unsigned> &Y) {
4831          return std::get<1>(X) < std::get<1>(Y);
4832        });
4833        int InitialOffset = std::get<1>(Vec[0]);
4834        AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4835          return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4836        });
4837      }
4838    }
4839  
4840    // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4841    SortedIndices.clear();
4842    if (!AnyConsecutive)
4843      return false;
4844  
4845    for (auto &Base : Bases) {
4846      for (auto &T : Base.second)
4847        SortedIndices.push_back(std::get<2>(T));
4848    }
4849  
4850    assert(SortedIndices.size() == VL.size() &&
4851           "Expected SortedIndices to be the size of VL");
4852    return true;
4853  }
4854  
4855  std::optional<BoUpSLP::OrdersType>
4856  BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4857    assert(TE.isGather() && "Expected gather node only.");
4858    Type *ScalarTy = TE.Scalars[0]->getType();
4859  
4860    SmallVector<Value *> Ptrs;
4861    Ptrs.reserve(TE.Scalars.size());
4862    for (Value *V : TE.Scalars) {
4863      auto *L = dyn_cast<LoadInst>(V);
4864      if (!L || !L->isSimple())
4865        return std::nullopt;
4866      Ptrs.push_back(L->getPointerOperand());
4867    }
4868  
4869    BoUpSLP::OrdersType Order;
4870    if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4871      return std::move(Order);
4872    return std::nullopt;
4873  }
4874  
4875  /// Check if two insertelement instructions are from the same buildvector.
4876  static bool areTwoInsertFromSameBuildVector(
4877      InsertElementInst *VU, InsertElementInst *V,
4878      function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4879    // Instructions must be from the same basic blocks.
4880    if (VU->getParent() != V->getParent())
4881      return false;
4882    // Checks if 2 insertelements are from the same buildvector.
4883    if (VU->getType() != V->getType())
4884      return false;
4885    // Multiple used inserts are separate nodes.
4886    if (!VU->hasOneUse() && !V->hasOneUse())
4887      return false;
4888    auto *IE1 = VU;
4889    auto *IE2 = V;
4890    std::optional<unsigned> Idx1 = getElementIndex(IE1);
4891    std::optional<unsigned> Idx2 = getElementIndex(IE2);
4892    if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4893      return false;
4894    // Go through the vector operand of insertelement instructions trying to find
4895    // either VU as the original vector for IE2 or V as the original vector for
4896    // IE1.
4897    SmallBitVector ReusedIdx(
4898        cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4899    bool IsReusedIdx = false;
4900    do {
4901      if (IE2 == VU && !IE1)
4902        return VU->hasOneUse();
4903      if (IE1 == V && !IE2)
4904        return V->hasOneUse();
4905      if (IE1 && IE1 != V) {
4906        unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
4907        IsReusedIdx |= ReusedIdx.test(Idx1);
4908        ReusedIdx.set(Idx1);
4909        if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4910          IE1 = nullptr;
4911        else
4912          IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4913      }
4914      if (IE2 && IE2 != VU) {
4915        unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
4916        IsReusedIdx |= ReusedIdx.test(Idx2);
4917        ReusedIdx.set(Idx2);
4918        if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4919          IE2 = nullptr;
4920        else
4921          IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4922      }
4923    } while (!IsReusedIdx && (IE1 || IE2));
4924    return false;
4925  }
4926  
4927  std::optional<BoUpSLP::OrdersType>
4928  BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4929    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4930    if (TE.isNonPowOf2Vec())
4931      return std::nullopt;
4932  
4933    // No need to reorder if need to shuffle reuses, still need to shuffle the
4934    // node.
4935    if (!TE.ReuseShuffleIndices.empty()) {
4936      if (isSplat(TE.Scalars))
4937        return std::nullopt;
4938      // Check if reuse shuffle indices can be improved by reordering.
4939      // For this, check that reuse mask is "clustered", i.e. each scalar values
4940      // is used once in each submask of size <number_of_scalars>.
4941      // Example: 4 scalar values.
4942      // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4943      //                           0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4944      //                           element 3 is used twice in the second submask.
4945      unsigned Sz = TE.Scalars.size();
4946      if (TE.isGather()) {
4947        if (std::optional<OrdersType> CurrentOrder =
4948                findReusedOrderedScalars(TE)) {
4949          SmallVector<int> Mask;
4950          fixupOrderingIndices(*CurrentOrder);
4951          inversePermutation(*CurrentOrder, Mask);
4952          ::addMask(Mask, TE.ReuseShuffleIndices);
4953          OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954          unsigned Sz = TE.Scalars.size();
4955          for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4956            for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4957              if (Idx != PoisonMaskElem)
4958                Res[Idx + K * Sz] = I + K * Sz;
4959          }
4960          return std::move(Res);
4961        }
4962      }
4963      if (Sz == 2 && TE.getVectorFactor() == 4 &&
4964          TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
4965                                               2 * TE.getVectorFactor())) == 1)
4966        return std::nullopt;
4967      if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4968                                                       Sz)) {
4969        SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4970        if (TE.ReorderIndices.empty())
4971          std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4972        else
4973          inversePermutation(TE.ReorderIndices, ReorderMask);
4974        ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4975        unsigned VF = ReorderMask.size();
4976        OrdersType ResOrder(VF, VF);
4977        unsigned NumParts = divideCeil(VF, Sz);
4978        SmallBitVector UsedVals(NumParts);
4979        for (unsigned I = 0; I < VF; I += Sz) {
4980          int Val = PoisonMaskElem;
4981          unsigned UndefCnt = 0;
4982          unsigned Limit = std::min(Sz, VF - I);
4983          if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
4984                     [&](int Idx) {
4985                       if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4986                         Val = Idx;
4987                       if (Idx == PoisonMaskElem)
4988                         ++UndefCnt;
4989                       return Idx != PoisonMaskElem && Idx != Val;
4990                     }) ||
4991              Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4992              UndefCnt > Sz / 2)
4993            return std::nullopt;
4994          UsedVals.set(Val);
4995          for (unsigned K = 0; K < NumParts; ++K)
4996            ResOrder[Val + Sz * K] = I + K;
4997        }
4998        return std::move(ResOrder);
4999      }
5000      unsigned VF = TE.getVectorFactor();
5001      // Try build correct order for extractelement instructions.
5002      SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5003                                  TE.ReuseShuffleIndices.end());
5004      if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5005          all_of(TE.Scalars, [Sz](Value *V) {
5006            std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5007            return Idx && *Idx < Sz;
5008          })) {
5009        SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5010        if (TE.ReorderIndices.empty())
5011          std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5012        else
5013          inversePermutation(TE.ReorderIndices, ReorderMask);
5014        for (unsigned I = 0; I < VF; ++I) {
5015          int &Idx = ReusedMask[I];
5016          if (Idx == PoisonMaskElem)
5017            continue;
5018          Value *V = TE.Scalars[ReorderMask[Idx]];
5019          std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5020          Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5021        }
5022      }
5023      // Build the order of the VF size, need to reorder reuses shuffles, they are
5024      // always of VF size.
5025      OrdersType ResOrder(VF);
5026      std::iota(ResOrder.begin(), ResOrder.end(), 0);
5027      auto *It = ResOrder.begin();
5028      for (unsigned K = 0; K < VF; K += Sz) {
5029        OrdersType CurrentOrder(TE.ReorderIndices);
5030        SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5031        if (SubMask.front() == PoisonMaskElem)
5032          std::iota(SubMask.begin(), SubMask.end(), 0);
5033        reorderOrder(CurrentOrder, SubMask);
5034        transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5035        std::advance(It, Sz);
5036      }
5037      if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5038            return Data.index() == Data.value();
5039          }))
5040        return std::nullopt; // No need to reorder.
5041      return std::move(ResOrder);
5042    }
5043    if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044        any_of(TE.UserTreeIndices,
5045               [](const EdgeInfo &EI) {
5046                 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5047               }) &&
5048        (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5049      return std::nullopt;
5050    if ((TE.State == TreeEntry::Vectorize ||
5051         TE.State == TreeEntry::StridedVectorize) &&
5052        (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5053         (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5054        !TE.isAltShuffle())
5055      return TE.ReorderIndices;
5056    if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057      auto PHICompare = [&](unsigned I1, unsigned I2) {
5058        Value *V1 = TE.Scalars[I1];
5059        Value *V2 = TE.Scalars[I2];
5060        if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5061          return false;
5062        if (V1->getNumUses() < V2->getNumUses())
5063          return true;
5064        if (V1->getNumUses() > V2->getNumUses())
5065          return false;
5066        auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5067        auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5068        if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5069          if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5070            if (!areTwoInsertFromSameBuildVector(
5071                    IE1, IE2,
5072                    [](InsertElementInst *II) { return II->getOperand(0); }))
5073              return I1 < I2;
5074            return getElementIndex(IE1) < getElementIndex(IE2);
5075          }
5076        if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5077          if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5078            if (EE1->getOperand(0) != EE2->getOperand(0))
5079              return I1 < I2;
5080            return getElementIndex(EE1) < getElementIndex(EE2);
5081          }
5082        return I1 < I2;
5083      };
5084      auto IsIdentityOrder = [](const OrdersType &Order) {
5085        for (unsigned Idx : seq<unsigned>(0, Order.size()))
5086          if (Idx != Order[Idx])
5087            return false;
5088        return true;
5089      };
5090      if (!TE.ReorderIndices.empty())
5091        return TE.ReorderIndices;
5092      DenseMap<unsigned, unsigned> PhiToId;
5093      SmallVector<unsigned> Phis(TE.Scalars.size());
5094      std::iota(Phis.begin(), Phis.end(), 0);
5095      OrdersType ResOrder(TE.Scalars.size());
5096      for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5097        PhiToId[Id] = Id;
5098      stable_sort(Phis, PHICompare);
5099      for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5100        ResOrder[Id] = PhiToId[Phis[Id]];
5101      if (IsIdentityOrder(ResOrder))
5102        return std::nullopt; // No need to reorder.
5103      return std::move(ResOrder);
5104    }
5105    if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5106      // TODO: add analysis of other gather nodes with extractelement
5107      // instructions and other values/instructions, not only undefs.
5108      if ((TE.getOpcode() == Instruction::ExtractElement ||
5109           (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5110            any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5111          all_of(TE.Scalars, [](Value *V) {
5112            auto *EE = dyn_cast<ExtractElementInst>(V);
5113            return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5114          })) {
5115        // Check that gather of extractelements can be represented as
5116        // just a shuffle of a single vector.
5117        OrdersType CurrentOrder;
5118        bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5119                                     /*ResizeAllowed=*/true);
5120        if (Reuse || !CurrentOrder.empty())
5121          return std::move(CurrentOrder);
5122      }
5123      // If the gather node is <undef, v, .., poison> and
5124      // insertelement poison, v, 0 [+ permute]
5125      // is cheaper than
5126      // insertelement poison, v, n - try to reorder.
5127      // If rotating the whole graph, exclude the permute cost, the whole graph
5128      // might be transformed.
5129      int Sz = TE.Scalars.size();
5130      if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5131          count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5132        const auto *It =
5133            find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5134        if (It == TE.Scalars.begin())
5135          return OrdersType();
5136        auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5137        if (It != TE.Scalars.end()) {
5138          OrdersType Order(Sz, Sz);
5139          unsigned Idx = std::distance(TE.Scalars.begin(), It);
5140          Order[Idx] = 0;
5141          fixupOrderingIndices(Order);
5142          SmallVector<int> Mask;
5143          inversePermutation(Order, Mask);
5144          InstructionCost PermuteCost =
5145              TopToBottom
5146                  ? 0
5147                  : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);
5148          InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5149              Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5150              PoisonValue::get(Ty), *It);
5151          InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5152              Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5153              PoisonValue::get(Ty), *It);
5154          if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5155            OrdersType Order(Sz, Sz);
5156            Order[Idx] = 0;
5157            return std::move(Order);
5158          }
5159        }
5160      }
5161      if (isSplat(TE.Scalars))
5162        return std::nullopt;
5163      if (TE.Scalars.size() >= 4)
5164        if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5165          return Order;
5166      if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5167        return CurrentOrder;
5168    }
5169    return std::nullopt;
5170  }
5171  
5172  /// Checks if the given mask is a "clustered" mask with the same clusters of
5173  /// size \p Sz, which are not identity submasks.
5174  static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5175                                                 unsigned Sz) {
5176    ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5177    if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5178      return false;
5179    for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5180      ArrayRef<int> Cluster = Mask.slice(I, Sz);
5181      if (Cluster != FirstCluster)
5182        return false;
5183    }
5184    return true;
5185  }
5186  
5187  void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5188    // Reorder reuses mask.
5189    reorderReuses(TE.ReuseShuffleIndices, Mask);
5190    const unsigned Sz = TE.Scalars.size();
5191    // For vectorized and non-clustered reused no need to do anything else.
5192    if (!TE.isGather() ||
5193        !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5194                                                     Sz) ||
5195        !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5196      return;
5197    SmallVector<int> NewMask;
5198    inversePermutation(TE.ReorderIndices, NewMask);
5199    addMask(NewMask, TE.ReuseShuffleIndices);
5200    // Clear reorder since it is going to be applied to the new mask.
5201    TE.ReorderIndices.clear();
5202    // Try to improve gathered nodes with clustered reuses, if possible.
5203    ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5204    SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5205    inversePermutation(NewOrder, NewMask);
5206    reorderScalars(TE.Scalars, NewMask);
5207    // Fill the reuses mask with the identity submasks.
5208    for (auto *It = TE.ReuseShuffleIndices.begin(),
5209              *End = TE.ReuseShuffleIndices.end();
5210         It != End; std::advance(It, Sz))
5211      std::iota(It, std::next(It, Sz), 0);
5212  }
5213  
5214  static void combineOrders(MutableArrayRef<unsigned> Order,
5215                            ArrayRef<unsigned> SecondaryOrder) {
5216    assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5217           "Expected same size of orders");
5218    unsigned Sz = Order.size();
5219    SmallBitVector UsedIndices(Sz);
5220    for (unsigned Idx : seq<unsigned>(0, Sz)) {
5221      if (Order[Idx] != Sz)
5222        UsedIndices.set(Order[Idx]);
5223    }
5224    if (SecondaryOrder.empty()) {
5225      for (unsigned Idx : seq<unsigned>(0, Sz))
5226        if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5227          Order[Idx] = Idx;
5228    } else {
5229      for (unsigned Idx : seq<unsigned>(0, Sz))
5230        if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5231            !UsedIndices.test(SecondaryOrder[Idx]))
5232          Order[Idx] = SecondaryOrder[Idx];
5233    }
5234  }
5235  
5236  void BoUpSLP::reorderTopToBottom() {
5237    // Maps VF to the graph nodes.
5238    DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5239    // ExtractElement gather nodes which can be vectorized and need to handle
5240    // their ordering.
5241    DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5242  
5243    // Phi nodes can have preferred ordering based on their result users
5244    DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5245  
5246    // AltShuffles can also have a preferred ordering that leads to fewer
5247    // instructions, e.g., the addsub instruction in x86.
5248    DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5249  
5250    // Maps a TreeEntry to the reorder indices of external users.
5251    DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
5252        ExternalUserReorderMap;
5253    // Find all reorderable nodes with the given VF.
5254    // Currently the are vectorized stores,loads,extracts + some gathering of
5255    // extracts.
5256    for_each(VectorizableTree, [&, &TTIRef = *TTI](
5257                                   const std::unique_ptr<TreeEntry> &TE) {
5258      // Look for external users that will probably be vectorized.
5259      SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5260          findExternalStoreUsersReorderIndices(TE.get());
5261      if (!ExternalUserReorderIndices.empty()) {
5262        VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5263        ExternalUserReorderMap.try_emplace(TE.get(),
5264                                           std::move(ExternalUserReorderIndices));
5265      }
5266  
5267      // Patterns like [fadd,fsub] can be combined into a single instruction in
5268      // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5269      // to take into account their order when looking for the most used order.
5270      if (TE->isAltShuffle()) {
5271        VectorType *VecTy =
5272            getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5273        unsigned Opcode0 = TE->getOpcode();
5274        unsigned Opcode1 = TE->getAltOpcode();
5275        SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5276        // If this pattern is supported by the target then we consider the order.
5277        if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278          VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5279          AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5280        }
5281        // TODO: Check the reverse order too.
5282      }
5283  
5284      if (std::optional<OrdersType> CurrentOrder =
5285              getReorderingData(*TE, /*TopToBottom=*/true)) {
5286        // Do not include ordering for nodes used in the alt opcode vectorization,
5287        // better to reorder them during bottom-to-top stage. If follow the order
5288        // here, it causes reordering of the whole graph though actually it is
5289        // profitable just to reorder the subgraph that starts from the alternate
5290        // opcode vectorization node. Such nodes already end-up with the shuffle
5291        // instruction and it is just enough to change this shuffle rather than
5292        // rotate the scalars for the whole graph.
5293        unsigned Cnt = 0;
5294        const TreeEntry *UserTE = TE.get();
5295        while (UserTE && Cnt < RecursionMaxDepth) {
5296          if (UserTE->UserTreeIndices.size() != 1)
5297            break;
5298          if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5299                return EI.UserTE->State == TreeEntry::Vectorize &&
5300                       EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5301              }))
5302            return;
5303          UserTE = UserTE->UserTreeIndices.back().UserTE;
5304          ++Cnt;
5305        }
5306        VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5307        if (!(TE->State == TreeEntry::Vectorize ||
5308              TE->State == TreeEntry::StridedVectorize) ||
5309            !TE->ReuseShuffleIndices.empty())
5310          GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5311        if (TE->State == TreeEntry::Vectorize &&
5312            TE->getOpcode() == Instruction::PHI)
5313          PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5314      }
5315    });
5316  
5317    // Reorder the graph nodes according to their vectorization factor.
5318    for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5319         VF /= 2) {
5320      auto It = VFToOrderedEntries.find(VF);
5321      if (It == VFToOrderedEntries.end())
5322        continue;
5323      // Try to find the most profitable order. We just are looking for the most
5324      // used order and reorder scalar elements in the nodes according to this
5325      // mostly used order.
5326      ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5327      // All operands are reordered and used only in this node - propagate the
5328      // most used order to the user node.
5329      MapVector<OrdersType, unsigned,
5330                DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5331          OrdersUses;
5332      SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5333      for (const TreeEntry *OpTE : OrderedEntries) {
5334        // No need to reorder this nodes, still need to extend and to use shuffle,
5335        // just need to merge reordering shuffle and the reuse shuffle.
5336        if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5337          continue;
5338        // Count number of orders uses.
5339        const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5340                             &PhisToOrders]() -> const OrdersType & {
5341          if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5342            auto It = GathersToOrders.find(OpTE);
5343            if (It != GathersToOrders.end())
5344              return It->second;
5345          }
5346          if (OpTE->isAltShuffle()) {
5347            auto It = AltShufflesToOrders.find(OpTE);
5348            if (It != AltShufflesToOrders.end())
5349              return It->second;
5350          }
5351          if (OpTE->State == TreeEntry::Vectorize &&
5352              OpTE->getOpcode() == Instruction::PHI) {
5353            auto It = PhisToOrders.find(OpTE);
5354            if (It != PhisToOrders.end())
5355              return It->second;
5356          }
5357          return OpTE->ReorderIndices;
5358        }();
5359        // First consider the order of the external scalar users.
5360        auto It = ExternalUserReorderMap.find(OpTE);
5361        if (It != ExternalUserReorderMap.end()) {
5362          const auto &ExternalUserReorderIndices = It->second;
5363          // If the OpTE vector factor != number of scalars - use natural order,
5364          // it is an attempt to reorder node with reused scalars but with
5365          // external uses.
5366          if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367            OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5368                ExternalUserReorderIndices.size();
5369          } else {
5370            for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371              ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5372          }
5373          // No other useful reorder data in this entry.
5374          if (Order.empty())
5375            continue;
5376        }
5377        // Stores actually store the mask, not the order, need to invert.
5378        if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379            OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5380          SmallVector<int> Mask;
5381          inversePermutation(Order, Mask);
5382          unsigned E = Order.size();
5383          OrdersType CurrentOrder(E, E);
5384          transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5385            return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5386          });
5387          fixupOrderingIndices(CurrentOrder);
5388          ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5389        } else {
5390          ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5391        }
5392      }
5393      if (OrdersUses.empty())
5394        continue;
5395      auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5396        const unsigned Sz = Order.size();
5397        for (unsigned Idx : seq<unsigned>(0, Sz))
5398          if (Idx != Order[Idx] && Order[Idx] != Sz)
5399            return false;
5400        return true;
5401      };
5402      // Choose the most used order.
5403      unsigned IdentityCnt = 0;
5404      unsigned FilledIdentityCnt = 0;
5405      OrdersType IdentityOrder(VF, VF);
5406      for (auto &Pair : OrdersUses) {
5407        if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5408          if (!Pair.first.empty())
5409            FilledIdentityCnt += Pair.second;
5410          IdentityCnt += Pair.second;
5411          combineOrders(IdentityOrder, Pair.first);
5412        }
5413      }
5414      MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5415      unsigned Cnt = IdentityCnt;
5416      for (auto &Pair : OrdersUses) {
5417        // Prefer identity order. But, if filled identity found (non-empty order)
5418        // with same number of uses, as the new candidate order, we can choose
5419        // this candidate order.
5420        if (Cnt < Pair.second ||
5421            (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422             Cnt == Pair.second && !BestOrder.empty() &&
5423             IsIdentityOrder(BestOrder))) {
5424          combineOrders(Pair.first, BestOrder);
5425          BestOrder = Pair.first;
5426          Cnt = Pair.second;
5427        } else {
5428          combineOrders(BestOrder, Pair.first);
5429        }
5430      }
5431      // Set order of the user node.
5432      if (IsIdentityOrder(BestOrder))
5433        continue;
5434      fixupOrderingIndices(BestOrder);
5435      SmallVector<int> Mask;
5436      inversePermutation(BestOrder, Mask);
5437      SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5438      unsigned E = BestOrder.size();
5439      transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5440        return I < E ? static_cast<int>(I) : PoisonMaskElem;
5441      });
5442      // Do an actual reordering, if profitable.
5443      for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5444        // Just do the reordering for the nodes with the given VF.
5445        if (TE->Scalars.size() != VF) {
5446          if (TE->ReuseShuffleIndices.size() == VF) {
5447            // Need to reorder the reuses masks of the operands with smaller VF to
5448            // be able to find the match between the graph nodes and scalar
5449            // operands of the given node during vectorization/cost estimation.
5450            assert(all_of(TE->UserTreeIndices,
5451                          [VF, &TE](const EdgeInfo &EI) {
5452                            return EI.UserTE->Scalars.size() == VF ||
5453                                   EI.UserTE->Scalars.size() ==
5454                                       TE->Scalars.size();
5455                          }) &&
5456                   "All users must be of VF size.");
5457            // Update ordering of the operands with the smaller VF than the given
5458            // one.
5459            reorderNodeWithReuses(*TE, Mask);
5460          }
5461          continue;
5462        }
5463        if ((TE->State == TreeEntry::Vectorize ||
5464             TE->State == TreeEntry::StridedVectorize) &&
5465            isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5466                InsertElementInst>(TE->getMainOp()) &&
5467            !TE->isAltShuffle()) {
5468          // Build correct orders for extract{element,value}, loads and
5469          // stores.
5470          reorderOrder(TE->ReorderIndices, Mask);
5471          if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5472            TE->reorderOperands(Mask);
5473        } else {
5474          // Reorder the node and its operands.
5475          TE->reorderOperands(Mask);
5476          assert(TE->ReorderIndices.empty() &&
5477                 "Expected empty reorder sequence.");
5478          reorderScalars(TE->Scalars, Mask);
5479        }
5480        if (!TE->ReuseShuffleIndices.empty()) {
5481          // Apply reversed order to keep the original ordering of the reused
5482          // elements to avoid extra reorder indices shuffling.
5483          OrdersType CurrentOrder;
5484          reorderOrder(CurrentOrder, MaskOrder);
5485          SmallVector<int> NewReuses;
5486          inversePermutation(CurrentOrder, NewReuses);
5487          addMask(NewReuses, TE->ReuseShuffleIndices);
5488          TE->ReuseShuffleIndices.swap(NewReuses);
5489        }
5490      }
5491    }
5492  }
5493  
5494  bool BoUpSLP::canReorderOperands(
5495      TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5496      ArrayRef<TreeEntry *> ReorderableGathers,
5497      SmallVectorImpl<TreeEntry *> &GatherOps) {
5498    // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5499    if (UserTE->isNonPowOf2Vec())
5500      return false;
5501  
5502    for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5503      if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5504            return OpData.first == I &&
5505                   (OpData.second->State == TreeEntry::Vectorize ||
5506                    OpData.second->State == TreeEntry::StridedVectorize);
5507          }))
5508        continue;
5509      if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5510        // Do not reorder if operand node is used by many user nodes.
5511        if (any_of(TE->UserTreeIndices,
5512                   [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5513          return false;
5514        // Add the node to the list of the ordered nodes with the identity
5515        // order.
5516        Edges.emplace_back(I, TE);
5517        // Add ScatterVectorize nodes to the list of operands, where just
5518        // reordering of the scalars is required. Similar to the gathers, so
5519        // simply add to the list of gathered ops.
5520        // If there are reused scalars, process this node as a regular vectorize
5521        // node, just reorder reuses mask.
5522        if (TE->State != TreeEntry::Vectorize &&
5523            TE->State != TreeEntry::StridedVectorize &&
5524            TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5525          GatherOps.push_back(TE);
5526        continue;
5527      }
5528      TreeEntry *Gather = nullptr;
5529      if (count_if(ReorderableGathers,
5530                   [&Gather, UserTE, I](TreeEntry *TE) {
5531                     assert(TE->State != TreeEntry::Vectorize &&
5532                            TE->State != TreeEntry::StridedVectorize &&
5533                            "Only non-vectorized nodes are expected.");
5534                     if (any_of(TE->UserTreeIndices,
5535                                [UserTE, I](const EdgeInfo &EI) {
5536                                  return EI.UserTE == UserTE && EI.EdgeIdx == I;
5537                                })) {
5538                       assert(TE->isSame(UserTE->getOperand(I)) &&
5539                              "Operand entry does not match operands.");
5540                       Gather = TE;
5541                       return true;
5542                     }
5543                     return false;
5544                   }) > 1 &&
5545          !allConstant(UserTE->getOperand(I)))
5546        return false;
5547      if (Gather)
5548        GatherOps.push_back(Gather);
5549    }
5550    return true;
5551  }
5552  
5553  void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5554    SetVector<TreeEntry *> OrderedEntries;
5555    DenseSet<const TreeEntry *> GathersToOrders;
5556    // Find all reorderable leaf nodes with the given VF.
5557    // Currently the are vectorized loads,extracts without alternate operands +
5558    // some gathering of extracts.
5559    SmallVector<TreeEntry *> NonVectorized;
5560    for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561      if (TE->State != TreeEntry::Vectorize &&
5562          TE->State != TreeEntry::StridedVectorize)
5563        NonVectorized.push_back(TE.get());
5564      if (std::optional<OrdersType> CurrentOrder =
5565              getReorderingData(*TE, /*TopToBottom=*/false)) {
5566        OrderedEntries.insert(TE.get());
5567        if (!(TE->State == TreeEntry::Vectorize ||
5568              TE->State == TreeEntry::StridedVectorize) ||
5569            !TE->ReuseShuffleIndices.empty())
5570          GathersToOrders.insert(TE.get());
5571      }
5572    }
5573  
5574    // 1. Propagate order to the graph nodes, which use only reordered nodes.
5575    // I.e., if the node has operands, that are reordered, try to make at least
5576    // one operand order in the natural order and reorder others + reorder the
5577    // user node itself.
5578    SmallPtrSet<const TreeEntry *, 4> Visited;
5579    while (!OrderedEntries.empty()) {
5580      // 1. Filter out only reordered nodes.
5581      // 2. If the entry has multiple uses - skip it and jump to the next node.
5582      DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
5583      SmallVector<TreeEntry *> Filtered;
5584      for (TreeEntry *TE : OrderedEntries) {
5585        if (!(TE->State == TreeEntry::Vectorize ||
5586              TE->State == TreeEntry::StridedVectorize ||
5587              (TE->isGather() && GathersToOrders.contains(TE))) ||
5588            TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5589            !all_of(drop_begin(TE->UserTreeIndices),
5590                    [TE](const EdgeInfo &EI) {
5591                      return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5592                    }) ||
5593            !Visited.insert(TE).second) {
5594          Filtered.push_back(TE);
5595          continue;
5596        }
5597        // Build a map between user nodes and their operands order to speedup
5598        // search. The graph currently does not provide this dependency directly.
5599        for (EdgeInfo &EI : TE->UserTreeIndices) {
5600          TreeEntry *UserTE = EI.UserTE;
5601          auto It = Users.find(UserTE);
5602          if (It == Users.end())
5603            It = Users.insert({UserTE, {}}).first;
5604          It->second.emplace_back(EI.EdgeIdx, TE);
5605        }
5606      }
5607      // Erase filtered entries.
5608      for (TreeEntry *TE : Filtered)
5609        OrderedEntries.remove(TE);
5610      SmallVector<
5611          std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5612          UsersVec(Users.begin(), Users.end());
5613      sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5614        return Data1.first->Idx > Data2.first->Idx;
5615      });
5616      for (auto &Data : UsersVec) {
5617        // Check that operands are used only in the User node.
5618        SmallVector<TreeEntry *> GatherOps;
5619        if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5620                                GatherOps)) {
5621          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5622            OrderedEntries.remove(Op.second);
5623          continue;
5624        }
5625        // All operands are reordered and used only in this node - propagate the
5626        // most used order to the user node.
5627        MapVector<OrdersType, unsigned,
5628                  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5629            OrdersUses;
5630        // Do the analysis for each tree entry only once, otherwise the order of
5631        // the same node my be considered several times, though might be not
5632        // profitable.
5633        SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5634        SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
5635        for (const auto &Op : Data.second) {
5636          TreeEntry *OpTE = Op.second;
5637          if (!VisitedOps.insert(OpTE).second)
5638            continue;
5639          if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5640            continue;
5641          const auto Order = [&]() -> const OrdersType {
5642            if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5643              return getReorderingData(*OpTE, /*TopToBottom=*/false)
5644                  .value_or(OrdersType(1));
5645            return OpTE->ReorderIndices;
5646          }();
5647          // The order is partially ordered, skip it in favor of fully non-ordered
5648          // orders.
5649          if (Order.size() == 1)
5650            continue;
5651          unsigned NumOps = count_if(
5652              Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5653                return P.second == OpTE;
5654              });
5655          // Stores actually store the mask, not the order, need to invert.
5656          if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657              OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5658            SmallVector<int> Mask;
5659            inversePermutation(Order, Mask);
5660            unsigned E = Order.size();
5661            OrdersType CurrentOrder(E, E);
5662            transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5663              return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5664            });
5665            fixupOrderingIndices(CurrentOrder);
5666            OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5667                NumOps;
5668          } else {
5669            OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5670          }
5671          auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5672          const auto AllowsReordering = [&](const TreeEntry *TE) {
5673            // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5674            if (TE->isNonPowOf2Vec())
5675              return false;
5676            if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5677                (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5678                (IgnoreReorder && TE->Idx == 0))
5679              return true;
5680            if (TE->isGather()) {
5681              if (GathersToOrders.contains(TE))
5682                return !getReorderingData(*TE, /*TopToBottom=*/false)
5683                            .value_or(OrdersType(1))
5684                            .empty();
5685              return true;
5686            }
5687            return false;
5688          };
5689          for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690            TreeEntry *UserTE = EI.UserTE;
5691            if (!VisitedUsers.insert(UserTE).second)
5692              continue;
5693            // May reorder user node if it requires reordering, has reused
5694            // scalars, is an alternate op vectorize node or its op nodes require
5695            // reordering.
5696            if (AllowsReordering(UserTE))
5697              continue;
5698            // Check if users allow reordering.
5699            // Currently look up just 1 level of operands to avoid increase of
5700            // the compile time.
5701            // Profitable to reorder if definitely more operands allow
5702            // reordering rather than those with natural order.
5703            ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
5704            if (static_cast<unsigned>(count_if(
5705                    Ops, [UserTE, &AllowsReordering](
5706                             const std::pair<unsigned, TreeEntry *> &Op) {
5707                      return AllowsReordering(Op.second) &&
5708                             all_of(Op.second->UserTreeIndices,
5709                                    [UserTE](const EdgeInfo &EI) {
5710                                      return EI.UserTE == UserTE;
5711                                    });
5712                    })) <= Ops.size() / 2)
5713              ++Res.first->second;
5714          }
5715        }
5716        if (OrdersUses.empty()) {
5717          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5718            OrderedEntries.remove(Op.second);
5719          continue;
5720        }
5721        auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5722          const unsigned Sz = Order.size();
5723          for (unsigned Idx : seq<unsigned>(0, Sz))
5724            if (Idx != Order[Idx] && Order[Idx] != Sz)
5725              return false;
5726          return true;
5727        };
5728        // Choose the most used order.
5729        unsigned IdentityCnt = 0;
5730        unsigned VF = Data.second.front().second->getVectorFactor();
5731        OrdersType IdentityOrder(VF, VF);
5732        for (auto &Pair : OrdersUses) {
5733          if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5734            IdentityCnt += Pair.second;
5735            combineOrders(IdentityOrder, Pair.first);
5736          }
5737        }
5738        MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5739        unsigned Cnt = IdentityCnt;
5740        for (auto &Pair : OrdersUses) {
5741          // Prefer identity order. But, if filled identity found (non-empty
5742          // order) with same number of uses, as the new candidate order, we can
5743          // choose this candidate order.
5744          if (Cnt < Pair.second) {
5745            combineOrders(Pair.first, BestOrder);
5746            BestOrder = Pair.first;
5747            Cnt = Pair.second;
5748          } else {
5749            combineOrders(BestOrder, Pair.first);
5750          }
5751        }
5752        // Set order of the user node.
5753        if (IsIdentityOrder(BestOrder)) {
5754          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5755            OrderedEntries.remove(Op.second);
5756          continue;
5757        }
5758        fixupOrderingIndices(BestOrder);
5759        // Erase operands from OrderedEntries list and adjust their orders.
5760        VisitedOps.clear();
5761        SmallVector<int> Mask;
5762        inversePermutation(BestOrder, Mask);
5763        SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5764        unsigned E = BestOrder.size();
5765        transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5766          return I < E ? static_cast<int>(I) : PoisonMaskElem;
5767        });
5768        for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5769          TreeEntry *TE = Op.second;
5770          OrderedEntries.remove(TE);
5771          if (!VisitedOps.insert(TE).second)
5772            continue;
5773          if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5774            reorderNodeWithReuses(*TE, Mask);
5775            continue;
5776          }
5777          // Gathers are processed separately.
5778          if (TE->State != TreeEntry::Vectorize &&
5779              TE->State != TreeEntry::StridedVectorize &&
5780              (TE->State != TreeEntry::ScatterVectorize ||
5781               TE->ReorderIndices.empty()))
5782            continue;
5783          assert((BestOrder.size() == TE->ReorderIndices.size() ||
5784                  TE->ReorderIndices.empty()) &&
5785                 "Non-matching sizes of user/operand entries.");
5786          reorderOrder(TE->ReorderIndices, Mask);
5787          if (IgnoreReorder && TE == VectorizableTree.front().get())
5788            IgnoreReorder = false;
5789        }
5790        // For gathers just need to reorder its scalars.
5791        for (TreeEntry *Gather : GatherOps) {
5792          assert(Gather->ReorderIndices.empty() &&
5793                 "Unexpected reordering of gathers.");
5794          if (!Gather->ReuseShuffleIndices.empty()) {
5795            // Just reorder reuses indices.
5796            reorderReuses(Gather->ReuseShuffleIndices, Mask);
5797            continue;
5798          }
5799          reorderScalars(Gather->Scalars, Mask);
5800          OrderedEntries.remove(Gather);
5801        }
5802        // Reorder operands of the user node and set the ordering for the user
5803        // node itself.
5804        if (Data.first->State != TreeEntry::Vectorize ||
5805            !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806                Data.first->getMainOp()) ||
5807            Data.first->isAltShuffle())
5808          Data.first->reorderOperands(Mask);
5809        if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5810            Data.first->isAltShuffle() ||
5811            Data.first->State == TreeEntry::StridedVectorize) {
5812          reorderScalars(Data.first->Scalars, Mask);
5813          reorderOrder(Data.first->ReorderIndices, MaskOrder,
5814                       /*BottomOrder=*/true);
5815          if (Data.first->ReuseShuffleIndices.empty() &&
5816              !Data.first->ReorderIndices.empty() &&
5817              !Data.first->isAltShuffle()) {
5818            // Insert user node to the list to try to sink reordering deeper in
5819            // the graph.
5820            OrderedEntries.insert(Data.first);
5821          }
5822        } else {
5823          reorderOrder(Data.first->ReorderIndices, Mask);
5824        }
5825      }
5826    }
5827    // If the reordering is unnecessary, just remove the reorder.
5828    if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829        VectorizableTree.front()->ReuseShuffleIndices.empty())
5830      VectorizableTree.front()->ReorderIndices.clear();
5831  }
5832  
5833  void BoUpSLP::buildExternalUses(
5834      const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5835    DenseMap<Value *, unsigned> ScalarToExtUses;
5836    // Collect the values that we need to extract from the tree.
5837    for (auto &TEPtr : VectorizableTree) {
5838      TreeEntry *Entry = TEPtr.get();
5839  
5840      // No need to handle users of gathered values.
5841      if (Entry->isGather())
5842        continue;
5843  
5844      // For each lane:
5845      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846        Value *Scalar = Entry->Scalars[Lane];
5847        if (!isa<Instruction>(Scalar))
5848          continue;
5849        // All uses must be replaced already? No need to do it again.
5850        auto It = ScalarToExtUses.find(Scalar);
5851        if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5852          continue;
5853  
5854        // Check if the scalar is externally used as an extra arg.
5855        const auto *ExtI = ExternallyUsedValues.find(Scalar);
5856        if (ExtI != ExternallyUsedValues.end()) {
5857          int FoundLane = Entry->findLaneForValue(Scalar);
5858          LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5859                            << FoundLane << " from " << *Scalar << ".\n");
5860          ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5861          ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5862          continue;
5863        }
5864        for (User *U : Scalar->users()) {
5865          LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5866  
5867          Instruction *UserInst = dyn_cast<Instruction>(U);
5868          if (!UserInst || isDeleted(UserInst))
5869            continue;
5870  
5871          // Ignore users in the user ignore list.
5872          if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5873            continue;
5874  
5875          // Skip in-tree scalars that become vectors
5876          if (TreeEntry *UseEntry = getTreeEntry(U)) {
5877            // Some in-tree scalars will remain as scalar in vectorized
5878            // instructions. If that is the case, the one in FoundLane will
5879            // be used.
5880            if (UseEntry->State == TreeEntry::ScatterVectorize ||
5881                !doesInTreeUserNeedToExtract(
5882                    Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5883              LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5884                                << ".\n");
5885              assert(!UseEntry->isGather() && "Bad state");
5886              continue;
5887            }
5888            U = nullptr;
5889            if (It != ScalarToExtUses.end()) {
5890              ExternalUses[It->second].User = nullptr;
5891              break;
5892            }
5893          }
5894  
5895          if (U && Scalar->hasNUsesOrMore(UsesLimit))
5896            U = nullptr;
5897          int FoundLane = Entry->findLaneForValue(Scalar);
5898          LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5899                            << " from lane " << FoundLane << " from " << *Scalar
5900                            << ".\n");
5901          It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5902          ExternalUses.emplace_back(Scalar, U, FoundLane);
5903          if (!U)
5904            break;
5905        }
5906      }
5907    }
5908  }
5909  
5910  DenseMap<Value *, SmallVector<StoreInst *>>
5911  BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5912    DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
5913    for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5914      Value *V = TE->Scalars[Lane];
5915      // To save compilation time we don't visit if we have too many users.
5916      if (V->hasNUsesOrMore(UsesLimit))
5917        break;
5918  
5919      // Collect stores per pointer object.
5920      for (User *U : V->users()) {
5921        auto *SI = dyn_cast<StoreInst>(U);
5922        if (SI == nullptr || !SI->isSimple() ||
5923            !isValidElementType(SI->getValueOperand()->getType()))
5924          continue;
5925        // Skip entry if already
5926        if (getTreeEntry(U))
5927          continue;
5928  
5929        Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5930        auto &StoresVec = PtrToStoresMap[Ptr];
5931        // For now just keep one store per pointer object per lane.
5932        // TODO: Extend this to support multiple stores per pointer per lane
5933        if (StoresVec.size() > Lane)
5934          continue;
5935        // Skip if in different BBs.
5936        if (!StoresVec.empty() &&
5937            SI->getParent() != StoresVec.back()->getParent())
5938          continue;
5939        // Make sure that the stores are of the same type.
5940        if (!StoresVec.empty() &&
5941            SI->getValueOperand()->getType() !=
5942                StoresVec.back()->getValueOperand()->getType())
5943          continue;
5944        StoresVec.push_back(SI);
5945      }
5946    }
5947    return PtrToStoresMap;
5948  }
5949  
5950  bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5951                              OrdersType &ReorderIndices) const {
5952    // We check whether the stores in StoreVec can form a vector by sorting them
5953    // and checking whether they are consecutive.
5954  
5955    // To avoid calling getPointersDiff() while sorting we create a vector of
5956    // pairs {store, offset from first} and sort this instead.
5957    SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5958    StoreInst *S0 = StoresVec[0];
5959    StoreOffsetVec[0] = {S0, 0};
5960    Type *S0Ty = S0->getValueOperand()->getType();
5961    Value *S0Ptr = S0->getPointerOperand();
5962    for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5963      StoreInst *SI = StoresVec[Idx];
5964      std::optional<int> Diff =
5965          getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5966                          SI->getPointerOperand(), *DL, *SE,
5967                          /*StrictCheck=*/true);
5968      // We failed to compare the pointers so just abandon this StoresVec.
5969      if (!Diff)
5970        return false;
5971      StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5972    }
5973  
5974    // Sort the vector based on the pointers. We create a copy because we may
5975    // need the original later for calculating the reorder (shuffle) indices.
5976    stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5977                                   const std::pair<StoreInst *, int> &Pair2) {
5978      int Offset1 = Pair1.second;
5979      int Offset2 = Pair2.second;
5980      return Offset1 < Offset2;
5981    });
5982  
5983    // Check if the stores are consecutive by checking if their difference is 1.
5984    for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5985      if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5986        return false;
5987  
5988    // Calculate the shuffle indices according to their offset against the sorted
5989    // StoreOffsetVec.
5990    ReorderIndices.reserve(StoresVec.size());
5991    for (StoreInst *SI : StoresVec) {
5992      unsigned Idx = find_if(StoreOffsetVec,
5993                             [SI](const std::pair<StoreInst *, int> &Pair) {
5994                               return Pair.first == SI;
5995                             }) -
5996                     StoreOffsetVec.begin();
5997      ReorderIndices.push_back(Idx);
5998    }
5999    // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6000    // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6001    // same convention here.
6002    auto IsIdentityOrder = [](const OrdersType &Order) {
6003      for (unsigned Idx : seq<unsigned>(0, Order.size()))
6004        if (Idx != Order[Idx])
6005          return false;
6006      return true;
6007    };
6008    if (IsIdentityOrder(ReorderIndices))
6009      ReorderIndices.clear();
6010  
6011    return true;
6012  }
6013  
6014  #ifndef NDEBUG
6015  LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
6016    for (unsigned Idx : Order)
6017      dbgs() << Idx << ", ";
6018    dbgs() << "\n";
6019  }
6020  #endif
6021  
6022  SmallVector<BoUpSLP::OrdersType, 1>
6023  BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6024    unsigned NumLanes = TE->Scalars.size();
6025  
6026    DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
6027        collectUserStores(TE);
6028  
6029    // Holds the reorder indices for each candidate store vector that is a user of
6030    // the current TreeEntry.
6031    SmallVector<OrdersType, 1> ExternalReorderIndices;
6032  
6033    // Now inspect the stores collected per pointer and look for vectorization
6034    // candidates. For each candidate calculate the reorder index vector and push
6035    // it into `ExternalReorderIndices`
6036    for (const auto &Pair : PtrToStoresMap) {
6037      auto &StoresVec = Pair.second;
6038      // If we have fewer than NumLanes stores, then we can't form a vector.
6039      if (StoresVec.size() != NumLanes)
6040        continue;
6041  
6042      // If the stores are not consecutive then abandon this StoresVec.
6043      OrdersType ReorderIndices;
6044      if (!canFormVector(StoresVec, ReorderIndices))
6045        continue;
6046  
6047      // We now know that the scalars in StoresVec can form a vector instruction,
6048      // so set the reorder indices.
6049      ExternalReorderIndices.push_back(ReorderIndices);
6050    }
6051    return ExternalReorderIndices;
6052  }
6053  
6054  void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
6055                          const SmallDenseSet<Value *> &UserIgnoreLst) {
6056    deleteTree();
6057    UserIgnoreList = &UserIgnoreLst;
6058    if (!allSameType(Roots))
6059      return;
6060    buildTree_rec(Roots, 0, EdgeInfo());
6061  }
6062  
6063  void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6064    deleteTree();
6065    if (!allSameType(Roots))
6066      return;
6067    buildTree_rec(Roots, 0, EdgeInfo());
6068  }
6069  
6070  /// \return true if the specified list of values has only one instruction that
6071  /// requires scheduling, false otherwise.
6072  #ifndef NDEBUG
6073  static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
6074    Value *NeedsScheduling = nullptr;
6075    for (Value *V : VL) {
6076      if (doesNotNeedToBeScheduled(V))
6077        continue;
6078      if (!NeedsScheduling) {
6079        NeedsScheduling = V;
6080        continue;
6081      }
6082      return false;
6083    }
6084    return NeedsScheduling;
6085  }
6086  #endif
6087  
6088  /// Generates key/subkey pair for the given value to provide effective sorting
6089  /// of the values and better detection of the vectorizable values sequences. The
6090  /// keys/subkeys can be used for better sorting of the values themselves (keys)
6091  /// and in values subgroups (subkeys).
6092  static std::pair<size_t, size_t> generateKeySubkey(
6093      Value *V, const TargetLibraryInfo *TLI,
6094      function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6095      bool AllowAlternate) {
6096    hash_code Key = hash_value(V->getValueID() + 2);
6097    hash_code SubKey = hash_value(0);
6098    // Sort the loads by the distance between the pointers.
6099    if (auto *LI = dyn_cast<LoadInst>(V)) {
6100      Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6101      if (LI->isSimple())
6102        SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6103      else
6104        Key = SubKey = hash_value(LI);
6105    } else if (isVectorLikeInstWithConstOps(V)) {
6106      // Sort extracts by the vector operands.
6107      if (isa<ExtractElementInst, UndefValue>(V))
6108        Key = hash_value(Value::UndefValueVal + 1);
6109      if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6110        if (!isUndefVector(EI->getVectorOperand()).all() &&
6111            !isa<UndefValue>(EI->getIndexOperand()))
6112          SubKey = hash_value(EI->getVectorOperand());
6113      }
6114    } else if (auto *I = dyn_cast<Instruction>(V)) {
6115      // Sort other instructions just by the opcodes except for CMPInst.
6116      // For CMP also sort by the predicate kind.
6117      if ((isa<BinaryOperator, CastInst>(I)) &&
6118          isValidForAlternation(I->getOpcode())) {
6119        if (AllowAlternate)
6120          Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6121        else
6122          Key = hash_combine(hash_value(I->getOpcode()), Key);
6123        SubKey = hash_combine(
6124            hash_value(I->getOpcode()), hash_value(I->getType()),
6125            hash_value(isa<BinaryOperator>(I)
6126                           ? I->getType()
6127                           : cast<CastInst>(I)->getOperand(0)->getType()));
6128        // For casts, look through the only operand to improve compile time.
6129        if (isa<CastInst>(I)) {
6130          std::pair<size_t, size_t> OpVals =
6131              generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6132                                /*AllowAlternate=*/true);
6133          Key = hash_combine(OpVals.first, Key);
6134          SubKey = hash_combine(OpVals.first, SubKey);
6135        }
6136      } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6137        CmpInst::Predicate Pred = CI->getPredicate();
6138        if (CI->isCommutative())
6139          Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6140        CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
6141        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6142                              hash_value(SwapPred),
6143                              hash_value(CI->getOperand(0)->getType()));
6144      } else if (auto *Call = dyn_cast<CallInst>(I)) {
6145        Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
6146        if (isTriviallyVectorizable(ID)) {
6147          SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
6148        } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
6149          SubKey = hash_combine(hash_value(I->getOpcode()),
6150                                hash_value(Call->getCalledFunction()));
6151        } else {
6152          Key = hash_combine(hash_value(Call), Key);
6153          SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
6154        }
6155        for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6156          SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
6157                                hash_value(Op.Tag), SubKey);
6158      } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
6159        if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6160          SubKey = hash_value(Gep->getPointerOperand());
6161        else
6162          SubKey = hash_value(Gep);
6163      } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
6164                 !isa<ConstantInt>(I->getOperand(1))) {
6165        // Do not try to vectorize instructions with potentially high cost.
6166        SubKey = hash_value(I);
6167      } else {
6168        SubKey = hash_value(I->getOpcode());
6169      }
6170      Key = hash_combine(hash_value(I->getParent()), Key);
6171    }
6172    return std::make_pair(Key, SubKey);
6173  }
6174  
6175  /// Checks if the specified instruction \p I is an alternate operation for
6176  /// the given \p MainOp and \p AltOp instructions.
6177  static bool isAlternateInstruction(const Instruction *I,
6178                                     const Instruction *MainOp,
6179                                     const Instruction *AltOp,
6180                                     const TargetLibraryInfo &TLI);
6181  
6182  bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6183                                         ArrayRef<Value *> VL) const {
6184    unsigned Opcode0 = S.getOpcode();
6185    unsigned Opcode1 = S.getAltOpcode();
6186    SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6187    // If this pattern is supported by the target then consider it profitable.
6188    if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
6189                             Opcode0, Opcode1, OpcodeMask))
6190      return true;
6191    SmallVector<ValueList> Operands;
6192    for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6193      Operands.emplace_back();
6194      // Prepare the operand vector.
6195      for (Value *V : VL)
6196        Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6197    }
6198    if (Operands.size() == 2) {
6199      // Try find best operands candidates.
6200      for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6201        SmallVector<std::pair<Value *, Value *>> Candidates(3);
6202        Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6203        Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6204        Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6205        std::optional<int> Res = findBestRootPair(Candidates);
6206        switch (Res.value_or(0)) {
6207        case 0:
6208          break;
6209        case 1:
6210          std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6211          break;
6212        case 2:
6213          std::swap(Operands[0][I], Operands[1][I]);
6214          break;
6215        default:
6216          llvm_unreachable("Unexpected index.");
6217        }
6218      }
6219    }
6220    DenseSet<unsigned> UniqueOpcodes;
6221    constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6222    unsigned NonInstCnt = 0;
6223    // Estimate number of instructions, required for the vectorized node and for
6224    // the buildvector node.
6225    unsigned UndefCnt = 0;
6226    // Count the number of extra shuffles, required for vector nodes.
6227    unsigned ExtraShuffleInsts = 0;
6228    // Check that operands do not contain same values and create either perfect
6229    // diamond match or shuffled match.
6230    if (Operands.size() == 2) {
6231      // Do not count same operands twice.
6232      if (Operands.front() == Operands.back()) {
6233        Operands.erase(Operands.begin());
6234      } else if (!allConstant(Operands.front()) &&
6235                 all_of(Operands.front(), [&](Value *V) {
6236                   return is_contained(Operands.back(), V);
6237                 })) {
6238        Operands.erase(Operands.begin());
6239        ++ExtraShuffleInsts;
6240      }
6241    }
6242    const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6243    // Vectorize node, if:
6244    // 1. at least single operand is constant or splat.
6245    // 2. Operands have many loop invariants (the instructions are not loop
6246    // invariants).
6247    // 3. At least single unique operands is supposed to vectorized.
6248    return none_of(Operands,
6249                   [&](ArrayRef<Value *> Op) {
6250                     if (allConstant(Op) ||
6251                         (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6252                          getSameOpcode(Op, *TLI).MainOp))
6253                       return false;
6254                     DenseMap<Value *, unsigned> Uniques;
6255                     for (Value *V : Op) {
6256                       if (isa<Constant, ExtractElementInst>(V) ||
6257                           getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6258                         if (isa<UndefValue>(V))
6259                           ++UndefCnt;
6260                         continue;
6261                       }
6262                       auto Res = Uniques.try_emplace(V, 0);
6263                       // Found first duplicate - need to add shuffle.
6264                       if (!Res.second && Res.first->second == 1)
6265                         ++ExtraShuffleInsts;
6266                       ++Res.first->getSecond();
6267                       if (auto *I = dyn_cast<Instruction>(V))
6268                         UniqueOpcodes.insert(I->getOpcode());
6269                       else if (Res.second)
6270                         ++NonInstCnt;
6271                     }
6272                     return none_of(Uniques, [&](const auto &P) {
6273                       return P.first->hasNUsesOrMore(P.second + 1) &&
6274                              none_of(P.first->users(), [&](User *U) {
6275                                return getTreeEntry(U) || Uniques.contains(U);
6276                              });
6277                     });
6278                   }) ||
6279           // Do not vectorize node, if estimated number of vector instructions is
6280           // more than estimated number of buildvector instructions. Number of
6281           // vector operands is number of vector instructions + number of vector
6282           // instructions for operands (buildvectors). Number of buildvector
6283           // instructions is just number_of_operands * number_of_scalars.
6284           (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6285            (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6286             NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6287  }
6288  
6289  BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6290      InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6291      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6292    assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6293  
6294    unsigned ShuffleOrOp =
6295        S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6296    auto *VL0 = cast<Instruction>(S.OpValue);
6297    switch (ShuffleOrOp) {
6298    case Instruction::PHI: {
6299      // Too many operands - gather, most probably won't be vectorized.
6300      if (VL0->getNumOperands() > MaxPHINumOperands)
6301        return TreeEntry::NeedToGather;
6302      // Check for terminator values (e.g. invoke).
6303      for (Value *V : VL)
6304        for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6305          Instruction *Term = dyn_cast<Instruction>(Incoming);
6306          if (Term && Term->isTerminator()) {
6307            LLVM_DEBUG(dbgs()
6308                       << "SLP: Need to swizzle PHINodes (terminator use).\n");
6309            return TreeEntry::NeedToGather;
6310          }
6311        }
6312  
6313      return TreeEntry::Vectorize;
6314    }
6315    case Instruction::ExtractValue:
6316    case Instruction::ExtractElement: {
6317      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6318      // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6319      if (!isPowerOf2_32(VL.size()))
6320        return TreeEntry::NeedToGather;
6321      if (Reuse || !CurrentOrder.empty())
6322        return TreeEntry::Vectorize;
6323      LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6324      return TreeEntry::NeedToGather;
6325    }
6326    case Instruction::InsertElement: {
6327      // Check that we have a buildvector and not a shuffle of 2 or more
6328      // different vectors.
6329      ValueSet SourceVectors;
6330      for (Value *V : VL) {
6331        SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6332        assert(getElementIndex(V) != std::nullopt &&
6333               "Non-constant or undef index?");
6334      }
6335  
6336      if (count_if(VL, [&SourceVectors](Value *V) {
6337            return !SourceVectors.contains(V);
6338          }) >= 2) {
6339        // Found 2nd source vector - cancel.
6340        LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6341                             "different source vectors.\n");
6342        return TreeEntry::NeedToGather;
6343      }
6344  
6345      return TreeEntry::Vectorize;
6346    }
6347    case Instruction::Load: {
6348      // Check that a vectorized load would load the same memory as a scalar
6349      // load. For example, we don't want to vectorize loads that are smaller
6350      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6351      // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6352      // from such a struct, we read/write packed bits disagreeing with the
6353      // unvectorized version.
6354      switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6355      case LoadsState::Vectorize:
6356        return TreeEntry::Vectorize;
6357      case LoadsState::ScatterVectorize:
6358        return TreeEntry::ScatterVectorize;
6359      case LoadsState::StridedVectorize:
6360        return TreeEntry::StridedVectorize;
6361      case LoadsState::Gather:
6362  #ifndef NDEBUG
6363        Type *ScalarTy = VL0->getType();
6364        if (DL->getTypeSizeInBits(ScalarTy) !=
6365            DL->getTypeAllocSizeInBits(ScalarTy))
6366          LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6367        else if (any_of(VL,
6368                        [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6369          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6370        else
6371          LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6372  #endif // NDEBUG
6373        return TreeEntry::NeedToGather;
6374      }
6375      llvm_unreachable("Unexpected state of loads");
6376    }
6377    case Instruction::ZExt:
6378    case Instruction::SExt:
6379    case Instruction::FPToUI:
6380    case Instruction::FPToSI:
6381    case Instruction::FPExt:
6382    case Instruction::PtrToInt:
6383    case Instruction::IntToPtr:
6384    case Instruction::SIToFP:
6385    case Instruction::UIToFP:
6386    case Instruction::Trunc:
6387    case Instruction::FPTrunc:
6388    case Instruction::BitCast: {
6389      Type *SrcTy = VL0->getOperand(0)->getType();
6390      for (Value *V : VL) {
6391        Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6392        if (Ty != SrcTy || !isValidElementType(Ty)) {
6393          LLVM_DEBUG(
6394              dbgs() << "SLP: Gathering casts with different src types.\n");
6395          return TreeEntry::NeedToGather;
6396        }
6397      }
6398      return TreeEntry::Vectorize;
6399    }
6400    case Instruction::ICmp:
6401    case Instruction::FCmp: {
6402      // Check that all of the compares have the same predicate.
6403      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6404      CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
6405      Type *ComparedTy = VL0->getOperand(0)->getType();
6406      for (Value *V : VL) {
6407        CmpInst *Cmp = cast<CmpInst>(V);
6408        if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6409            Cmp->getOperand(0)->getType() != ComparedTy) {
6410          LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6411          return TreeEntry::NeedToGather;
6412        }
6413      }
6414      return TreeEntry::Vectorize;
6415    }
6416    case Instruction::Select:
6417    case Instruction::FNeg:
6418    case Instruction::Add:
6419    case Instruction::FAdd:
6420    case Instruction::Sub:
6421    case Instruction::FSub:
6422    case Instruction::Mul:
6423    case Instruction::FMul:
6424    case Instruction::UDiv:
6425    case Instruction::SDiv:
6426    case Instruction::FDiv:
6427    case Instruction::URem:
6428    case Instruction::SRem:
6429    case Instruction::FRem:
6430    case Instruction::Shl:
6431    case Instruction::LShr:
6432    case Instruction::AShr:
6433    case Instruction::And:
6434    case Instruction::Or:
6435    case Instruction::Xor:
6436      return TreeEntry::Vectorize;
6437    case Instruction::GetElementPtr: {
6438      // We don't combine GEPs with complicated (nested) indexing.
6439      for (Value *V : VL) {
6440        auto *I = dyn_cast<GetElementPtrInst>(V);
6441        if (!I)
6442          continue;
6443        if (I->getNumOperands() != 2) {
6444          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6445          return TreeEntry::NeedToGather;
6446        }
6447      }
6448  
6449      // We can't combine several GEPs into one vector if they operate on
6450      // different types.
6451      Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6452      for (Value *V : VL) {
6453        auto *GEP = dyn_cast<GEPOperator>(V);
6454        if (!GEP)
6455          continue;
6456        Type *CurTy = GEP->getSourceElementType();
6457        if (Ty0 != CurTy) {
6458          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6459          return TreeEntry::NeedToGather;
6460        }
6461      }
6462  
6463      // We don't combine GEPs with non-constant indexes.
6464      Type *Ty1 = VL0->getOperand(1)->getType();
6465      for (Value *V : VL) {
6466        auto *I = dyn_cast<GetElementPtrInst>(V);
6467        if (!I)
6468          continue;
6469        auto *Op = I->getOperand(1);
6470        if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6471            (Op->getType() != Ty1 &&
6472             ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6473              Op->getType()->getScalarSizeInBits() >
6474                  DL->getIndexSizeInBits(
6475                      V->getType()->getPointerAddressSpace())))) {
6476          LLVM_DEBUG(
6477              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6478          return TreeEntry::NeedToGather;
6479        }
6480      }
6481  
6482      return TreeEntry::Vectorize;
6483    }
6484    case Instruction::Store: {
6485      // Check if the stores are consecutive or if we need to swizzle them.
6486      llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6487      // Avoid types that are padded when being allocated as scalars, while
6488      // being packed together in a vector (such as i1).
6489      if (DL->getTypeSizeInBits(ScalarTy) !=
6490          DL->getTypeAllocSizeInBits(ScalarTy)) {
6491        LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6492        return TreeEntry::NeedToGather;
6493      }
6494      // Make sure all stores in the bundle are simple - we can't vectorize
6495      // atomic or volatile stores.
6496      for (Value *V : VL) {
6497        auto *SI = cast<StoreInst>(V);
6498        if (!SI->isSimple()) {
6499          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6500          return TreeEntry::NeedToGather;
6501        }
6502        PointerOps.push_back(SI->getPointerOperand());
6503      }
6504  
6505      // Check the order of pointer operands.
6506      if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6507        Value *Ptr0;
6508        Value *PtrN;
6509        if (CurrentOrder.empty()) {
6510          Ptr0 = PointerOps.front();
6511          PtrN = PointerOps.back();
6512        } else {
6513          Ptr0 = PointerOps[CurrentOrder.front()];
6514          PtrN = PointerOps[CurrentOrder.back()];
6515        }
6516        std::optional<int> Dist =
6517            getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6518        // Check that the sorted pointer operands are consecutive.
6519        if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6520          return TreeEntry::Vectorize;
6521      }
6522  
6523      LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6524      return TreeEntry::NeedToGather;
6525    }
6526    case Instruction::Call: {
6527      // Check if the calls are all to the same vectorizable intrinsic or
6528      // library function.
6529      CallInst *CI = cast<CallInst>(VL0);
6530      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6531  
6532      VFShape Shape = VFShape::get(
6533          CI->getFunctionType(),
6534          ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6535          false /*HasGlobalPred*/);
6536      Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6537  
6538      if (!VecFunc && !isTriviallyVectorizable(ID)) {
6539        LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6540        return TreeEntry::NeedToGather;
6541      }
6542      Function *F = CI->getCalledFunction();
6543      unsigned NumArgs = CI->arg_size();
6544      SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6545      for (unsigned J = 0; J != NumArgs; ++J)
6546        if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
6547          ScalarArgs[J] = CI->getArgOperand(J);
6548      for (Value *V : VL) {
6549        CallInst *CI2 = dyn_cast<CallInst>(V);
6550        if (!CI2 || CI2->getCalledFunction() != F ||
6551            getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6552            (VecFunc &&
6553             VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6554            !CI->hasIdenticalOperandBundleSchema(*CI2)) {
6555          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6556                            << "\n");
6557          return TreeEntry::NeedToGather;
6558        }
6559        // Some intrinsics have scalar arguments and should be same in order for
6560        // them to be vectorized.
6561        for (unsigned J = 0; J != NumArgs; ++J) {
6562          if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
6563            Value *A1J = CI2->getArgOperand(J);
6564            if (ScalarArgs[J] != A1J) {
6565              LLVM_DEBUG(dbgs()
6566                         << "SLP: mismatched arguments in call:" << *CI
6567                         << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6568              return TreeEntry::NeedToGather;
6569            }
6570          }
6571        }
6572        // Verify that the bundle operands are identical between the two calls.
6573        if (CI->hasOperandBundles() &&
6574            !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6575                        CI->op_begin() + CI->getBundleOperandsEndIndex(),
6576                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6577          LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6578                            << "!=" << *V << '\n');
6579          return TreeEntry::NeedToGather;
6580        }
6581      }
6582  
6583      return TreeEntry::Vectorize;
6584    }
6585    case Instruction::ShuffleVector: {
6586      // If this is not an alternate sequence of opcode like add-sub
6587      // then do not vectorize this instruction.
6588      if (!S.isAltShuffle()) {
6589        LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6590        return TreeEntry::NeedToGather;
6591      }
6592      if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6593        LLVM_DEBUG(
6594            dbgs()
6595            << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6596               "the whole alt sequence is not profitable.\n");
6597        return TreeEntry::NeedToGather;
6598      }
6599  
6600      return TreeEntry::Vectorize;
6601    }
6602    default:
6603      LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6604      return TreeEntry::NeedToGather;
6605    }
6606  }
6607  
6608  namespace {
6609  /// Allows to correctly handle operands of the phi nodes based on the \p Main
6610  /// PHINode order of incoming basic blocks/values.
6611  class PHIHandler {
6612    DominatorTree &DT;
6613    PHINode *Main = nullptr;
6614    SmallVector<Value *> Phis;
6615    SmallVector<SmallVector<Value *>> Operands;
6616  
6617  public:
6618    PHIHandler() = delete;
6619    PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6620        : DT(DT), Main(Main), Phis(Phis),
6621          Operands(Main->getNumIncomingValues(),
6622                   SmallVector<Value *>(Phis.size(), nullptr)) {}
6623    void buildOperands() {
6624      constexpr unsigned FastLimit = 4;
6625      if (Main->getNumIncomingValues() <= FastLimit) {
6626        for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6627          BasicBlock *InBB = Main->getIncomingBlock(I);
6628          if (!DT.isReachableFromEntry(InBB)) {
6629            Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6630            continue;
6631          }
6632          // Prepare the operand vector.
6633          for (auto [Idx, V] : enumerate(Phis)) {
6634            auto *P = cast<PHINode>(V);
6635            if (P->getIncomingBlock(I) == InBB)
6636              Operands[I][Idx] = P->getIncomingValue(I);
6637            else
6638              Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6639          }
6640        }
6641        return;
6642      }
6643      SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
6644      for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6645        BasicBlock *InBB = Main->getIncomingBlock(I);
6646        if (!DT.isReachableFromEntry(InBB)) {
6647          Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6648          continue;
6649        }
6650        Blocks.try_emplace(InBB).first->second.push_back(I);
6651      }
6652      for (auto [Idx, V] : enumerate(Phis)) {
6653        auto *P = cast<PHINode>(V);
6654        for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6655          BasicBlock *InBB = P->getIncomingBlock(I);
6656          if (InBB == Main->getIncomingBlock(I)) {
6657            if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6658              continue;
6659            Operands[I][Idx] = P->getIncomingValue(I);
6660            continue;
6661          }
6662          auto It = Blocks.find(InBB);
6663          if (It == Blocks.end())
6664            continue;
6665          Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6666        }
6667      }
6668      for (const auto &P : Blocks) {
6669        if (P.getSecond().size() <= 1)
6670          continue;
6671        unsigned BasicI = P.getSecond().front();
6672        for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6673          assert(all_of(enumerate(Operands[I]),
6674                        [&](const auto &Data) {
6675                          return !Data.value() ||
6676                                 Data.value() == Operands[BasicI][Data.index()];
6677                        }) &&
6678                 "Expected empty operands list.");
6679          Operands[I] = Operands[BasicI];
6680        }
6681      }
6682    }
6683    ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6684  };
6685  } // namespace
6686  
6687  void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6688                              const EdgeInfo &UserTreeIdx) {
6689    assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6690  
6691    SmallVector<int> ReuseShuffleIndices;
6692    SmallVector<Value *> UniqueValues;
6693    SmallVector<Value *> NonUniqueValueVL;
6694    auto TryToFindDuplicates = [&](const InstructionsState &S,
6695                                   bool DoNotFail = false) {
6696      // Check that every instruction appears once in this bundle.
6697      DenseMap<Value *, unsigned> UniquePositions(VL.size());
6698      for (Value *V : VL) {
6699        if (isConstant(V)) {
6700          ReuseShuffleIndices.emplace_back(
6701              isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6702          UniqueValues.emplace_back(V);
6703          continue;
6704        }
6705        auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6706        ReuseShuffleIndices.emplace_back(Res.first->second);
6707        if (Res.second)
6708          UniqueValues.emplace_back(V);
6709      }
6710      size_t NumUniqueScalarValues = UniqueValues.size();
6711      if (NumUniqueScalarValues == VL.size()) {
6712        ReuseShuffleIndices.clear();
6713      } else {
6714        // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6715        if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716          LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6717                               "for nodes with padding.\n");
6718          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6719          return false;
6720        }
6721        LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6722        if (NumUniqueScalarValues <= 1 ||
6723            (UniquePositions.size() == 1 && all_of(UniqueValues,
6724                                                   [](Value *V) {
6725                                                     return isa<UndefValue>(V) ||
6726                                                            !isConstant(V);
6727                                                   })) ||
6728            !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6729          if (DoNotFail && UniquePositions.size() > 1 &&
6730              NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6731              all_of(UniqueValues, [=](Value *V) {
6732                return isa<ExtractElementInst>(V) ||
6733                       areAllUsersVectorized(cast<Instruction>(V),
6734                                             UserIgnoreList);
6735              })) {
6736            unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6737            if (PWSz == VL.size()) {
6738              ReuseShuffleIndices.clear();
6739            } else {
6740              NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6741              NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6742                                      UniqueValues.back());
6743              VL = NonUniqueValueVL;
6744            }
6745            return true;
6746          }
6747          LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6748          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6749          return false;
6750        }
6751        VL = UniqueValues;
6752      }
6753      return true;
6754    };
6755  
6756    InstructionsState S = getSameOpcode(VL, *TLI);
6757  
6758    // Don't vectorize ephemeral values.
6759    if (!EphValues.empty()) {
6760      for (Value *V : VL) {
6761        if (EphValues.count(V)) {
6762          LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6763                            << ") is ephemeral.\n");
6764          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6765          return;
6766        }
6767      }
6768    }
6769  
6770    // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6771    // a load), in which case peek through to include it in the tree, without
6772    // ballooning over-budget.
6773    if (Depth >= RecursionMaxDepth &&
6774        !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6775          VL.size() >= 4 &&
6776          (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6777             return match(I,
6778                          m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
6779                    cast<Instruction>(I)->getOpcode() ==
6780                        cast<Instruction>(S.MainOp)->getOpcode();
6781           })))) {
6782      LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6783      if (TryToFindDuplicates(S))
6784        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6785                     ReuseShuffleIndices);
6786      return;
6787    }
6788  
6789    // Don't handle scalable vectors
6790    if (S.getOpcode() == Instruction::ExtractElement &&
6791        isa<ScalableVectorType>(
6792            cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6793      LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6794      if (TryToFindDuplicates(S))
6795        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6796                     ReuseShuffleIndices);
6797      return;
6798    }
6799  
6800    // Don't handle vectors.
6801    if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802        !isa<InsertElementInst>(S.OpValue)) {
6803      LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6804      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6805      return;
6806    }
6807  
6808    if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6809      if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6810        LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6811        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6812        return;
6813      }
6814  
6815    // If all of the operands are identical or constant we have a simple solution.
6816    // If we deal with insert/extract instructions, they all must have constant
6817    // indices, otherwise we should gather them, not try to vectorize.
6818    // If alternate op node with 2 elements with gathered operands - do not
6819    // vectorize.
6820    auto &&NotProfitableForVectorization = [&S, this,
6821                                            Depth](ArrayRef<Value *> VL) {
6822      if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6823        return false;
6824      if (VectorizableTree.size() < MinTreeSize)
6825        return false;
6826      if (Depth >= RecursionMaxDepth - 1)
6827        return true;
6828      // Check if all operands are extracts, part of vector node or can build a
6829      // regular vectorize node.
6830      SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6831      for (Value *V : VL) {
6832        auto *I = cast<Instruction>(V);
6833        InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6834          return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6835        }));
6836      }
6837      bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6838      if ((IsCommutative &&
6839           std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6840          (!IsCommutative &&
6841           all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6842        return true;
6843      assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6844      SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
6845      auto *I1 = cast<Instruction>(VL.front());
6846      auto *I2 = cast<Instruction>(VL.back());
6847      for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6848        Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6849                                               I2->getOperand(Op));
6850      if (static_cast<unsigned>(count_if(
6851              Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6852                return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
6853              })) >= S.MainOp->getNumOperands() / 2)
6854        return false;
6855      if (S.MainOp->getNumOperands() > 2)
6856        return true;
6857      if (IsCommutative) {
6858        // Check permuted operands.
6859        Candidates.clear();
6860        for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6861          Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6862                                                 I2->getOperand((Op + 1) % E));
6863        if (any_of(
6864                Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6865                  return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
6866                }))
6867          return false;
6868      }
6869      return true;
6870    };
6871    SmallVector<unsigned> SortedIndices;
6872    BasicBlock *BB = nullptr;
6873    bool IsScatterVectorizeUserTE =
6874        UserTreeIdx.UserTE &&
6875        UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876    bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6877    bool AreScatterAllGEPSameBlock =
6878        (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6879         VL.size() > 2 &&
6880         all_of(VL,
6881                [&BB](Value *V) {
6882                  auto *I = dyn_cast<GetElementPtrInst>(V);
6883                  if (!I)
6884                    return doesNotNeedToBeScheduled(V);
6885                  if (!BB)
6886                    BB = I->getParent();
6887                  return BB == I->getParent() && I->getNumOperands() == 2;
6888                }) &&
6889         BB &&
6890         sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6891                         SortedIndices));
6892    bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6893    if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6894        (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6895             S.OpValue) &&
6896         !all_of(VL, isVectorLikeInstWithConstOps)) ||
6897        NotProfitableForVectorization(VL)) {
6898      LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899      if (TryToFindDuplicates(S))
6900        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6901                     ReuseShuffleIndices);
6902      return;
6903    }
6904  
6905    // We now know that this is a vector of instructions of the same type from
6906    // the same block.
6907  
6908    // Check if this is a duplicate of another entry.
6909    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6910      LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6911      if (!E->isSame(VL)) {
6912        auto It = MultiNodeScalars.find(S.OpValue);
6913        if (It != MultiNodeScalars.end()) {
6914          auto *TEIt = find_if(It->getSecond(),
6915                               [&](TreeEntry *ME) { return ME->isSame(VL); });
6916          if (TEIt != It->getSecond().end())
6917            E = *TEIt;
6918          else
6919            E = nullptr;
6920        } else {
6921          E = nullptr;
6922        }
6923      }
6924      if (!E) {
6925        if (!doesNotNeedToBeScheduled(S.OpValue)) {
6926          LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6927          if (TryToFindDuplicates(S))
6928            newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6929                         ReuseShuffleIndices);
6930          return;
6931        }
6932      } else {
6933        // Record the reuse of the tree node.  FIXME, currently this is only used
6934        // to properly draw the graph rather than for the actual vectorization.
6935        E->UserTreeIndices.push_back(UserTreeIdx);
6936        LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6937                          << ".\n");
6938        return;
6939      }
6940    }
6941  
6942    // Check that none of the instructions in the bundle are already in the tree.
6943    for (Value *V : VL) {
6944      if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6945          doesNotNeedToBeScheduled(V))
6946        continue;
6947      if (getTreeEntry(V)) {
6948        LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6949                          << ") is already in tree.\n");
6950        if (TryToFindDuplicates(S))
6951          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6952                       ReuseShuffleIndices);
6953        return;
6954      }
6955    }
6956  
6957    // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6958    if (UserIgnoreList && !UserIgnoreList->empty()) {
6959      for (Value *V : VL) {
6960        if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961          LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6962          if (TryToFindDuplicates(S))
6963            newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6964                         ReuseShuffleIndices);
6965          return;
6966        }
6967      }
6968    }
6969  
6970    // Special processing for sorted pointers for ScatterVectorize node with
6971    // constant indeces only.
6972    if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973      assert(S.OpValue->getType()->isPointerTy() &&
6974             count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6975             "Expected pointers only.");
6976      // Reset S to make it GetElementPtr kind of node.
6977      const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6978      assert(It != VL.end() && "Expected at least one GEP.");
6979      S = getSameOpcode(*It, *TLI);
6980    }
6981  
6982    // Check that all of the users of the scalars that we want to vectorize are
6983    // schedulable.
6984    auto *VL0 = cast<Instruction>(S.OpValue);
6985    BB = VL0->getParent();
6986  
6987    if (!DT->isReachableFromEntry(BB)) {
6988      // Don't go into unreachable blocks. They may contain instructions with
6989      // dependency cycles which confuse the final scheduling.
6990      LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6991      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6992      return;
6993    }
6994  
6995    // Don't go into catchswitch blocks, which can happen with PHIs.
6996    // Such blocks can only have PHIs and the catchswitch.  There is no
6997    // place to insert a shuffle if we need to, so just avoid that issue.
6998    if (isa<CatchSwitchInst>(BB->getTerminator())) {
6999      LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7000      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7001      return;
7002    }
7003  
7004    // Check that every instruction appears once in this bundle.
7005    if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7006      return;
7007  
7008    // Perform specific checks for each particular instruction kind.
7009    OrdersType CurrentOrder;
7010    SmallVector<Value *> PointerOps;
7011    TreeEntry::EntryState State = getScalarsVectorizationState(
7012        S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013    if (State == TreeEntry::NeedToGather) {
7014      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7015                   ReuseShuffleIndices);
7016      return;
7017    }
7018  
7019    auto &BSRef = BlocksSchedules[BB];
7020    if (!BSRef)
7021      BSRef = std::make_unique<BlockScheduling>(BB);
7022  
7023    BlockScheduling &BS = *BSRef;
7024  
7025    std::optional<ScheduleData *> Bundle =
7026        BS.tryScheduleBundle(UniqueValues, this, S);
7027  #ifdef EXPENSIVE_CHECKS
7028    // Make sure we didn't break any internal invariants
7029    BS.verify();
7030  #endif
7031    if (!Bundle) {
7032      LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7033      assert((!BS.getScheduleData(VL0) ||
7034              !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035             "tryScheduleBundle should cancelScheduling on failure");
7036      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7037                   ReuseShuffleIndices);
7038      NonScheduledFirst.insert(VL.front());
7039      return;
7040    }
7041    LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7042  
7043    unsigned ShuffleOrOp = S.isAltShuffle() ?
7044                  (unsigned) Instruction::ShuffleVector : S.getOpcode();
7045    switch (ShuffleOrOp) {
7046      case Instruction::PHI: {
7047        auto *PH = cast<PHINode>(VL0);
7048  
7049        TreeEntry *TE =
7050            newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7051        LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7052  
7053        // Keeps the reordered operands to avoid code duplication.
7054        PHIHandler Handler(*DT, PH, VL);
7055        Handler.buildOperands();
7056        for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7057          TE->setOperand(I, Handler.getOperands(I));
7058        for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7059          buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
7060        return;
7061      }
7062      case Instruction::ExtractValue:
7063      case Instruction::ExtractElement: {
7064        if (CurrentOrder.empty()) {
7065          LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7066        } else {
7067          LLVM_DEBUG({
7068            dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7069                      "with order";
7070            for (unsigned Idx : CurrentOrder)
7071              dbgs() << " " << Idx;
7072            dbgs() << "\n";
7073          });
7074          fixupOrderingIndices(CurrentOrder);
7075        }
7076        // Insert new order with initial value 0, if it does not exist,
7077        // otherwise return the iterator to the existing one.
7078        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7079                     ReuseShuffleIndices, CurrentOrder);
7080        // This is a special case, as it does not gather, but at the same time
7081        // we are not extending buildTree_rec() towards the operands.
7082        ValueList Op0;
7083        Op0.assign(VL.size(), VL0->getOperand(0));
7084        VectorizableTree.back()->setOperand(0, Op0);
7085        return;
7086      }
7087      case Instruction::InsertElement: {
7088        assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7089  
7090        auto OrdCompare = [](const std::pair<int, int> &P1,
7091                             const std::pair<int, int> &P2) {
7092          return P1.first > P2.first;
7093        };
7094        PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
7095                      decltype(OrdCompare)>
7096            Indices(OrdCompare);
7097        for (int I = 0, E = VL.size(); I < E; ++I) {
7098          unsigned Idx = *getElementIndex(VL[I]);
7099          Indices.emplace(Idx, I);
7100        }
7101        OrdersType CurrentOrder(VL.size(), VL.size());
7102        bool IsIdentity = true;
7103        for (int I = 0, E = VL.size(); I < E; ++I) {
7104          CurrentOrder[Indices.top().second] = I;
7105          IsIdentity &= Indices.top().second == I;
7106          Indices.pop();
7107        }
7108        if (IsIdentity)
7109          CurrentOrder.clear();
7110        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7111                                     std::nullopt, CurrentOrder);
7112        LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7113  
7114        TE->setOperandsInOrder();
7115        buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
7116        return;
7117      }
7118      case Instruction::Load: {
7119        // Check that a vectorized load would load the same memory as a scalar
7120        // load. For example, we don't want to vectorize loads that are smaller
7121        // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7122        // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7123        // from such a struct, we read/write packed bits disagreeing with the
7124        // unvectorized version.
7125        TreeEntry *TE = nullptr;
7126        fixupOrderingIndices(CurrentOrder);
7127        switch (State) {
7128        case TreeEntry::Vectorize:
7129          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7130                            ReuseShuffleIndices, CurrentOrder);
7131          if (CurrentOrder.empty())
7132            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7133          else
7134            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7135          TE->setOperandsInOrder();
7136          break;
7137        case TreeEntry::StridedVectorize:
7138          // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7139          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7140                            UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7141          TE->setOperandsInOrder();
7142          LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7143          break;
7144        case TreeEntry::ScatterVectorize:
7145          // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7146          TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7147                            UserTreeIdx, ReuseShuffleIndices);
7148          TE->setOperandsInOrder();
7149          buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7150          LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7151          break;
7152        case TreeEntry::NeedToGather:
7153          llvm_unreachable("Unexpected loads state.");
7154        }
7155        return;
7156      }
7157      case Instruction::ZExt:
7158      case Instruction::SExt:
7159      case Instruction::FPToUI:
7160      case Instruction::FPToSI:
7161      case Instruction::FPExt:
7162      case Instruction::PtrToInt:
7163      case Instruction::IntToPtr:
7164      case Instruction::SIToFP:
7165      case Instruction::UIToFP:
7166      case Instruction::Trunc:
7167      case Instruction::FPTrunc:
7168      case Instruction::BitCast: {
7169        auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170            std::make_pair(std::numeric_limits<unsigned>::min(),
7171                           std::numeric_limits<unsigned>::max()));
7172        if (ShuffleOrOp == Instruction::ZExt ||
7173            ShuffleOrOp == Instruction::SExt) {
7174          CastMaxMinBWSizes = std::make_pair(
7175              std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7176                                 PrevMaxBW),
7177              std::min<unsigned>(
7178                  DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7179                  PrevMinBW));
7180        } else if (ShuffleOrOp == Instruction::Trunc) {
7181          CastMaxMinBWSizes = std::make_pair(
7182              std::max<unsigned>(
7183                  DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7184                  PrevMaxBW),
7185              std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7186                                 PrevMinBW));
7187          ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7188        } else if (ShuffleOrOp == Instruction::SIToFP ||
7189                   ShuffleOrOp == Instruction::UIToFP) {
7190          unsigned NumSignBits =
7191              ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7192          if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7193            APInt Mask = DB->getDemandedBits(OpI);
7194            NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7195          }
7196          if (NumSignBits * 2 >=
7197              DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7198            ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7199        }
7200        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7201                                     ReuseShuffleIndices);
7202        LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7203  
7204        TE->setOperandsInOrder();
7205        for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7206          buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7207        return;
7208      }
7209      case Instruction::ICmp:
7210      case Instruction::FCmp: {
7211        // Check that all of the compares have the same predicate.
7212        CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7213        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7214                                     ReuseShuffleIndices);
7215        LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7216  
7217        ValueList Left, Right;
7218        if (cast<CmpInst>(VL0)->isCommutative()) {
7219          // Commutative predicate - collect + sort operands of the instructions
7220          // so that each side is more likely to have the same opcode.
7221          assert(P0 == CmpInst::getSwappedPredicate(P0) &&
7222                 "Commutative Predicate mismatch");
7223          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7224        } else {
7225          // Collect operands - commute if it uses the swapped predicate.
7226          for (Value *V : VL) {
7227            auto *Cmp = cast<CmpInst>(V);
7228            Value *LHS = Cmp->getOperand(0);
7229            Value *RHS = Cmp->getOperand(1);
7230            if (Cmp->getPredicate() != P0)
7231              std::swap(LHS, RHS);
7232            Left.push_back(LHS);
7233            Right.push_back(RHS);
7234          }
7235        }
7236        TE->setOperand(0, Left);
7237        TE->setOperand(1, Right);
7238        buildTree_rec(Left, Depth + 1, {TE, 0});
7239        buildTree_rec(Right, Depth + 1, {TE, 1});
7240        if (ShuffleOrOp == Instruction::ICmp) {
7241          unsigned NumSignBits0 =
7242              ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7243          if (NumSignBits0 * 2 >=
7244              DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7245            ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7246          unsigned NumSignBits1 =
7247              ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7248          if (NumSignBits1 * 2 >=
7249              DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7250            ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7251        }
7252        return;
7253      }
7254      case Instruction::Select:
7255      case Instruction::FNeg:
7256      case Instruction::Add:
7257      case Instruction::FAdd:
7258      case Instruction::Sub:
7259      case Instruction::FSub:
7260      case Instruction::Mul:
7261      case Instruction::FMul:
7262      case Instruction::UDiv:
7263      case Instruction::SDiv:
7264      case Instruction::FDiv:
7265      case Instruction::URem:
7266      case Instruction::SRem:
7267      case Instruction::FRem:
7268      case Instruction::Shl:
7269      case Instruction::LShr:
7270      case Instruction::AShr:
7271      case Instruction::And:
7272      case Instruction::Or:
7273      case Instruction::Xor: {
7274        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7275                                     ReuseShuffleIndices);
7276        LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7277  
7278        // Sort operands of the instructions so that each side is more likely to
7279        // have the same opcode.
7280        if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7281          ValueList Left, Right;
7282          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7283          TE->setOperand(0, Left);
7284          TE->setOperand(1, Right);
7285          buildTree_rec(Left, Depth + 1, {TE, 0});
7286          buildTree_rec(Right, Depth + 1, {TE, 1});
7287          return;
7288        }
7289  
7290        TE->setOperandsInOrder();
7291        for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7292          buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7293        return;
7294      }
7295      case Instruction::GetElementPtr: {
7296        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7297                                     ReuseShuffleIndices);
7298        LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7299        SmallVector<ValueList, 2> Operands(2);
7300        // Prepare the operand vector for pointer operands.
7301        for (Value *V : VL) {
7302          auto *GEP = dyn_cast<GetElementPtrInst>(V);
7303          if (!GEP) {
7304            Operands.front().push_back(V);
7305            continue;
7306          }
7307          Operands.front().push_back(GEP->getPointerOperand());
7308        }
7309        TE->setOperand(0, Operands.front());
7310        // Need to cast all indices to the same type before vectorization to
7311        // avoid crash.
7312        // Required to be able to find correct matches between different gather
7313        // nodes and reuse the vectorized values rather than trying to gather them
7314        // again.
7315        int IndexIdx = 1;
7316        Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7317        Type *Ty = all_of(VL,
7318                          [VL0Ty, IndexIdx](Value *V) {
7319                            auto *GEP = dyn_cast<GetElementPtrInst>(V);
7320                            if (!GEP)
7321                              return true;
7322                            return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7323                          })
7324                       ? VL0Ty
7325                       : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7326                                              ->getPointerOperandType()
7327                                              ->getScalarType());
7328        // Prepare the operand vector.
7329        for (Value *V : VL) {
7330          auto *I = dyn_cast<GetElementPtrInst>(V);
7331          if (!I) {
7332            Operands.back().push_back(
7333                ConstantInt::get(Ty, 0, /*isSigned=*/false));
7334            continue;
7335          }
7336          auto *Op = I->getOperand(IndexIdx);
7337          auto *CI = dyn_cast<ConstantInt>(Op);
7338          if (!CI)
7339            Operands.back().push_back(Op);
7340          else
7341            Operands.back().push_back(ConstantFoldIntegerCast(
7342                CI, Ty, CI->getValue().isSignBitSet(), *DL));
7343        }
7344        TE->setOperand(IndexIdx, Operands.back());
7345  
7346        for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7347          buildTree_rec(Operands[I], Depth + 1, {TE, I});
7348        return;
7349      }
7350      case Instruction::Store: {
7351        bool Consecutive = CurrentOrder.empty();
7352        if (!Consecutive)
7353          fixupOrderingIndices(CurrentOrder);
7354        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7355                                     ReuseShuffleIndices, CurrentOrder);
7356        TE->setOperandsInOrder();
7357        buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7358        if (Consecutive)
7359          LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7360        else
7361          LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7362        return;
7363      }
7364      case Instruction::Call: {
7365        // Check if the calls are all to the same vectorizable intrinsic or
7366        // library function.
7367        CallInst *CI = cast<CallInst>(VL0);
7368        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7369  
7370        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7371                                     ReuseShuffleIndices);
7372        // Sort operands of the instructions so that each side is more likely to
7373        // have the same opcode.
7374        if (isCommutative(VL0)) {
7375          ValueList Left, Right;
7376          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7377          TE->setOperand(0, Left);
7378          TE->setOperand(1, Right);
7379          SmallVector<ValueList> Operands;
7380          for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7381            Operands.emplace_back();
7382            if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
7383              continue;
7384            for (Value *V : VL) {
7385              auto *CI2 = cast<CallInst>(V);
7386              Operands.back().push_back(CI2->getArgOperand(I));
7387            }
7388            TE->setOperand(I, Operands.back());
7389          }
7390          buildTree_rec(Left, Depth + 1, {TE, 0});
7391          buildTree_rec(Right, Depth + 1, {TE, 1});
7392          for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7393            if (Operands[I - 2].empty())
7394              continue;
7395            buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7396          }
7397          return;
7398        }
7399        TE->setOperandsInOrder();
7400        for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7401          // For scalar operands no need to create an entry since no need to
7402          // vectorize it.
7403          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
7404            continue;
7405          ValueList Operands;
7406          // Prepare the operand vector.
7407          for (Value *V : VL) {
7408            auto *CI2 = cast<CallInst>(V);
7409            Operands.push_back(CI2->getArgOperand(I));
7410          }
7411          buildTree_rec(Operands, Depth + 1, {TE, I});
7412        }
7413        return;
7414      }
7415      case Instruction::ShuffleVector: {
7416        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7417                                     ReuseShuffleIndices);
7418        LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7419  
7420        // Reorder operands if reordering would enable vectorization.
7421        auto *CI = dyn_cast<CmpInst>(VL0);
7422        if (isa<BinaryOperator>(VL0) || CI) {
7423          ValueList Left, Right;
7424          if (!CI || all_of(VL, [](Value *V) {
7425                return cast<CmpInst>(V)->isCommutative();
7426              })) {
7427            reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7428          } else {
7429            auto *MainCI = cast<CmpInst>(S.MainOp);
7430            auto *AltCI = cast<CmpInst>(S.AltOp);
7431            CmpInst::Predicate MainP = MainCI->getPredicate();
7432            CmpInst::Predicate AltP = AltCI->getPredicate();
7433            assert(MainP != AltP &&
7434                   "Expected different main/alternate predicates.");
7435            // Collect operands - commute if it uses the swapped predicate or
7436            // alternate operation.
7437            for (Value *V : VL) {
7438              auto *Cmp = cast<CmpInst>(V);
7439              Value *LHS = Cmp->getOperand(0);
7440              Value *RHS = Cmp->getOperand(1);
7441  
7442              if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7443                if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7444                  std::swap(LHS, RHS);
7445              } else {
7446                if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7447                  std::swap(LHS, RHS);
7448              }
7449              Left.push_back(LHS);
7450              Right.push_back(RHS);
7451            }
7452          }
7453          TE->setOperand(0, Left);
7454          TE->setOperand(1, Right);
7455          buildTree_rec(Left, Depth + 1, {TE, 0});
7456          buildTree_rec(Right, Depth + 1, {TE, 1});
7457          return;
7458        }
7459  
7460        TE->setOperandsInOrder();
7461        for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7462          buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7463        return;
7464      }
7465      default:
7466        break;
7467    }
7468    llvm_unreachable("Unexpected vectorization of the instructions.");
7469  }
7470  
7471  unsigned BoUpSLP::canMapToVector(Type *T) const {
7472    unsigned N = 1;
7473    Type *EltTy = T;
7474  
7475    while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7476      if (auto *ST = dyn_cast<StructType>(EltTy)) {
7477        // Check that struct is homogeneous.
7478        for (const auto *Ty : ST->elements())
7479          if (Ty != *ST->element_begin())
7480            return 0;
7481        N *= ST->getNumElements();
7482        EltTy = *ST->element_begin();
7483      } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7484        N *= AT->getNumElements();
7485        EltTy = AT->getElementType();
7486      } else {
7487        auto *VT = cast<FixedVectorType>(EltTy);
7488        N *= VT->getNumElements();
7489        EltTy = VT->getElementType();
7490      }
7491    }
7492  
7493    if (!isValidElementType(EltTy))
7494      return 0;
7495    uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
7496    if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7497        VTSize != DL->getTypeStoreSizeInBits(T))
7498      return 0;
7499    return N;
7500  }
7501  
7502  bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7503                                SmallVectorImpl<unsigned> &CurrentOrder,
7504                                bool ResizeAllowed) const {
7505    const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7506    assert(It != VL.end() && "Expected at least one extract instruction.");
7507    auto *E0 = cast<Instruction>(*It);
7508    assert(
7509        all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7510        "Invalid opcode");
7511    // Check if all of the extracts come from the same vector and from the
7512    // correct offset.
7513    Value *Vec = E0->getOperand(0);
7514  
7515    CurrentOrder.clear();
7516  
7517    // We have to extract from a vector/aggregate with the same number of elements.
7518    unsigned NElts;
7519    if (E0->getOpcode() == Instruction::ExtractValue) {
7520      NElts = canMapToVector(Vec->getType());
7521      if (!NElts)
7522        return false;
7523      // Check if load can be rewritten as load of vector.
7524      LoadInst *LI = dyn_cast<LoadInst>(Vec);
7525      if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7526        return false;
7527    } else {
7528      NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7529    }
7530  
7531    unsigned E = VL.size();
7532    if (!ResizeAllowed && NElts != E)
7533      return false;
7534    SmallVector<int> Indices(E, PoisonMaskElem);
7535    unsigned MinIdx = NElts, MaxIdx = 0;
7536    for (auto [I, V] : enumerate(VL)) {
7537      auto *Inst = dyn_cast<Instruction>(V);
7538      if (!Inst)
7539        continue;
7540      if (Inst->getOperand(0) != Vec)
7541        return false;
7542      if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7543        if (isa<UndefValue>(EE->getIndexOperand()))
7544          continue;
7545      std::optional<unsigned> Idx = getExtractIndex(Inst);
7546      if (!Idx)
7547        return false;
7548      const unsigned ExtIdx = *Idx;
7549      if (ExtIdx >= NElts)
7550        continue;
7551      Indices[I] = ExtIdx;
7552      if (MinIdx > ExtIdx)
7553        MinIdx = ExtIdx;
7554      if (MaxIdx < ExtIdx)
7555        MaxIdx = ExtIdx;
7556    }
7557    if (MaxIdx - MinIdx + 1 > E)
7558      return false;
7559    if (MaxIdx + 1 <= E)
7560      MinIdx = 0;
7561  
7562    // Check that all of the indices extract from the correct offset.
7563    bool ShouldKeepOrder = true;
7564    // Assign to all items the initial value E + 1 so we can check if the extract
7565    // instruction index was used already.
7566    // Also, later we can check that all the indices are used and we have a
7567    // consecutive access in the extract instructions, by checking that no
7568    // element of CurrentOrder still has value E + 1.
7569    CurrentOrder.assign(E, E);
7570    for (unsigned I = 0; I < E; ++I) {
7571      if (Indices[I] == PoisonMaskElem)
7572        continue;
7573      const unsigned ExtIdx = Indices[I] - MinIdx;
7574      if (CurrentOrder[ExtIdx] != E) {
7575        CurrentOrder.clear();
7576        return false;
7577      }
7578      ShouldKeepOrder &= ExtIdx == I;
7579      CurrentOrder[ExtIdx] = I;
7580    }
7581    if (ShouldKeepOrder)
7582      CurrentOrder.clear();
7583  
7584    return ShouldKeepOrder;
7585  }
7586  
7587  bool BoUpSLP::areAllUsersVectorized(
7588      Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7589    return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7590           all_of(I->users(), [this](User *U) {
7591             return ScalarToTreeEntry.contains(U) ||
7592                    isVectorLikeInstWithConstOps(U) ||
7593                    (isa<ExtractElementInst>(U) && MustGather.contains(U));
7594           });
7595  }
7596  
7597  static std::pair<InstructionCost, InstructionCost>
7598  getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7599                     TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7600                     ArrayRef<Type *> ArgTys) {
7601    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7602  
7603    // Calculate the cost of the scalar and vector calls.
7604    FastMathFlags FMF;
7605    if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7606      FMF = FPCI->getFastMathFlags();
7607    SmallVector<const Value *> Arguments(CI->args());
7608    IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7609                                      dyn_cast<IntrinsicInst>(CI));
7610    auto IntrinsicCost =
7611      TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
7612  
7613    auto Shape = VFShape::get(CI->getFunctionType(),
7614                              ElementCount::getFixed(VecTy->getNumElements()),
7615                              false /*HasGlobalPred*/);
7616    Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7617    auto LibCost = IntrinsicCost;
7618    if (!CI->isNoBuiltin() && VecFunc) {
7619      // Calculate the cost of the vector library call.
7620      // If the corresponding vector call is cheaper, return its cost.
7621      LibCost =
7622          TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7623    }
7624    return {IntrinsicCost, LibCost};
7625  }
7626  
7627  void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7628      const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7629      SmallVectorImpl<Value *> *OpScalars,
7630      SmallVectorImpl<Value *> *AltScalars) const {
7631    unsigned Sz = Scalars.size();
7632    Mask.assign(Sz, PoisonMaskElem);
7633    SmallVector<int> OrderMask;
7634    if (!ReorderIndices.empty())
7635      inversePermutation(ReorderIndices, OrderMask);
7636    for (unsigned I = 0; I < Sz; ++I) {
7637      unsigned Idx = I;
7638      if (!ReorderIndices.empty())
7639        Idx = OrderMask[I];
7640      auto *OpInst = cast<Instruction>(Scalars[Idx]);
7641      if (IsAltOp(OpInst)) {
7642        Mask[I] = Sz + Idx;
7643        if (AltScalars)
7644          AltScalars->push_back(OpInst);
7645      } else {
7646        Mask[I] = Idx;
7647        if (OpScalars)
7648          OpScalars->push_back(OpInst);
7649      }
7650    }
7651    if (!ReuseShuffleIndices.empty()) {
7652      SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7653      transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7654        return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7655      });
7656      Mask.swap(NewMask);
7657    }
7658  }
7659  
7660  static bool isAlternateInstruction(const Instruction *I,
7661                                     const Instruction *MainOp,
7662                                     const Instruction *AltOp,
7663                                     const TargetLibraryInfo &TLI) {
7664    if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7665      auto *AltCI = cast<CmpInst>(AltOp);
7666      CmpInst::Predicate MainP = MainCI->getPredicate();
7667      CmpInst::Predicate AltP = AltCI->getPredicate();
7668      assert(MainP != AltP && "Expected different main/alternate predicates.");
7669      auto *CI = cast<CmpInst>(I);
7670      if (isCmpSameOrSwapped(MainCI, CI, TLI))
7671        return false;
7672      if (isCmpSameOrSwapped(AltCI, CI, TLI))
7673        return true;
7674      CmpInst::Predicate P = CI->getPredicate();
7675      CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
7676  
7677      assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7678             "CmpInst expected to match either main or alternate predicate or "
7679             "their swap.");
7680      (void)AltP;
7681      return MainP != P && MainP != SwappedP;
7682    }
7683    return I->getOpcode() == AltOp->getOpcode();
7684  }
7685  
7686  TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7687    assert(!Ops.empty());
7688    const auto *Op0 = Ops.front();
7689  
7690    const bool IsConstant = all_of(Ops, [](Value *V) {
7691      // TODO: We should allow undef elements here
7692      return isConstant(V) && !isa<UndefValue>(V);
7693    });
7694    const bool IsUniform = all_of(Ops, [=](Value *V) {
7695      // TODO: We should allow undef elements here
7696      return V == Op0;
7697    });
7698    const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7699      // TODO: We should allow undef elements here
7700      if (auto *CI = dyn_cast<ConstantInt>(V))
7701        return CI->getValue().isPowerOf2();
7702      return false;
7703    });
7704    const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7705      // TODO: We should allow undef elements here
7706      if (auto *CI = dyn_cast<ConstantInt>(V))
7707        return CI->getValue().isNegatedPowerOf2();
7708      return false;
7709    });
7710  
7711    TTI::OperandValueKind VK = TTI::OK_AnyValue;
7712    if (IsConstant && IsUniform)
7713      VK = TTI::OK_UniformConstantValue;
7714    else if (IsConstant)
7715      VK = TTI::OK_NonUniformConstantValue;
7716    else if (IsUniform)
7717      VK = TTI::OK_UniformValue;
7718  
7719    TTI::OperandValueProperties VP = TTI::OP_None;
7720    VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7721    VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7722  
7723    return {VK, VP};
7724  }
7725  
7726  namespace {
7727  /// The base class for shuffle instruction emission and shuffle cost estimation.
7728  class BaseShuffleAnalysis {
7729  protected:
7730    /// Checks if the mask is an identity mask.
7731    /// \param IsStrict if is true the function returns false if mask size does
7732    /// not match vector size.
7733    static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7734                               bool IsStrict) {
7735      int Limit = Mask.size();
7736      int VF = VecTy->getNumElements();
7737      int Index = -1;
7738      if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7739        return true;
7740      if (!IsStrict) {
7741        // Consider extract subvector starting from index 0.
7742        if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
7743            Index == 0)
7744          return true;
7745        // All VF-size submasks are identity (e.g.
7746        // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7747        if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7748              ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7749              return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7750                     ShuffleVectorInst::isIdentityMask(Slice, VF);
7751            }))
7752          return true;
7753      }
7754      return false;
7755    }
7756  
7757    /// Tries to combine 2 different masks into single one.
7758    /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7759    /// change the size of the vector, \p LocalVF is the original size of the
7760    /// shuffled vector.
7761    static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7762                             ArrayRef<int> ExtMask) {
7763      unsigned VF = Mask.size();
7764      SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7765      for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7766        if (ExtMask[I] == PoisonMaskElem)
7767          continue;
7768        int MaskedIdx = Mask[ExtMask[I] % VF];
7769        NewMask[I] =
7770            MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7771      }
7772      Mask.swap(NewMask);
7773    }
7774  
7775    /// Looks through shuffles trying to reduce final number of shuffles in the
7776    /// code. The function looks through the previously emitted shuffle
7777    /// instructions and properly mark indices in mask as undef.
7778    /// For example, given the code
7779    /// \code
7780    /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7781    /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7782    /// \endcode
7783    /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7784    /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7785    /// <0, 1, 2, 3> for the shuffle.
7786    /// If 2 operands are of different size, the smallest one will be resized and
7787    /// the mask recalculated properly.
7788    /// For example, given the code
7789    /// \code
7790    /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7791    /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7792    /// \endcode
7793    /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7794    /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7795    /// <0, 1, 2, 3> for the shuffle.
7796    /// So, it tries to transform permutations to simple vector merge, if
7797    /// possible.
7798    /// \param V The input vector which must be shuffled using the given \p Mask.
7799    /// If the better candidate is found, \p V is set to this best candidate
7800    /// vector.
7801    /// \param Mask The input mask for the shuffle. If the best candidate is found
7802    /// during looking-through-shuffles attempt, it is updated accordingly.
7803    /// \param SinglePermute true if the shuffle operation is originally a
7804    /// single-value-permutation. In this case the look-through-shuffles procedure
7805    /// may look for resizing shuffles as the best candidates.
7806    /// \return true if the shuffle results in the non-resizing identity shuffle
7807    /// (and thus can be ignored), false - otherwise.
7808    static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7809                                    bool SinglePermute) {
7810      Value *Op = V;
7811      ShuffleVectorInst *IdentityOp = nullptr;
7812      SmallVector<int> IdentityMask;
7813      while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7814        // Exit if not a fixed vector type or changing size shuffle.
7815        auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7816        if (!SVTy)
7817          break;
7818        // Remember the identity or broadcast mask, if it is not a resizing
7819        // shuffle. If no better candidates are found, this Op and Mask will be
7820        // used in the final shuffle.
7821        if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7822          if (!IdentityOp || !SinglePermute ||
7823              (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7824               !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
7825                                                      IdentityMask.size()))) {
7826            IdentityOp = SV;
7827            // Store current mask in the IdentityMask so later we did not lost
7828            // this info if IdentityOp is selected as the best candidate for the
7829            // permutation.
7830            IdentityMask.assign(Mask);
7831          }
7832        }
7833        // Remember the broadcast mask. If no better candidates are found, this Op
7834        // and Mask will be used in the final shuffle.
7835        // Zero splat can be used as identity too, since it might be used with
7836        // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7837        // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7838        // expensive, the analysis founds out, that the source vector is just a
7839        // broadcast, this original mask can be transformed to identity mask <0,
7840        // 1, 2, 3>.
7841        // \code
7842        // %0 = shuffle %v, poison, zeroinitalizer
7843        // %res = shuffle %0, poison, <3, 1, 2, 0>
7844        // \endcode
7845        // may be transformed to
7846        // \code
7847        // %0 = shuffle %v, poison, zeroinitalizer
7848        // %res = shuffle %0, poison, <0, 1, 2, 3>
7849        // \endcode
7850        if (SV->isZeroEltSplat()) {
7851          IdentityOp = SV;
7852          IdentityMask.assign(Mask);
7853        }
7854        int LocalVF = Mask.size();
7855        if (auto *SVOpTy =
7856                dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7857          LocalVF = SVOpTy->getNumElements();
7858        SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7859        for (auto [Idx, I] : enumerate(Mask)) {
7860          if (I == PoisonMaskElem ||
7861              static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7862            continue;
7863          ExtMask[Idx] = SV->getMaskValue(I);
7864        }
7865        bool IsOp1Undef =
7866            isUndefVector(SV->getOperand(0),
7867                          buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7868                .all();
7869        bool IsOp2Undef =
7870            isUndefVector(SV->getOperand(1),
7871                          buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7872                .all();
7873        if (!IsOp1Undef && !IsOp2Undef) {
7874          // Update mask and mark undef elems.
7875          for (int &I : Mask) {
7876            if (I == PoisonMaskElem)
7877              continue;
7878            if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7879                PoisonMaskElem)
7880              I = PoisonMaskElem;
7881          }
7882          break;
7883        }
7884        SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7885                                     SV->getShuffleMask().end());
7886        combineMasks(LocalVF, ShuffleMask, Mask);
7887        Mask.swap(ShuffleMask);
7888        if (IsOp2Undef)
7889          Op = SV->getOperand(0);
7890        else
7891          Op = SV->getOperand(1);
7892      }
7893      if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7894          !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7895          ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
7896        if (IdentityOp) {
7897          V = IdentityOp;
7898          assert(Mask.size() == IdentityMask.size() &&
7899                 "Expected masks of same sizes.");
7900          // Clear known poison elements.
7901          for (auto [I, Idx] : enumerate(Mask))
7902            if (Idx == PoisonMaskElem)
7903              IdentityMask[I] = PoisonMaskElem;
7904          Mask.swap(IdentityMask);
7905          auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7906          return SinglePermute &&
7907                 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7908                                 /*IsStrict=*/true) ||
7909                  (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7910                   Shuffle->isZeroEltSplat() &&
7911                   ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
7912        }
7913        V = Op;
7914        return false;
7915      }
7916      V = Op;
7917      return true;
7918    }
7919  
7920    /// Smart shuffle instruction emission, walks through shuffles trees and
7921    /// tries to find the best matching vector for the actual shuffle
7922    /// instruction.
7923    template <typename T, typename ShuffleBuilderTy>
7924    static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7925                           ShuffleBuilderTy &Builder) {
7926      assert(V1 && "Expected at least one vector value.");
7927      if (V2)
7928        Builder.resizeToMatch(V1, V2);
7929      int VF = Mask.size();
7930      if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7931        VF = FTy->getNumElements();
7932      if (V2 &&
7933          !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7934        // Peek through shuffles.
7935        Value *Op1 = V1;
7936        Value *Op2 = V2;
7937        int VF =
7938            cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7939        SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7940        SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7941        for (int I = 0, E = Mask.size(); I < E; ++I) {
7942          if (Mask[I] < VF)
7943            CombinedMask1[I] = Mask[I];
7944          else
7945            CombinedMask2[I] = Mask[I] - VF;
7946        }
7947        Value *PrevOp1;
7948        Value *PrevOp2;
7949        do {
7950          PrevOp1 = Op1;
7951          PrevOp2 = Op2;
7952          (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7953          (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7954          // Check if we have 2 resizing shuffles - need to peek through operands
7955          // again.
7956          if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7957            if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7958              SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7959              for (auto [Idx, I] : enumerate(CombinedMask1)) {
7960                  if (I == PoisonMaskElem)
7961                  continue;
7962                  ExtMask1[Idx] = SV1->getMaskValue(I);
7963              }
7964              SmallBitVector UseMask1 = buildUseMask(
7965                  cast<FixedVectorType>(SV1->getOperand(1)->getType())
7966                      ->getNumElements(),
7967                  ExtMask1, UseMask::SecondArg);
7968              SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7969              for (auto [Idx, I] : enumerate(CombinedMask2)) {
7970                  if (I == PoisonMaskElem)
7971                  continue;
7972                  ExtMask2[Idx] = SV2->getMaskValue(I);
7973              }
7974              SmallBitVector UseMask2 = buildUseMask(
7975                  cast<FixedVectorType>(SV2->getOperand(1)->getType())
7976                      ->getNumElements(),
7977                  ExtMask2, UseMask::SecondArg);
7978              if (SV1->getOperand(0)->getType() ==
7979                      SV2->getOperand(0)->getType() &&
7980                  SV1->getOperand(0)->getType() != SV1->getType() &&
7981                  isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7982                  isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7983                Op1 = SV1->getOperand(0);
7984                Op2 = SV2->getOperand(0);
7985                SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7986                                              SV1->getShuffleMask().end());
7987                int LocalVF = ShuffleMask1.size();
7988                if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7989                  LocalVF = FTy->getNumElements();
7990                combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7991                CombinedMask1.swap(ShuffleMask1);
7992                SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7993                                              SV2->getShuffleMask().end());
7994                LocalVF = ShuffleMask2.size();
7995                if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7996                  LocalVF = FTy->getNumElements();
7997                combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7998                CombinedMask2.swap(ShuffleMask2);
7999              }
8000            }
8001        } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8002        Builder.resizeToMatch(Op1, Op2);
8003        VF = std::max(cast<VectorType>(Op1->getType())
8004                          ->getElementCount()
8005                          .getKnownMinValue(),
8006                      cast<VectorType>(Op2->getType())
8007                          ->getElementCount()
8008                          .getKnownMinValue());
8009        for (int I = 0, E = Mask.size(); I < E; ++I) {
8010          if (CombinedMask2[I] != PoisonMaskElem) {
8011            assert(CombinedMask1[I] == PoisonMaskElem &&
8012                   "Expected undefined mask element");
8013            CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8014          }
8015        }
8016        if (Op1 == Op2 &&
8017            (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8018             (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8019              isa<ShuffleVectorInst>(Op1) &&
8020              cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8021                  ArrayRef(CombinedMask1))))
8022          return Builder.createIdentity(Op1);
8023        return Builder.createShuffleVector(
8024            Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8025            CombinedMask1);
8026      }
8027      if (isa<PoisonValue>(V1))
8028        return Builder.createPoison(
8029            cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8030      SmallVector<int> NewMask(Mask.begin(), Mask.end());
8031      bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8032      assert(V1 && "Expected non-null value after looking through shuffles.");
8033  
8034      if (!IsIdentity)
8035        return Builder.createShuffleVector(V1, NewMask);
8036      return Builder.createIdentity(V1);
8037    }
8038  };
8039  } // namespace
8040  
8041  /// Returns the cost of the shuffle instructions with the given \p Kind, vector
8042  /// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8043  /// subvector pattern.
8044  static InstructionCost
8045  getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
8046                 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
8047                 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
8048                 int Index = 0, VectorType *SubTp = nullptr,
8049                 ArrayRef<const Value *> Args = std::nullopt) {
8050    if (Kind != TTI::SK_PermuteTwoSrc)
8051      return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8052    int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8053    int NumSubElts;
8054    if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
8055                               Mask, NumSrcElts, NumSubElts, Index)) {
8056      if (Index + NumSubElts > NumSrcElts &&
8057          Index + NumSrcElts <= static_cast<int>(Mask.size()))
8058        return TTI.getShuffleCost(
8059            TTI::SK_InsertSubvector,
8060            getWidenedType(Tp->getElementType(), Mask.size()), Mask,
8061            TTI::TCK_RecipThroughput, Index, Tp);
8062    }
8063    return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8064  }
8065  
8066  /// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8067  static std::pair<InstructionCost, InstructionCost>
8068  getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
8069              Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8070              Type *ScalarTy, VectorType *VecTy) {
8071    InstructionCost ScalarCost = 0;
8072    InstructionCost VecCost = 0;
8073    // Here we differentiate two cases: (1) when Ptrs represent a regular
8074    // vectorization tree node (as they are pointer arguments of scattered
8075    // loads) or (2) when Ptrs are the arguments of loads or stores being
8076    // vectorized as plane wide unit-stride load/store since all the
8077    // loads/stores are known to be from/to adjacent locations.
8078    if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8079      // Case 2: estimate costs for pointer related costs when vectorizing to
8080      // a wide load/store.
8081      // Scalar cost is estimated as a set of pointers with known relationship
8082      // between them.
8083      // For vector code we will use BasePtr as argument for the wide load/store
8084      // but we also need to account all the instructions which are going to
8085      // stay in vectorized code due to uses outside of these scalar
8086      // loads/stores.
8087      ScalarCost = TTI.getPointersChainCost(
8088          Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8089          CostKind);
8090  
8091      SmallVector<const Value *> PtrsRetainedInVecCode;
8092      for (Value *V : Ptrs) {
8093        if (V == BasePtr) {
8094          PtrsRetainedInVecCode.push_back(V);
8095          continue;
8096        }
8097        auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8098        // For simplicity assume Ptr to stay in vectorized code if it's not a
8099        // GEP instruction. We don't care since it's cost considered free.
8100        // TODO: We should check for any uses outside of vectorizable tree
8101        // rather than just single use.
8102        if (!Ptr || !Ptr->hasOneUse())
8103          PtrsRetainedInVecCode.push_back(V);
8104      }
8105  
8106      if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8107        // If all pointers stay in vectorized code then we don't have
8108        // any savings on that.
8109        return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
8110      }
8111      VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
8112                                         TTI::PointersChainInfo::getKnownStride(),
8113                                         VecTy, CostKind);
8114    } else {
8115      // Case 1: Ptrs are the arguments of loads that we are going to transform
8116      // into masked gather load intrinsic.
8117      // All the scalar GEPs will be removed as a result of vectorization.
8118      // For any external uses of some lanes extract element instructions will
8119      // be generated (which cost is estimated separately).
8120      TTI::PointersChainInfo PtrsInfo =
8121          all_of(Ptrs,
8122                 [](const Value *V) {
8123                   auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8124                   return Ptr && !Ptr->hasAllConstantIndices();
8125                 })
8126              ? TTI::PointersChainInfo::getUnknownStride()
8127              : TTI::PointersChainInfo::getKnownStride();
8128  
8129      ScalarCost =
8130          TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
8131      auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8132      if (!BaseGEP) {
8133        auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
8134        if (It != Ptrs.end())
8135          BaseGEP = cast<GEPOperator>(*It);
8136      }
8137      if (BaseGEP) {
8138        SmallVector<const Value *> Indices(BaseGEP->indices());
8139        VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
8140                                 BaseGEP->getPointerOperand(), Indices, VecTy,
8141                                 CostKind);
8142      }
8143    }
8144  
8145    return std::make_pair(ScalarCost, VecCost);
8146  }
8147  
8148  void BoUpSLP::transformNodes() {
8149    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8150    for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8151      TreeEntry &E = *TE;
8152      switch (E.getOpcode()) {
8153      case Instruction::Load: {
8154        // No need to reorder masked gather loads, just reorder the scalar
8155        // operands.
8156        if (E.State != TreeEntry::Vectorize)
8157          break;
8158        Type *ScalarTy = E.getMainOp()->getType();
8159        auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8160        Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8161        // Check if profitable to represent consecutive load + reverse as strided
8162        // load with stride -1.
8163        if (isReverseOrder(E.ReorderIndices) &&
8164            TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8165          SmallVector<int> Mask;
8166          inversePermutation(E.ReorderIndices, Mask);
8167          auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8168          InstructionCost OriginalVecCost =
8169              TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8170                                   BaseLI->getPointerAddressSpace(), CostKind,
8171                                   TTI::OperandValueInfo()) +
8172              ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
8173          InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8174              Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8175              /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8176          if (StridedCost < OriginalVecCost)
8177            // Strided load is more profitable than consecutive load + reverse -
8178            // transform the node to strided load.
8179            E.State = TreeEntry::StridedVectorize;
8180        }
8181        break;
8182      }
8183      case Instruction::Store: {
8184        Type *ScalarTy =
8185            cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8186        auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8187        Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8188        // Check if profitable to represent consecutive load + reverse as strided
8189        // load with stride -1.
8190        if (isReverseOrder(E.ReorderIndices) &&
8191            TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8192          SmallVector<int> Mask;
8193          inversePermutation(E.ReorderIndices, Mask);
8194          auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8195          InstructionCost OriginalVecCost =
8196              TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8197                                   BaseSI->getPointerAddressSpace(), CostKind,
8198                                   TTI::OperandValueInfo()) +
8199              ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
8200          InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8201              Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8202              /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8203          if (StridedCost < OriginalVecCost)
8204            // Strided load is more profitable than consecutive load + reverse -
8205            // transform the node to strided load.
8206            E.State = TreeEntry::StridedVectorize;
8207        }
8208        break;
8209      }
8210      default:
8211        break;
8212      }
8213    }
8214  }
8215  
8216  /// Merges shuffle masks and emits final shuffle instruction, if required. It
8217  /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8218  /// when the actual shuffle instruction is generated only if this is actually
8219  /// required. Otherwise, the shuffle instruction emission is delayed till the
8220  /// end of the process, to reduce the number of emitted instructions and further
8221  /// analysis/transformations.
8222  class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8223    bool IsFinalized = false;
8224    SmallVector<int> CommonMask;
8225    SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
8226    Type *ScalarTy = nullptr;
8227    const TargetTransformInfo &TTI;
8228    InstructionCost Cost = 0;
8229    SmallDenseSet<Value *> VectorizedVals;
8230    BoUpSLP &R;
8231    SmallPtrSetImpl<Value *> &CheckedExtracts;
8232    constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8233    /// While set, still trying to estimate the cost for the same nodes and we
8234    /// can delay actual cost estimation (virtual shuffle instruction emission).
8235    /// May help better estimate the cost if same nodes must be permuted + allows
8236    /// to move most of the long shuffles cost estimation to TTI.
8237    bool SameNodesEstimated = true;
8238  
8239    static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8240      if (Ty->getScalarType()->isPointerTy()) {
8241        Constant *Res = ConstantExpr::getIntToPtr(
8242            ConstantInt::getAllOnesValue(
8243                IntegerType::get(Ty->getContext(),
8244                                 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8245            Ty->getScalarType());
8246        if (auto *VTy = dyn_cast<VectorType>(Ty))
8247          Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8248        return Res;
8249      }
8250      return Constant::getAllOnesValue(Ty);
8251    }
8252  
8253    InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8254      if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8255        return TTI::TCC_Free;
8256      auto *VecTy = getWidenedType(ScalarTy, VL.size());
8257      InstructionCost GatherCost = 0;
8258      SmallVector<Value *> Gathers(VL.begin(), VL.end());
8259      // Improve gather cost for gather of loads, if we can group some of the
8260      // loads into vector loads.
8261      InstructionsState S = getSameOpcode(VL, *R.TLI);
8262      const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8263      unsigned MinVF = R.getMinVF(2 * Sz);
8264      if (VL.size() > 2 &&
8265          ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8266           (InVectors.empty() &&
8267            any_of(seq<unsigned>(0, VL.size() / MinVF),
8268                   [&](unsigned Idx) {
8269                     ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8270                     InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8271                     return S.getOpcode() == Instruction::Load &&
8272                            !S.isAltShuffle();
8273                   }))) &&
8274          !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8275          !isSplat(Gathers)) {
8276        InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8277        SetVector<Value *> VectorizedLoads;
8278        SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
8279        SmallVector<unsigned> ScatterVectorized;
8280        unsigned StartIdx = 0;
8281        unsigned VF = VL.size() / 2;
8282        for (; VF >= MinVF; VF /= 2) {
8283          for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8284               Cnt += VF) {
8285            ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8286            if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8287              InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8288              if (SliceS.getOpcode() != Instruction::Load ||
8289                  SliceS.isAltShuffle())
8290                continue;
8291            }
8292            if (!VectorizedLoads.count(Slice.front()) &&
8293                !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8294              SmallVector<Value *> PointerOps;
8295              OrdersType CurrentOrder;
8296              LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8297                                                  CurrentOrder, PointerOps);
8298              switch (LS) {
8299              case LoadsState::Vectorize:
8300              case LoadsState::ScatterVectorize:
8301              case LoadsState::StridedVectorize:
8302                // Mark the vectorized loads so that we don't vectorize them
8303                // again.
8304                // TODO: better handling of loads with reorders.
8305                if (((LS == LoadsState::Vectorize ||
8306                      LS == LoadsState::StridedVectorize) &&
8307                     CurrentOrder.empty()) ||
8308                    (LS == LoadsState::StridedVectorize &&
8309                     isReverseOrder(CurrentOrder)))
8310                  VectorizedStarts.emplace_back(Cnt, LS);
8311                else
8312                  ScatterVectorized.push_back(Cnt);
8313                VectorizedLoads.insert(Slice.begin(), Slice.end());
8314                // If we vectorized initial block, no need to try to vectorize
8315                // it again.
8316                if (Cnt == StartIdx)
8317                  StartIdx += VF;
8318                break;
8319              case LoadsState::Gather:
8320                break;
8321              }
8322            }
8323          }
8324          // Check if the whole array was vectorized already - exit.
8325          if (StartIdx >= VL.size())
8326            break;
8327          // Found vectorizable parts - exit.
8328          if (!VectorizedLoads.empty())
8329            break;
8330        }
8331        if (!VectorizedLoads.empty()) {
8332          unsigned NumParts = TTI.getNumberOfParts(VecTy);
8333          bool NeedInsertSubvectorAnalysis =
8334              !NumParts || (VL.size() / VF) > NumParts;
8335          // Get the cost for gathered loads.
8336          for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8337            if (VectorizedLoads.contains(VL[I]))
8338              continue;
8339            GatherCost +=
8340                getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8341          }
8342          // Exclude potentially vectorized loads from list of gathered
8343          // scalars.
8344          Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8345          // The cost for vectorized loads.
8346          InstructionCost ScalarsCost = 0;
8347          for (Value *V : VectorizedLoads) {
8348            auto *LI = cast<LoadInst>(V);
8349            ScalarsCost +=
8350                TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8351                                    LI->getAlign(), LI->getPointerAddressSpace(),
8352                                    CostKind, TTI::OperandValueInfo(), LI);
8353          }
8354          auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
8355          for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8356            auto *LI = cast<LoadInst>(VL[P.first]);
8357            Align Alignment = LI->getAlign();
8358            GatherCost +=
8359                P.second == LoadsState::Vectorize
8360                    ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8361                                          LI->getPointerAddressSpace(), CostKind,
8362                                          TTI::OperandValueInfo(), LI)
8363                    : TTI.getStridedMemoryOpCost(
8364                          Instruction::Load, LoadTy, LI->getPointerOperand(),
8365                          /*VariableMask=*/false, Alignment, CostKind, LI);
8366            // Estimate GEP cost.
8367            SmallVector<Value *> PointerOps(VF);
8368            for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8369              PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8370            auto [ScalarGEPCost, VectorGEPCost] =
8371                getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8372                            Instruction::Load, CostKind, LI->getType(), LoadTy);
8373            GatherCost += VectorGEPCost - ScalarGEPCost;
8374          }
8375          for (unsigned P : ScatterVectorized) {
8376            auto *LI0 = cast<LoadInst>(VL[P]);
8377            ArrayRef<Value *> Slice = VL.slice(P, VF);
8378            Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8379            GatherCost += TTI.getGatherScatterOpCost(
8380                Instruction::Load, LoadTy, LI0->getPointerOperand(),
8381                /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8382            // Estimate GEP cost.
8383            SmallVector<Value *> PointerOps(VF);
8384            for (auto [I, V] : enumerate(Slice))
8385              PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8386            OrdersType Order;
8387            if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8388                                Order)) {
8389              // TODO: improve checks if GEPs can be vectorized.
8390              Value *Ptr0 = PointerOps.front();
8391              Type *ScalarTy = Ptr0->getType();
8392              auto *VecTy = getWidenedType(ScalarTy, VF);
8393              auto [ScalarGEPCost, VectorGEPCost] =
8394                  getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8395                              CostKind, ScalarTy, VecTy);
8396              GatherCost += VectorGEPCost - ScalarGEPCost;
8397              if (!Order.empty()) {
8398                SmallVector<int> Mask;
8399                inversePermutation(Order, Mask);
8400                GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
8401                                               VecTy, Mask, CostKind);
8402              }
8403            } else {
8404              GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8405                                            PointerOps.front()->getType());
8406            }
8407          }
8408          if (NeedInsertSubvectorAnalysis) {
8409            // Add the cost for the subvectors insert.
8410            SmallVector<int> ShuffleMask(VL.size());
8411            for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8412              for (unsigned Idx : seq<unsigned>(0, E))
8413                ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8414              GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8415                                               ShuffleMask, CostKind, I, LoadTy);
8416            }
8417          }
8418          GatherCost -= ScalarsCost;
8419        }
8420        GatherCost = std::min(BaseCost, GatherCost);
8421      } else if (!Root && isSplat(VL)) {
8422        // Found the broadcasting of the single scalar, calculate the cost as
8423        // the broadcast.
8424        const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8425        assert(It != VL.end() && "Expected at least one non-undef value.");
8426        // Add broadcast for non-identity shuffle only.
8427        bool NeedShuffle =
8428            count(VL, *It) > 1 &&
8429            (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8430        if (!NeedShuffle)
8431          return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8432                                        CostKind, std::distance(VL.begin(), It),
8433                                        PoisonValue::get(VecTy), *It);
8434  
8435        SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8436        transform(VL, ShuffleMask.begin(), [](Value *V) {
8437          return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8438        });
8439        InstructionCost InsertCost =
8440            TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8441                                   PoisonValue::get(VecTy), *It);
8442        return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
8443                                               VecTy, ShuffleMask, CostKind,
8444                                               /*Index=*/0, /*SubTp=*/nullptr,
8445                                               /*Args=*/*It);
8446      }
8447      return GatherCost +
8448             (all_of(Gathers, IsaPred<UndefValue>)
8449                  ? TTI::TCC_Free
8450                  : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8451                                    ScalarTy));
8452    };
8453  
8454    /// Compute the cost of creating a vector containing the extracted values from
8455    /// \p VL.
8456    InstructionCost
8457    computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8458                       ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459                       unsigned NumParts) {
8460      assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8461      unsigned NumElts =
8462          std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8463            auto *EE = dyn_cast<ExtractElementInst>(V);
8464            if (!EE)
8465              return Sz;
8466            auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8467            if (!VecTy)
8468              return Sz;
8469            return std::max(Sz, VecTy->getNumElements());
8470          });
8471      // FIXME: this must be moved to TTI for better estimation.
8472      unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8473      auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8474                                          SmallVectorImpl<unsigned> &Indices)
8475          -> std::optional<TTI::ShuffleKind> {
8476        if (NumElts <= EltsPerVector)
8477          return std::nullopt;
8478        int OffsetReg0 =
8479            alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8480                                      [](int S, int I) {
8481                                        if (I == PoisonMaskElem)
8482                                          return S;
8483                                        return std::min(S, I);
8484                                      }),
8485                      EltsPerVector);
8486        int OffsetReg1 = OffsetReg0;
8487        DenseSet<int> RegIndices;
8488        // Check that if trying to permute same single/2 input vectors.
8489        TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8490        int FirstRegId = -1;
8491        Indices.assign(1, OffsetReg0);
8492        for (auto [Pos, I] : enumerate(Mask)) {
8493          if (I == PoisonMaskElem)
8494            continue;
8495          int Idx = I - OffsetReg0;
8496          int RegId =
8497              (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8498          if (FirstRegId < 0)
8499            FirstRegId = RegId;
8500          RegIndices.insert(RegId);
8501          if (RegIndices.size() > 2)
8502            return std::nullopt;
8503          if (RegIndices.size() == 2) {
8504            ShuffleKind = TTI::SK_PermuteTwoSrc;
8505            if (Indices.size() == 1) {
8506              OffsetReg1 = alignDown(
8507                  std::accumulate(
8508                      std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8509                      [&](int S, int I) {
8510                        if (I == PoisonMaskElem)
8511                          return S;
8512                        int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513                                    ((I - OffsetReg0) % NumElts) / EltsPerVector;
8514                        if (RegId == FirstRegId)
8515                          return S;
8516                        return std::min(S, I);
8517                      }),
8518                  EltsPerVector);
8519              Indices.push_back(OffsetReg1 % NumElts);
8520            }
8521            Idx = I - OffsetReg1;
8522          }
8523          I = (Idx % NumElts) % EltsPerVector +
8524              (RegId == FirstRegId ? 0 : EltsPerVector);
8525        }
8526        return ShuffleKind;
8527      };
8528      InstructionCost Cost = 0;
8529  
8530      // Process extracts in blocks of EltsPerVector to check if the source vector
8531      // operand can be re-used directly. If not, add the cost of creating a
8532      // shuffle to extract the values into a vector register.
8533      for (unsigned Part : seq<unsigned>(NumParts)) {
8534        if (!ShuffleKinds[Part])
8535          continue;
8536        ArrayRef<int> MaskSlice = Mask.slice(
8537            Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8538        SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8539        copy(MaskSlice, SubMask.begin());
8540        SmallVector<unsigned, 2> Indices;
8541        std::optional<TTI::ShuffleKind> RegShuffleKind =
8542            CheckPerRegistersShuffle(SubMask, Indices);
8543        if (!RegShuffleKind) {
8544          if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8545              !ShuffleVectorInst::isIdentityMask(
8546                  MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8547            Cost +=
8548                ::getShuffleCost(TTI, *ShuffleKinds[Part],
8549                                 getWidenedType(ScalarTy, NumElts), MaskSlice);
8550          continue;
8551        }
8552        if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8553            !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8554          Cost +=
8555              ::getShuffleCost(TTI, *RegShuffleKind,
8556                               getWidenedType(ScalarTy, EltsPerVector), SubMask);
8557        }
8558        for (unsigned Idx : Indices) {
8559          assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8560                 "SK_ExtractSubvector index out of range");
8561          Cost += ::getShuffleCost(
8562              TTI, TTI::SK_ExtractSubvector,
8563              getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)),
8564              std::nullopt, CostKind, Idx,
8565              getWidenedType(ScalarTy, EltsPerVector));
8566        }
8567        // Second attempt to check, if just a permute is better estimated than
8568        // subvector extract.
8569        SubMask.assign(NumElts, PoisonMaskElem);
8570        copy(MaskSlice, SubMask.begin());
8571        InstructionCost OriginalCost = ::getShuffleCost(
8572            TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
8573        if (OriginalCost < Cost)
8574          Cost = OriginalCost;
8575      }
8576      return Cost;
8577    }
8578    /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8579    /// shuffle emission.
8580    static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8581                                          ArrayRef<int> Mask) {
8582      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8583        if (Mask[Idx] != PoisonMaskElem)
8584          CommonMask[Idx] = Idx;
8585    }
8586    /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8587    /// mask \p Mask, register number \p Part, that includes \p SliceSize
8588    /// elements.
8589    void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8590                                  ArrayRef<int> Mask, unsigned Part,
8591                                  unsigned SliceSize) {
8592      if (SameNodesEstimated) {
8593        // Delay the cost estimation if the same nodes are reshuffling.
8594        // If we already requested the cost of reshuffling of E1 and E2 before, no
8595        // need to estimate another cost with the sub-Mask, instead include this
8596        // sub-Mask into the CommonMask to estimate it later and avoid double cost
8597        // estimation.
8598        if ((InVectors.size() == 2 &&
8599             InVectors.front().get<const TreeEntry *>() == &E1 &&
8600             InVectors.back().get<const TreeEntry *>() == E2) ||
8601            (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8602          unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8603          assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8604                        [](int Idx) { return Idx == PoisonMaskElem; }) &&
8605                 "Expected all poisoned elements.");
8606          ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8607          copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8608          return;
8609        }
8610        // Found non-matching nodes - need to estimate the cost for the matched
8611        // and transform mask.
8612        Cost += createShuffle(InVectors.front(),
8613                              InVectors.size() == 1 ? nullptr : InVectors.back(),
8614                              CommonMask);
8615        transformMaskAfterShuffle(CommonMask, CommonMask);
8616      }
8617      SameNodesEstimated = false;
8618      if (!E2 && InVectors.size() == 1) {
8619        unsigned VF = E1.getVectorFactor();
8620        if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8621          VF = std::max(VF,
8622                        cast<FixedVectorType>(V1->getType())->getNumElements());
8623        } else {
8624          const auto *E = InVectors.front().get<const TreeEntry *>();
8625          VF = std::max(VF, E->getVectorFactor());
8626        }
8627        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8628          if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8629            CommonMask[Idx] = Mask[Idx] + VF;
8630        Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8631        transformMaskAfterShuffle(CommonMask, CommonMask);
8632      } else {
8633        Cost += createShuffle(&E1, E2, Mask);
8634        transformMaskAfterShuffle(CommonMask, Mask);
8635      }
8636    }
8637  
8638    class ShuffleCostBuilder {
8639      const TargetTransformInfo &TTI;
8640  
8641      static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8642        int Index = -1;
8643        return Mask.empty() ||
8644               (VF == Mask.size() &&
8645                ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
8646               (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
8647                Index == 0);
8648      }
8649  
8650    public:
8651      ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8652      ~ShuffleCostBuilder() = default;
8653      InstructionCost createShuffleVector(Value *V1, Value *,
8654                                          ArrayRef<int> Mask) const {
8655        // Empty mask or identity mask are free.
8656        unsigned VF =
8657            cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8658        if (isEmptyOrIdentity(Mask, VF))
8659          return TTI::TCC_Free;
8660        return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8661                                cast<VectorType>(V1->getType()), Mask);
8662      }
8663      InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8664        // Empty mask or identity mask are free.
8665        unsigned VF =
8666            cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8667        if (isEmptyOrIdentity(Mask, VF))
8668          return TTI::TCC_Free;
8669        return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
8670                                  cast<VectorType>(V1->getType()), Mask);
8671      }
8672      InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8673      InstructionCost createPoison(Type *Ty, unsigned VF) const {
8674        return TTI::TCC_Free;
8675      }
8676      void resizeToMatch(Value *&, Value *&) const {}
8677    };
8678  
8679    /// Smart shuffle instruction emission, walks through shuffles trees and
8680    /// tries to find the best matching vector for the actual shuffle
8681    /// instruction.
8682    InstructionCost
8683    createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8684                  const PointerUnion<Value *, const TreeEntry *> &P2,
8685                  ArrayRef<int> Mask) {
8686      ShuffleCostBuilder Builder(TTI);
8687      SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8688      Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8689      unsigned CommonVF = Mask.size();
8690      InstructionCost ExtraCost = 0;
8691      auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8692                                          unsigned VF) -> InstructionCost {
8693        if (E.isGather() && allConstant(E.Scalars))
8694          return TTI::TCC_Free;
8695        Type *EScalarTy = E.Scalars.front()->getType();
8696        bool IsSigned = true;
8697        if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8698          EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8699          IsSigned = It->second.second;
8700        }
8701        if (EScalarTy != ScalarTy) {
8702          unsigned CastOpcode = Instruction::Trunc;
8703          unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8704          unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8705          if (DstSz > SrcSz)
8706            CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8707          return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
8708                                      getWidenedType(EScalarTy, VF),
8709                                      TTI::CastContextHint::None, CostKind);
8710        }
8711        return TTI::TCC_Free;
8712      };
8713      auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8714        if (isa<Constant>(V))
8715          return TTI::TCC_Free;
8716        auto *VecTy = cast<VectorType>(V->getType());
8717        Type *EScalarTy = VecTy->getElementType();
8718        if (EScalarTy != ScalarTy) {
8719          bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8720          unsigned CastOpcode = Instruction::Trunc;
8721          unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8722          unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8723          if (DstSz > SrcSz)
8724            CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8725          return TTI.getCastInstrCost(
8726              CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8727              VecTy, TTI::CastContextHint::None, CostKind);
8728        }
8729        return TTI::TCC_Free;
8730      };
8731      if (!V1 && !V2 && !P2.isNull()) {
8732        // Shuffle 2 entry nodes.
8733        const TreeEntry *E = P1.get<const TreeEntry *>();
8734        unsigned VF = E->getVectorFactor();
8735        const TreeEntry *E2 = P2.get<const TreeEntry *>();
8736        CommonVF = std::max(VF, E2->getVectorFactor());
8737        assert(all_of(Mask,
8738                      [=](int Idx) {
8739                        return Idx < 2 * static_cast<int>(CommonVF);
8740                      }) &&
8741               "All elements in mask must be less than 2 * CommonVF.");
8742        if (E->Scalars.size() == E2->Scalars.size()) {
8743          SmallVector<int> EMask = E->getCommonMask();
8744          SmallVector<int> E2Mask = E2->getCommonMask();
8745          if (!EMask.empty() || !E2Mask.empty()) {
8746            for (int &Idx : CommonMask) {
8747              if (Idx == PoisonMaskElem)
8748                continue;
8749              if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8750                Idx = EMask[Idx];
8751              else if (Idx >= static_cast<int>(CommonVF))
8752                Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8753                      E->Scalars.size();
8754            }
8755          }
8756          CommonVF = E->Scalars.size();
8757          ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758                       GetNodeMinBWAffectedCost(*E2, CommonVF);
8759        } else {
8760          ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761                       GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8762        }
8763        V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8764        V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8765      } else if (!V1 && P2.isNull()) {
8766        // Shuffle single entry node.
8767        const TreeEntry *E = P1.get<const TreeEntry *>();
8768        unsigned VF = E->getVectorFactor();
8769        CommonVF = VF;
8770        assert(
8771            all_of(Mask,
8772                   [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8773            "All elements in mask must be less than CommonVF.");
8774        if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8775          SmallVector<int> EMask = E->getCommonMask();
8776          assert(!EMask.empty() && "Expected non-empty common mask.");
8777          for (int &Idx : CommonMask) {
8778            if (Idx != PoisonMaskElem)
8779              Idx = EMask[Idx];
8780          }
8781          CommonVF = E->Scalars.size();
8782        }
8783        ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8784        V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8785        // Not identity/broadcast? Try to see if the original vector is better.
8786        if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787            CommonVF == CommonMask.size() &&
8788            any_of(enumerate(CommonMask),
8789                   [](const auto &&P) {
8790                     return P.value() != PoisonMaskElem &&
8791                            static_cast<unsigned>(P.value()) != P.index();
8792                   }) &&
8793            any_of(CommonMask,
8794                   [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8795          SmallVector<int> ReorderMask;
8796          inversePermutation(E->ReorderIndices, ReorderMask);
8797          ::addMask(CommonMask, ReorderMask);
8798        }
8799      } else if (V1 && P2.isNull()) {
8800        // Shuffle single vector.
8801        ExtraCost += GetValueMinBWAffectedCost(V1);
8802        CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8803        assert(
8804            all_of(Mask,
8805                   [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8806            "All elements in mask must be less than CommonVF.");
8807      } else if (V1 && !V2) {
8808        // Shuffle vector and tree node.
8809        unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8810        const TreeEntry *E2 = P2.get<const TreeEntry *>();
8811        CommonVF = std::max(VF, E2->getVectorFactor());
8812        assert(all_of(Mask,
8813                      [=](int Idx) {
8814                        return Idx < 2 * static_cast<int>(CommonVF);
8815                      }) &&
8816               "All elements in mask must be less than 2 * CommonVF.");
8817        if (E2->Scalars.size() == VF && VF != CommonVF) {
8818          SmallVector<int> E2Mask = E2->getCommonMask();
8819          assert(!E2Mask.empty() && "Expected non-empty common mask.");
8820          for (int &Idx : CommonMask) {
8821            if (Idx == PoisonMaskElem)
8822              continue;
8823            if (Idx >= static_cast<int>(CommonVF))
8824              Idx = E2Mask[Idx - CommonVF] + VF;
8825          }
8826          CommonVF = VF;
8827        }
8828        ExtraCost += GetValueMinBWAffectedCost(V1);
8829        V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8830        ExtraCost += GetNodeMinBWAffectedCost(
8831            *E2, std::min(CommonVF, E2->getVectorFactor()));
8832        V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8833      } else if (!V1 && V2) {
8834        // Shuffle vector and tree node.
8835        unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8836        const TreeEntry *E1 = P1.get<const TreeEntry *>();
8837        CommonVF = std::max(VF, E1->getVectorFactor());
8838        assert(all_of(Mask,
8839                      [=](int Idx) {
8840                        return Idx < 2 * static_cast<int>(CommonVF);
8841                      }) &&
8842               "All elements in mask must be less than 2 * CommonVF.");
8843        if (E1->Scalars.size() == VF && VF != CommonVF) {
8844          SmallVector<int> E1Mask = E1->getCommonMask();
8845          assert(!E1Mask.empty() && "Expected non-empty common mask.");
8846          for (int &Idx : CommonMask) {
8847            if (Idx == PoisonMaskElem)
8848              continue;
8849            if (Idx >= static_cast<int>(CommonVF))
8850              Idx = E1Mask[Idx - CommonVF] + VF;
8851            else
8852              Idx = E1Mask[Idx];
8853          }
8854          CommonVF = VF;
8855        }
8856        ExtraCost += GetNodeMinBWAffectedCost(
8857            *E1, std::min(CommonVF, E1->getVectorFactor()));
8858        V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8859        ExtraCost += GetValueMinBWAffectedCost(V2);
8860        V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8861      } else {
8862        assert(V1 && V2 && "Expected both vectors.");
8863        unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8864        CommonVF =
8865            std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8866        assert(all_of(Mask,
8867                      [=](int Idx) {
8868                        return Idx < 2 * static_cast<int>(CommonVF);
8869                      }) &&
8870               "All elements in mask must be less than 2 * CommonVF.");
8871        ExtraCost +=
8872            GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873        if (V1->getType() != V2->getType()) {
8874          V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8875          V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8876        } else {
8877          if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8878            V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8879          if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8880            V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8881        }
8882      }
8883      InVectors.front() =
8884          Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8885      if (InVectors.size() == 2)
8886        InVectors.pop_back();
8887      return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888                             V1, V2, CommonMask, Builder);
8889    }
8890  
8891  public:
8892    ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
8893                         ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8894                         SmallPtrSetImpl<Value *> &CheckedExtracts)
8895        : ScalarTy(ScalarTy), TTI(TTI),
8896          VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897          CheckedExtracts(CheckedExtracts) {}
8898    Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8899                          ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900                          unsigned NumParts, bool &UseVecBaseAsInput) {
8901      UseVecBaseAsInput = false;
8902      if (Mask.empty())
8903        return nullptr;
8904      Value *VecBase = nullptr;
8905      ArrayRef<Value *> VL = E->Scalars;
8906      // If the resulting type is scalarized, do not adjust the cost.
8907      if (NumParts == VL.size())
8908        return nullptr;
8909      // Check if it can be considered reused if same extractelements were
8910      // vectorized already.
8911      bool PrevNodeFound = any_of(
8912          ArrayRef(R.VectorizableTree).take_front(E->Idx),
8913          [&](const std::unique_ptr<TreeEntry> &TE) {
8914            return ((!TE->isAltShuffle() &&
8915                     TE->getOpcode() == Instruction::ExtractElement) ||
8916                    TE->isGather()) &&
8917                   all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8918                     return VL.size() > Data.index() &&
8919                            (Mask[Data.index()] == PoisonMaskElem ||
8920                             isa<UndefValue>(VL[Data.index()]) ||
8921                             Data.value() == VL[Data.index()]);
8922                   });
8923          });
8924      SmallPtrSet<Value *, 4> UniqueBases;
8925      unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
8926      for (unsigned Part : seq<unsigned>(NumParts)) {
8927        unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
8928        ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8929        for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
8930          // Ignore non-extractelement scalars.
8931          if (isa<UndefValue>(V) ||
8932              (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8933            continue;
8934          // If all users of instruction are going to be vectorized and this
8935          // instruction itself is not going to be vectorized, consider this
8936          // instruction as dead and remove its cost from the final cost of the
8937          // vectorized tree.
8938          // Also, avoid adjusting the cost for extractelements with multiple uses
8939          // in different graph entries.
8940          auto *EE = cast<ExtractElementInst>(V);
8941          VecBase = EE->getVectorOperand();
8942          UniqueBases.insert(VecBase);
8943          const TreeEntry *VE = R.getTreeEntry(V);
8944          if (!CheckedExtracts.insert(V).second ||
8945              !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8946              any_of(EE->users(),
8947                     [&](User *U) {
8948                       return isa<GetElementPtrInst>(U) &&
8949                              !R.areAllUsersVectorized(cast<Instruction>(U),
8950                                                       &VectorizedVals);
8951                     }) ||
8952              (VE && VE != E))
8953            continue;
8954          std::optional<unsigned> EEIdx = getExtractIndex(EE);
8955          if (!EEIdx)
8956            continue;
8957          unsigned Idx = *EEIdx;
8958          // Take credit for instruction that will become dead.
8959          if (EE->hasOneUse() || !PrevNodeFound) {
8960            Instruction *Ext = EE->user_back();
8961            if (isa<SExtInst, ZExtInst>(Ext) &&
8962                all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8963              // Use getExtractWithExtendCost() to calculate the cost of
8964              // extractelement/ext pair.
8965              Cost -=
8966                  TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8967                                               EE->getVectorOperandType(), Idx);
8968              // Add back the cost of s|zext which is subtracted separately.
8969              Cost += TTI.getCastInstrCost(
8970                  Ext->getOpcode(), Ext->getType(), EE->getType(),
8971                  TTI::getCastContextHint(Ext), CostKind, Ext);
8972              continue;
8973            }
8974          }
8975          Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8976                                         CostKind, Idx);
8977        }
8978      }
8979      // Check that gather of extractelements can be represented as just a
8980      // shuffle of a single/two vectors the scalars are extracted from.
8981      // Found the bunch of extractelement instructions that must be gathered
8982      // into a vector and can be represented as a permutation elements in a
8983      // single input vector or of 2 input vectors.
8984      // Done for reused if same extractelements were vectorized already.
8985      if (!PrevNodeFound)
8986        Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8987      InVectors.assign(1, E);
8988      CommonMask.assign(Mask.begin(), Mask.end());
8989      transformMaskAfterShuffle(CommonMask, CommonMask);
8990      SameNodesEstimated = false;
8991      if (NumParts != 1 && UniqueBases.size() != 1) {
8992        UseVecBaseAsInput = true;
8993        VecBase =
8994            Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8995      }
8996      return VecBase;
8997    }
8998    /// Checks if the specified entry \p E needs to be delayed because of its
8999    /// dependency nodes.
9000    std::optional<InstructionCost>
9001    needToDelay(const TreeEntry *,
9002                ArrayRef<SmallVector<const TreeEntry *>>) const {
9003      // No need to delay the cost estimation during analysis.
9004      return std::nullopt;
9005    }
9006    void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9007      if (&E1 == &E2) {
9008        assert(all_of(Mask,
9009                      [&](int Idx) {
9010                        return Idx < static_cast<int>(E1.getVectorFactor());
9011                      }) &&
9012               "Expected single vector shuffle mask.");
9013        add(E1, Mask);
9014        return;
9015      }
9016      if (InVectors.empty()) {
9017        CommonMask.assign(Mask.begin(), Mask.end());
9018        InVectors.assign({&E1, &E2});
9019        return;
9020      }
9021      assert(!CommonMask.empty() && "Expected non-empty common mask.");
9022      auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9023      unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9024      if (NumParts == 0 || NumParts >= Mask.size())
9025        NumParts = 1;
9026      unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9027      const auto *It =
9028          find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9029      unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9030      estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9031    }
9032    void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9033      if (InVectors.empty()) {
9034        CommonMask.assign(Mask.begin(), Mask.end());
9035        InVectors.assign(1, &E1);
9036        return;
9037      }
9038      assert(!CommonMask.empty() && "Expected non-empty common mask.");
9039      auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9040      unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9041      if (NumParts == 0 || NumParts >= Mask.size())
9042        NumParts = 1;
9043      unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9044      const auto *It =
9045          find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9046      unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9047      estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
9048      if (!SameNodesEstimated && InVectors.size() == 1)
9049        InVectors.emplace_back(&E1);
9050    }
9051    /// Adds 2 input vectors and the mask for their shuffling.
9052    void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9053      // May come only for shuffling of 2 vectors with extractelements, already
9054      // handled in adjustExtracts.
9055      assert(InVectors.size() == 1 &&
9056             all_of(enumerate(CommonMask),
9057                    [&](auto P) {
9058                      if (P.value() == PoisonMaskElem)
9059                        return Mask[P.index()] == PoisonMaskElem;
9060                      auto *EI =
9061                          cast<ExtractElementInst>(InVectors.front()
9062                                                       .get<const TreeEntry *>()
9063                                                       ->Scalars[P.index()]);
9064                      return EI->getVectorOperand() == V1 ||
9065                             EI->getVectorOperand() == V2;
9066                    }) &&
9067             "Expected extractelement vectors.");
9068    }
9069    /// Adds another one input vector and the mask for the shuffling.
9070    void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9071      if (InVectors.empty()) {
9072        assert(CommonMask.empty() && !ForExtracts &&
9073               "Expected empty input mask/vectors.");
9074        CommonMask.assign(Mask.begin(), Mask.end());
9075        InVectors.assign(1, V1);
9076        return;
9077      }
9078      if (ForExtracts) {
9079        // No need to add vectors here, already handled them in adjustExtracts.
9080        assert(InVectors.size() == 1 &&
9081               InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9082               all_of(enumerate(CommonMask),
9083                      [&](auto P) {
9084                        Value *Scalar = InVectors.front()
9085                                            .get<const TreeEntry *>()
9086                                            ->Scalars[P.index()];
9087                        if (P.value() == PoisonMaskElem)
9088                          return P.value() == Mask[P.index()] ||
9089                                 isa<UndefValue>(Scalar);
9090                        if (isa<Constant>(V1))
9091                          return true;
9092                        auto *EI = cast<ExtractElementInst>(Scalar);
9093                        return EI->getVectorOperand() == V1;
9094                      }) &&
9095               "Expected only tree entry for extractelement vectors.");
9096        return;
9097      }
9098      assert(!InVectors.empty() && !CommonMask.empty() &&
9099             "Expected only tree entries from extracts/reused buildvectors.");
9100      unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
9101      if (InVectors.size() == 2) {
9102        Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
9103        transformMaskAfterShuffle(CommonMask, CommonMask);
9104        VF = std::max<unsigned>(VF, CommonMask.size());
9105      } else if (const auto *InTE =
9106                     InVectors.front().dyn_cast<const TreeEntry *>()) {
9107        VF = std::max(VF, InTE->getVectorFactor());
9108      } else {
9109        VF = std::max(
9110            VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
9111                    ->getNumElements());
9112      }
9113      InVectors.push_back(V1);
9114      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9115        if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9116          CommonMask[Idx] = Mask[Idx] + VF;
9117    }
9118    Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9119                  Value *Root = nullptr) {
9120      Cost += getBuildVectorCost(VL, Root);
9121      if (!Root) {
9122        // FIXME: Need to find a way to avoid use of getNullValue here.
9123        SmallVector<Constant *> Vals;
9124        unsigned VF = VL.size();
9125        if (MaskVF != 0)
9126          VF = std::min(VF, MaskVF);
9127        for (Value *V : VL.take_front(VF)) {
9128          if (isa<UndefValue>(V)) {
9129            Vals.push_back(cast<Constant>(V));
9130            continue;
9131          }
9132          Vals.push_back(Constant::getNullValue(V->getType()));
9133        }
9134        return ConstantVector::get(Vals);
9135      }
9136      return ConstantVector::getSplat(
9137          ElementCount::getFixed(
9138              cast<FixedVectorType>(Root->getType())->getNumElements()),
9139          getAllOnesValue(*R.DL, ScalarTy));
9140    }
9141    InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
9142    /// Finalize emission of the shuffles.
9143    InstructionCost
9144    finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9145             function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9146      IsFinalized = true;
9147      if (Action) {
9148        const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9149        if (InVectors.size() == 2)
9150          Cost += createShuffle(Vec, InVectors.back(), CommonMask);
9151        else
9152          Cost += createShuffle(Vec, nullptr, CommonMask);
9153        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9154          if (CommonMask[Idx] != PoisonMaskElem)
9155            CommonMask[Idx] = Idx;
9156        assert(VF > 0 &&
9157               "Expected vector length for the final value before action.");
9158        Value *V = Vec.get<Value *>();
9159        Action(V, CommonMask);
9160        InVectors.front() = V;
9161      }
9162      ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
9163      if (CommonMask.empty()) {
9164        assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9165        return Cost;
9166      }
9167      return Cost +
9168             createShuffle(InVectors.front(),
9169                           InVectors.size() == 2 ? InVectors.back() : nullptr,
9170                           CommonMask);
9171    }
9172  
9173    ~ShuffleCostEstimator() {
9174      assert((IsFinalized || CommonMask.empty()) &&
9175             "Shuffle construction must be finalized.");
9176    }
9177  };
9178  
9179  const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9180                                                     unsigned Idx) const {
9181    Value *Op = E->getOperand(Idx).front();
9182    if (const TreeEntry *TE = getTreeEntry(Op)) {
9183      if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9184            return EI.EdgeIdx == Idx && EI.UserTE == E;
9185          }) != TE->UserTreeIndices.end())
9186        return TE;
9187      auto MIt = MultiNodeScalars.find(Op);
9188      if (MIt != MultiNodeScalars.end()) {
9189        for (const TreeEntry *TE : MIt->second) {
9190          if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9191                return EI.EdgeIdx == Idx && EI.UserTE == E;
9192              }) != TE->UserTreeIndices.end())
9193            return TE;
9194        }
9195      }
9196    }
9197    const auto *It =
9198        find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9199          return TE->isGather() &&
9200                 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9201                   return EI.EdgeIdx == Idx && EI.UserTE == E;
9202                 }) != TE->UserTreeIndices.end();
9203        });
9204    assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9205    return It->get();
9206  }
9207  
9208  TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9209    if (TE.State == TreeEntry::ScatterVectorize ||
9210        TE.State == TreeEntry::StridedVectorize)
9211      return TTI::CastContextHint::GatherScatter;
9212    if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9213        !TE.isAltShuffle()) {
9214      if (TE.ReorderIndices.empty())
9215        return TTI::CastContextHint::Normal;
9216      SmallVector<int> Mask;
9217      inversePermutation(TE.ReorderIndices, Mask);
9218      if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9219        return TTI::CastContextHint::Reversed;
9220    }
9221    return TTI::CastContextHint::None;
9222  }
9223  
9224  /// Builds the arguments types vector for the given call instruction with the
9225  /// given \p ID for the specified vector factor.
9226  static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
9227                                                    const Intrinsic::ID ID,
9228                                                    const unsigned VF,
9229                                                    unsigned MinBW) {
9230    SmallVector<Type *> ArgTys;
9231    for (auto [Idx, Arg] : enumerate(CI->args())) {
9232      if (ID != Intrinsic::not_intrinsic) {
9233        if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
9234          ArgTys.push_back(Arg->getType());
9235          continue;
9236        }
9237        if (MinBW > 0) {
9238          ArgTys.push_back(
9239              getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9240          continue;
9241        }
9242      }
9243      ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9244    }
9245    return ArgTys;
9246  }
9247  
9248  InstructionCost
9249  BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9250                        SmallPtrSetImpl<Value *> &CheckedExtracts) {
9251    ArrayRef<Value *> VL = E->Scalars;
9252  
9253    Type *ScalarTy = VL[0]->getType();
9254    if (!E->isGather()) {
9255      if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9256        ScalarTy = SI->getValueOperand()->getType();
9257      else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9258        ScalarTy = CI->getOperand(0)->getType();
9259      else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9260        ScalarTy = IE->getOperand(1)->getType();
9261    }
9262    if (!isValidElementType(ScalarTy))
9263      return InstructionCost::getInvalid();
9264    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9265  
9266    // If we have computed a smaller type for the expression, update VecTy so
9267    // that the costs will be accurate.
9268    auto It = MinBWs.find(E);
9269    Type *OrigScalarTy = ScalarTy;
9270    if (It != MinBWs.end())
9271      ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9272    auto *VecTy = getWidenedType(ScalarTy, VL.size());
9273    unsigned EntryVF = E->getVectorFactor();
9274    auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
9275  
9276    bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277    if (E->isGather()) {
9278      if (allConstant(VL))
9279        return 0;
9280      if (isa<InsertElementInst>(VL[0]))
9281        return InstructionCost::getInvalid();
9282      return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283          E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9284    }
9285    InstructionCost CommonCost = 0;
9286    SmallVector<int> Mask;
9287    bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9288    if (!E->ReorderIndices.empty() &&
9289        (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9290      SmallVector<int> NewMask;
9291      if (E->getOpcode() == Instruction::Store) {
9292        // For stores the order is actually a mask.
9293        NewMask.resize(E->ReorderIndices.size());
9294        copy(E->ReorderIndices, NewMask.begin());
9295      } else {
9296        inversePermutation(E->ReorderIndices, NewMask);
9297      }
9298      ::addMask(Mask, NewMask);
9299    }
9300    if (NeedToShuffleReuses)
9301      ::addMask(Mask, E->ReuseShuffleIndices);
9302    if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9303      CommonCost =
9304          TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9305    assert((E->State == TreeEntry::Vectorize ||
9306            E->State == TreeEntry::ScatterVectorize ||
9307            E->State == TreeEntry::StridedVectorize) &&
9308           "Unhandled state");
9309    assert(E->getOpcode() &&
9310           ((allSameType(VL) && allSameBlock(VL)) ||
9311            (E->getOpcode() == Instruction::GetElementPtr &&
9312             E->getMainOp()->getType()->isPointerTy())) &&
9313           "Invalid VL");
9314    Instruction *VL0 = E->getMainOp();
9315    unsigned ShuffleOrOp =
9316        E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9317    SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9318    const unsigned Sz = UniqueValues.size();
9319    SmallBitVector UsedScalars(Sz, false);
9320    for (unsigned I = 0; I < Sz; ++I) {
9321      if (getTreeEntry(UniqueValues[I]) == E)
9322        continue;
9323      UsedScalars.set(I);
9324    }
9325    auto GetCastContextHint = [&](Value *V) {
9326      if (const TreeEntry *OpTE = getTreeEntry(V))
9327        return getCastContextHint(*OpTE);
9328      InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9329      if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9330        return TTI::CastContextHint::GatherScatter;
9331      return TTI::CastContextHint::None;
9332    };
9333    auto GetCostDiff =
9334        [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9335            function_ref<InstructionCost(InstructionCost)> VectorCost) {
9336          // Calculate the cost of this instruction.
9337          InstructionCost ScalarCost = 0;
9338          if (isa<CastInst, CallInst>(VL0)) {
9339            // For some of the instructions no need to calculate cost for each
9340            // particular instruction, we can use the cost of the single
9341            // instruction x total number of scalar instructions.
9342            ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9343          } else {
9344            for (unsigned I = 0; I < Sz; ++I) {
9345              if (UsedScalars.test(I))
9346                continue;
9347              ScalarCost += ScalarEltCost(I);
9348            }
9349          }
9350  
9351          InstructionCost VecCost = VectorCost(CommonCost);
9352          // Check if the current node must be resized, if the parent node is not
9353          // resized.
9354          if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9355            const EdgeInfo &EI = E->UserTreeIndices.front();
9356            if ((EI.UserTE->getOpcode() != Instruction::Select ||
9357                 EI.EdgeIdx != 0) &&
9358                It != MinBWs.end()) {
9359              auto UserBWIt = MinBWs.find(EI.UserTE);
9360              Type *UserScalarTy =
9361                  EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9362              if (UserBWIt != MinBWs.end())
9363                UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9364                                                UserBWIt->second.first);
9365              if (ScalarTy != UserScalarTy) {
9366                unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9367                unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9368                unsigned VecOpcode;
9369                auto *UserVecTy =
9370                    getWidenedType(UserScalarTy, E->getVectorFactor());
9371                if (BWSz > SrcBWSz)
9372                  VecOpcode = Instruction::Trunc;
9373                else
9374                  VecOpcode =
9375                      It->second.second ? Instruction::SExt : Instruction::ZExt;
9376                TTI::CastContextHint CCH = GetCastContextHint(VL0);
9377                VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9378                                                 CostKind);
9379              }
9380            }
9381          }
9382          LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383                                   ScalarCost, "Calculated costs for Tree"));
9384          return VecCost - ScalarCost;
9385        };
9386    // Calculate cost difference from vectorizing set of GEPs.
9387    // Negative value means vectorizing is profitable.
9388    auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9389      assert((E->State == TreeEntry::Vectorize ||
9390              E->State == TreeEntry::StridedVectorize) &&
9391             "Entry state expected to be Vectorize or StridedVectorize here.");
9392      InstructionCost ScalarCost = 0;
9393      InstructionCost VecCost = 0;
9394      std::tie(ScalarCost, VecCost) = getGEPCosts(
9395          *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9396      LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9397                               "Calculated GEPs cost for Tree"));
9398  
9399      return VecCost - ScalarCost;
9400    };
9401  
9402    switch (ShuffleOrOp) {
9403    case Instruction::PHI: {
9404      // Count reused scalars.
9405      InstructionCost ScalarCost = 0;
9406      SmallPtrSet<const TreeEntry *, 4> CountedOps;
9407      for (Value *V : UniqueValues) {
9408        auto *PHI = dyn_cast<PHINode>(V);
9409        if (!PHI)
9410          continue;
9411  
9412        ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9413        for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9414          Value *Op = PHI->getIncomingValue(I);
9415          Operands[I] = Op;
9416        }
9417        if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9418          if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9419            if (!OpTE->ReuseShuffleIndices.empty())
9420              ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421                                              OpTE->Scalars.size());
9422      }
9423  
9424      return CommonCost - ScalarCost;
9425    }
9426    case Instruction::ExtractValue:
9427    case Instruction::ExtractElement: {
9428      auto GetScalarCost = [&](unsigned Idx) {
9429        auto *I = cast<Instruction>(UniqueValues[Idx]);
9430        VectorType *SrcVecTy;
9431        if (ShuffleOrOp == Instruction::ExtractElement) {
9432          auto *EE = cast<ExtractElementInst>(I);
9433          SrcVecTy = EE->getVectorOperandType();
9434        } else {
9435          auto *EV = cast<ExtractValueInst>(I);
9436          Type *AggregateTy = EV->getAggregateOperand()->getType();
9437          unsigned NumElts;
9438          if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9439            NumElts = ATy->getNumElements();
9440          else
9441            NumElts = AggregateTy->getStructNumElements();
9442          SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
9443        }
9444        if (I->hasOneUse()) {
9445          Instruction *Ext = I->user_back();
9446          if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9447              all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9448            // Use getExtractWithExtendCost() to calculate the cost of
9449            // extractelement/ext pair.
9450            InstructionCost Cost = TTI->getExtractWithExtendCost(
9451                Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9452            // Subtract the cost of s|zext which is subtracted separately.
9453            Cost -= TTI->getCastInstrCost(
9454                Ext->getOpcode(), Ext->getType(), I->getType(),
9455                TTI::getCastContextHint(Ext), CostKind, Ext);
9456            return Cost;
9457          }
9458        }
9459        return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9460                                       CostKind, *getExtractIndex(I));
9461      };
9462      auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9463      return GetCostDiff(GetScalarCost, GetVectorCost);
9464    }
9465    case Instruction::InsertElement: {
9466      assert(E->ReuseShuffleIndices.empty() &&
9467             "Unique insertelements only are expected.");
9468      auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9469      unsigned const NumElts = SrcVecTy->getNumElements();
9470      unsigned const NumScalars = VL.size();
9471  
9472      unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9473  
9474      SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9475      unsigned OffsetBeg = *getElementIndex(VL.front());
9476      unsigned OffsetEnd = OffsetBeg;
9477      InsertMask[OffsetBeg] = 0;
9478      for (auto [I, V] : enumerate(VL.drop_front())) {
9479        unsigned Idx = *getElementIndex(V);
9480        if (OffsetBeg > Idx)
9481          OffsetBeg = Idx;
9482        else if (OffsetEnd < Idx)
9483          OffsetEnd = Idx;
9484        InsertMask[Idx] = I + 1;
9485      }
9486      unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9487      if (NumOfParts > 0)
9488        VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9489      unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9490                       VecScalarsSz;
9491      unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492      unsigned InsertVecSz = std::min<unsigned>(
9493          PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9494          ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495      bool IsWholeSubvector =
9496          OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9497      // Check if we can safely insert a subvector. If it is not possible, just
9498      // generate a whole-sized vector and shuffle the source vector and the new
9499      // subvector.
9500      if (OffsetBeg + InsertVecSz > VecSz) {
9501        // Align OffsetBeg to generate correct mask.
9502        OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9503        InsertVecSz = VecSz;
9504      }
9505  
9506      APInt DemandedElts = APInt::getZero(NumElts);
9507      // TODO: Add support for Instruction::InsertValue.
9508      SmallVector<int> Mask;
9509      if (!E->ReorderIndices.empty()) {
9510        inversePermutation(E->ReorderIndices, Mask);
9511        Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9512      } else {
9513        Mask.assign(VecSz, PoisonMaskElem);
9514        std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9515      }
9516      bool IsIdentity = true;
9517      SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9518      Mask.swap(PrevMask);
9519      for (unsigned I = 0; I < NumScalars; ++I) {
9520        unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
9521        DemandedElts.setBit(InsertIdx);
9522        IsIdentity &= InsertIdx - OffsetBeg == I;
9523        Mask[InsertIdx - OffsetBeg] = I;
9524      }
9525      assert(Offset < NumElts && "Failed to find vector index offset");
9526  
9527      InstructionCost Cost = 0;
9528      Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9529                                            /*Insert*/ true, /*Extract*/ false,
9530                                            CostKind);
9531  
9532      // First cost - resize to actual vector size if not identity shuffle or
9533      // need to shift the vector.
9534      // Do not calculate the cost if the actual size is the register size and
9535      // we can merge this shuffle with the following SK_Select.
9536      auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
9537      if (!IsIdentity)
9538        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
9539                                    InsertVecTy, Mask);
9540      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9541        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9542      }));
9543      // Second cost - permutation with subvector, if some elements are from the
9544      // initial vector or inserting a subvector.
9545      // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9546      // subvector of ActualVecTy.
9547      SmallBitVector InMask =
9548          isUndefVector(FirstInsert->getOperand(0),
9549                        buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9550      if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9551        if (InsertVecSz != VecSz) {
9552          auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
9553          Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
9554                                      std::nullopt, CostKind, OffsetBeg - Offset,
9555                                      InsertVecTy);
9556        } else {
9557          for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9558            Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9559          for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9560               I <= End; ++I)
9561            if (Mask[I] != PoisonMaskElem)
9562              Mask[I] = I + VecSz;
9563          for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9564            Mask[I] =
9565                ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9566          Cost +=
9567              ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9568        }
9569      }
9570      return Cost;
9571    }
9572    case Instruction::ZExt:
9573    case Instruction::SExt:
9574    case Instruction::FPToUI:
9575    case Instruction::FPToSI:
9576    case Instruction::FPExt:
9577    case Instruction::PtrToInt:
9578    case Instruction::IntToPtr:
9579    case Instruction::SIToFP:
9580    case Instruction::UIToFP:
9581    case Instruction::Trunc:
9582    case Instruction::FPTrunc:
9583    case Instruction::BitCast: {
9584      auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9585      Type *SrcScalarTy = VL0->getOperand(0)->getType();
9586      auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9587      unsigned Opcode = ShuffleOrOp;
9588      unsigned VecOpcode = Opcode;
9589      if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9590          (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9591        // Check if the values are candidates to demote.
9592        unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9593        if (SrcIt != MinBWs.end()) {
9594          SrcBWSz = SrcIt->second.first;
9595          SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9596          SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9597        }
9598        unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9599        if (BWSz == SrcBWSz) {
9600          VecOpcode = Instruction::BitCast;
9601        } else if (BWSz < SrcBWSz) {
9602          VecOpcode = Instruction::Trunc;
9603        } else if (It != MinBWs.end()) {
9604          assert(BWSz > SrcBWSz && "Invalid cast!");
9605          VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9606        } else if (SrcIt != MinBWs.end()) {
9607          assert(BWSz > SrcBWSz && "Invalid cast!");
9608          VecOpcode =
9609              SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9610        }
9611      } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9612                 !SrcIt->second.second) {
9613        VecOpcode = Instruction::UIToFP;
9614      }
9615      auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9616        auto *VI = cast<Instruction>(UniqueValues[Idx]);
9617        return TTI->getCastInstrCost(Opcode, VL0->getType(),
9618                                     VL0->getOperand(0)->getType(),
9619                                     TTI::getCastContextHint(VI), CostKind, VI);
9620      };
9621      auto GetVectorCost = [=](InstructionCost CommonCost) {
9622        // Do not count cost here if minimum bitwidth is in effect and it is just
9623        // a bitcast (here it is just a noop).
9624        if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9625          return CommonCost;
9626        auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9627        TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9628        return CommonCost +
9629               TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9630                                     VecOpcode == Opcode ? VI : nullptr);
9631      };
9632      return GetCostDiff(GetScalarCost, GetVectorCost);
9633    }
9634    case Instruction::FCmp:
9635    case Instruction::ICmp:
9636    case Instruction::Select: {
9637      CmpInst::Predicate VecPred, SwappedVecPred;
9638      auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9639      if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9640          match(VL0, MatchCmp))
9641        SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9642      else
9643        SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9644                                       ? CmpInst::BAD_FCMP_PREDICATE
9645                                       : CmpInst::BAD_ICMP_PREDICATE;
9646      auto GetScalarCost = [&](unsigned Idx) {
9647        auto *VI = cast<Instruction>(UniqueValues[Idx]);
9648        CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9649                                             ? CmpInst::BAD_FCMP_PREDICATE
9650                                             : CmpInst::BAD_ICMP_PREDICATE;
9651        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9652        if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9653             !match(VI, MatchCmp)) ||
9654            (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9655          VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9656                                         ? CmpInst::BAD_FCMP_PREDICATE
9657                                         : CmpInst::BAD_ICMP_PREDICATE;
9658  
9659        InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
9660            E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9661            CostKind, VI);
9662        auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9663        if (MinMaxID != Intrinsic::not_intrinsic) {
9664          Type *CanonicalType = OrigScalarTy;
9665          if (CanonicalType->isPtrOrPtrVectorTy())
9666            CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9667                CanonicalType->getContext(),
9668                DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9669  
9670          IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9671                                            {CanonicalType, CanonicalType});
9672          InstructionCost IntrinsicCost =
9673              TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9674          // If the selects are the only uses of the compares, they will be
9675          // dead and we can adjust the cost by removing their cost.
9676          if (SelectOnly) {
9677            auto *CI = cast<CmpInst>(VI->getOperand(0));
9678            IntrinsicCost -= TTI->getCmpSelInstrCost(
9679                CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9680                CI->getPredicate(), CostKind, CI);
9681          }
9682          ScalarCost = std::min(ScalarCost, IntrinsicCost);
9683        }
9684  
9685        return ScalarCost;
9686      };
9687      auto GetVectorCost = [&](InstructionCost CommonCost) {
9688        auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9689  
9690        InstructionCost VecCost = TTI->getCmpSelInstrCost(
9691            E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9692        // Check if it is possible and profitable to use min/max for selects
9693        // in VL.
9694        //
9695        auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9696        if (MinMaxID != Intrinsic::not_intrinsic) {
9697          Type *CanonicalType = VecTy;
9698          if (CanonicalType->isPtrOrPtrVectorTy())
9699            CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9700                CanonicalType->getContext(),
9701                DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9702          IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9703                                            {CanonicalType, CanonicalType});
9704          InstructionCost IntrinsicCost =
9705              TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9706          // If the selects are the only uses of the compares, they will be
9707          // dead and we can adjust the cost by removing their cost.
9708          if (SelectOnly) {
9709            auto *CI =
9710                cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9711            IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9712                                                     MaskTy, VecPred, CostKind);
9713          }
9714          VecCost = std::min(VecCost, IntrinsicCost);
9715        }
9716        return VecCost + CommonCost;
9717      };
9718      return GetCostDiff(GetScalarCost, GetVectorCost);
9719    }
9720    case Instruction::FNeg:
9721    case Instruction::Add:
9722    case Instruction::FAdd:
9723    case Instruction::Sub:
9724    case Instruction::FSub:
9725    case Instruction::Mul:
9726    case Instruction::FMul:
9727    case Instruction::UDiv:
9728    case Instruction::SDiv:
9729    case Instruction::FDiv:
9730    case Instruction::URem:
9731    case Instruction::SRem:
9732    case Instruction::FRem:
9733    case Instruction::Shl:
9734    case Instruction::LShr:
9735    case Instruction::AShr:
9736    case Instruction::And:
9737    case Instruction::Or:
9738    case Instruction::Xor: {
9739      auto GetScalarCost = [&](unsigned Idx) {
9740        auto *VI = cast<Instruction>(UniqueValues[Idx]);
9741        unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9742        TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9743        TTI::OperandValueInfo Op2Info =
9744            TTI::getOperandInfo(VI->getOperand(OpIdx));
9745        SmallVector<const Value *> Operands(VI->operand_values());
9746        return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9747                                           Op1Info, Op2Info, Operands, VI);
9748      };
9749      auto GetVectorCost = [=](InstructionCost CommonCost) {
9750        if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9751          for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9752            ArrayRef<Value *> Ops = E->getOperand(I);
9753            if (all_of(Ops, [&](Value *Op) {
9754                  auto *CI = dyn_cast<ConstantInt>(Op);
9755                  return CI && CI->getValue().countr_one() >= It->second.first;
9756                }))
9757              return CommonCost;
9758          }
9759        }
9760        unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9761        TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9762        TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9763        return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9764                                           Op2Info, std::nullopt, nullptr, TLI) +
9765               CommonCost;
9766      };
9767      return GetCostDiff(GetScalarCost, GetVectorCost);
9768    }
9769    case Instruction::GetElementPtr: {
9770      return CommonCost + GetGEPCostDiff(VL, VL0);
9771    }
9772    case Instruction::Load: {
9773      auto GetScalarCost = [&](unsigned Idx) {
9774        auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9775        return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9776                                    VI->getAlign(), VI->getPointerAddressSpace(),
9777                                    CostKind, TTI::OperandValueInfo(), VI);
9778      };
9779      auto *LI0 = cast<LoadInst>(VL0);
9780      auto GetVectorCost = [&](InstructionCost CommonCost) {
9781        InstructionCost VecLdCost;
9782        if (E->State == TreeEntry::Vectorize) {
9783          VecLdCost = TTI->getMemoryOpCost(
9784              Instruction::Load, VecTy, LI0->getAlign(),
9785              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9786        } else if (E->State == TreeEntry::StridedVectorize) {
9787          Align CommonAlignment =
9788              computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9789          VecLdCost = TTI->getStridedMemoryOpCost(
9790              Instruction::Load, VecTy, LI0->getPointerOperand(),
9791              /*VariableMask=*/false, CommonAlignment, CostKind);
9792        } else {
9793          assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9794          Align CommonAlignment =
9795              computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9796          VecLdCost = TTI->getGatherScatterOpCost(
9797              Instruction::Load, VecTy, LI0->getPointerOperand(),
9798              /*VariableMask=*/false, CommonAlignment, CostKind);
9799        }
9800        return VecLdCost + CommonCost;
9801      };
9802  
9803      InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9804      // If this node generates masked gather load then it is not a terminal node.
9805      // Hence address operand cost is estimated separately.
9806      if (E->State == TreeEntry::ScatterVectorize)
9807        return Cost;
9808  
9809      // Estimate cost of GEPs since this tree node is a terminator.
9810      SmallVector<Value *> PointerOps(VL.size());
9811      for (auto [I, V] : enumerate(VL))
9812        PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9813      return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9814    }
9815    case Instruction::Store: {
9816      bool IsReorder = !E->ReorderIndices.empty();
9817      auto GetScalarCost = [=](unsigned Idx) {
9818        auto *VI = cast<StoreInst>(VL[Idx]);
9819        TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9820        return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9821                                    VI->getAlign(), VI->getPointerAddressSpace(),
9822                                    CostKind, OpInfo, VI);
9823      };
9824      auto *BaseSI =
9825          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9826      auto GetVectorCost = [=](InstructionCost CommonCost) {
9827        // We know that we can merge the stores. Calculate the cost.
9828        InstructionCost VecStCost;
9829        if (E->State == TreeEntry::StridedVectorize) {
9830          Align CommonAlignment =
9831              computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9832          VecStCost = TTI->getStridedMemoryOpCost(
9833              Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9834              /*VariableMask=*/false, CommonAlignment, CostKind);
9835        } else {
9836          assert(E->State == TreeEntry::Vectorize &&
9837                 "Expected either strided or consecutive stores.");
9838          TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9839          VecStCost = TTI->getMemoryOpCost(
9840              Instruction::Store, VecTy, BaseSI->getAlign(),
9841              BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9842        }
9843        return VecStCost + CommonCost;
9844      };
9845      SmallVector<Value *> PointerOps(VL.size());
9846      for (auto [I, V] : enumerate(VL)) {
9847        unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9848        PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9849      }
9850  
9851      return GetCostDiff(GetScalarCost, GetVectorCost) +
9852             GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9853    }
9854    case Instruction::Call: {
9855      auto GetScalarCost = [&](unsigned Idx) {
9856        auto *CI = cast<CallInst>(UniqueValues[Idx]);
9857        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9858        if (ID != Intrinsic::not_intrinsic) {
9859          IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9860          return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9861        }
9862        return TTI->getCallInstrCost(CI->getCalledFunction(),
9863                                     CI->getFunctionType()->getReturnType(),
9864                                     CI->getFunctionType()->params(), CostKind);
9865      };
9866      auto GetVectorCost = [=](InstructionCost CommonCost) {
9867        auto *CI = cast<CallInst>(VL0);
9868        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9869        SmallVector<Type *> ArgTys =
9870            buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
9871                                   It != MinBWs.end() ? It->second.first : 0);
9872        auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9873        return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9874      };
9875      return GetCostDiff(GetScalarCost, GetVectorCost);
9876    }
9877    case Instruction::ShuffleVector: {
9878      assert(E->isAltShuffle() &&
9879             ((Instruction::isBinaryOp(E->getOpcode()) &&
9880               Instruction::isBinaryOp(E->getAltOpcode())) ||
9881              (Instruction::isCast(E->getOpcode()) &&
9882               Instruction::isCast(E->getAltOpcode())) ||
9883              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884             "Invalid Shuffle Vector Operand");
9885      // Try to find the previous shuffle node with the same operands and same
9886      // main/alternate ops.
9887      auto TryFindNodeWithEqualOperands = [=]() {
9888        for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9889          if (TE.get() == E)
9890            break;
9891          if (TE->isAltShuffle() &&
9892              ((TE->getOpcode() == E->getOpcode() &&
9893                TE->getAltOpcode() == E->getAltOpcode()) ||
9894               (TE->getOpcode() == E->getAltOpcode() &&
9895                TE->getAltOpcode() == E->getOpcode())) &&
9896              TE->hasEqualOperands(*E))
9897            return true;
9898        }
9899        return false;
9900      };
9901      auto GetScalarCost = [&](unsigned Idx) {
9902        auto *VI = cast<Instruction>(UniqueValues[Idx]);
9903        assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9904        (void)E;
9905        return TTI->getInstructionCost(VI, CostKind);
9906      };
9907      // Need to clear CommonCost since the final shuffle cost is included into
9908      // vector cost.
9909      auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9910        // VecCost is equal to sum of the cost of creating 2 vectors
9911        // and the cost of creating shuffle.
9912        InstructionCost VecCost = 0;
9913        if (TryFindNodeWithEqualOperands()) {
9914          LLVM_DEBUG({
9915            dbgs() << "SLP: diamond match for alternate node found.\n";
9916            E->dump();
9917          });
9918          // No need to add new vector costs here since we're going to reuse
9919          // same main/alternate vector ops, just do different shuffling.
9920        } else if (Instruction::isBinaryOp(E->getOpcode())) {
9921          VecCost =
9922              TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9923          VecCost +=
9924              TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9925        } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9926          auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9927          VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9928                                              CI0->getPredicate(), CostKind, VL0);
9929          VecCost += TTIRef.getCmpSelInstrCost(
9930              E->getOpcode(), VecTy, MaskTy,
9931              cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9932              E->getAltOp());
9933        } else {
9934          Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9935          auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
9936          if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9937            auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9938            unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9939            unsigned SrcBWSz =
9940                DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9941            if (SrcIt != MinBWs.end()) {
9942              SrcBWSz = SrcIt->second.first;
9943              SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9944              SrcTy = getWidenedType(SrcSclTy, VL.size());
9945            }
9946            if (BWSz <= SrcBWSz) {
9947              if (BWSz < SrcBWSz)
9948                VecCost =
9949                    TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9950                                            TTI::CastContextHint::None, CostKind);
9951              LLVM_DEBUG({
9952                dbgs()
9953                    << "SLP: alternate extension, which should be truncated.\n";
9954                E->dump();
9955              });
9956              return VecCost;
9957            }
9958          }
9959          VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9960                                            TTI::CastContextHint::None, CostKind);
9961          VecCost +=
9962              TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9963                                      TTI::CastContextHint::None, CostKind);
9964        }
9965        SmallVector<int> Mask;
9966        E->buildAltOpShuffleMask(
9967            [E](Instruction *I) {
9968              assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9969              return I->getOpcode() == E->getAltOpcode();
9970            },
9971            Mask);
9972        VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
9973                                    FinalVecTy, Mask);
9974        // Patterns like [fadd,fsub] can be combined into a single instruction
9975        // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9976        // need to take into account their order when looking for the most used
9977        // order.
9978        unsigned Opcode0 = E->getOpcode();
9979        unsigned Opcode1 = E->getAltOpcode();
9980        SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
9981        // If this pattern is supported by the target then we consider the
9982        // order.
9983        if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9984          InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9985              VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9986          return AltVecCost < VecCost ? AltVecCost : VecCost;
9987        }
9988        // TODO: Check the reverse order too.
9989        return VecCost;
9990      };
9991      return GetCostDiff(GetScalarCost, GetVectorCost);
9992    }
9993    default:
9994      llvm_unreachable("Unknown instruction");
9995    }
9996  }
9997  
9998  bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9999    LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10000                      << VectorizableTree.size() << " is fully vectorizable .\n");
10001  
10002    auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10003      SmallVector<int> Mask;
10004      return TE->isGather() &&
10005             !any_of(TE->Scalars,
10006                     [this](Value *V) { return EphValues.contains(V); }) &&
10007             (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
10008              TE->Scalars.size() < Limit ||
10009              ((TE->getOpcode() == Instruction::ExtractElement ||
10010                all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10011               isFixedVectorShuffle(TE->Scalars, Mask)) ||
10012              (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10013               !TE->isAltShuffle()));
10014    };
10015  
10016    // We only handle trees of heights 1 and 2.
10017    if (VectorizableTree.size() == 1 &&
10018        (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10019         (ForReduction &&
10020          AreVectorizableGathers(VectorizableTree[0].get(),
10021                                 VectorizableTree[0]->Scalars.size()) &&
10022          VectorizableTree[0]->getVectorFactor() > 2)))
10023      return true;
10024  
10025    if (VectorizableTree.size() != 2)
10026      return false;
10027  
10028    // Handle splat and all-constants stores. Also try to vectorize tiny trees
10029    // with the second gather nodes if they have less scalar operands rather than
10030    // the initial tree element (may be profitable to shuffle the second gather)
10031    // or they are extractelements, which form shuffle.
10032    SmallVector<int> Mask;
10033    if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10034        AreVectorizableGathers(VectorizableTree[1].get(),
10035                               VectorizableTree[0]->Scalars.size()))
10036      return true;
10037  
10038    // Gathering cost would be too much for tiny trees.
10039    if (VectorizableTree[0]->isGather() ||
10040        (VectorizableTree[1]->isGather() &&
10041         VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10042         VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10043      return false;
10044  
10045    return true;
10046  }
10047  
10048  static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10049                                         TargetTransformInfo *TTI,
10050                                         bool MustMatchOrInst) {
10051    // Look past the root to find a source value. Arbitrarily follow the
10052    // path through operand 0 of any 'or'. Also, peek through optional
10053    // shift-left-by-multiple-of-8-bits.
10054    Value *ZextLoad = Root;
10055    const APInt *ShAmtC;
10056    bool FoundOr = false;
10057    while (!isa<ConstantExpr>(ZextLoad) &&
10058           (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
10059            (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
10060             ShAmtC->urem(8) == 0))) {
10061      auto *BinOp = cast<BinaryOperator>(ZextLoad);
10062      ZextLoad = BinOp->getOperand(0);
10063      if (BinOp->getOpcode() == Instruction::Or)
10064        FoundOr = true;
10065    }
10066    // Check if the input is an extended load of the required or/shift expression.
10067    Value *Load;
10068    if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10069        !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
10070      return false;
10071  
10072    // Require that the total load bit width is a legal integer type.
10073    // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10074    // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10075    Type *SrcTy = Load->getType();
10076    unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10077    if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
10078      return false;
10079  
10080    // Everything matched - assume that we can fold the whole sequence using
10081    // load combining.
10082    LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10083               << *(cast<Instruction>(Root)) << "\n");
10084  
10085    return true;
10086  }
10087  
10088  bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
10089    if (RdxKind != RecurKind::Or)
10090      return false;
10091  
10092    unsigned NumElts = VectorizableTree[0]->Scalars.size();
10093    Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10094    return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
10095                                      /* MatchOr */ false);
10096  }
10097  
10098  bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
10099    // Peek through a final sequence of stores and check if all operations are
10100    // likely to be load-combined.
10101    unsigned NumElts = Stores.size();
10102    for (Value *Scalar : Stores) {
10103      Value *X;
10104      if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
10105          !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
10106        return false;
10107    }
10108    return true;
10109  }
10110  
10111  bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10112    // No need to vectorize inserts of gathered values.
10113    if (VectorizableTree.size() == 2 &&
10114        isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10115        VectorizableTree[1]->isGather() &&
10116        (VectorizableTree[1]->getVectorFactor() <= 2 ||
10117         !(isSplat(VectorizableTree[1]->Scalars) ||
10118           allConstant(VectorizableTree[1]->Scalars))))
10119      return true;
10120  
10121    // If the graph includes only PHI nodes and gathers, it is defnitely not
10122    // profitable for the vectorization, we can skip it, if the cost threshold is
10123    // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10124    // gathers/buildvectors.
10125    constexpr int Limit = 4;
10126    if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10127        !VectorizableTree.empty() &&
10128        all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10129          return (TE->isGather() &&
10130                  TE->getOpcode() != Instruction::ExtractElement &&
10131                  count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10132                 TE->getOpcode() == Instruction::PHI;
10133        }))
10134      return true;
10135  
10136    // We can vectorize the tree if its size is greater than or equal to the
10137    // minimum size specified by the MinTreeSize command line option.
10138    if (VectorizableTree.size() >= MinTreeSize)
10139      return false;
10140  
10141    // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10142    // can vectorize it if we can prove it fully vectorizable.
10143    if (isFullyVectorizableTinyTree(ForReduction))
10144      return false;
10145  
10146    // Check if any of the gather node forms an insertelement buildvector
10147    // somewhere.
10148    bool IsAllowedSingleBVNode =
10149        VectorizableTree.size() > 1 ||
10150        (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10151         !VectorizableTree.front()->isAltShuffle() &&
10152         VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153         VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10154         allSameBlock(VectorizableTree.front()->Scalars));
10155    if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10156          return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
10157                   return isa<ExtractElementInst, UndefValue>(V) ||
10158                          (IsAllowedSingleBVNode &&
10159                           !V->hasNUsesOrMore(UsesLimit) &&
10160                           any_of(V->users(), IsaPred<InsertElementInst>));
10161                 });
10162        }))
10163      return false;
10164  
10165    assert(VectorizableTree.empty()
10166               ? ExternalUses.empty()
10167               : true && "We shouldn't have any external users");
10168  
10169    // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10170    // vectorizable.
10171    return true;
10172  }
10173  
10174  InstructionCost BoUpSLP::getSpillCost() const {
10175    // Walk from the bottom of the tree to the top, tracking which values are
10176    // live. When we see a call instruction that is not part of our tree,
10177    // query TTI to see if there is a cost to keeping values live over it
10178    // (for example, if spills and fills are required).
10179    unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10180    InstructionCost Cost = 0;
10181  
10182    SmallPtrSet<Instruction *, 4> LiveValues;
10183    Instruction *PrevInst = nullptr;
10184  
10185    // The entries in VectorizableTree are not necessarily ordered by their
10186    // position in basic blocks. Collect them and order them by dominance so later
10187    // instructions are guaranteed to be visited first. For instructions in
10188    // different basic blocks, we only scan to the beginning of the block, so
10189    // their order does not matter, as long as all instructions in a basic block
10190    // are grouped together. Using dominance ensures a deterministic order.
10191    SmallVector<Instruction *, 16> OrderedScalars;
10192    for (const auto &TEPtr : VectorizableTree) {
10193      if (TEPtr->State != TreeEntry::Vectorize)
10194        continue;
10195      Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10196      if (!Inst)
10197        continue;
10198      OrderedScalars.push_back(Inst);
10199    }
10200    llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10201      auto *NodeA = DT->getNode(A->getParent());
10202      auto *NodeB = DT->getNode(B->getParent());
10203      assert(NodeA && "Should only process reachable instructions");
10204      assert(NodeB && "Should only process reachable instructions");
10205      assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206             "Different nodes should have different DFS numbers");
10207      if (NodeA != NodeB)
10208        return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209      return B->comesBefore(A);
10210    });
10211  
10212    for (Instruction *Inst : OrderedScalars) {
10213      if (!PrevInst) {
10214        PrevInst = Inst;
10215        continue;
10216      }
10217  
10218      // Update LiveValues.
10219      LiveValues.erase(PrevInst);
10220      for (auto &J : PrevInst->operands()) {
10221        if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10222          LiveValues.insert(cast<Instruction>(&*J));
10223      }
10224  
10225      LLVM_DEBUG({
10226        dbgs() << "SLP: #LV: " << LiveValues.size();
10227        for (auto *X : LiveValues)
10228          dbgs() << " " << X->getName();
10229        dbgs() << ", Looking at ";
10230        Inst->dump();
10231      });
10232  
10233      // Now find the sequence of instructions between PrevInst and Inst.
10234      unsigned NumCalls = 0;
10235      BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10236                                   PrevInstIt =
10237                                       PrevInst->getIterator().getReverse();
10238      while (InstIt != PrevInstIt) {
10239        if (PrevInstIt == PrevInst->getParent()->rend()) {
10240          PrevInstIt = Inst->getParent()->rbegin();
10241          continue;
10242        }
10243  
10244        auto NoCallIntrinsic = [this](Instruction *I) {
10245          if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10246            if (II->isAssumeLikeIntrinsic())
10247              return true;
10248            FastMathFlags FMF;
10249            SmallVector<Type *, 4> Tys;
10250            for (auto &ArgOp : II->args())
10251              Tys.push_back(ArgOp->getType());
10252            if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10253              FMF = FPMO->getFastMathFlags();
10254            IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10255                                        FMF);
10256            InstructionCost IntrCost =
10257                TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
10258            InstructionCost CallCost = TTI->getCallInstrCost(
10259                nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10260            if (IntrCost < CallCost)
10261              return true;
10262          }
10263          return false;
10264        };
10265  
10266        // Debug information does not impact spill cost.
10267        if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10268            &*PrevInstIt != PrevInst)
10269          NumCalls++;
10270  
10271        ++PrevInstIt;
10272      }
10273  
10274      if (NumCalls) {
10275        SmallVector<Type *, 4> V;
10276        for (auto *II : LiveValues) {
10277          auto *ScalarTy = II->getType();
10278          if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10279            ScalarTy = VectorTy->getElementType();
10280          V.push_back(getWidenedType(ScalarTy, BundleWidth));
10281        }
10282        Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10283      }
10284  
10285      PrevInst = Inst;
10286    }
10287  
10288    return Cost;
10289  }
10290  
10291  /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10292  /// buildvector sequence.
10293  static bool isFirstInsertElement(const InsertElementInst *IE1,
10294                                   const InsertElementInst *IE2) {
10295    if (IE1 == IE2)
10296      return false;
10297    const auto *I1 = IE1;
10298    const auto *I2 = IE2;
10299    const InsertElementInst *PrevI1;
10300    const InsertElementInst *PrevI2;
10301    unsigned Idx1 = *getElementIndex(IE1);
10302    unsigned Idx2 = *getElementIndex(IE2);
10303    do {
10304      if (I2 == IE1)
10305        return true;
10306      if (I1 == IE2)
10307        return false;
10308      PrevI1 = I1;
10309      PrevI2 = I2;
10310      if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10311          getElementIndex(I1).value_or(Idx2) != Idx2)
10312        I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10313      if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10314          getElementIndex(I2).value_or(Idx1) != Idx1)
10315        I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10316    } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10317    llvm_unreachable("Two different buildvectors not expected.");
10318  }
10319  
10320  namespace {
10321  /// Returns incoming Value *, if the requested type is Value * too, or a default
10322  /// value, otherwise.
10323  struct ValueSelect {
10324    template <typename U>
10325    static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10326      return V;
10327    }
10328    template <typename U>
10329    static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10330      return U();
10331    }
10332  };
10333  } // namespace
10334  
10335  /// Does the analysis of the provided shuffle masks and performs the requested
10336  /// actions on the vectors with the given shuffle masks. It tries to do it in
10337  /// several steps.
10338  /// 1. If the Base vector is not undef vector, resizing the very first mask to
10339  /// have common VF and perform action for 2 input vectors (including non-undef
10340  /// Base). Other shuffle masks are combined with the resulting after the 1 stage
10341  /// and processed as a shuffle of 2 elements.
10342  /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10343  /// action only for 1 vector with the given mask, if it is not the identity
10344  /// mask.
10345  /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10346  /// vectors, combing the masks properly between the steps.
10347  template <typename T>
10348  static T *performExtractsShuffleAction(
10349      MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10350      function_ref<unsigned(T *)> GetVF,
10351      function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10352      function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
10353    assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10354    SmallVector<int> Mask(ShuffleMask.begin()->second);
10355    auto VMIt = std::next(ShuffleMask.begin());
10356    T *Prev = nullptr;
10357    SmallBitVector UseMask =
10358        buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10359    SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10360    if (!IsBaseUndef.all()) {
10361      // Base is not undef, need to combine it with the next subvectors.
10362      std::pair<T *, bool> Res =
10363          ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10364      SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10365      for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10366        if (Mask[Idx] == PoisonMaskElem)
10367          Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10368        else
10369          Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10370      }
10371      auto *V = ValueSelect::get<T *>(Base);
10372      (void)V;
10373      assert((!V || GetVF(V) == Mask.size()) &&
10374             "Expected base vector of VF number of elements.");
10375      Prev = Action(Mask, {nullptr, Res.first});
10376    } else if (ShuffleMask.size() == 1) {
10377      // Base is undef and only 1 vector is shuffled - perform the action only for
10378      // single vector, if the mask is not the identity mask.
10379      std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10380                                              /*ForSingleMask=*/true);
10381      if (Res.second)
10382        // Identity mask is found.
10383        Prev = Res.first;
10384      else
10385        Prev = Action(Mask, {ShuffleMask.begin()->first});
10386    } else {
10387      // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10388      // shuffles step by step, combining shuffle between the steps.
10389      unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390      unsigned Vec2VF = GetVF(VMIt->first);
10391      if (Vec1VF == Vec2VF) {
10392        // No need to resize the input vectors since they are of the same size, we
10393        // can shuffle them directly.
10394        ArrayRef<int> SecMask = VMIt->second;
10395        for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10396          if (SecMask[I] != PoisonMaskElem) {
10397            assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10398            Mask[I] = SecMask[I] + Vec1VF;
10399          }
10400        }
10401        Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10402      } else {
10403        // Vectors of different sizes - resize and reshuffle.
10404        std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10405                                                 /*ForSingleMask=*/false);
10406        std::pair<T *, bool> Res2 =
10407            ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10408        ArrayRef<int> SecMask = VMIt->second;
10409        for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10410          if (Mask[I] != PoisonMaskElem) {
10411            assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10412            if (Res1.second)
10413              Mask[I] = I;
10414          } else if (SecMask[I] != PoisonMaskElem) {
10415            assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10416            Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10417          }
10418        }
10419        Prev = Action(Mask, {Res1.first, Res2.first});
10420      }
10421      VMIt = std::next(VMIt);
10422    }
10423    bool IsBaseNotUndef = !IsBaseUndef.all();
10424    (void)IsBaseNotUndef;
10425    // Perform requested actions for the remaining masks/vectors.
10426    for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10427      // Shuffle other input vectors, if any.
10428      std::pair<T *, bool> Res =
10429          ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10430      ArrayRef<int> SecMask = VMIt->second;
10431      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10432        if (SecMask[I] != PoisonMaskElem) {
10433          assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10434                 "Multiple uses of scalars.");
10435          Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10436        } else if (Mask[I] != PoisonMaskElem) {
10437          Mask[I] = I;
10438        }
10439      }
10440      Prev = Action(Mask, {Prev, Res.first});
10441    }
10442    return Prev;
10443  }
10444  
10445  InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10446    InstructionCost Cost = 0;
10447    LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10448                      << VectorizableTree.size() << ".\n");
10449  
10450    unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10451  
10452    SmallPtrSet<Value *, 4> CheckedExtracts;
10453    for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10454      TreeEntry &TE = *VectorizableTree[I];
10455      if (TE.isGather()) {
10456        if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10457            E && E->getVectorFactor() == TE.getVectorFactor() &&
10458            E->isSame(TE.Scalars)) {
10459          // Some gather nodes might be absolutely the same as some vectorizable
10460          // nodes after reordering, need to handle it.
10461          LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10462                            << shortBundleName(TE.Scalars) << ".\n"
10463                            << "SLP: Current total cost = " << Cost << "\n");
10464          continue;
10465        }
10466      }
10467  
10468      InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10469      Cost += C;
10470      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10471                        << shortBundleName(TE.Scalars) << ".\n"
10472                        << "SLP: Current total cost = " << Cost << "\n");
10473    }
10474  
10475    SmallPtrSet<Value *, 16> ExtractCostCalculated;
10476    InstructionCost ExtractCost = 0;
10477    SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
10478    SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
10479    SmallVector<APInt> DemandedElts;
10480    SmallDenseSet<Value *, 4> UsedInserts;
10481    DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
10482    std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10483    for (ExternalUser &EU : ExternalUses) {
10484      // We only add extract cost once for the same scalar.
10485      if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10486          !ExtractCostCalculated.insert(EU.Scalar).second)
10487        continue;
10488  
10489      // Uses by ephemeral values are free (because the ephemeral value will be
10490      // removed prior to code generation, and so the extraction will be
10491      // removed as well).
10492      if (EphValues.count(EU.User))
10493        continue;
10494  
10495      // No extract cost for vector "scalar"
10496      if (isa<FixedVectorType>(EU.Scalar->getType()))
10497        continue;
10498  
10499      // If found user is an insertelement, do not calculate extract cost but try
10500      // to detect it as a final shuffled/identity match.
10501      if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10502          VU && VU->getOperand(1) == EU.Scalar) {
10503        if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10504          if (!UsedInserts.insert(VU).second)
10505            continue;
10506          std::optional<unsigned> InsertIdx = getElementIndex(VU);
10507          if (InsertIdx) {
10508            const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10509            auto *It = find_if(
10510                FirstUsers,
10511                [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10512                  return areTwoInsertFromSameBuildVector(
10513                      VU, cast<InsertElementInst>(Pair.first),
10514                      [this](InsertElementInst *II) -> Value * {
10515                        Value *Op0 = II->getOperand(0);
10516                        if (getTreeEntry(II) && !getTreeEntry(Op0))
10517                          return nullptr;
10518                        return Op0;
10519                      });
10520                });
10521            int VecId = -1;
10522            if (It == FirstUsers.end()) {
10523              (void)ShuffleMasks.emplace_back();
10524              SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10525              if (Mask.empty())
10526                Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10527              // Find the insertvector, vectorized in tree, if any.
10528              Value *Base = VU;
10529              while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10530                if (IEBase != EU.User &&
10531                    (!IEBase->hasOneUse() ||
10532                     getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10533                  break;
10534                // Build the mask for the vectorized insertelement instructions.
10535                if (const TreeEntry *E = getTreeEntry(IEBase)) {
10536                  VU = IEBase;
10537                  do {
10538                    IEBase = cast<InsertElementInst>(Base);
10539                    int Idx = *getElementIndex(IEBase);
10540                    assert(Mask[Idx] == PoisonMaskElem &&
10541                           "InsertElementInstruction used already.");
10542                    Mask[Idx] = Idx;
10543                    Base = IEBase->getOperand(0);
10544                  } while (E == getTreeEntry(Base));
10545                  break;
10546                }
10547                Base = cast<InsertElementInst>(Base)->getOperand(0);
10548              }
10549              FirstUsers.emplace_back(VU, ScalarTE);
10550              DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10551              VecId = FirstUsers.size() - 1;
10552              auto It = MinBWs.find(ScalarTE);
10553              if (It != MinBWs.end() &&
10554                  VectorCasts
10555                      .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10556                      .second) {
10557                unsigned BWSz = It->second.first;
10558                unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10559                unsigned VecOpcode;
10560                if (DstBWSz < BWSz)
10561                  VecOpcode = Instruction::Trunc;
10562                else
10563                  VecOpcode =
10564                      It->second.second ? Instruction::SExt : Instruction::ZExt;
10565                TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10566                InstructionCost C = TTI->getCastInstrCost(
10567                    VecOpcode, FTy,
10568                    getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
10569                                   FTy->getNumElements()),
10570                    TTI::CastContextHint::None, CostKind);
10571                LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10572                                  << " for extending externally used vector with "
10573                                     "non-equal minimum bitwidth.\n");
10574                Cost += C;
10575              }
10576            } else {
10577              if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10578                It->first = VU;
10579              VecId = std::distance(FirstUsers.begin(), It);
10580            }
10581            int InIdx = *InsertIdx;
10582            SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10583            if (Mask.empty())
10584              Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10585            Mask[InIdx] = EU.Lane;
10586            DemandedElts[VecId].setBit(InIdx);
10587            continue;
10588          }
10589        }
10590      }
10591      // Leave the GEPs as is, they are free in most cases and better to keep them
10592      // as GEPs.
10593      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10594      if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10595        if (!ValueToExtUses) {
10596          ValueToExtUses.emplace();
10597          for_each(enumerate(ExternalUses), [&](const auto &P) {
10598            ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10599          });
10600        }
10601        // Can use original GEP, if no operands vectorized or they are marked as
10602        // externally used already.
10603        bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10604          if (!getTreeEntry(V))
10605            return true;
10606          auto It = ValueToExtUses->find(V);
10607          if (It != ValueToExtUses->end()) {
10608            // Replace all uses to avoid compiler crash.
10609            ExternalUses[It->second].User = nullptr;
10610            return true;
10611          }
10612          return false;
10613        });
10614        if (CanBeUsedAsGEP) {
10615          ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10616          ExternalUsesAsGEPs.insert(EU.Scalar);
10617          continue;
10618        }
10619      }
10620  
10621      // If we plan to rewrite the tree in a smaller type, we will need to sign
10622      // extend the extracted value back to the original type. Here, we account
10623      // for the extract and the added cost of the sign extend if needed.
10624      auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10625      auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10626      if (It != MinBWs.end()) {
10627        auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10628        unsigned Extend =
10629            It->second.second ? Instruction::SExt : Instruction::ZExt;
10630        VecTy = getWidenedType(MinTy, BundleWidth);
10631        ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10632                                                     VecTy, EU.Lane);
10633      } else {
10634        ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10635                                               CostKind, EU.Lane);
10636      }
10637    }
10638    // Add reduced value cost, if resized.
10639    if (!VectorizedVals.empty()) {
10640      const TreeEntry &Root = *VectorizableTree.front();
10641      auto BWIt = MinBWs.find(&Root);
10642      if (BWIt != MinBWs.end()) {
10643        Type *DstTy = Root.Scalars.front()->getType();
10644        unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10645        unsigned SrcSz =
10646            ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10647        if (OriginalSz != SrcSz) {
10648          unsigned Opcode = Instruction::Trunc;
10649          if (OriginalSz > SrcSz)
10650            Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10651          Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10652          Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10653                                        TTI::CastContextHint::None,
10654                                        TTI::TCK_RecipThroughput);
10655        }
10656      }
10657    }
10658  
10659    InstructionCost SpillCost = getSpillCost();
10660    Cost += SpillCost + ExtractCost;
10661    auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10662                                      bool) {
10663      InstructionCost C = 0;
10664      unsigned VF = Mask.size();
10665      unsigned VecVF = TE->getVectorFactor();
10666      if (VF != VecVF &&
10667          (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10668           !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
10669        SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10670        std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10671                  OrigMask.begin());
10672        C = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc,
10673                                getWidenedType(TE->getMainOp()->getType(), VecVF),
10674                                OrigMask);
10675        LLVM_DEBUG(
10676            dbgs() << "SLP: Adding cost " << C
10677                   << " for final shuffle of insertelement external users.\n";
10678            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10679        Cost += C;
10680        return std::make_pair(TE, true);
10681      }
10682      return std::make_pair(TE, false);
10683    };
10684    // Calculate the cost of the reshuffled vectors, if any.
10685    for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10686      Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10687      auto Vector = ShuffleMasks[I].takeVector();
10688      unsigned VF = 0;
10689      auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10690                                      ArrayRef<const TreeEntry *> TEs) {
10691        assert((TEs.size() == 1 || TEs.size() == 2) &&
10692               "Expected exactly 1 or 2 tree entries.");
10693        if (TEs.size() == 1) {
10694          if (VF == 0)
10695            VF = TEs.front()->getVectorFactor();
10696          auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10697          if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10698              !all_of(enumerate(Mask), [=](const auto &Data) {
10699                return Data.value() == PoisonMaskElem ||
10700                       (Data.index() < VF &&
10701                        static_cast<int>(Data.index()) == Data.value());
10702              })) {
10703            InstructionCost C =
10704                TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
10705            LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10706                              << " for final shuffle of insertelement "
10707                                 "external users.\n";
10708                       TEs.front()->dump();
10709                       dbgs() << "SLP: Current total cost = " << Cost << "\n");
10710            Cost += C;
10711          }
10712        } else {
10713          if (VF == 0) {
10714            if (TEs.front() &&
10715                TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716              VF = TEs.front()->getVectorFactor();
10717            else
10718              VF = Mask.size();
10719          }
10720          auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10721          InstructionCost C =
10722              ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
10723          LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10724                            << " for final shuffle of vector node and external "
10725                               "insertelement users.\n";
10726                     if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727                     dbgs() << "SLP: Current total cost = " << Cost << "\n");
10728          Cost += C;
10729        }
10730        VF = Mask.size();
10731        return TEs.back();
10732      };
10733      (void)performExtractsShuffleAction<const TreeEntry>(
10734          MutableArrayRef(Vector.data(), Vector.size()), Base,
10735          [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10736          EstimateShufflesCost);
10737      InstructionCost InsertCost = TTI->getScalarizationOverhead(
10738          cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10739          /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10740      Cost -= InsertCost;
10741    }
10742  
10743    // Add the cost for reduced value resize (if required).
10744    if (ReductionBitWidth != 0) {
10745      assert(UserIgnoreList && "Expected reduction tree.");
10746      const TreeEntry &E = *VectorizableTree.front();
10747      auto It = MinBWs.find(&E);
10748      if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10749        unsigned SrcSize = It->second.first;
10750        unsigned DstSize = ReductionBitWidth;
10751        unsigned Opcode = Instruction::Trunc;
10752        if (SrcSize < DstSize)
10753          Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10754        auto *SrcVecTy =
10755            getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10756        auto *DstVecTy =
10757            getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
10758        TTI::CastContextHint CCH = getCastContextHint(E);
10759        InstructionCost CastCost;
10760        switch (E.getOpcode()) {
10761        case Instruction::SExt:
10762        case Instruction::ZExt:
10763        case Instruction::Trunc: {
10764          const TreeEntry *OpTE = getOperandEntry(&E, 0);
10765          CCH = getCastContextHint(*OpTE);
10766          break;
10767        }
10768        default:
10769          break;
10770        }
10771        CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10772                                          TTI::TCK_RecipThroughput);
10773        Cost += CastCost;
10774        LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10775                          << " for final resize for reduction from " << SrcVecTy
10776                          << " to " << DstVecTy << "\n";
10777                   dbgs() << "SLP: Current total cost = " << Cost << "\n");
10778      }
10779    }
10780  
10781  #ifndef NDEBUG
10782    SmallString<256> Str;
10783    {
10784      raw_svector_ostream OS(Str);
10785      OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10786         << "SLP: Extract Cost = " << ExtractCost << ".\n"
10787         << "SLP: Total Cost = " << Cost << ".\n";
10788    }
10789    LLVM_DEBUG(dbgs() << Str);
10790    if (ViewSLPTree)
10791      ViewGraph(this, "SLP" + F->getName(), false, Str);
10792  #endif
10793  
10794    return Cost;
10795  }
10796  
10797  /// Tries to find extractelement instructions with constant indices from fixed
10798  /// vector type and gather such instructions into a bunch, which highly likely
10799  /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10800  /// successful, the matched scalars are replaced by poison values in \p VL for
10801  /// future analysis.
10802  std::optional<TTI::ShuffleKind>
10803  BoUpSLP::tryToGatherSingleRegisterExtractElements(
10804      MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
10805    // Scan list of gathered scalars for extractelements that can be represented
10806    // as shuffles.
10807    MapVector<Value *, SmallVector<int>> VectorOpToIdx;
10808    SmallVector<int> UndefVectorExtracts;
10809    for (int I = 0, E = VL.size(); I < E; ++I) {
10810      auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10811      if (!EI) {
10812        if (isa<UndefValue>(VL[I]))
10813          UndefVectorExtracts.push_back(I);
10814        continue;
10815      }
10816      auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10817      if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10818        continue;
10819      std::optional<unsigned> Idx = getExtractIndex(EI);
10820      // Undefined index.
10821      if (!Idx) {
10822        UndefVectorExtracts.push_back(I);
10823        continue;
10824      }
10825      SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10826      ExtractMask.reset(*Idx);
10827      if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10828        UndefVectorExtracts.push_back(I);
10829        continue;
10830      }
10831      VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10832    }
10833    // Sort the vector operands by the maximum number of uses in extractelements.
10834    SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
10835        VectorOpToIdx.takeVector();
10836    stable_sort(Vectors, [](const auto &P1, const auto &P2) {
10837      return P1.second.size() > P2.second.size();
10838    });
10839    // Find the best pair of the vectors or a single vector.
10840    const int UndefSz = UndefVectorExtracts.size();
10841    unsigned SingleMax = 0;
10842    unsigned PairMax = 0;
10843    if (!Vectors.empty()) {
10844      SingleMax = Vectors.front().second.size() + UndefSz;
10845      if (Vectors.size() > 1) {
10846        auto *ItNext = std::next(Vectors.begin());
10847        PairMax = SingleMax + ItNext->second.size();
10848      }
10849    }
10850    if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10851      return std::nullopt;
10852    // Check if better to perform a shuffle of 2 vectors or just of a single
10853    // vector.
10854    SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10855    SmallVector<Value *> GatheredExtracts(
10856        VL.size(), PoisonValue::get(VL.front()->getType()));
10857    if (SingleMax >= PairMax && SingleMax) {
10858      for (int Idx : Vectors.front().second)
10859        std::swap(GatheredExtracts[Idx], VL[Idx]);
10860    } else if (!Vectors.empty()) {
10861      for (unsigned Idx : {0, 1})
10862        for (int Idx : Vectors[Idx].second)
10863          std::swap(GatheredExtracts[Idx], VL[Idx]);
10864    }
10865    // Add extracts from undefs too.
10866    for (int Idx : UndefVectorExtracts)
10867      std::swap(GatheredExtracts[Idx], VL[Idx]);
10868    // Check that gather of extractelements can be represented as just a
10869    // shuffle of a single/two vectors the scalars are extracted from.
10870    std::optional<TTI::ShuffleKind> Res =
10871        isFixedVectorShuffle(GatheredExtracts, Mask);
10872    if (!Res) {
10873      // TODO: try to check other subsets if possible.
10874      // Restore the original VL if attempt was not successful.
10875      copy(SavedVL, VL.begin());
10876      return std::nullopt;
10877    }
10878    // Restore unused scalars from mask, if some of the extractelements were not
10879    // selected for shuffle.
10880    for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10881      if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10882          isa<UndefValue>(GatheredExtracts[I])) {
10883        std::swap(VL[I], GatheredExtracts[I]);
10884        continue;
10885      }
10886      auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10887      if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10888          !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10889          is_contained(UndefVectorExtracts, I))
10890        continue;
10891    }
10892    return Res;
10893  }
10894  
10895  /// Tries to find extractelement instructions with constant indices from fixed
10896  /// vector type and gather such instructions into a bunch, which highly likely
10897  /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10898  /// successful, the matched scalars are replaced by poison values in \p VL for
10899  /// future analysis.
10900  SmallVector<std::optional<TTI::ShuffleKind>>
10901  BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10902                                      SmallVectorImpl<int> &Mask,
10903                                      unsigned NumParts) const {
10904    assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10905    SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10906    Mask.assign(VL.size(), PoisonMaskElem);
10907    unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10908    for (unsigned Part : seq<unsigned>(NumParts)) {
10909      // Scan list of gathered scalars for extractelements that can be represented
10910      // as shuffles.
10911      MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
10912          Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
10913      SmallVector<int> SubMask;
10914      std::optional<TTI::ShuffleKind> Res =
10915          tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10916      ShufflesRes[Part] = Res;
10917      copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10918    }
10919    if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10920          return Res.has_value();
10921        }))
10922      ShufflesRes.clear();
10923    return ShufflesRes;
10924  }
10925  
10926  std::optional<TargetTransformInfo::ShuffleKind>
10927  BoUpSLP::isGatherShuffledSingleRegisterEntry(
10928      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10929      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10930    Entries.clear();
10931    // TODO: currently checking only for Scalars in the tree entry, need to count
10932    // reused elements too for better cost estimation.
10933    const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10934    const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10935    const BasicBlock *TEInsertBlock = nullptr;
10936    // Main node of PHI entries keeps the correct order of operands/incoming
10937    // blocks.
10938    if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10939      TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10940      TEInsertPt = TEInsertBlock->getTerminator();
10941    } else {
10942      TEInsertBlock = TEInsertPt->getParent();
10943    }
10944    if (!DT->isReachableFromEntry(TEInsertBlock))
10945      return std::nullopt;
10946    auto *NodeUI = DT->getNode(TEInsertBlock);
10947    assert(NodeUI && "Should only process reachable instructions");
10948    SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10949    auto CheckOrdering = [&](const Instruction *InsertPt) {
10950      // Argument InsertPt is an instruction where vector code for some other
10951      // tree entry (one that shares one or more scalars with TE) is going to be
10952      // generated. This lambda returns true if insertion point of vector code
10953      // for the TE dominates that point (otherwise dependency is the other way
10954      // around). The other node is not limited to be of a gather kind. Gather
10955      // nodes are not scheduled and their vector code is inserted before their
10956      // first user. If user is PHI, that is supposed to be at the end of a
10957      // predecessor block. Otherwise it is the last instruction among scalars of
10958      // the user node. So, instead of checking dependency between instructions
10959      // themselves, we check dependency between their insertion points for vector
10960      // code (since each scalar instruction ends up as a lane of a vector
10961      // instruction).
10962      const BasicBlock *InsertBlock = InsertPt->getParent();
10963      auto *NodeEUI = DT->getNode(InsertBlock);
10964      if (!NodeEUI)
10965        return false;
10966      assert((NodeUI == NodeEUI) ==
10967                 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968             "Different nodes should have different DFS numbers");
10969      // Check the order of the gather nodes users.
10970      if (TEInsertPt->getParent() != InsertBlock &&
10971          (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10972        return false;
10973      if (TEInsertPt->getParent() == InsertBlock &&
10974          TEInsertPt->comesBefore(InsertPt))
10975        return false;
10976      return true;
10977    };
10978    // Find all tree entries used by the gathered values. If no common entries
10979    // found - not a shuffle.
10980    // Here we build a set of tree nodes for each gathered value and trying to
10981    // find the intersection between these sets. If we have at least one common
10982    // tree node for each gathered value - we have just a permutation of the
10983    // single vector. If we have 2 different sets, we're in situation where we
10984    // have a permutation of 2 input vectors.
10985    SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
10986    DenseMap<Value *, int> UsedValuesEntry;
10987    for (Value *V : VL) {
10988      if (isConstant(V))
10989        continue;
10990      // Build a list of tree entries where V is used.
10991      SmallPtrSet<const TreeEntry *, 4> VToTEs;
10992      for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10993        if (TEPtr == TE)
10994          continue;
10995        assert(any_of(TEPtr->Scalars,
10996                      [&](Value *V) { return GatheredScalars.contains(V); }) &&
10997               "Must contain at least single gathered value.");
10998        assert(TEPtr->UserTreeIndices.size() == 1 &&
10999               "Expected only single user of a gather node.");
11000        const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11001  
11002        PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11003        const Instruction *InsertPt =
11004            UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
11005                    : &getLastInstructionInBundle(UseEI.UserTE);
11006        if (TEInsertPt == InsertPt) {
11007          // If 2 gathers are operands of the same entry (regardless of whether
11008          // user is PHI or else), compare operands indices, use the earlier one
11009          // as the base.
11010          if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11011            continue;
11012          // If the user instruction is used for some reason in different
11013          // vectorized nodes - make it depend on index.
11014          if (TEUseEI.UserTE != UseEI.UserTE &&
11015              TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11016            continue;
11017        }
11018  
11019        // Check if the user node of the TE comes after user node of TEPtr,
11020        // otherwise TEPtr depends on TE.
11021        if ((TEInsertBlock != InsertPt->getParent() ||
11022             TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11023            !CheckOrdering(InsertPt))
11024          continue;
11025        VToTEs.insert(TEPtr);
11026      }
11027      if (const TreeEntry *VTE = getTreeEntry(V)) {
11028        if (ForOrder) {
11029          if (VTE->State != TreeEntry::Vectorize) {
11030            auto It = MultiNodeScalars.find(V);
11031            if (It == MultiNodeScalars.end())
11032              continue;
11033            VTE = *It->getSecond().begin();
11034            // Iterate through all vectorized nodes.
11035            auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
11036              return MTE->State == TreeEntry::Vectorize;
11037            });
11038            if (MIt == It->getSecond().end())
11039              continue;
11040            VTE = *MIt;
11041          }
11042        }
11043        Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11044        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11045          continue;
11046        VToTEs.insert(VTE);
11047      }
11048      if (VToTEs.empty())
11049        continue;
11050      if (UsedTEs.empty()) {
11051        // The first iteration, just insert the list of nodes to vector.
11052        UsedTEs.push_back(VToTEs);
11053        UsedValuesEntry.try_emplace(V, 0);
11054      } else {
11055        // Need to check if there are any previously used tree nodes which use V.
11056        // If there are no such nodes, consider that we have another one input
11057        // vector.
11058        SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11059        unsigned Idx = 0;
11060        for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11061          // Do we have a non-empty intersection of previously listed tree entries
11062          // and tree entries using current V?
11063          set_intersect(VToTEs, Set);
11064          if (!VToTEs.empty()) {
11065            // Yes, write the new subset and continue analysis for the next
11066            // scalar.
11067            Set.swap(VToTEs);
11068            break;
11069          }
11070          VToTEs = SavedVToTEs;
11071          ++Idx;
11072        }
11073        // No non-empty intersection found - need to add a second set of possible
11074        // source vectors.
11075        if (Idx == UsedTEs.size()) {
11076          // If the number of input vectors is greater than 2 - not a permutation,
11077          // fallback to the regular gather.
11078          // TODO: support multiple reshuffled nodes.
11079          if (UsedTEs.size() == 2)
11080            continue;
11081          UsedTEs.push_back(SavedVToTEs);
11082          Idx = UsedTEs.size() - 1;
11083        }
11084        UsedValuesEntry.try_emplace(V, Idx);
11085      }
11086    }
11087  
11088    if (UsedTEs.empty()) {
11089      Entries.clear();
11090      return std::nullopt;
11091    }
11092  
11093    unsigned VF = 0;
11094    if (UsedTEs.size() == 1) {
11095      // Keep the order to avoid non-determinism.
11096      SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11097                                                  UsedTEs.front().end());
11098      sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11099        return TE1->Idx < TE2->Idx;
11100      });
11101      // Try to find the perfect match in another gather node at first.
11102      auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
11103        return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
11104      });
11105      if (It != FirstEntries.end() &&
11106          ((*It)->getVectorFactor() == VL.size() ||
11107           ((*It)->getVectorFactor() == TE->Scalars.size() &&
11108            TE->ReuseShuffleIndices.size() == VL.size() &&
11109            (*It)->isSame(TE->Scalars)))) {
11110        Entries.push_back(*It);
11111        if ((*It)->getVectorFactor() == VL.size()) {
11112          std::iota(std::next(Mask.begin(), Part * VL.size()),
11113                    std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
11114        } else {
11115          SmallVector<int> CommonMask = TE->getCommonMask();
11116          copy(CommonMask, Mask.begin());
11117        }
11118        // Clear undef scalars.
11119        for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11120          if (isa<PoisonValue>(VL[I]))
11121            Mask[I] = PoisonMaskElem;
11122        return TargetTransformInfo::SK_PermuteSingleSrc;
11123      }
11124      // No perfect match, just shuffle, so choose the first tree node from the
11125      // tree.
11126      Entries.push_back(FirstEntries.front());
11127    } else {
11128      // Try to find nodes with the same vector factor.
11129      assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11130      // Keep the order of tree nodes to avoid non-determinism.
11131      DenseMap<int, const TreeEntry *> VFToTE;
11132      for (const TreeEntry *TE : UsedTEs.front()) {
11133        unsigned VF = TE->getVectorFactor();
11134        auto It = VFToTE.find(VF);
11135        if (It != VFToTE.end()) {
11136          if (It->second->Idx > TE->Idx)
11137            It->getSecond() = TE;
11138          continue;
11139        }
11140        VFToTE.try_emplace(VF, TE);
11141      }
11142      // Same, keep the order to avoid non-determinism.
11143      SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11144                                                   UsedTEs.back().end());
11145      sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11146        return TE1->Idx < TE2->Idx;
11147      });
11148      for (const TreeEntry *TE : SecondEntries) {
11149        auto It = VFToTE.find(TE->getVectorFactor());
11150        if (It != VFToTE.end()) {
11151          VF = It->first;
11152          Entries.push_back(It->second);
11153          Entries.push_back(TE);
11154          break;
11155        }
11156      }
11157      // No 2 source vectors with the same vector factor - just choose 2 with max
11158      // index.
11159      if (Entries.empty()) {
11160        Entries.push_back(*llvm::max_element(
11161            UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
11162              return TE1->Idx < TE2->Idx;
11163            }));
11164        Entries.push_back(SecondEntries.front());
11165        VF = std::max(Entries.front()->getVectorFactor(),
11166                      Entries.back()->getVectorFactor());
11167      }
11168    }
11169  
11170    bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
11171    // Checks if the 2 PHIs are compatible in terms of high possibility to be
11172    // vectorized.
11173    auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11174      auto *PHI = cast<PHINode>(V);
11175      auto *PHI1 = cast<PHINode>(V1);
11176      // Check that all incoming values are compatible/from same parent (if they
11177      // are instructions).
11178      // The incoming values are compatible if they all are constants, or
11179      // instruction with the same/alternate opcodes from the same basic block.
11180      for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11181        Value *In = PHI->getIncomingValue(I);
11182        Value *In1 = PHI1->getIncomingValue(I);
11183        if (isConstant(In) && isConstant(In1))
11184          continue;
11185        if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11186          return false;
11187        if (cast<Instruction>(In)->getParent() !=
11188            cast<Instruction>(In1)->getParent())
11189          return false;
11190      }
11191      return true;
11192    };
11193    // Check if the value can be ignored during analysis for shuffled gathers.
11194    // We suppose it is better to ignore instruction, which do not form splats,
11195    // are not vectorized/not extractelements (these instructions will be handled
11196    // by extractelements processing) or may form vector node in future.
11197    auto MightBeIgnored = [=](Value *V) {
11198      auto *I = dyn_cast<Instruction>(V);
11199      return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11200             !isVectorLikeInstWithConstOps(I) &&
11201             !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11202    };
11203    // Check that the neighbor instruction may form a full vector node with the
11204    // current instruction V. It is possible, if they have same/alternate opcode
11205    // and same parent basic block.
11206    auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11207      Value *V1 = VL[Idx];
11208      bool UsedInSameVTE = false;
11209      auto It = UsedValuesEntry.find(V1);
11210      if (It != UsedValuesEntry.end())
11211        UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11212      return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11213             getSameOpcode({V, V1}, *TLI).getOpcode() &&
11214             cast<Instruction>(V)->getParent() ==
11215                 cast<Instruction>(V1)->getParent() &&
11216             (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11217    };
11218    // Build a shuffle mask for better cost estimation and vector emission.
11219    SmallBitVector UsedIdxs(Entries.size());
11220    SmallVector<std::pair<unsigned, int>> EntryLanes;
11221    for (int I = 0, E = VL.size(); I < E; ++I) {
11222      Value *V = VL[I];
11223      auto It = UsedValuesEntry.find(V);
11224      if (It == UsedValuesEntry.end())
11225        continue;
11226      // Do not try to shuffle scalars, if they are constants, or instructions
11227      // that can be vectorized as a result of the following vector build
11228      // vectorization.
11229      if (isConstant(V) || (MightBeIgnored(V) &&
11230                            ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11231                             (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11232        continue;
11233      unsigned Idx = It->second;
11234      EntryLanes.emplace_back(Idx, I);
11235      UsedIdxs.set(Idx);
11236    }
11237    // Iterate through all shuffled scalars and select entries, which can be used
11238    // for final shuffle.
11239    SmallVector<const TreeEntry *> TempEntries;
11240    for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11241      if (!UsedIdxs.test(I))
11242        continue;
11243      // Fix the entry number for the given scalar. If it is the first entry, set
11244      // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11245      // These indices are used when calculating final shuffle mask as the vector
11246      // offset.
11247      for (std::pair<unsigned, int> &Pair : EntryLanes)
11248        if (Pair.first == I)
11249          Pair.first = TempEntries.size();
11250      TempEntries.push_back(Entries[I]);
11251    }
11252    Entries.swap(TempEntries);
11253    if (EntryLanes.size() == Entries.size() &&
11254        !VL.equals(ArrayRef(TE->Scalars)
11255                       .slice(Part * VL.size(),
11256                              std::min<int>(VL.size(), TE->Scalars.size())))) {
11257      // We may have here 1 or 2 entries only. If the number of scalars is equal
11258      // to the number of entries, no need to do the analysis, it is not very
11259      // profitable. Since VL is not the same as TE->Scalars, it means we already
11260      // have some shuffles before. Cut off not profitable case.
11261      Entries.clear();
11262      return std::nullopt;
11263    }
11264    // Build the final mask, check for the identity shuffle, if possible.
11265    bool IsIdentity = Entries.size() == 1;
11266    // Pair.first is the offset to the vector, while Pair.second is the index of
11267    // scalar in the list.
11268    for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11269      unsigned Idx = Part * VL.size() + Pair.second;
11270      Mask[Idx] =
11271          Pair.first * VF +
11272          (ForOrder ? std::distance(
11273                          Entries[Pair.first]->Scalars.begin(),
11274                          find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11275                    : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11276      IsIdentity &= Mask[Idx] == Pair.second;
11277    }
11278    switch (Entries.size()) {
11279    case 1:
11280      if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11281        return TargetTransformInfo::SK_PermuteSingleSrc;
11282      break;
11283    case 2:
11284      if (EntryLanes.size() > 2 || VL.size() <= 2)
11285        return TargetTransformInfo::SK_PermuteTwoSrc;
11286      break;
11287    default:
11288      break;
11289    }
11290    Entries.clear();
11291    // Clear the corresponding mask elements.
11292    std::fill(std::next(Mask.begin(), Part * VL.size()),
11293              std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11294    return std::nullopt;
11295  }
11296  
11297  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
11298  BoUpSLP::isGatherShuffledEntry(
11299      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11300      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11301      bool ForOrder) {
11302    assert(NumParts > 0 && NumParts < VL.size() &&
11303           "Expected positive number of registers.");
11304    Entries.clear();
11305    // No need to check for the topmost gather node.
11306    if (TE == VectorizableTree.front().get())
11307      return {};
11308    // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11309    if (TE->isNonPowOf2Vec())
11310      return {};
11311    Mask.assign(VL.size(), PoisonMaskElem);
11312    assert(TE->UserTreeIndices.size() == 1 &&
11313           "Expected only single user of the gather node.");
11314    assert(VL.size() % NumParts == 0 &&
11315           "Number of scalars must be divisible by NumParts.");
11316    unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11317    SmallVector<std::optional<TTI::ShuffleKind>> Res;
11318    for (unsigned Part : seq<unsigned>(NumParts)) {
11319      ArrayRef<Value *> SubVL =
11320          VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11321      SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11322      std::optional<TTI::ShuffleKind> SubRes =
11323          isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11324                                              ForOrder);
11325      if (!SubRes)
11326        SubEntries.clear();
11327      Res.push_back(SubRes);
11328      if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11329          SubEntries.front()->getVectorFactor() == VL.size() &&
11330          (SubEntries.front()->isSame(TE->Scalars) ||
11331           SubEntries.front()->isSame(VL))) {
11332        SmallVector<const TreeEntry *> LocalSubEntries;
11333        LocalSubEntries.swap(SubEntries);
11334        Entries.clear();
11335        Res.clear();
11336        std::iota(Mask.begin(), Mask.end(), 0);
11337        // Clear undef scalars.
11338        for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11339          if (isa<PoisonValue>(VL[I]))
11340            Mask[I] = PoisonMaskElem;
11341        Entries.emplace_back(1, LocalSubEntries.front());
11342        Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
11343        return Res;
11344      }
11345    }
11346    if (all_of(Res,
11347               [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11348      Entries.clear();
11349      return {};
11350    }
11351    return Res;
11352  }
11353  
11354  InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11355                                         Type *ScalarTy) const {
11356    auto *VecTy = getWidenedType(ScalarTy, VL.size());
11357    bool DuplicateNonConst = false;
11358    // Find the cost of inserting/extracting values from the vector.
11359    // Check if the same elements are inserted several times and count them as
11360    // shuffle candidates.
11361    APInt ShuffledElements = APInt::getZero(VL.size());
11362    DenseMap<Value *, unsigned> UniqueElements;
11363    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11364    InstructionCost Cost;
11365    auto EstimateInsertCost = [&](unsigned I, Value *V) {
11366      if (V->getType() != ScalarTy) {
11367        Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11368                                      TTI::CastContextHint::None, CostKind);
11369        V = nullptr;
11370      }
11371      if (!ForPoisonSrc)
11372        Cost +=
11373            TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11374                                    I, Constant::getNullValue(VecTy), V);
11375    };
11376    SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11377    for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11378      Value *V = VL[I];
11379      // No need to shuffle duplicates for constants.
11380      if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11381        ShuffledElements.setBit(I);
11382        ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11383        continue;
11384      }
11385  
11386      auto Res = UniqueElements.try_emplace(V, I);
11387      if (Res.second) {
11388        EstimateInsertCost(I, V);
11389        ShuffleMask[I] = I;
11390        continue;
11391      }
11392  
11393      DuplicateNonConst = true;
11394      ShuffledElements.setBit(I);
11395      ShuffleMask[I] = Res.first->second;
11396    }
11397    if (ForPoisonSrc)
11398      Cost =
11399          TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11400                                        /*Extract*/ false, CostKind);
11401    if (DuplicateNonConst)
11402      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
11403                                  VecTy, ShuffleMask);
11404    return Cost;
11405  }
11406  
11407  // Perform operand reordering on the instructions in VL and return the reordered
11408  // operands in Left and Right.
11409  void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11410                                               SmallVectorImpl<Value *> &Left,
11411                                               SmallVectorImpl<Value *> &Right,
11412                                               const BoUpSLP &R) {
11413    if (VL.empty())
11414      return;
11415    VLOperands Ops(VL, R);
11416    // Reorder the operands in place.
11417    Ops.reorder();
11418    Left = Ops.getVL(0);
11419    Right = Ops.getVL(1);
11420  }
11421  
11422  Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11423    auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11424    if (Res.second)
11425      return *Res.second;
11426    // Get the basic block this bundle is in. All instructions in the bundle
11427    // should be in this block (except for extractelement-like instructions with
11428    // constant indeces).
11429    auto *Front = E->getMainOp();
11430    auto *BB = Front->getParent();
11431    assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11432      if (E->getOpcode() == Instruction::GetElementPtr &&
11433          !isa<GetElementPtrInst>(V))
11434        return true;
11435      auto *I = cast<Instruction>(V);
11436      return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11437             isVectorLikeInstWithConstOps(I);
11438    }));
11439  
11440    auto FindLastInst = [&]() {
11441      Instruction *LastInst = Front;
11442      for (Value *V : E->Scalars) {
11443        auto *I = dyn_cast<Instruction>(V);
11444        if (!I)
11445          continue;
11446        if (LastInst->getParent() == I->getParent()) {
11447          if (LastInst->comesBefore(I))
11448            LastInst = I;
11449          continue;
11450        }
11451        assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452                 !isa<GetElementPtrInst>(I)) ||
11453                (isVectorLikeInstWithConstOps(LastInst) &&
11454                 isVectorLikeInstWithConstOps(I))) &&
11455               "Expected vector-like or non-GEP in GEP node insts only.");
11456        if (!DT->isReachableFromEntry(LastInst->getParent())) {
11457          LastInst = I;
11458          continue;
11459        }
11460        if (!DT->isReachableFromEntry(I->getParent()))
11461          continue;
11462        auto *NodeA = DT->getNode(LastInst->getParent());
11463        auto *NodeB = DT->getNode(I->getParent());
11464        assert(NodeA && "Should only process reachable instructions");
11465        assert(NodeB && "Should only process reachable instructions");
11466        assert((NodeA == NodeB) ==
11467                   (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468               "Different nodes should have different DFS numbers");
11469        if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11470          LastInst = I;
11471      }
11472      BB = LastInst->getParent();
11473      return LastInst;
11474    };
11475  
11476    auto FindFirstInst = [&]() {
11477      Instruction *FirstInst = Front;
11478      for (Value *V : E->Scalars) {
11479        auto *I = dyn_cast<Instruction>(V);
11480        if (!I)
11481          continue;
11482        if (FirstInst->getParent() == I->getParent()) {
11483          if (I->comesBefore(FirstInst))
11484            FirstInst = I;
11485          continue;
11486        }
11487        assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488                !isa<GetElementPtrInst>(I)) ||
11489               (isVectorLikeInstWithConstOps(FirstInst) &&
11490                isVectorLikeInstWithConstOps(I))) &&
11491                   "Expected vector-like or non-GEP in GEP node insts only.");
11492        if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11493          FirstInst = I;
11494          continue;
11495        }
11496        if (!DT->isReachableFromEntry(I->getParent()))
11497          continue;
11498        auto *NodeA = DT->getNode(FirstInst->getParent());
11499        auto *NodeB = DT->getNode(I->getParent());
11500        assert(NodeA && "Should only process reachable instructions");
11501        assert(NodeB && "Should only process reachable instructions");
11502        assert((NodeA == NodeB) ==
11503                   (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504               "Different nodes should have different DFS numbers");
11505        if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11506          FirstInst = I;
11507      }
11508      return FirstInst;
11509    };
11510  
11511    // Set the insert point to the beginning of the basic block if the entry
11512    // should not be scheduled.
11513    if (doesNotNeedToSchedule(E->Scalars) ||
11514        (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11515      if ((E->getOpcode() == Instruction::GetElementPtr &&
11516           any_of(E->Scalars,
11517                  [](Value *V) {
11518                    return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11519                  })) ||
11520          all_of(E->Scalars,
11521                 [](Value *V) {
11522                   return !isVectorLikeInstWithConstOps(V) &&
11523                          isUsedOutsideBlock(V);
11524                 }) ||
11525          (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
11526             return isa<ExtractElementInst, UndefValue>(V) ||
11527                    areAllOperandsNonInsts(V);
11528           })))
11529        Res.second = FindLastInst();
11530      else
11531        Res.second = FindFirstInst();
11532      return *Res.second;
11533    }
11534  
11535    // Find the last instruction. The common case should be that BB has been
11536    // scheduled, and the last instruction is VL.back(). So we start with
11537    // VL.back() and iterate over schedule data until we reach the end of the
11538    // bundle. The end of the bundle is marked by null ScheduleData.
11539    if (BlocksSchedules.count(BB)) {
11540      Value *V = E->isOneOf(E->Scalars.back());
11541      if (doesNotNeedToBeScheduled(V))
11542        V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11543      auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11544      if (Bundle && Bundle->isPartOfBundle())
11545        for (; Bundle; Bundle = Bundle->NextInBundle)
11546          if (Bundle->OpValue == Bundle->Inst)
11547            Res.second = Bundle->Inst;
11548    }
11549  
11550    // LastInst can still be null at this point if there's either not an entry
11551    // for BB in BlocksSchedules or there's no ScheduleData available for
11552    // VL.back(). This can be the case if buildTree_rec aborts for various
11553    // reasons (e.g., the maximum recursion depth is reached, the maximum region
11554    // size is reached, etc.). ScheduleData is initialized in the scheduling
11555    // "dry-run".
11556    //
11557    // If this happens, we can still find the last instruction by brute force. We
11558    // iterate forwards from Front (inclusive) until we either see all
11559    // instructions in the bundle or reach the end of the block. If Front is the
11560    // last instruction in program order, LastInst will be set to Front, and we
11561    // will visit all the remaining instructions in the block.
11562    //
11563    // One of the reasons we exit early from buildTree_rec is to place an upper
11564    // bound on compile-time. Thus, taking an additional compile-time hit here is
11565    // not ideal. However, this should be exceedingly rare since it requires that
11566    // we both exit early from buildTree_rec and that the bundle be out-of-order
11567    // (causing us to iterate all the way to the end of the block).
11568    if (!Res.second)
11569      Res.second = FindLastInst();
11570    assert(Res.second && "Failed to find last instruction in bundle");
11571    return *Res.second;
11572  }
11573  
11574  void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11575    auto *Front = E->getMainOp();
11576    Instruction *LastInst = &getLastInstructionInBundle(E);
11577    assert(LastInst && "Failed to find last instruction in bundle");
11578    BasicBlock::iterator LastInstIt = LastInst->getIterator();
11579    // If the instruction is PHI, set the insert point after all the PHIs.
11580    bool IsPHI = isa<PHINode>(LastInst);
11581    if (IsPHI)
11582      LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11583    if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
11584      Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11585    } else {
11586      // Set the insertion point after the last instruction in the bundle. Set the
11587      // debug location to Front.
11588      Builder.SetInsertPoint(
11589          LastInst->getParent(),
11590          LastInst->getNextNonDebugInstruction()->getIterator());
11591    }
11592    Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11593  }
11594  
11595  Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11596    // List of instructions/lanes from current block and/or the blocks which are
11597    // part of the current loop. These instructions will be inserted at the end to
11598    // make it possible to optimize loops and hoist invariant instructions out of
11599    // the loops body with better chances for success.
11600    SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
11601    SmallSet<int, 4> PostponedIndices;
11602    Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11603    auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11604      SmallPtrSet<BasicBlock *, 4> Visited;
11605      while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11606        InsertBB = InsertBB->getSinglePredecessor();
11607      return InsertBB && InsertBB == InstBB;
11608    };
11609    for (int I = 0, E = VL.size(); I < E; ++I) {
11610      if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11611        if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11612             getTreeEntry(Inst) ||
11613             (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11614            PostponedIndices.insert(I).second)
11615          PostponedInsts.emplace_back(Inst, I);
11616    }
11617  
11618    auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11619                                        Type *Ty) {
11620      Value *Scalar = V;
11621      if (Scalar->getType() != Ty) {
11622        assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11623               "Expected integer types only.");
11624        Value *V = Scalar;
11625        if (auto *CI = dyn_cast<CastInst>(Scalar);
11626            isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11627          Value *Op = CI->getOperand(0);
11628          if (auto *IOp = dyn_cast<Instruction>(Op);
11629              !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11630            V = Op;
11631        }
11632        Scalar = Builder.CreateIntCast(
11633            V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11634      }
11635  
11636      Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11637      auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11638      if (!InsElt)
11639        return Vec;
11640      GatherShuffleExtractSeq.insert(InsElt);
11641      CSEBlocks.insert(InsElt->getParent());
11642      // Add to our 'need-to-extract' list.
11643      if (isa<Instruction>(V)) {
11644        if (TreeEntry *Entry = getTreeEntry(V)) {
11645          // Find which lane we need to extract.
11646          User *UserOp = nullptr;
11647          if (Scalar != V) {
11648            if (auto *SI = dyn_cast<Instruction>(Scalar))
11649              UserOp = SI;
11650          } else {
11651            UserOp = InsElt;
11652          }
11653          if (UserOp) {
11654            unsigned FoundLane = Entry->findLaneForValue(V);
11655            ExternalUses.emplace_back(V, UserOp, FoundLane);
11656          }
11657        }
11658      }
11659      return Vec;
11660    };
11661    auto *VecTy = getWidenedType(ScalarTy, VL.size());
11662    Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11663    SmallVector<int> NonConsts;
11664    // Insert constant values at first.
11665    for (int I = 0, E = VL.size(); I < E; ++I) {
11666      if (PostponedIndices.contains(I))
11667        continue;
11668      if (!isConstant(VL[I])) {
11669        NonConsts.push_back(I);
11670        continue;
11671      }
11672      if (Root) {
11673        if (!isa<UndefValue>(VL[I])) {
11674          NonConsts.push_back(I);
11675          continue;
11676        }
11677        if (isa<PoisonValue>(VL[I]))
11678          continue;
11679        if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11680          if (SV->getMaskValue(I) == PoisonMaskElem)
11681            continue;
11682        }
11683      }
11684      Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11685    }
11686    // Insert non-constant values.
11687    for (int I : NonConsts)
11688      Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11689    // Append instructions, which are/may be part of the loop, in the end to make
11690    // it possible to hoist non-loop-based instructions.
11691    for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11692      Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11693  
11694    return Vec;
11695  }
11696  
11697  /// Merges shuffle masks and emits final shuffle instruction, if required. It
11698  /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11699  /// when the actual shuffle instruction is generated only if this is actually
11700  /// required. Otherwise, the shuffle instruction emission is delayed till the
11701  /// end of the process, to reduce the number of emitted instructions and further
11702  /// analysis/transformations.
11703  /// The class also will look through the previously emitted shuffle instructions
11704  /// and properly mark indices in mask as undef.
11705  /// For example, given the code
11706  /// \code
11707  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11708  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11709  /// \endcode
11710  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11711  /// look through %s1 and %s2 and emit
11712  /// \code
11713  /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11714  /// \endcode
11715  /// instead.
11716  /// If 2 operands are of different size, the smallest one will be resized and
11717  /// the mask recalculated properly.
11718  /// For example, given the code
11719  /// \code
11720  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11721  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11722  /// \endcode
11723  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11724  /// look through %s1 and %s2 and emit
11725  /// \code
11726  /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11727  /// \endcode
11728  /// instead.
11729  class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11730    bool IsFinalized = false;
11731    /// Combined mask for all applied operands and masks. It is built during
11732    /// analysis and actual emission of shuffle vector instructions.
11733    SmallVector<int> CommonMask;
11734    /// List of operands for the shuffle vector instruction. It hold at max 2
11735    /// operands, if the 3rd is going to be added, the first 2 are combined into
11736    /// shuffle with \p CommonMask mask, the first operand sets to be the
11737    /// resulting shuffle and the second operand sets to be the newly added
11738    /// operand. The \p CommonMask is transformed in the proper way after that.
11739    SmallVector<Value *, 2> InVectors;
11740    Type *ScalarTy = nullptr;
11741    IRBuilderBase &Builder;
11742    BoUpSLP &R;
11743  
11744    class ShuffleIRBuilder {
11745      IRBuilderBase &Builder;
11746      /// Holds all of the instructions that we gathered.
11747      SetVector<Instruction *> &GatherShuffleExtractSeq;
11748      /// A list of blocks that we are going to CSE.
11749      DenseSet<BasicBlock *> &CSEBlocks;
11750      /// Data layout.
11751      const DataLayout &DL;
11752  
11753    public:
11754      ShuffleIRBuilder(IRBuilderBase &Builder,
11755                       SetVector<Instruction *> &GatherShuffleExtractSeq,
11756                       DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11757          : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758            CSEBlocks(CSEBlocks), DL(DL) {}
11759      ~ShuffleIRBuilder() = default;
11760      /// Creates shufflevector for the 2 operands with the given mask.
11761      Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11762        if (V1->getType() != V2->getType()) {
11763          assert(V1->getType()->isIntOrIntVectorTy() &&
11764                 V1->getType()->isIntOrIntVectorTy() &&
11765                 "Expected integer vector types only.");
11766          if (V1->getType() != V2->getType()) {
11767            if (cast<VectorType>(V2->getType())
11768                    ->getElementType()
11769                    ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11770                                                 ->getElementType()
11771                                                 ->getIntegerBitWidth())
11772              V2 = Builder.CreateIntCast(
11773                  V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11774            else
11775              V1 = Builder.CreateIntCast(
11776                  V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11777          }
11778        }
11779        Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11780        if (auto *I = dyn_cast<Instruction>(Vec)) {
11781          GatherShuffleExtractSeq.insert(I);
11782          CSEBlocks.insert(I->getParent());
11783        }
11784        return Vec;
11785      }
11786      /// Creates permutation of the single vector operand with the given mask, if
11787      /// it is not identity mask.
11788      Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11789        if (Mask.empty())
11790          return V1;
11791        unsigned VF = Mask.size();
11792        unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11793        if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11794          return V1;
11795        Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11796        if (auto *I = dyn_cast<Instruction>(Vec)) {
11797          GatherShuffleExtractSeq.insert(I);
11798          CSEBlocks.insert(I->getParent());
11799        }
11800        return Vec;
11801      }
11802      Value *createIdentity(Value *V) { return V; }
11803      Value *createPoison(Type *Ty, unsigned VF) {
11804        return PoisonValue::get(getWidenedType(Ty, VF));
11805      }
11806      /// Resizes 2 input vector to match the sizes, if the they are not equal
11807      /// yet. The smallest vector is resized to the size of the larger vector.
11808      void resizeToMatch(Value *&V1, Value *&V2) {
11809        if (V1->getType() == V2->getType())
11810          return;
11811        int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11812        int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11813        int VF = std::max(V1VF, V2VF);
11814        int MinVF = std::min(V1VF, V2VF);
11815        SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11816        std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11817                  0);
11818        Value *&Op = MinVF == V1VF ? V1 : V2;
11819        Op = Builder.CreateShuffleVector(Op, IdentityMask);
11820        if (auto *I = dyn_cast<Instruction>(Op)) {
11821          GatherShuffleExtractSeq.insert(I);
11822          CSEBlocks.insert(I->getParent());
11823        }
11824        if (MinVF == V1VF)
11825          V1 = Op;
11826        else
11827          V2 = Op;
11828      }
11829    };
11830  
11831    /// Smart shuffle instruction emission, walks through shuffles trees and
11832    /// tries to find the best matching vector for the actual shuffle
11833    /// instruction.
11834    Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11835      assert(V1 && "Expected at least one vector value.");
11836      ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837                                      R.CSEBlocks, *R.DL);
11838      return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11839                                                         ShuffleBuilder);
11840    }
11841  
11842    /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11843    /// shuffle emission.
11844    static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11845                                          ArrayRef<int> Mask) {
11846      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11847        if (Mask[Idx] != PoisonMaskElem)
11848          CommonMask[Idx] = Idx;
11849    }
11850  
11851    /// Cast value \p V to the vector type with the same number of elements, but
11852    /// the base type \p ScalarTy.
11853    Value *castToScalarTyElem(Value *V,
11854                              std::optional<bool> IsSigned = std::nullopt) {
11855      auto *VecTy = cast<VectorType>(V->getType());
11856      assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
11857      if (VecTy->getElementType() == ScalarTy->getScalarType())
11858        return V;
11859      return Builder.CreateIntCast(
11860          V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
11861          IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11862    }
11863  
11864  public:
11865    ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
11866        : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11867  
11868    /// Adjusts extractelements after reusing them.
11869    Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11870                          ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871                          unsigned NumParts, bool &UseVecBaseAsInput) {
11872      UseVecBaseAsInput = false;
11873      SmallPtrSet<Value *, 4> UniqueBases;
11874      Value *VecBase = nullptr;
11875      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11876        int Idx = Mask[I];
11877        if (Idx == PoisonMaskElem)
11878          continue;
11879        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11880        VecBase = EI->getVectorOperand();
11881        if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11882          VecBase = TE->VectorizedValue;
11883        assert(VecBase && "Expected vectorized value.");
11884        UniqueBases.insert(VecBase);
11885        // If the only one use is vectorized - can delete the extractelement
11886        // itself.
11887        if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11888            any_of(EI->users(), [&](User *U) {
11889              const TreeEntry *UTE = R.getTreeEntry(U);
11890              return !UTE || R.MultiNodeScalars.contains(U) ||
11891                     (isa<GetElementPtrInst>(U) &&
11892                      !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11893                     count_if(R.VectorizableTree,
11894                              [&](const std::unique_ptr<TreeEntry> &TE) {
11895                                return any_of(TE->UserTreeIndices,
11896                                              [&](const EdgeInfo &Edge) {
11897                                                return Edge.UserTE == UTE;
11898                                              }) &&
11899                                       is_contained(TE->Scalars, EI);
11900                              }) != 1;
11901            }))
11902          continue;
11903        R.eraseInstruction(EI);
11904      }
11905      if (NumParts == 1 || UniqueBases.size() == 1) {
11906        assert(VecBase && "Expected vectorized value.");
11907        return castToScalarTyElem(VecBase);
11908      }
11909      UseVecBaseAsInput = true;
11910      auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11911        for (auto [I, Idx] : enumerate(Mask))
11912          if (Idx != PoisonMaskElem)
11913            Idx = I;
11914      };
11915      // Perform multi-register vector shuffle, joining them into a single virtual
11916      // long vector.
11917      // Need to shuffle each part independently and then insert all this parts
11918      // into a long virtual vector register, forming the original vector.
11919      Value *Vec = nullptr;
11920      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11921      unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
11922      for (unsigned Part : seq<unsigned>(NumParts)) {
11923        unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
11924        ArrayRef<Value *> VL =
11925            ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
11926        MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
11927        constexpr int MaxBases = 2;
11928        SmallVector<Value *, MaxBases> Bases(MaxBases);
11929        auto VLMask = zip(VL, SubMask);
11930        const unsigned VF = std::accumulate(
11931            VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
11932              if (std::get<1>(D) == PoisonMaskElem)
11933                return S;
11934              Value *VecOp =
11935                  cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
11936              if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11937                VecOp = TE->VectorizedValue;
11938              assert(VecOp && "Expected vectorized value.");
11939              const unsigned Size =
11940                  cast<FixedVectorType>(VecOp->getType())->getNumElements();
11941              return std::max(S, Size);
11942            });
11943        for (const auto [V, I] : VLMask) {
11944          if (I == PoisonMaskElem)
11945            continue;
11946          Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11947          if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11948            VecOp = TE->VectorizedValue;
11949          assert(VecOp && "Expected vectorized value.");
11950          VecOp = castToScalarTyElem(VecOp);
11951          Bases[I / VF] = VecOp;
11952        }
11953        if (!Bases.front())
11954          continue;
11955        Value *SubVec;
11956        if (Bases.back()) {
11957          SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11958          TransformToIdentity(SubMask);
11959        } else {
11960          SubVec = Bases.front();
11961        }
11962        if (!Vec) {
11963          Vec = SubVec;
11964          assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11965                                      [&](unsigned P) {
11966                                        ArrayRef<int> SubMask =
11967                                            Mask.slice(P * SliceSize,
11968                                                       getNumElems(Mask.size(),
11969                                                                   SliceSize, P));
11970                                        return all_of(SubMask, [](int Idx) {
11971                                          return Idx == PoisonMaskElem;
11972                                        });
11973                                      })) &&
11974                 "Expected first part or all previous parts masked.");
11975          copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11976        } else {
11977          unsigned NewVF =
11978              cast<FixedVectorType>(Vec->getType())->getNumElements();
11979          if (Vec->getType() != SubVec->getType()) {
11980            unsigned SubVecVF =
11981                cast<FixedVectorType>(SubVec->getType())->getNumElements();
11982            NewVF = std::max(NewVF, SubVecVF);
11983          }
11984          // Adjust SubMask.
11985          for (int &Idx : SubMask)
11986            if (Idx != PoisonMaskElem)
11987              Idx += NewVF;
11988          copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11989          Vec = createShuffle(Vec, SubVec, VecMask);
11990          TransformToIdentity(VecMask);
11991        }
11992      }
11993      copy(VecMask, Mask.begin());
11994      return Vec;
11995    }
11996    /// Checks if the specified entry \p E needs to be delayed because of its
11997    /// dependency nodes.
11998    std::optional<Value *>
11999    needToDelay(const TreeEntry *E,
12000                ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
12001      // No need to delay emission if all deps are ready.
12002      if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
12003            return all_of(
12004                TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
12005          }))
12006        return std::nullopt;
12007      // Postpone gather emission, will be emitted after the end of the
12008      // process to keep correct order.
12009      auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
12010      return Builder.CreateAlignedLoad(
12011          ResVecTy,
12012          PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
12013          MaybeAlign());
12014    }
12015    /// Adds 2 input vectors (in form of tree entries) and the mask for their
12016    /// shuffling.
12017    void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12018      Value *V1 = E1.VectorizedValue;
12019      if (V1->getType()->isIntOrIntVectorTy())
12020        V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12021                                  return !isKnownNonNegative(
12022                                      V, SimplifyQuery(*R.DL));
12023                                }));
12024      Value *V2 = E2.VectorizedValue;
12025      if (V2->getType()->isIntOrIntVectorTy())
12026        V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
12027                                  return !isKnownNonNegative(
12028                                      V, SimplifyQuery(*R.DL));
12029                                }));
12030      add(V1, V2, Mask);
12031    }
12032    /// Adds single input vector (in form of tree entry) and the mask for its
12033    /// shuffling.
12034    void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12035      Value *V1 = E1.VectorizedValue;
12036      if (V1->getType()->isIntOrIntVectorTy())
12037        V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12038                                  return !isKnownNonNegative(
12039                                      V, SimplifyQuery(*R.DL));
12040                                }));
12041      add(V1, Mask);
12042    }
12043    /// Adds 2 input vectors and the mask for their shuffling.
12044    void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12045      assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12046      V1 = castToScalarTyElem(V1);
12047      V2 = castToScalarTyElem(V2);
12048      if (InVectors.empty()) {
12049        InVectors.push_back(V1);
12050        InVectors.push_back(V2);
12051        CommonMask.assign(Mask.begin(), Mask.end());
12052        return;
12053      }
12054      Value *Vec = InVectors.front();
12055      if (InVectors.size() == 2) {
12056        Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12057        transformMaskAfterShuffle(CommonMask, CommonMask);
12058      } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
12059                 Mask.size()) {
12060        Vec = createShuffle(Vec, nullptr, CommonMask);
12061        transformMaskAfterShuffle(CommonMask, CommonMask);
12062      }
12063      V1 = createShuffle(V1, V2, Mask);
12064      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12065        if (Mask[Idx] != PoisonMaskElem)
12066          CommonMask[Idx] = Idx + Sz;
12067      InVectors.front() = Vec;
12068      if (InVectors.size() == 2)
12069        InVectors.back() = V1;
12070      else
12071        InVectors.push_back(V1);
12072    }
12073    /// Adds another one input vector and the mask for the shuffling.
12074    void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12075      V1 = castToScalarTyElem(V1);
12076      if (InVectors.empty()) {
12077        if (!isa<FixedVectorType>(V1->getType())) {
12078          V1 = createShuffle(V1, nullptr, CommonMask);
12079          CommonMask.assign(Mask.size(), PoisonMaskElem);
12080          transformMaskAfterShuffle(CommonMask, Mask);
12081        }
12082        InVectors.push_back(V1);
12083        CommonMask.assign(Mask.begin(), Mask.end());
12084        return;
12085      }
12086      const auto *It = find(InVectors, V1);
12087      if (It == InVectors.end()) {
12088        if (InVectors.size() == 2 ||
12089            InVectors.front()->getType() != V1->getType() ||
12090            !isa<FixedVectorType>(V1->getType())) {
12091          Value *V = InVectors.front();
12092          if (InVectors.size() == 2) {
12093            V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12094            transformMaskAfterShuffle(CommonMask, CommonMask);
12095          } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12096                     CommonMask.size()) {
12097            V = createShuffle(InVectors.front(), nullptr, CommonMask);
12098            transformMaskAfterShuffle(CommonMask, CommonMask);
12099          }
12100          for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12101            if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12102              CommonMask[Idx] =
12103                  V->getType() != V1->getType()
12104                      ? Idx + Sz
12105                      : Mask[Idx] + cast<FixedVectorType>(V1->getType())
12106                                        ->getNumElements();
12107          if (V->getType() != V1->getType())
12108            V1 = createShuffle(V1, nullptr, Mask);
12109          InVectors.front() = V;
12110          if (InVectors.size() == 2)
12111            InVectors.back() = V1;
12112          else
12113            InVectors.push_back(V1);
12114          return;
12115        }
12116        // Check if second vector is required if the used elements are already
12117        // used from the first one.
12118        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12119          if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12120            InVectors.push_back(V1);
12121            break;
12122          }
12123      }
12124      int VF = CommonMask.size();
12125      if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12126        VF = FTy->getNumElements();
12127      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12128        if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12129          CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12130    }
12131    /// Adds another one input vector and the mask for the shuffling.
12132    void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
12133      SmallVector<int> NewMask;
12134      inversePermutation(Order, NewMask);
12135      add(V1, NewMask);
12136    }
12137    Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12138                  Value *Root = nullptr) {
12139      return R.gather(VL, Root, ScalarTy);
12140    }
12141    Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12142    /// Finalize emission of the shuffles.
12143    /// \param Action the action (if any) to be performed before final applying of
12144    /// the \p ExtMask mask.
12145    Value *
12146    finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12147             function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12148      IsFinalized = true;
12149      if (Action) {
12150        Value *Vec = InVectors.front();
12151        if (InVectors.size() == 2) {
12152          Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12153          InVectors.pop_back();
12154        } else {
12155          Vec = createShuffle(Vec, nullptr, CommonMask);
12156        }
12157        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12158          if (CommonMask[Idx] != PoisonMaskElem)
12159            CommonMask[Idx] = Idx;
12160        assert(VF > 0 &&
12161               "Expected vector length for the final value before action.");
12162        unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12163        if (VecVF < VF) {
12164          SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12165          std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12166          Vec = createShuffle(Vec, nullptr, ResizeMask);
12167        }
12168        Action(Vec, CommonMask);
12169        InVectors.front() = Vec;
12170      }
12171      if (!ExtMask.empty()) {
12172        if (CommonMask.empty()) {
12173          CommonMask.assign(ExtMask.begin(), ExtMask.end());
12174        } else {
12175          SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12176          for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12177            if (ExtMask[I] == PoisonMaskElem)
12178              continue;
12179            NewMask[I] = CommonMask[ExtMask[I]];
12180          }
12181          CommonMask.swap(NewMask);
12182        }
12183      }
12184      if (CommonMask.empty()) {
12185        assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12186        return InVectors.front();
12187      }
12188      if (InVectors.size() == 2)
12189        return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12190      return createShuffle(InVectors.front(), nullptr, CommonMask);
12191    }
12192  
12193    ~ShuffleInstructionBuilder() {
12194      assert((IsFinalized || CommonMask.empty()) &&
12195             "Shuffle construction must be finalized.");
12196    }
12197  };
12198  
12199  Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12200                                   bool PostponedPHIs) {
12201    ValueList &VL = E->getOperand(NodeIdx);
12202    const unsigned VF = VL.size();
12203    InstructionsState S = getSameOpcode(VL, *TLI);
12204    // Special processing for GEPs bundle, which may include non-gep values.
12205    if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12206      const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12207      if (It != VL.end())
12208        S = getSameOpcode(*It, *TLI);
12209    }
12210    if (S.getOpcode()) {
12211      auto CheckSameVE = [&](const TreeEntry *VE) {
12212        return VE->isSame(VL) &&
12213               (any_of(VE->UserTreeIndices,
12214                       [E, NodeIdx](const EdgeInfo &EI) {
12215                         return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12216                       }) ||
12217                any_of(VectorizableTree,
12218                       [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12219                         return TE->isOperandGatherNode({E, NodeIdx}) &&
12220                                VE->isSame(TE->Scalars);
12221                       }));
12222      };
12223      TreeEntry *VE = getTreeEntry(S.OpValue);
12224      bool IsSameVE = VE && CheckSameVE(VE);
12225      if (!IsSameVE) {
12226        auto It = MultiNodeScalars.find(S.OpValue);
12227        if (It != MultiNodeScalars.end()) {
12228          auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12229            return TE != VE && CheckSameVE(TE);
12230          });
12231          if (I != It->getSecond().end()) {
12232            VE = *I;
12233            IsSameVE = true;
12234          }
12235        }
12236      }
12237      if (IsSameVE) {
12238        auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12239          ShuffleInstructionBuilder ShuffleBuilder(
12240              cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12241          ShuffleBuilder.add(V, Mask);
12242          return ShuffleBuilder.finalize(std::nullopt);
12243        };
12244        Value *V = vectorizeTree(VE, PostponedPHIs);
12245        if (VF * getNumElements(VL[0]->getType()) !=
12246            cast<FixedVectorType>(V->getType())->getNumElements()) {
12247          if (!VE->ReuseShuffleIndices.empty()) {
12248            // Reshuffle to get only unique values.
12249            // If some of the scalars are duplicated in the vectorization
12250            // tree entry, we do not vectorize them but instead generate a
12251            // mask for the reuses. But if there are several users of the
12252            // same entry, they may have different vectorization factors.
12253            // This is especially important for PHI nodes. In this case, we
12254            // need to adapt the resulting instruction for the user
12255            // vectorization factor and have to reshuffle it again to take
12256            // only unique elements of the vector. Without this code the
12257            // function incorrectly returns reduced vector instruction with
12258            // the same elements, not with the unique ones.
12259  
12260            // block:
12261            // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12262            // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12263            // ... (use %2)
12264            // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12265            // br %block
12266            SmallVector<int> Mask(VF, PoisonMaskElem);
12267            for (auto [I, V] : enumerate(VL)) {
12268              if (isa<PoisonValue>(V))
12269                continue;
12270              Mask[I] = VE->findLaneForValue(V);
12271            }
12272            V = FinalShuffle(V, Mask);
12273          } else {
12274            assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12275                   "Expected vectorization factor less "
12276                   "than original vector size.");
12277            SmallVector<int> UniformMask(VF, 0);
12278            std::iota(UniformMask.begin(), UniformMask.end(), 0);
12279            V = FinalShuffle(V, UniformMask);
12280          }
12281        }
12282        // Need to update the operand gather node, if actually the operand is not a
12283        // vectorized node, but the buildvector/gather node, which matches one of
12284        // the vectorized nodes.
12285        if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12286              return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12287            }) == VE->UserTreeIndices.end()) {
12288          auto *It = find_if(
12289              VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12290                return TE->isGather() &&
12291                       TE->UserTreeIndices.front().UserTE == E &&
12292                       TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12293              });
12294          assert(It != VectorizableTree.end() && "Expected gather node operand.");
12295          (*It)->VectorizedValue = V;
12296        }
12297        return V;
12298      }
12299    }
12300  
12301    // Find the corresponding gather entry and vectorize it.
12302    // Allows to be more accurate with tree/graph transformations, checks for the
12303    // correctness of the transformations in many cases.
12304    auto *I = find_if(VectorizableTree,
12305                      [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12306                        return TE->isOperandGatherNode({E, NodeIdx});
12307                      });
12308    assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12309    assert(I->get()->UserTreeIndices.size() == 1 &&
12310           "Expected only single user for the gather node.");
12311    assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12312    return vectorizeTree(I->get(), PostponedPHIs);
12313  }
12314  
12315  template <typename BVTy, typename ResTy, typename... Args>
12316  ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12317                                    Args &...Params) {
12318    assert(E->isGather() && "Expected gather node.");
12319    unsigned VF = E->getVectorFactor();
12320  
12321    bool NeedFreeze = false;
12322    SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12323                                         E->ReuseShuffleIndices.end());
12324    SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12325    // Build a mask out of the reorder indices and reorder scalars per this
12326    // mask.
12327    SmallVector<int> ReorderMask;
12328    inversePermutation(E->ReorderIndices, ReorderMask);
12329    if (!ReorderMask.empty())
12330      reorderScalars(GatheredScalars, ReorderMask);
12331    auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12332                               unsigned I, unsigned SliceSize) {
12333      if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12334            return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12335          }))
12336        return false;
12337      TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12338      unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12339      if (UserTE->getNumOperands() != 2)
12340        return false;
12341      auto *It =
12342          find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12343            return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12344                     return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12345                   }) != TE->UserTreeIndices.end();
12346          });
12347      if (It == VectorizableTree.end())
12348        return false;
12349      int Idx;
12350      if ((Mask.size() < InputVF &&
12351           ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
12352           Idx == 0) ||
12353          (Mask.size() == InputVF &&
12354           ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12355        std::iota(
12356            std::next(Mask.begin(), I * SliceSize),
12357            std::next(Mask.begin(),
12358                      I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12359            0);
12360      } else {
12361        unsigned IVal =
12362            *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12363        std::fill(
12364            std::next(Mask.begin(), I * SliceSize),
12365            std::next(Mask.begin(),
12366                      I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12367            IVal);
12368      }
12369      return true;
12370    };
12371    BVTy ShuffleBuilder(ScalarTy, Params...);
12372    ResTy Res = ResTy();
12373    SmallVector<int> Mask;
12374    SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12375    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
12376    Value *ExtractVecBase = nullptr;
12377    bool UseVecBaseAsInput = false;
12378    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
12379    SmallVector<SmallVector<const TreeEntry *>> Entries;
12380    Type *OrigScalarTy = GatheredScalars.front()->getType();
12381    auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12382    unsigned NumParts = TTI->getNumberOfParts(VecTy);
12383    if (NumParts == 0 || NumParts >= GatheredScalars.size())
12384      NumParts = 1;
12385    if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12386      // Check for gathered extracts.
12387      bool Resized = false;
12388      ExtractShuffles =
12389          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12390      if (!ExtractShuffles.empty()) {
12391        SmallVector<const TreeEntry *> ExtractEntries;
12392        for (auto [Idx, I] : enumerate(ExtractMask)) {
12393          if (I == PoisonMaskElem)
12394            continue;
12395          if (const auto *TE = getTreeEntry(
12396                  cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12397            ExtractEntries.push_back(TE);
12398        }
12399        if (std::optional<ResTy> Delayed =
12400                ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12401          // Delay emission of gathers which are not ready yet.
12402          PostponedGathers.insert(E);
12403          // Postpone gather emission, will be emitted after the end of the
12404          // process to keep correct order.
12405          return *Delayed;
12406        }
12407        if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12408                E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12409          ExtractVecBase = VecBase;
12410          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12411            if (VF == VecBaseTy->getNumElements() &&
12412                GatheredScalars.size() != VF) {
12413              Resized = true;
12414              GatheredScalars.append(VF - GatheredScalars.size(),
12415                                     PoisonValue::get(OrigScalarTy));
12416            }
12417        }
12418      }
12419      // Gather extracts after we check for full matched gathers only.
12420      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12421          E->isAltShuffle() ||
12422          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12423          isSplat(E->Scalars) ||
12424          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12425        GatherShuffles =
12426            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12427      }
12428      if (!GatherShuffles.empty()) {
12429        if (std::optional<ResTy> Delayed =
12430                ShuffleBuilder.needToDelay(E, Entries)) {
12431          // Delay emission of gathers which are not ready yet.
12432          PostponedGathers.insert(E);
12433          // Postpone gather emission, will be emitted after the end of the
12434          // process to keep correct order.
12435          return *Delayed;
12436        }
12437        if (GatherShuffles.size() == 1 &&
12438            *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12439            Entries.front().front()->isSame(E->Scalars)) {
12440          // Perfect match in the graph, will reuse the previously vectorized
12441          // node. Cost is 0.
12442          LLVM_DEBUG(
12443              dbgs()
12444              << "SLP: perfect diamond match for gather bundle "
12445              << shortBundleName(E->Scalars) << ".\n");
12446          // Restore the mask for previous partially matched values.
12447          Mask.resize(E->Scalars.size());
12448          const TreeEntry *FrontTE = Entries.front().front();
12449          if (FrontTE->ReorderIndices.empty() &&
12450              ((FrontTE->ReuseShuffleIndices.empty() &&
12451                E->Scalars.size() == FrontTE->Scalars.size()) ||
12452               (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12453            std::iota(Mask.begin(), Mask.end(), 0);
12454          } else {
12455            for (auto [I, V] : enumerate(E->Scalars)) {
12456              if (isa<PoisonValue>(V)) {
12457                Mask[I] = PoisonMaskElem;
12458                continue;
12459              }
12460              Mask[I] = FrontTE->findLaneForValue(V);
12461            }
12462          }
12463          ShuffleBuilder.add(*FrontTE, Mask);
12464          Res = ShuffleBuilder.finalize(E->getCommonMask());
12465          return Res;
12466        }
12467        if (!Resized) {
12468          if (GatheredScalars.size() != VF &&
12469              any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12470                return any_of(TEs, [&](const TreeEntry *TE) {
12471                  return TE->getVectorFactor() == VF;
12472                });
12473              }))
12474            GatheredScalars.append(VF - GatheredScalars.size(),
12475                                   PoisonValue::get(OrigScalarTy));
12476        }
12477        // Remove shuffled elements from list of gathers.
12478        for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12479          if (Mask[I] != PoisonMaskElem)
12480            GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12481        }
12482      }
12483    }
12484    auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12485                              SmallVectorImpl<int> &ReuseMask,
12486                              bool IsRootPoison) {
12487      // For splats with can emit broadcasts instead of gathers, so try to find
12488      // such sequences.
12489      bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12490                     (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12491      Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12492      SmallVector<int> UndefPos;
12493      DenseMap<Value *, unsigned> UniquePositions;
12494      // Gather unique non-const values and all constant values.
12495      // For repeated values, just shuffle them.
12496      int NumNonConsts = 0;
12497      int SinglePos = 0;
12498      for (auto [I, V] : enumerate(Scalars)) {
12499        if (isa<UndefValue>(V)) {
12500          if (!isa<PoisonValue>(V)) {
12501            ReuseMask[I] = I;
12502            UndefPos.push_back(I);
12503          }
12504          continue;
12505        }
12506        if (isConstant(V)) {
12507          ReuseMask[I] = I;
12508          continue;
12509        }
12510        ++NumNonConsts;
12511        SinglePos = I;
12512        Value *OrigV = V;
12513        Scalars[I] = PoisonValue::get(OrigScalarTy);
12514        if (IsSplat) {
12515          Scalars.front() = OrigV;
12516          ReuseMask[I] = 0;
12517        } else {
12518          const auto Res = UniquePositions.try_emplace(OrigV, I);
12519          Scalars[Res.first->second] = OrigV;
12520          ReuseMask[I] = Res.first->second;
12521        }
12522      }
12523      if (NumNonConsts == 1) {
12524        // Restore single insert element.
12525        if (IsSplat) {
12526          ReuseMask.assign(VF, PoisonMaskElem);
12527          std::swap(Scalars.front(), Scalars[SinglePos]);
12528          if (!UndefPos.empty() && UndefPos.front() == 0)
12529            Scalars.front() = UndefValue::get(OrigScalarTy);
12530        }
12531        ReuseMask[SinglePos] = SinglePos;
12532      } else if (!UndefPos.empty() && IsSplat) {
12533        // For undef values, try to replace them with the simple broadcast.
12534        // We can do it if the broadcasted value is guaranteed to be
12535        // non-poisonous, or by freezing the incoming scalar value first.
12536        auto *It = find_if(Scalars, [this, E](Value *V) {
12537          return !isa<UndefValue>(V) &&
12538                 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12539                  (E->UserTreeIndices.size() == 1 &&
12540                   any_of(V->uses(), [E](const Use &U) {
12541                     // Check if the value already used in the same operation in
12542                     // one of the nodes already.
12543                     return E->UserTreeIndices.front().EdgeIdx !=
12544                                U.getOperandNo() &&
12545                            is_contained(
12546                                E->UserTreeIndices.front().UserTE->Scalars,
12547                                U.getUser());
12548                   })));
12549        });
12550        if (It != Scalars.end()) {
12551          // Replace undefs by the non-poisoned scalars and emit broadcast.
12552          int Pos = std::distance(Scalars.begin(), It);
12553          for (int I : UndefPos) {
12554            // Set the undef position to the non-poisoned scalar.
12555            ReuseMask[I] = Pos;
12556            // Replace the undef by the poison, in the mask it is replaced by
12557            // non-poisoned scalar already.
12558            if (I != Pos)
12559              Scalars[I] = PoisonValue::get(OrigScalarTy);
12560          }
12561        } else {
12562          // Replace undefs by the poisons, emit broadcast and then emit
12563          // freeze.
12564          for (int I : UndefPos) {
12565            ReuseMask[I] = PoisonMaskElem;
12566            if (isa<UndefValue>(Scalars[I]))
12567              Scalars[I] = PoisonValue::get(OrigScalarTy);
12568          }
12569          NeedFreeze = true;
12570        }
12571      }
12572    };
12573    if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12574      bool IsNonPoisoned = true;
12575      bool IsUsedInExpr = true;
12576      Value *Vec1 = nullptr;
12577      if (!ExtractShuffles.empty()) {
12578        // Gather of extractelements can be represented as just a shuffle of
12579        // a single/two vectors the scalars are extracted from.
12580        // Find input vectors.
12581        Value *Vec2 = nullptr;
12582        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12583          if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12584            ExtractMask[I] = PoisonMaskElem;
12585        }
12586        if (UseVecBaseAsInput) {
12587          Vec1 = ExtractVecBase;
12588        } else {
12589          for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12590            if (ExtractMask[I] == PoisonMaskElem)
12591              continue;
12592            if (isa<UndefValue>(E->Scalars[I]))
12593              continue;
12594            auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12595            Value *VecOp = EI->getVectorOperand();
12596            if (const auto *TE = getTreeEntry(VecOp))
12597              if (TE->VectorizedValue)
12598                VecOp = TE->VectorizedValue;
12599            if (!Vec1) {
12600              Vec1 = VecOp;
12601            } else if (Vec1 != VecOp) {
12602              assert((!Vec2 || Vec2 == VecOp) &&
12603                     "Expected only 1 or 2 vectors shuffle.");
12604              Vec2 = VecOp;
12605            }
12606          }
12607        }
12608        if (Vec2) {
12609          IsUsedInExpr = false;
12610          IsNonPoisoned &=
12611              isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
12612          ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12613        } else if (Vec1) {
12614          IsUsedInExpr &= FindReusedSplat(
12615              ExtractMask,
12616              cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12617              ExtractMask.size());
12618          ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12619          IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12620        } else {
12621          IsUsedInExpr = false;
12622          ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12623                             /*ForExtracts=*/true);
12624        }
12625      }
12626      if (!GatherShuffles.empty()) {
12627        unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12628        SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12629        for (const auto [I, TEs] : enumerate(Entries)) {
12630          if (TEs.empty()) {
12631            assert(!GatherShuffles[I] &&
12632                   "No shuffles with empty entries list expected.");
12633            continue;
12634          }
12635          assert((TEs.size() == 1 || TEs.size() == 2) &&
12636                 "Expected shuffle of 1 or 2 entries.");
12637          unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12638          auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12639          VecMask.assign(VecMask.size(), PoisonMaskElem);
12640          copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12641          if (TEs.size() == 1) {
12642            IsUsedInExpr &= FindReusedSplat(
12643                VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12644            ShuffleBuilder.add(*TEs.front(), VecMask);
12645            if (TEs.front()->VectorizedValue)
12646              IsNonPoisoned &=
12647                  isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12648          } else {
12649            IsUsedInExpr = false;
12650            ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12651            if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12652              IsNonPoisoned &=
12653                  isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12654                  isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12655          }
12656        }
12657      }
12658      // Try to figure out best way to combine values: build a shuffle and insert
12659      // elements or just build several shuffles.
12660      // Insert non-constant scalars.
12661      SmallVector<Value *> NonConstants(GatheredScalars);
12662      int EMSz = ExtractMask.size();
12663      int MSz = Mask.size();
12664      // Try to build constant vector and shuffle with it only if currently we
12665      // have a single permutation and more than 1 scalar constants.
12666      bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12667      bool IsIdentityShuffle =
12668          ((UseVecBaseAsInput ||
12669            all_of(ExtractShuffles,
12670                   [](const std::optional<TTI::ShuffleKind> &SK) {
12671                     return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12672                            TTI::SK_PermuteSingleSrc;
12673                   })) &&
12674           none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12675           ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12676          (!GatherShuffles.empty() &&
12677           all_of(GatherShuffles,
12678                  [](const std::optional<TTI::ShuffleKind> &SK) {
12679                    return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12680                           TTI::SK_PermuteSingleSrc;
12681                  }) &&
12682           none_of(Mask, [&](int I) { return I >= MSz; }) &&
12683           ShuffleVectorInst::isIdentityMask(Mask, MSz));
12684      bool EnoughConstsForShuffle =
12685          IsSingleShuffle &&
12686          (none_of(GatheredScalars,
12687                   [](Value *V) {
12688                     return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12689                   }) ||
12690           any_of(GatheredScalars,
12691                  [](Value *V) {
12692                    return isa<Constant>(V) && !isa<UndefValue>(V);
12693                  })) &&
12694          (!IsIdentityShuffle ||
12695           (GatheredScalars.size() == 2 &&
12696            any_of(GatheredScalars,
12697                   [](Value *V) { return !isa<UndefValue>(V); })) ||
12698           count_if(GatheredScalars, [](Value *V) {
12699             return isa<Constant>(V) && !isa<PoisonValue>(V);
12700           }) > 1);
12701      // NonConstants array contains just non-constant values, GatheredScalars
12702      // contains only constant to build final vector and then shuffle.
12703      for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12704        if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12705          NonConstants[I] = PoisonValue::get(OrigScalarTy);
12706        else
12707          GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12708      }
12709      // Generate constants for final shuffle and build a mask for them.
12710      if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12711        SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12712        TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12713        Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12714        ShuffleBuilder.add(BV, BVMask);
12715      }
12716      if (all_of(NonConstants, [=](Value *V) {
12717            return isa<PoisonValue>(V) ||
12718                   (IsSingleShuffle && ((IsIdentityShuffle &&
12719                    IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12720          }))
12721        Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12722      else
12723        Res = ShuffleBuilder.finalize(
12724            E->ReuseShuffleIndices, E->Scalars.size(),
12725            [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12726              TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12727              Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12728            });
12729    } else if (!allConstant(GatheredScalars)) {
12730      // Gather unique scalars and all constants.
12731      SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12732      TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12733      Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12734      ShuffleBuilder.add(BV, ReuseMask);
12735      Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12736    } else {
12737      // Gather all constants.
12738      SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12739      for (auto [I, V] : enumerate(E->Scalars)) {
12740        if (!isa<PoisonValue>(V))
12741          Mask[I] = I;
12742      }
12743      Value *BV = ShuffleBuilder.gather(E->Scalars);
12744      ShuffleBuilder.add(BV, Mask);
12745      Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12746    }
12747  
12748    if (NeedFreeze)
12749      Res = ShuffleBuilder.createFreeze(Res);
12750    return Res;
12751  }
12752  
12753  Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12754    return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12755                                                                  Builder, *this);
12756  }
12757  
12758  Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12759    IRBuilderBase::InsertPointGuard Guard(Builder);
12760  
12761    if (E->VectorizedValue &&
12762        (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12763         E->isAltShuffle())) {
12764      LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12765      return E->VectorizedValue;
12766    }
12767  
12768    Value *V = E->Scalars.front();
12769    Type *ScalarTy = V->getType();
12770    if (auto *Store = dyn_cast<StoreInst>(V))
12771      ScalarTy = Store->getValueOperand()->getType();
12772    else if (auto *IE = dyn_cast<InsertElementInst>(V))
12773      ScalarTy = IE->getOperand(1)->getType();
12774    auto It = MinBWs.find(E);
12775    if (It != MinBWs.end())
12776      ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12777    auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
12778    if (E->isGather()) {
12779      // Set insert point for non-reduction initial nodes.
12780      if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12781        setInsertPointAfterBundle(E);
12782      Value *Vec = createBuildVector(E, ScalarTy);
12783      E->VectorizedValue = Vec;
12784      return Vec;
12785    }
12786  
12787    bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12788    auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12789      ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12790      if (E->getOpcode() == Instruction::Store &&
12791          E->State == TreeEntry::Vectorize) {
12792        ArrayRef<int> Mask =
12793            ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12794                     E->ReorderIndices.size());
12795        ShuffleBuilder.add(V, Mask);
12796      } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12797        ShuffleBuilder.addOrdered(V, std::nullopt);
12798      } else {
12799        ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12800      }
12801      return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12802    };
12803  
12804    assert((E->State == TreeEntry::Vectorize ||
12805            E->State == TreeEntry::ScatterVectorize ||
12806            E->State == TreeEntry::StridedVectorize) &&
12807           "Unhandled state");
12808    unsigned ShuffleOrOp =
12809        E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12810    Instruction *VL0 = E->getMainOp();
12811    auto GetOperandSignedness = [&](unsigned Idx) {
12812      const TreeEntry *OpE = getOperandEntry(E, Idx);
12813      bool IsSigned = false;
12814      auto It = MinBWs.find(OpE);
12815      if (It != MinBWs.end())
12816        IsSigned = It->second.second;
12817      else
12818        IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12819          return !isKnownNonNegative(R, SimplifyQuery(*DL));
12820        });
12821      return IsSigned;
12822    };
12823    switch (ShuffleOrOp) {
12824      case Instruction::PHI: {
12825        assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12826                E != VectorizableTree.front().get() ||
12827                !E->UserTreeIndices.empty()) &&
12828               "PHI reordering is free.");
12829        if (PostponedPHIs && E->VectorizedValue)
12830          return E->VectorizedValue;
12831        auto *PH = cast<PHINode>(VL0);
12832        Builder.SetInsertPoint(PH->getParent(),
12833                               PH->getParent()->getFirstNonPHIIt());
12834        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12835        if (PostponedPHIs || !E->VectorizedValue) {
12836          PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12837          E->PHI = NewPhi;
12838          Value *V = NewPhi;
12839  
12840          // Adjust insertion point once all PHI's have been generated.
12841          Builder.SetInsertPoint(PH->getParent(),
12842                                 PH->getParent()->getFirstInsertionPt());
12843          Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12844  
12845          V = FinalShuffle(V, E, VecTy);
12846  
12847          E->VectorizedValue = V;
12848          if (PostponedPHIs)
12849            return V;
12850        }
12851        PHINode *NewPhi = cast<PHINode>(E->PHI);
12852        // If phi node is fully emitted - exit.
12853        if (NewPhi->getNumIncomingValues() != 0)
12854          return NewPhi;
12855  
12856        // PHINodes may have multiple entries from the same block. We want to
12857        // visit every block once.
12858        SmallPtrSet<BasicBlock *, 4> VisitedBBs;
12859  
12860        for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12861          ValueList Operands;
12862          BasicBlock *IBB = PH->getIncomingBlock(I);
12863  
12864          // Stop emission if all incoming values are generated.
12865          if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12866            LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12867            return NewPhi;
12868          }
12869  
12870          if (!VisitedBBs.insert(IBB).second) {
12871            NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12872            continue;
12873          }
12874  
12875          Builder.SetInsertPoint(IBB->getTerminator());
12876          Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12877          Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12878          if (VecTy != Vec->getType()) {
12879            assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
12880                    MinBWs.contains(getOperandEntry(E, I))) &&
12881                   "Expected item in MinBWs.");
12882            Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12883          }
12884          NewPhi->addIncoming(Vec, IBB);
12885        }
12886  
12887        assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12888               "Invalid number of incoming values");
12889        return NewPhi;
12890      }
12891  
12892      case Instruction::ExtractElement: {
12893        Value *V = E->getSingleOperand(0);
12894        if (const TreeEntry *TE = getTreeEntry(V))
12895          V = TE->VectorizedValue;
12896        setInsertPointAfterBundle(E);
12897        V = FinalShuffle(V, E, VecTy);
12898        E->VectorizedValue = V;
12899        return V;
12900      }
12901      case Instruction::ExtractValue: {
12902        auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12903        Builder.SetInsertPoint(LI);
12904        Value *Ptr = LI->getPointerOperand();
12905        LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12906        Value *NewV = propagateMetadata(V, E->Scalars);
12907        NewV = FinalShuffle(NewV, E, VecTy);
12908        E->VectorizedValue = NewV;
12909        return NewV;
12910      }
12911      case Instruction::InsertElement: {
12912        assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12913        Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12914        Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12915        ArrayRef<Value *> Op = E->getOperand(1);
12916        Type *ScalarTy = Op.front()->getType();
12917        if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12918          assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12919          std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12920          assert(Res.first > 0 && "Expected item in MinBWs.");
12921          V = Builder.CreateIntCast(
12922              V,
12923              getWidenedType(
12924                  ScalarTy,
12925                  cast<FixedVectorType>(V->getType())->getNumElements()),
12926              Res.second);
12927        }
12928  
12929        // Create InsertVector shuffle if necessary
12930        auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12931          return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12932        }));
12933        const unsigned NumElts =
12934            cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12935        const unsigned NumScalars = E->Scalars.size();
12936  
12937        unsigned Offset = *getElementIndex(VL0);
12938        assert(Offset < NumElts && "Failed to find vector index offset");
12939  
12940        // Create shuffle to resize vector
12941        SmallVector<int> Mask;
12942        if (!E->ReorderIndices.empty()) {
12943          inversePermutation(E->ReorderIndices, Mask);
12944          Mask.append(NumElts - NumScalars, PoisonMaskElem);
12945        } else {
12946          Mask.assign(NumElts, PoisonMaskElem);
12947          std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12948        }
12949        // Create InsertVector shuffle if necessary
12950        bool IsIdentity = true;
12951        SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12952        Mask.swap(PrevMask);
12953        for (unsigned I = 0; I < NumScalars; ++I) {
12954          Value *Scalar = E->Scalars[PrevMask[I]];
12955          unsigned InsertIdx = *getElementIndex(Scalar);
12956          IsIdentity &= InsertIdx - Offset == I;
12957          Mask[InsertIdx - Offset] = I;
12958        }
12959        if (!IsIdentity || NumElts != NumScalars) {
12960          Value *V2 = nullptr;
12961          bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12962          SmallVector<int> InsertMask(Mask);
12963          if (NumElts != NumScalars && Offset == 0) {
12964            // Follow all insert element instructions from the current buildvector
12965            // sequence.
12966            InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12967            do {
12968              std::optional<unsigned> InsertIdx = getElementIndex(Ins);
12969              if (!InsertIdx)
12970                break;
12971              if (InsertMask[*InsertIdx] == PoisonMaskElem)
12972                InsertMask[*InsertIdx] = *InsertIdx;
12973              if (!Ins->hasOneUse())
12974                break;
12975              Ins = dyn_cast_or_null<InsertElementInst>(
12976                  Ins->getUniqueUndroppableUser());
12977            } while (Ins);
12978            SmallBitVector UseMask =
12979                buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12980            SmallBitVector IsFirstPoison =
12981                isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12982            SmallBitVector IsFirstUndef =
12983                isUndefVector(FirstInsert->getOperand(0), UseMask);
12984            if (!IsFirstPoison.all()) {
12985              unsigned Idx = 0;
12986              for (unsigned I = 0; I < NumElts; I++) {
12987                if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12988                    IsFirstUndef.test(I)) {
12989                  if (IsVNonPoisonous) {
12990                    InsertMask[I] = I < NumScalars ? I : 0;
12991                    continue;
12992                  }
12993                  if (!V2)
12994                    V2 = UndefValue::get(V->getType());
12995                  if (Idx >= NumScalars)
12996                    Idx = NumScalars - 1;
12997                  InsertMask[I] = NumScalars + Idx;
12998                  ++Idx;
12999                } else if (InsertMask[I] != PoisonMaskElem &&
13000                           Mask[I] == PoisonMaskElem) {
13001                  InsertMask[I] = PoisonMaskElem;
13002                }
13003              }
13004            } else {
13005              InsertMask = Mask;
13006            }
13007          }
13008          if (!V2)
13009            V2 = PoisonValue::get(V->getType());
13010          V = Builder.CreateShuffleVector(V, V2, InsertMask);
13011          if (auto *I = dyn_cast<Instruction>(V)) {
13012            GatherShuffleExtractSeq.insert(I);
13013            CSEBlocks.insert(I->getParent());
13014          }
13015        }
13016  
13017        SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13018        for (unsigned I = 0; I < NumElts; I++) {
13019          if (Mask[I] != PoisonMaskElem)
13020            InsertMask[Offset + I] = I;
13021        }
13022        SmallBitVector UseMask =
13023            buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13024        SmallBitVector IsFirstUndef =
13025            isUndefVector(FirstInsert->getOperand(0), UseMask);
13026        if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13027            NumElts != NumScalars) {
13028          if (IsFirstUndef.all()) {
13029            if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
13030              SmallBitVector IsFirstPoison =
13031                  isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13032              if (!IsFirstPoison.all()) {
13033                for (unsigned I = 0; I < NumElts; I++) {
13034                  if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
13035                    InsertMask[I] = I + NumElts;
13036                }
13037              }
13038              V = Builder.CreateShuffleVector(
13039                  V,
13040                  IsFirstPoison.all() ? PoisonValue::get(V->getType())
13041                                      : FirstInsert->getOperand(0),
13042                  InsertMask, cast<Instruction>(E->Scalars.back())->getName());
13043              if (auto *I = dyn_cast<Instruction>(V)) {
13044                GatherShuffleExtractSeq.insert(I);
13045                CSEBlocks.insert(I->getParent());
13046              }
13047            }
13048          } else {
13049            SmallBitVector IsFirstPoison =
13050                isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13051            for (unsigned I = 0; I < NumElts; I++) {
13052              if (InsertMask[I] == PoisonMaskElem)
13053                InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
13054              else
13055                InsertMask[I] += NumElts;
13056            }
13057            V = Builder.CreateShuffleVector(
13058                FirstInsert->getOperand(0), V, InsertMask,
13059                cast<Instruction>(E->Scalars.back())->getName());
13060            if (auto *I = dyn_cast<Instruction>(V)) {
13061              GatherShuffleExtractSeq.insert(I);
13062              CSEBlocks.insert(I->getParent());
13063            }
13064          }
13065        }
13066  
13067        ++NumVectorInstructions;
13068        E->VectorizedValue = V;
13069        return V;
13070      }
13071      case Instruction::ZExt:
13072      case Instruction::SExt:
13073      case Instruction::FPToUI:
13074      case Instruction::FPToSI:
13075      case Instruction::FPExt:
13076      case Instruction::PtrToInt:
13077      case Instruction::IntToPtr:
13078      case Instruction::SIToFP:
13079      case Instruction::UIToFP:
13080      case Instruction::Trunc:
13081      case Instruction::FPTrunc:
13082      case Instruction::BitCast: {
13083        setInsertPointAfterBundle(E);
13084  
13085        Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13086        if (E->VectorizedValue) {
13087          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13088          return E->VectorizedValue;
13089        }
13090  
13091        auto *CI = cast<CastInst>(VL0);
13092        Instruction::CastOps VecOpcode = CI->getOpcode();
13093        Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
13094        auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
13095        if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13096            (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13097             SrcScalarTy != CI->getOperand(0)->getType())) {
13098          // Check if the values are candidates to demote.
13099          unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
13100          if (SrcIt != MinBWs.end())
13101            SrcBWSz = SrcIt->second.first;
13102          unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13103          if (BWSz == SrcBWSz) {
13104            VecOpcode = Instruction::BitCast;
13105          } else if (BWSz < SrcBWSz) {
13106            VecOpcode = Instruction::Trunc;
13107          } else if (It != MinBWs.end()) {
13108            assert(BWSz > SrcBWSz && "Invalid cast!");
13109            VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13110          } else if (SrcIt != MinBWs.end()) {
13111            assert(BWSz > SrcBWSz && "Invalid cast!");
13112            VecOpcode =
13113                SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13114          }
13115        } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13116                   !SrcIt->second.second) {
13117          VecOpcode = Instruction::UIToFP;
13118        }
13119        Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13120                       ? InVec
13121                       : Builder.CreateCast(VecOpcode, InVec, VecTy);
13122        V = FinalShuffle(V, E, VecTy);
13123  
13124        E->VectorizedValue = V;
13125        ++NumVectorInstructions;
13126        return V;
13127      }
13128      case Instruction::FCmp:
13129      case Instruction::ICmp: {
13130        setInsertPointAfterBundle(E);
13131  
13132        Value *L = vectorizeOperand(E, 0, PostponedPHIs);
13133        if (E->VectorizedValue) {
13134          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13135          return E->VectorizedValue;
13136        }
13137        Value *R = vectorizeOperand(E, 1, PostponedPHIs);
13138        if (E->VectorizedValue) {
13139          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13140          return E->VectorizedValue;
13141        }
13142        if (L->getType() != R->getType()) {
13143          assert((getOperandEntry(E, 0)->isGather() ||
13144                  getOperandEntry(E, 1)->isGather() ||
13145                  MinBWs.contains(getOperandEntry(E, 0)) ||
13146                  MinBWs.contains(getOperandEntry(E, 1))) &&
13147                 "Expected item in MinBWs.");
13148          if (cast<VectorType>(L->getType())
13149                  ->getElementType()
13150                  ->getIntegerBitWidth() < cast<VectorType>(R->getType())
13151                                               ->getElementType()
13152                                               ->getIntegerBitWidth()) {
13153            Type *CastTy = R->getType();
13154            L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
13155          } else {
13156            Type *CastTy = L->getType();
13157            R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
13158          }
13159        }
13160  
13161        CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13162        Value *V = Builder.CreateCmp(P0, L, R);
13163        propagateIRFlags(V, E->Scalars, VL0);
13164        // Do not cast for cmps.
13165        VecTy = cast<FixedVectorType>(V->getType());
13166        V = FinalShuffle(V, E, VecTy);
13167  
13168        E->VectorizedValue = V;
13169        ++NumVectorInstructions;
13170        return V;
13171      }
13172      case Instruction::Select: {
13173        setInsertPointAfterBundle(E);
13174  
13175        Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
13176        if (E->VectorizedValue) {
13177          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13178          return E->VectorizedValue;
13179        }
13180        Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13181        if (E->VectorizedValue) {
13182          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13183          return E->VectorizedValue;
13184        }
13185        Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13186        if (E->VectorizedValue) {
13187          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13188          return E->VectorizedValue;
13189        }
13190        if (True->getType() != VecTy || False->getType() != VecTy) {
13191          assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13192                  getOperandEntry(E, 2)->isGather() ||
13193                  MinBWs.contains(getOperandEntry(E, 1)) ||
13194                  MinBWs.contains(getOperandEntry(E, 2))) &&
13195                 "Expected item in MinBWs.");
13196          if (True->getType() != VecTy)
13197            True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13198          if (False->getType() != VecTy)
13199            False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13200        }
13201  
13202        Value *V = Builder.CreateSelect(Cond, True, False);
13203        V = FinalShuffle(V, E, VecTy);
13204  
13205        E->VectorizedValue = V;
13206        ++NumVectorInstructions;
13207        return V;
13208      }
13209      case Instruction::FNeg: {
13210        setInsertPointAfterBundle(E);
13211  
13212        Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13213  
13214        if (E->VectorizedValue) {
13215          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13216          return E->VectorizedValue;
13217        }
13218  
13219        Value *V = Builder.CreateUnOp(
13220            static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13221        propagateIRFlags(V, E->Scalars, VL0);
13222        if (auto *I = dyn_cast<Instruction>(V))
13223          V = propagateMetadata(I, E->Scalars);
13224  
13225        V = FinalShuffle(V, E, VecTy);
13226  
13227        E->VectorizedValue = V;
13228        ++NumVectorInstructions;
13229  
13230        return V;
13231      }
13232      case Instruction::Add:
13233      case Instruction::FAdd:
13234      case Instruction::Sub:
13235      case Instruction::FSub:
13236      case Instruction::Mul:
13237      case Instruction::FMul:
13238      case Instruction::UDiv:
13239      case Instruction::SDiv:
13240      case Instruction::FDiv:
13241      case Instruction::URem:
13242      case Instruction::SRem:
13243      case Instruction::FRem:
13244      case Instruction::Shl:
13245      case Instruction::LShr:
13246      case Instruction::AShr:
13247      case Instruction::And:
13248      case Instruction::Or:
13249      case Instruction::Xor: {
13250        setInsertPointAfterBundle(E);
13251  
13252        Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13253        if (E->VectorizedValue) {
13254          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13255          return E->VectorizedValue;
13256        }
13257        Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13258        if (E->VectorizedValue) {
13259          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13260          return E->VectorizedValue;
13261        }
13262        if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13263          for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13264            ArrayRef<Value *> Ops = E->getOperand(I);
13265            if (all_of(Ops, [&](Value *Op) {
13266                  auto *CI = dyn_cast<ConstantInt>(Op);
13267                  return CI && CI->getValue().countr_one() >= It->second.first;
13268                })) {
13269              V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13270              E->VectorizedValue = V;
13271              ++NumVectorInstructions;
13272              return V;
13273            }
13274          }
13275        }
13276        if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13277          assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13278                  getOperandEntry(E, 1)->isGather() ||
13279                  MinBWs.contains(getOperandEntry(E, 0)) ||
13280                  MinBWs.contains(getOperandEntry(E, 1))) &&
13281                 "Expected item in MinBWs.");
13282          if (LHS->getType() != VecTy)
13283            LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13284          if (RHS->getType() != VecTy)
13285            RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13286        }
13287  
13288        Value *V = Builder.CreateBinOp(
13289            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13290            RHS);
13291        propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13292        if (auto *I = dyn_cast<Instruction>(V)) {
13293          V = propagateMetadata(I, E->Scalars);
13294          // Drop nuw flags for abs(sub(commutative), true).
13295          if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13296              any_of(E->Scalars, [](Value *V) {
13297                return isCommutative(cast<Instruction>(V));
13298              }))
13299            I->setHasNoUnsignedWrap(/*b=*/false);
13300        }
13301  
13302        V = FinalShuffle(V, E, VecTy);
13303  
13304        E->VectorizedValue = V;
13305        ++NumVectorInstructions;
13306  
13307        return V;
13308      }
13309      case Instruction::Load: {
13310        // Loads are inserted at the head of the tree because we don't want to
13311        // sink them all the way down past store instructions.
13312        setInsertPointAfterBundle(E);
13313  
13314        LoadInst *LI = cast<LoadInst>(VL0);
13315        Instruction *NewLI;
13316        Value *PO = LI->getPointerOperand();
13317        if (E->State == TreeEntry::Vectorize) {
13318          NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13319        } else if (E->State == TreeEntry::StridedVectorize) {
13320          Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13321          Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13322          PO = IsReverseOrder ? PtrN : Ptr0;
13323          std::optional<int> Diff = getPointersDiff(
13324              VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13325          Type *StrideTy = DL->getIndexType(PO->getType());
13326          Value *StrideVal;
13327          if (Diff) {
13328            int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13329            StrideVal =
13330                ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13331                                               DL->getTypeAllocSize(ScalarTy));
13332          } else {
13333            SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13334            transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13335              return cast<LoadInst>(V)->getPointerOperand();
13336            });
13337            OrdersType Order;
13338            std::optional<Value *> Stride =
13339                calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13340                                  &*Builder.GetInsertPoint());
13341            Value *NewStride =
13342                Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13343            StrideVal = Builder.CreateMul(
13344                NewStride,
13345                ConstantInt::get(
13346                    StrideTy,
13347                    (IsReverseOrder ? -1 : 1) *
13348                        static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13349          }
13350          Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13351          auto *Inst = Builder.CreateIntrinsic(
13352              Intrinsic::experimental_vp_strided_load,
13353              {VecTy, PO->getType(), StrideTy},
13354              {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13355               Builder.getInt32(E->Scalars.size())});
13356          Inst->addParamAttr(
13357              /*ArgNo=*/0,
13358              Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13359          NewLI = Inst;
13360        } else {
13361          assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13362          Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13363          if (E->VectorizedValue) {
13364            LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13365            return E->VectorizedValue;
13366          }
13367          // Use the minimum alignment of the gathered loads.
13368          Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13369          NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13370        }
13371        Value *V = propagateMetadata(NewLI, E->Scalars);
13372  
13373        V = FinalShuffle(V, E, VecTy);
13374        E->VectorizedValue = V;
13375        ++NumVectorInstructions;
13376        return V;
13377      }
13378      case Instruction::Store: {
13379        auto *SI = cast<StoreInst>(VL0);
13380  
13381        setInsertPointAfterBundle(E);
13382  
13383        Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13384        if (VecValue->getType() != VecTy)
13385          VecValue =
13386              Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13387        VecValue = FinalShuffle(VecValue, E, VecTy);
13388  
13389        Value *Ptr = SI->getPointerOperand();
13390        Instruction *ST;
13391        if (E->State == TreeEntry::Vectorize) {
13392          ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13393        } else {
13394          assert(E->State == TreeEntry::StridedVectorize &&
13395                 "Expected either strided or conseutive stores.");
13396          if (!E->ReorderIndices.empty()) {
13397            SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13398            Ptr = SI->getPointerOperand();
13399          }
13400          Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13401          Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13402          auto *Inst = Builder.CreateIntrinsic(
13403              Intrinsic::experimental_vp_strided_store,
13404              {VecTy, Ptr->getType(), StrideTy},
13405              {VecValue, Ptr,
13406               ConstantInt::get(
13407                   StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13408               Builder.getAllOnesMask(VecTy->getElementCount()),
13409               Builder.getInt32(E->Scalars.size())});
13410          Inst->addParamAttr(
13411              /*ArgNo=*/1,
13412              Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13413          ST = Inst;
13414        }
13415  
13416        Value *V = propagateMetadata(ST, E->Scalars);
13417  
13418        E->VectorizedValue = V;
13419        ++NumVectorInstructions;
13420        return V;
13421      }
13422      case Instruction::GetElementPtr: {
13423        auto *GEP0 = cast<GetElementPtrInst>(VL0);
13424        setInsertPointAfterBundle(E);
13425  
13426        Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13427        if (E->VectorizedValue) {
13428          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13429          return E->VectorizedValue;
13430        }
13431  
13432        SmallVector<Value *> OpVecs;
13433        for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13434          Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13435          if (E->VectorizedValue) {
13436            LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13437            return E->VectorizedValue;
13438          }
13439          OpVecs.push_back(OpVec);
13440        }
13441  
13442        Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13443        if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13444          SmallVector<Value *> GEPs;
13445          for (Value *V : E->Scalars) {
13446            if (isa<GetElementPtrInst>(V))
13447              GEPs.push_back(V);
13448          }
13449          V = propagateMetadata(I, GEPs);
13450        }
13451  
13452        V = FinalShuffle(V, E, VecTy);
13453  
13454        E->VectorizedValue = V;
13455        ++NumVectorInstructions;
13456  
13457        return V;
13458      }
13459      case Instruction::Call: {
13460        CallInst *CI = cast<CallInst>(VL0);
13461        setInsertPointAfterBundle(E);
13462  
13463        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13464  
13465        SmallVector<Type *> ArgTys =
13466            buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
13467                                   It != MinBWs.end() ? It->second.first : 0);
13468        auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13469        bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13470                            VecCallCosts.first <= VecCallCosts.second;
13471  
13472        Value *ScalarArg = nullptr;
13473        SmallVector<Value *> OpVecs;
13474        SmallVector<Type *, 2> TysForDecl;
13475        // Add return type if intrinsic is overloaded on it.
13476        if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13477          TysForDecl.push_back(VecTy);
13478        auto *CEI = cast<CallInst>(VL0);
13479        for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13480          ValueList OpVL;
13481          // Some intrinsics have scalar arguments. This argument should not be
13482          // vectorized.
13483          if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13484            ScalarArg = CEI->getArgOperand(I);
13485            // if decided to reduce bitwidth of abs intrinsic, it second argument
13486            // must be set false (do not return poison, if value issigned min).
13487            if (ID == Intrinsic::abs && It != MinBWs.end() &&
13488                It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13489              ScalarArg = Builder.getFalse();
13490            OpVecs.push_back(ScalarArg);
13491            if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13492              TysForDecl.push_back(ScalarArg->getType());
13493            continue;
13494          }
13495  
13496          Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13497          if (E->VectorizedValue) {
13498            LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13499            return E->VectorizedValue;
13500          }
13501          ScalarArg = CEI->getArgOperand(I);
13502          if (cast<VectorType>(OpVec->getType())->getElementType() !=
13503                  ScalarArg->getType()->getScalarType() &&
13504              It == MinBWs.end()) {
13505            auto *CastTy =
13506                getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
13507            OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13508          } else if (It != MinBWs.end()) {
13509            OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13510          }
13511          LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13512          OpVecs.push_back(OpVec);
13513          if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13514            TysForDecl.push_back(OpVec->getType());
13515        }
13516  
13517        Function *CF;
13518        if (!UseIntrinsic) {
13519          VFShape Shape =
13520              VFShape::get(CI->getFunctionType(),
13521                           ElementCount::getFixed(
13522                               static_cast<unsigned>(VecTy->getNumElements())),
13523                           false /*HasGlobalPred*/);
13524          CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13525        } else {
13526          CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13527        }
13528  
13529        SmallVector<OperandBundleDef, 1> OpBundles;
13530        CI->getOperandBundlesAsDefs(OpBundles);
13531        Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13532  
13533        propagateIRFlags(V, E->Scalars, VL0);
13534        V = FinalShuffle(V, E, VecTy);
13535  
13536        E->VectorizedValue = V;
13537        ++NumVectorInstructions;
13538        return V;
13539      }
13540      case Instruction::ShuffleVector: {
13541        assert(E->isAltShuffle() &&
13542               ((Instruction::isBinaryOp(E->getOpcode()) &&
13543                 Instruction::isBinaryOp(E->getAltOpcode())) ||
13544                (Instruction::isCast(E->getOpcode()) &&
13545                 Instruction::isCast(E->getAltOpcode())) ||
13546                (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13547               "Invalid Shuffle Vector Operand");
13548  
13549        Value *LHS = nullptr, *RHS = nullptr;
13550        if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13551          setInsertPointAfterBundle(E);
13552          LHS = vectorizeOperand(E, 0, PostponedPHIs);
13553          if (E->VectorizedValue) {
13554            LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13555            return E->VectorizedValue;
13556          }
13557          RHS = vectorizeOperand(E, 1, PostponedPHIs);
13558        } else {
13559          setInsertPointAfterBundle(E);
13560          LHS = vectorizeOperand(E, 0, PostponedPHIs);
13561        }
13562        if (E->VectorizedValue) {
13563          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13564          return E->VectorizedValue;
13565        }
13566        if (LHS && RHS &&
13567            ((Instruction::isBinaryOp(E->getOpcode()) &&
13568              (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13569             (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13570          assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13571                  getOperandEntry(E, 1)->isGather() ||
13572                  MinBWs.contains(getOperandEntry(E, 0)) ||
13573                  MinBWs.contains(getOperandEntry(E, 1))) &&
13574                 "Expected item in MinBWs.");
13575          Type *CastTy = VecTy;
13576          if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13577            if (cast<VectorType>(LHS->getType())
13578                    ->getElementType()
13579                    ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13580                                                 ->getElementType()
13581                                                 ->getIntegerBitWidth())
13582              CastTy = RHS->getType();
13583            else
13584              CastTy = LHS->getType();
13585          }
13586          if (LHS->getType() != CastTy)
13587            LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13588          if (RHS->getType() != CastTy)
13589            RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13590        }
13591  
13592        Value *V0, *V1;
13593        if (Instruction::isBinaryOp(E->getOpcode())) {
13594          V0 = Builder.CreateBinOp(
13595              static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13596          V1 = Builder.CreateBinOp(
13597              static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13598        } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13599          V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13600          auto *AltCI = cast<CmpInst>(E->getAltOp());
13601          CmpInst::Predicate AltPred = AltCI->getPredicate();
13602          V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13603        } else {
13604          if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13605            unsigned SrcBWSz = DL->getTypeSizeInBits(
13606                cast<VectorType>(LHS->getType())->getElementType());
13607            unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13608            if (BWSz <= SrcBWSz) {
13609              if (BWSz < SrcBWSz)
13610                LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13611              assert(LHS->getType() == VecTy && "Expected same type as operand.");
13612              if (auto *I = dyn_cast<Instruction>(LHS))
13613                LHS = propagateMetadata(I, E->Scalars);
13614              E->VectorizedValue = LHS;
13615              ++NumVectorInstructions;
13616              return LHS;
13617            }
13618          }
13619          V0 = Builder.CreateCast(
13620              static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13621          V1 = Builder.CreateCast(
13622              static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13623        }
13624        // Add V0 and V1 to later analysis to try to find and remove matching
13625        // instruction, if any.
13626        for (Value *V : {V0, V1}) {
13627          if (auto *I = dyn_cast<Instruction>(V)) {
13628            GatherShuffleExtractSeq.insert(I);
13629            CSEBlocks.insert(I->getParent());
13630          }
13631        }
13632  
13633        // Create shuffle to take alternate operations from the vector.
13634        // Also, gather up main and alt scalar ops to propagate IR flags to
13635        // each vector operation.
13636        ValueList OpScalars, AltScalars;
13637        SmallVector<int> Mask;
13638        E->buildAltOpShuffleMask(
13639            [E, this](Instruction *I) {
13640              assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13641              return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13642                                            *TLI);
13643            },
13644            Mask, &OpScalars, &AltScalars);
13645  
13646        propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13647        propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13648        auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13649          // Drop nuw flags for abs(sub(commutative), true).
13650          if (auto *I = dyn_cast<Instruction>(Vec);
13651              I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13652              any_of(E->Scalars, [](Value *V) {
13653                auto *IV = cast<Instruction>(V);
13654                return IV->getOpcode() == Instruction::Sub &&
13655                       isCommutative(cast<Instruction>(IV));
13656              }))
13657            I->setHasNoUnsignedWrap(/*b=*/false);
13658        };
13659        DropNuwFlag(V0, E->getOpcode());
13660        DropNuwFlag(V1, E->getAltOpcode());
13661  
13662        Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13663        if (auto *I = dyn_cast<Instruction>(V)) {
13664          V = propagateMetadata(I, E->Scalars);
13665          GatherShuffleExtractSeq.insert(I);
13666          CSEBlocks.insert(I->getParent());
13667        }
13668  
13669        E->VectorizedValue = V;
13670        ++NumVectorInstructions;
13671  
13672        return V;
13673      }
13674      default:
13675        llvm_unreachable("unknown inst");
13676    }
13677    return nullptr;
13678  }
13679  
13680  Value *BoUpSLP::vectorizeTree() {
13681    ExtraValueToDebugLocsMap ExternallyUsedValues;
13682    SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13683    return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13684  }
13685  
13686  namespace {
13687  /// Data type for handling buildvector sequences with the reused scalars from
13688  /// other tree entries.
13689  struct ShuffledInsertData {
13690    /// List of insertelements to be replaced by shuffles.
13691    SmallVector<InsertElementInst *> InsertElements;
13692    /// The parent vectors and shuffle mask for the given list of inserts.
13693    MapVector<Value *, SmallVector<int>> ValueMasks;
13694  };
13695  } // namespace
13696  
13697  Value *BoUpSLP::vectorizeTree(
13698      const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13699      SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13700      Instruction *ReductionRoot) {
13701    // All blocks must be scheduled before any instructions are inserted.
13702    for (auto &BSIter : BlocksSchedules) {
13703      scheduleBlock(BSIter.second.get());
13704    }
13705    // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13706    // need to rebuild it.
13707    EntryToLastInstruction.clear();
13708  
13709    if (ReductionRoot)
13710      Builder.SetInsertPoint(ReductionRoot->getParent(),
13711                             ReductionRoot->getIterator());
13712    else
13713      Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13714  
13715    // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13716    (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13717    for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13718      if (TE->State == TreeEntry::Vectorize &&
13719          TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13720          TE->VectorizedValue)
13721        (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13722    // Run through the list of postponed gathers and emit them, replacing the temp
13723    // emitted allocas with actual vector instructions.
13724    ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13725    DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
13726    for (const TreeEntry *E : PostponedNodes) {
13727      auto *TE = const_cast<TreeEntry *>(E);
13728      if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13729        if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13730                TE->UserTreeIndices.front().EdgeIdx)) &&
13731            VecTE->isSame(TE->Scalars))
13732          // Found gather node which is absolutely the same as one of the
13733          // vectorized nodes. It may happen after reordering.
13734          continue;
13735      auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13736      TE->VectorizedValue = nullptr;
13737      auto *UserI =
13738          cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13739      // If user is a PHI node, its vector code have to be inserted right before
13740      // block terminator. Since the node was delayed, there were some unresolved
13741      // dependencies at the moment when stab instruction was emitted. In a case
13742      // when any of these dependencies turn out an operand of another PHI, coming
13743      // from this same block, position of a stab instruction will become invalid.
13744      // The is because source vector that supposed to feed this gather node was
13745      // inserted at the end of the block [after stab instruction]. So we need
13746      // to adjust insertion point again to the end of block.
13747      if (isa<PHINode>(UserI)) {
13748        // Insert before all users.
13749        Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13750        for (User *U : PrevVec->users()) {
13751          if (U == UserI)
13752            continue;
13753          auto *UI = dyn_cast<Instruction>(U);
13754          if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13755            continue;
13756          if (UI->comesBefore(InsertPt))
13757            InsertPt = UI;
13758        }
13759        Builder.SetInsertPoint(InsertPt);
13760      } else {
13761        Builder.SetInsertPoint(PrevVec);
13762      }
13763      Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13764      Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13765      if (Vec->getType() != PrevVec->getType()) {
13766        assert(Vec->getType()->isIntOrIntVectorTy() &&
13767               PrevVec->getType()->isIntOrIntVectorTy() &&
13768               "Expected integer vector types only.");
13769        std::optional<bool> IsSigned;
13770        for (Value *V : TE->Scalars) {
13771          if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13772            auto It = MinBWs.find(BaseTE);
13773            if (It != MinBWs.end()) {
13774              IsSigned = IsSigned.value_or(false) || It->second.second;
13775              if (*IsSigned)
13776                break;
13777            }
13778            for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13779              auto It = MinBWs.find(MNTE);
13780              if (It != MinBWs.end()) {
13781                IsSigned = IsSigned.value_or(false) || It->second.second;
13782                if (*IsSigned)
13783                  break;
13784              }
13785            }
13786            if (IsSigned.value_or(false))
13787              break;
13788            // Scan through gather nodes.
13789            for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13790              auto It = MinBWs.find(BVE);
13791              if (It != MinBWs.end()) {
13792                IsSigned = IsSigned.value_or(false) || It->second.second;
13793                if (*IsSigned)
13794                  break;
13795              }
13796            }
13797            if (IsSigned.value_or(false))
13798              break;
13799            if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13800              IsSigned =
13801                  IsSigned.value_or(false) ||
13802                  !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13803              continue;
13804            }
13805            if (IsSigned.value_or(false))
13806              break;
13807          }
13808        }
13809        if (IsSigned.value_or(false)) {
13810          // Final attempt - check user node.
13811          auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13812          if (It != MinBWs.end())
13813            IsSigned = It->second.second;
13814        }
13815        assert(IsSigned &&
13816               "Expected user node or perfect diamond match in MinBWs.");
13817        Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13818      }
13819      PrevVec->replaceAllUsesWith(Vec);
13820      PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13821      // Replace the stub vector node, if it was used before for one of the
13822      // buildvector nodes already.
13823      auto It = PostponedValues.find(PrevVec);
13824      if (It != PostponedValues.end()) {
13825        for (TreeEntry *VTE : It->getSecond())
13826          VTE->VectorizedValue = Vec;
13827      }
13828      eraseInstruction(PrevVec);
13829    }
13830  
13831    LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13832                      << " values .\n");
13833  
13834    SmallVector<ShuffledInsertData> ShuffledInserts;
13835    // Maps vector instruction to original insertelement instruction
13836    DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13837    // Maps extract Scalar to the corresponding extractelement instruction in the
13838    // basic block. Only one extractelement per block should be emitted.
13839    DenseMap<Value *,
13840             DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
13841        ScalarToEEs;
13842    SmallDenseSet<Value *, 4> UsedInserts;
13843    DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
13844    SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13845    // Extract all of the elements with the external uses.
13846    for (const auto &ExternalUse : ExternalUses) {
13847      Value *Scalar = ExternalUse.Scalar;
13848      llvm::User *User = ExternalUse.User;
13849  
13850      // Skip users that we already RAUW. This happens when one instruction
13851      // has multiple uses of the same value.
13852      if (User && !is_contained(Scalar->users(), User))
13853        continue;
13854      TreeEntry *E = getTreeEntry(Scalar);
13855      assert(E && "Invalid scalar");
13856      assert(!E->isGather() && "Extracting from a gather list");
13857      // Non-instruction pointers are not deleted, just skip them.
13858      if (E->getOpcode() == Instruction::GetElementPtr &&
13859          !isa<GetElementPtrInst>(Scalar))
13860        continue;
13861  
13862      Value *Vec = E->VectorizedValue;
13863      assert(Vec && "Can't find vectorizable value");
13864  
13865      Value *Lane = Builder.getInt32(ExternalUse.Lane);
13866      auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13867        if (Scalar->getType() != Vec->getType()) {
13868          Value *Ex = nullptr;
13869          Value *ExV = nullptr;
13870          auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13871          bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13872          auto It = ScalarToEEs.find(Scalar);
13873          if (It != ScalarToEEs.end()) {
13874            // No need to emit many extracts, just move the only one in the
13875            // current block.
13876            auto EEIt = It->second.find(Builder.GetInsertBlock());
13877            if (EEIt != It->second.end()) {
13878              Instruction *I = EEIt->second.first;
13879              if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13880                  Builder.GetInsertPoint()->comesBefore(I)) {
13881                I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13882                              Builder.GetInsertPoint());
13883                if (auto *CI = EEIt->second.second)
13884                  CI->moveAfter(I);
13885              }
13886              Ex = I;
13887              ExV = EEIt->second.second ? EEIt->second.second : Ex;
13888            }
13889          }
13890          if (!Ex) {
13891            // "Reuse" the existing extract to improve final codegen.
13892            if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13893              Value *V = ES->getVectorOperand();
13894              if (const TreeEntry *ETE = getTreeEntry(V))
13895                V = ETE->VectorizedValue;
13896              Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13897            } else if (ReplaceGEP) {
13898              // Leave the GEPs as is, they are free in most cases and better to
13899              // keep them as GEPs.
13900              auto *CloneGEP = GEP->clone();
13901              if (isa<Instruction>(Vec))
13902                CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13903                                       Builder.GetInsertPoint());
13904              else
13905                CloneGEP->insertBefore(GEP);
13906              if (GEP->hasName())
13907                CloneGEP->takeName(GEP);
13908              Ex = CloneGEP;
13909            } else {
13910              Ex = Builder.CreateExtractElement(Vec, Lane);
13911            }
13912            // If necessary, sign-extend or zero-extend ScalarRoot
13913            // to the larger type.
13914            ExV = Ex;
13915            if (Scalar->getType() != Ex->getType())
13916              ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13917                                          MinBWs.find(E)->second.second);
13918            if (auto *I = dyn_cast<Instruction>(Ex))
13919              ScalarToEEs[Scalar].try_emplace(
13920                  Builder.GetInsertBlock(),
13921                  std::make_pair(I, cast<Instruction>(ExV)));
13922          }
13923          // The then branch of the previous if may produce constants, since 0
13924          // operand might be a constant.
13925          if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13926            GatherShuffleExtractSeq.insert(ExI);
13927            CSEBlocks.insert(ExI->getParent());
13928          }
13929          return ExV;
13930        }
13931        assert(isa<FixedVectorType>(Scalar->getType()) &&
13932               isa<InsertElementInst>(Scalar) &&
13933               "In-tree scalar of vector type is not insertelement?");
13934        auto *IE = cast<InsertElementInst>(Scalar);
13935        VectorToInsertElement.try_emplace(Vec, IE);
13936        return Vec;
13937      };
13938      // If User == nullptr, the Scalar remains as scalar in vectorized
13939      // instructions or is used as extra arg. Generate ExtractElement instruction
13940      // and update the record for this scalar in ExternallyUsedValues.
13941      if (!User) {
13942        if (!ScalarsWithNullptrUser.insert(Scalar).second)
13943          continue;
13944        assert((ExternallyUsedValues.count(Scalar) ||
13945                Scalar->hasNUsesOrMore(UsesLimit) ||
13946                any_of(Scalar->users(),
13947                       [&](llvm::User *U) {
13948                         if (ExternalUsesAsGEPs.contains(U))
13949                           return true;
13950                         TreeEntry *UseEntry = getTreeEntry(U);
13951                         return UseEntry &&
13952                                (UseEntry->State == TreeEntry::Vectorize ||
13953                                 UseEntry->State ==
13954                                     TreeEntry::StridedVectorize) &&
13955                                (E->State == TreeEntry::Vectorize ||
13956                                 E->State == TreeEntry::StridedVectorize) &&
13957                                doesInTreeUserNeedToExtract(
13958                                    Scalar,
13959                                    cast<Instruction>(UseEntry->Scalars.front()),
13960                                    TLI);
13961                       })) &&
13962               "Scalar with nullptr User must be registered in "
13963               "ExternallyUsedValues map or remain as scalar in vectorized "
13964               "instructions");
13965        if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13966          if (auto *PHI = dyn_cast<PHINode>(VecI))
13967            Builder.SetInsertPoint(PHI->getParent(),
13968                                   PHI->getParent()->getFirstNonPHIIt());
13969          else
13970            Builder.SetInsertPoint(VecI->getParent(),
13971                                   std::next(VecI->getIterator()));
13972        } else {
13973          Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13974        }
13975        Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13976        // Required to update internally referenced instructions.
13977        Scalar->replaceAllUsesWith(NewInst);
13978        ReplacedExternals.emplace_back(Scalar, NewInst);
13979        continue;
13980      }
13981  
13982      if (auto *VU = dyn_cast<InsertElementInst>(User);
13983          VU && VU->getOperand(1) == Scalar) {
13984        // Skip if the scalar is another vector op or Vec is not an instruction.
13985        if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13986          if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13987            if (!UsedInserts.insert(VU).second)
13988              continue;
13989            // Need to use original vector, if the root is truncated.
13990            auto BWIt = MinBWs.find(E);
13991            if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13992              auto *ScalarTy = FTy->getElementType();
13993              auto Key = std::make_pair(Vec, ScalarTy);
13994              auto VecIt = VectorCasts.find(Key);
13995              if (VecIt == VectorCasts.end()) {
13996                IRBuilderBase::InsertPointGuard Guard(Builder);
13997                if (auto *IVec = dyn_cast<PHINode>(Vec))
13998                  Builder.SetInsertPoint(
13999                      IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14000                else if (auto *IVec = dyn_cast<Instruction>(Vec))
14001                  Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14002                Vec = Builder.CreateIntCast(
14003                    Vec,
14004                    getWidenedType(
14005                        ScalarTy,
14006                        cast<FixedVectorType>(Vec->getType())->getNumElements()),
14007                    BWIt->second.second);
14008                VectorCasts.try_emplace(Key, Vec);
14009              } else {
14010                Vec = VecIt->second;
14011              }
14012            }
14013  
14014            std::optional<unsigned> InsertIdx = getElementIndex(VU);
14015            if (InsertIdx) {
14016              auto *It =
14017                  find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14018                    // Checks if 2 insertelements are from the same buildvector.
14019                    InsertElementInst *VecInsert = Data.InsertElements.front();
14020                    return areTwoInsertFromSameBuildVector(
14021                        VU, VecInsert,
14022                        [](InsertElementInst *II) { return II->getOperand(0); });
14023                  });
14024              unsigned Idx = *InsertIdx;
14025              if (It == ShuffledInserts.end()) {
14026                (void)ShuffledInserts.emplace_back();
14027                It = std::next(ShuffledInserts.begin(),
14028                               ShuffledInserts.size() - 1);
14029                SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14030                if (Mask.empty())
14031                  Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14032                // Find the insertvector, vectorized in tree, if any.
14033                Value *Base = VU;
14034                while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14035                  if (IEBase != User &&
14036                      (!IEBase->hasOneUse() ||
14037                       getElementIndex(IEBase).value_or(Idx) == Idx))
14038                    break;
14039                  // Build the mask for the vectorized insertelement instructions.
14040                  if (const TreeEntry *E = getTreeEntry(IEBase)) {
14041                    do {
14042                      IEBase = cast<InsertElementInst>(Base);
14043                      int IEIdx = *getElementIndex(IEBase);
14044                      assert(Mask[IEIdx] == PoisonMaskElem &&
14045                             "InsertElementInstruction used already.");
14046                      Mask[IEIdx] = IEIdx;
14047                      Base = IEBase->getOperand(0);
14048                    } while (E == getTreeEntry(Base));
14049                    break;
14050                  }
14051                  Base = cast<InsertElementInst>(Base)->getOperand(0);
14052                  // After the vectorization the def-use chain has changed, need
14053                  // to look through original insertelement instructions, if they
14054                  // get replaced by vector instructions.
14055                  auto It = VectorToInsertElement.find(Base);
14056                  if (It != VectorToInsertElement.end())
14057                    Base = It->second;
14058                }
14059              }
14060              SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14061              if (Mask.empty())
14062                Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14063              Mask[Idx] = ExternalUse.Lane;
14064              It->InsertElements.push_back(cast<InsertElementInst>(User));
14065              continue;
14066            }
14067          }
14068        }
14069      }
14070  
14071      // Generate extracts for out-of-tree users.
14072      // Find the insertion point for the extractelement lane.
14073      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14074        if (PHINode *PH = dyn_cast<PHINode>(User)) {
14075          for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14076            if (PH->getIncomingValue(I) == Scalar) {
14077              Instruction *IncomingTerminator =
14078                  PH->getIncomingBlock(I)->getTerminator();
14079              if (isa<CatchSwitchInst>(IncomingTerminator)) {
14080                Builder.SetInsertPoint(VecI->getParent(),
14081                                       std::next(VecI->getIterator()));
14082              } else {
14083                Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
14084              }
14085              Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14086              PH->setOperand(I, NewInst);
14087            }
14088          }
14089        } else {
14090          Builder.SetInsertPoint(cast<Instruction>(User));
14091          Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14092          User->replaceUsesOfWith(Scalar, NewInst);
14093        }
14094      } else {
14095        Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14096        Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14097        User->replaceUsesOfWith(Scalar, NewInst);
14098      }
14099  
14100      LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14101    }
14102  
14103    auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14104      SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14105      SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14106      int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14107      for (int I = 0, E = Mask.size(); I < E; ++I) {
14108        if (Mask[I] < VF)
14109          CombinedMask1[I] = Mask[I];
14110        else
14111          CombinedMask2[I] = Mask[I] - VF;
14112      }
14113      ShuffleInstructionBuilder ShuffleBuilder(
14114          cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
14115      ShuffleBuilder.add(V1, CombinedMask1);
14116      if (V2)
14117        ShuffleBuilder.add(V2, CombinedMask2);
14118      return ShuffleBuilder.finalize(std::nullopt);
14119    };
14120  
14121    auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14122                                         bool ForSingleMask) {
14123      unsigned VF = Mask.size();
14124      unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14125      if (VF != VecVF) {
14126        if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14127          Vec = CreateShuffle(Vec, nullptr, Mask);
14128          return std::make_pair(Vec, true);
14129        }
14130        if (!ForSingleMask) {
14131          SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14132          for (unsigned I = 0; I < VF; ++I) {
14133            if (Mask[I] != PoisonMaskElem)
14134              ResizeMask[Mask[I]] = Mask[I];
14135          }
14136          Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14137        }
14138      }
14139  
14140      return std::make_pair(Vec, false);
14141    };
14142    // Perform shuffling of the vectorize tree entries for better handling of
14143    // external extracts.
14144    for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14145      // Find the first and the last instruction in the list of insertelements.
14146      sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
14147      InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14148      InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14149      Builder.SetInsertPoint(LastInsert);
14150      auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14151      Value *NewInst = performExtractsShuffleAction<Value>(
14152          MutableArrayRef(Vector.data(), Vector.size()),
14153          FirstInsert->getOperand(0),
14154          [](Value *Vec) {
14155            return cast<VectorType>(Vec->getType())
14156                ->getElementCount()
14157                .getKnownMinValue();
14158          },
14159          ResizeToVF,
14160          [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14161                                        ArrayRef<Value *> Vals) {
14162            assert((Vals.size() == 1 || Vals.size() == 2) &&
14163                   "Expected exactly 1 or 2 input values.");
14164            if (Vals.size() == 1) {
14165              // Do not create shuffle if the mask is a simple identity
14166              // non-resizing mask.
14167              if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14168                                     ->getNumElements() ||
14169                  !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14170                return CreateShuffle(Vals.front(), nullptr, Mask);
14171              return Vals.front();
14172            }
14173            return CreateShuffle(Vals.front() ? Vals.front()
14174                                              : FirstInsert->getOperand(0),
14175                                 Vals.back(), Mask);
14176          });
14177      auto It = ShuffledInserts[I].InsertElements.rbegin();
14178      // Rebuild buildvector chain.
14179      InsertElementInst *II = nullptr;
14180      if (It != ShuffledInserts[I].InsertElements.rend())
14181        II = *It;
14182      SmallVector<Instruction *> Inserts;
14183      while (It != ShuffledInserts[I].InsertElements.rend()) {
14184        assert(II && "Must be an insertelement instruction.");
14185        if (*It == II)
14186          ++It;
14187        else
14188          Inserts.push_back(cast<Instruction>(II));
14189        II = dyn_cast<InsertElementInst>(II->getOperand(0));
14190      }
14191      for (Instruction *II : reverse(Inserts)) {
14192        II->replaceUsesOfWith(II->getOperand(0), NewInst);
14193        if (auto *NewI = dyn_cast<Instruction>(NewInst))
14194          if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14195            II->moveAfter(NewI);
14196        NewInst = II;
14197      }
14198      LastInsert->replaceAllUsesWith(NewInst);
14199      for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14200        IE->replaceUsesOfWith(IE->getOperand(0),
14201                              PoisonValue::get(IE->getOperand(0)->getType()));
14202        IE->replaceUsesOfWith(IE->getOperand(1),
14203                              PoisonValue::get(IE->getOperand(1)->getType()));
14204        eraseInstruction(IE);
14205      }
14206      CSEBlocks.insert(LastInsert->getParent());
14207    }
14208  
14209    SmallVector<Instruction *> RemovedInsts;
14210    // For each vectorized value:
14211    for (auto &TEPtr : VectorizableTree) {
14212      TreeEntry *Entry = TEPtr.get();
14213  
14214      // No need to handle users of gathered values.
14215      if (Entry->isGather())
14216        continue;
14217  
14218      assert(Entry->VectorizedValue && "Can't find vectorizable value");
14219  
14220      // For each lane:
14221      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14222        Value *Scalar = Entry->Scalars[Lane];
14223  
14224        if (Entry->getOpcode() == Instruction::GetElementPtr &&
14225            !isa<GetElementPtrInst>(Scalar))
14226          continue;
14227  #ifndef NDEBUG
14228        Type *Ty = Scalar->getType();
14229        if (!Ty->isVoidTy()) {
14230          for (User *U : Scalar->users()) {
14231            LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14232  
14233            // It is legal to delete users in the ignorelist.
14234            assert((getTreeEntry(U) ||
14235                    (UserIgnoreList && UserIgnoreList->contains(U)) ||
14236                    (isa_and_nonnull<Instruction>(U) &&
14237                     isDeleted(cast<Instruction>(U)))) &&
14238                   "Deleting out-of-tree value");
14239          }
14240        }
14241  #endif
14242        LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14243        auto *I = cast<Instruction>(Scalar);
14244        RemovedInsts.push_back(I);
14245      }
14246    }
14247  
14248    // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14249    // new vector instruction.
14250    if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14251      V->mergeDIAssignID(RemovedInsts);
14252  
14253    // Clear up reduction references, if any.
14254    if (UserIgnoreList) {
14255      for (Instruction *I : RemovedInsts) {
14256        if (getTreeEntry(I)->Idx != 0)
14257          continue;
14258        SmallVector<SelectInst *> LogicalOpSelects;
14259        I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14260          // Do not replace condition of the logical op in form select <cond>.
14261          bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14262                                      (match(U.getUser(), m_LogicalAnd()) ||
14263                                       match(U.getUser(), m_LogicalOr())) &&
14264                                      U.getOperandNo() == 0;
14265          if (IsPoisoningLogicalOp) {
14266            LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14267            return false;
14268          }
14269          return UserIgnoreList->contains(U.getUser());
14270        });
14271        // Replace conditions of the poisoning logical ops with the non-poison
14272        // constant value.
14273        for (SelectInst *SI : LogicalOpSelects)
14274          SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
14275      }
14276    }
14277    // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14278    // cache correctness.
14279    // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14280    // - instructions are not deleted until later.
14281    removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14282  
14283    Builder.ClearInsertionPoint();
14284    InstrElementSize.clear();
14285  
14286    const TreeEntry &RootTE = *VectorizableTree.front();
14287    Value *Vec = RootTE.VectorizedValue;
14288    if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14289                                        It != MinBWs.end() &&
14290                                        ReductionBitWidth != It->second.first) {
14291      IRBuilder<>::InsertPointGuard Guard(Builder);
14292      Builder.SetInsertPoint(ReductionRoot->getParent(),
14293                             ReductionRoot->getIterator());
14294      Vec = Builder.CreateIntCast(
14295          Vec,
14296          VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14297                          cast<VectorType>(Vec->getType())->getElementCount()),
14298          It->second.second);
14299    }
14300    return Vec;
14301  }
14302  
14303  void BoUpSLP::optimizeGatherSequence() {
14304    LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14305                      << " gather sequences instructions.\n");
14306    // LICM InsertElementInst sequences.
14307    for (Instruction *I : GatherShuffleExtractSeq) {
14308      if (isDeleted(I))
14309        continue;
14310  
14311      // Check if this block is inside a loop.
14312      Loop *L = LI->getLoopFor(I->getParent());
14313      if (!L)
14314        continue;
14315  
14316      // Check if it has a preheader.
14317      BasicBlock *PreHeader = L->getLoopPreheader();
14318      if (!PreHeader)
14319        continue;
14320  
14321      // If the vector or the element that we insert into it are
14322      // instructions that are defined in this basic block then we can't
14323      // hoist this instruction.
14324      if (any_of(I->operands(), [L](Value *V) {
14325            auto *OpI = dyn_cast<Instruction>(V);
14326            return OpI && L->contains(OpI);
14327          }))
14328        continue;
14329  
14330      // We can hoist this instruction. Move it to the pre-header.
14331      I->moveBefore(PreHeader->getTerminator());
14332      CSEBlocks.insert(PreHeader);
14333    }
14334  
14335    // Make a list of all reachable blocks in our CSE queue.
14336    SmallVector<const DomTreeNode *, 8> CSEWorkList;
14337    CSEWorkList.reserve(CSEBlocks.size());
14338    for (BasicBlock *BB : CSEBlocks)
14339      if (DomTreeNode *N = DT->getNode(BB)) {
14340        assert(DT->isReachableFromEntry(N));
14341        CSEWorkList.push_back(N);
14342      }
14343  
14344    // Sort blocks by domination. This ensures we visit a block after all blocks
14345    // dominating it are visited.
14346    llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14347      assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14348             "Different nodes should have different DFS numbers");
14349      return A->getDFSNumIn() < B->getDFSNumIn();
14350    });
14351  
14352    // Less defined shuffles can be replaced by the more defined copies.
14353    // Between two shuffles one is less defined if it has the same vector operands
14354    // and its mask indeces are the same as in the first one or undefs. E.g.
14355    // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14356    // poison, <0, 0, 0, 0>.
14357    auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14358                                             SmallVectorImpl<int> &NewMask) {
14359      if (I1->getType() != I2->getType())
14360        return false;
14361      auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14362      auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14363      if (!SI1 || !SI2)
14364        return I1->isIdenticalTo(I2);
14365      if (SI1->isIdenticalTo(SI2))
14366        return true;
14367      for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14368        if (SI1->getOperand(I) != SI2->getOperand(I))
14369          return false;
14370      // Check if the second instruction is more defined than the first one.
14371      NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14372      ArrayRef<int> SM1 = SI1->getShuffleMask();
14373      // Count trailing undefs in the mask to check the final number of used
14374      // registers.
14375      unsigned LastUndefsCnt = 0;
14376      for (int I = 0, E = NewMask.size(); I < E; ++I) {
14377        if (SM1[I] == PoisonMaskElem)
14378          ++LastUndefsCnt;
14379        else
14380          LastUndefsCnt = 0;
14381        if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14382            NewMask[I] != SM1[I])
14383          return false;
14384        if (NewMask[I] == PoisonMaskElem)
14385          NewMask[I] = SM1[I];
14386      }
14387      // Check if the last undefs actually change the final number of used vector
14388      // registers.
14389      return SM1.size() - LastUndefsCnt > 1 &&
14390             TTI->getNumberOfParts(SI1->getType()) ==
14391                 TTI->getNumberOfParts(
14392                     getWidenedType(SI1->getType()->getElementType(),
14393                                    SM1.size() - LastUndefsCnt));
14394    };
14395    // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14396    // instructions. TODO: We can further optimize this scan if we split the
14397    // instructions into different buckets based on the insert lane.
14398    SmallVector<Instruction *, 16> Visited;
14399    for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14400      assert(*I &&
14401             (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14402             "Worklist not sorted properly!");
14403      BasicBlock *BB = (*I)->getBlock();
14404      // For all instructions in blocks containing gather sequences:
14405      for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14406        if (isDeleted(&In))
14407          continue;
14408        if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14409            !GatherShuffleExtractSeq.contains(&In))
14410          continue;
14411  
14412        // Check if we can replace this instruction with any of the
14413        // visited instructions.
14414        bool Replaced = false;
14415        for (Instruction *&V : Visited) {
14416          SmallVector<int> NewMask;
14417          if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14418              DT->dominates(V->getParent(), In.getParent())) {
14419            In.replaceAllUsesWith(V);
14420            eraseInstruction(&In);
14421            if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14422              if (!NewMask.empty())
14423                SI->setShuffleMask(NewMask);
14424            Replaced = true;
14425            break;
14426          }
14427          if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14428              GatherShuffleExtractSeq.contains(V) &&
14429              IsIdenticalOrLessDefined(V, &In, NewMask) &&
14430              DT->dominates(In.getParent(), V->getParent())) {
14431            In.moveAfter(V);
14432            V->replaceAllUsesWith(&In);
14433            eraseInstruction(V);
14434            if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14435              if (!NewMask.empty())
14436                SI->setShuffleMask(NewMask);
14437            V = &In;
14438            Replaced = true;
14439            break;
14440          }
14441        }
14442        if (!Replaced) {
14443          assert(!is_contained(Visited, &In));
14444          Visited.push_back(&In);
14445        }
14446      }
14447    }
14448    CSEBlocks.clear();
14449    GatherShuffleExtractSeq.clear();
14450  }
14451  
14452  BoUpSLP::ScheduleData *
14453  BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14454    ScheduleData *Bundle = nullptr;
14455    ScheduleData *PrevInBundle = nullptr;
14456    for (Value *V : VL) {
14457      if (doesNotNeedToBeScheduled(V))
14458        continue;
14459      ScheduleData *BundleMember = getScheduleData(V);
14460      assert(BundleMember &&
14461             "no ScheduleData for bundle member "
14462             "(maybe not in same basic block)");
14463      assert(BundleMember->isSchedulingEntity() &&
14464             "bundle member already part of other bundle");
14465      if (PrevInBundle) {
14466        PrevInBundle->NextInBundle = BundleMember;
14467      } else {
14468        Bundle = BundleMember;
14469      }
14470  
14471      // Group the instructions to a bundle.
14472      BundleMember->FirstInBundle = Bundle;
14473      PrevInBundle = BundleMember;
14474    }
14475    assert(Bundle && "Failed to find schedule bundle");
14476    return Bundle;
14477  }
14478  
14479  // Groups the instructions to a bundle (which is then a single scheduling entity)
14480  // and schedules instructions until the bundle gets ready.
14481  std::optional<BoUpSLP::ScheduleData *>
14482  BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14483                                              const InstructionsState &S) {
14484    // No need to schedule PHIs, insertelement, extractelement and extractvalue
14485    // instructions.
14486    if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14487        doesNotNeedToSchedule(VL))
14488      return nullptr;
14489  
14490    // Initialize the instruction bundle.
14491    Instruction *OldScheduleEnd = ScheduleEnd;
14492    LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
14493  
14494    auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14495                                                           ScheduleData *Bundle) {
14496      // The scheduling region got new instructions at the lower end (or it is a
14497      // new region for the first bundle). This makes it necessary to
14498      // recalculate all dependencies.
14499      // It is seldom that this needs to be done a second time after adding the
14500      // initial bundle to the region.
14501      if (ScheduleEnd != OldScheduleEnd) {
14502        for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14503          doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14504        ReSchedule = true;
14505      }
14506      if (Bundle) {
14507        LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14508                          << " in block " << BB->getName() << "\n");
14509        calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14510      }
14511  
14512      if (ReSchedule) {
14513        resetSchedule();
14514        initialFillReadyList(ReadyInsts);
14515      }
14516  
14517      // Now try to schedule the new bundle or (if no bundle) just calculate
14518      // dependencies. As soon as the bundle is "ready" it means that there are no
14519      // cyclic dependencies and we can schedule it. Note that's important that we
14520      // don't "schedule" the bundle yet (see cancelScheduling).
14521      while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14522             !ReadyInsts.empty()) {
14523        ScheduleData *Picked = ReadyInsts.pop_back_val();
14524        assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14525               "must be ready to schedule");
14526        schedule(Picked, ReadyInsts);
14527      }
14528    };
14529  
14530    // Make sure that the scheduling region contains all
14531    // instructions of the bundle.
14532    for (Value *V : VL) {
14533      if (doesNotNeedToBeScheduled(V))
14534        continue;
14535      if (!extendSchedulingRegion(V, S)) {
14536        // If the scheduling region got new instructions at the lower end (or it
14537        // is a new region for the first bundle). This makes it necessary to
14538        // recalculate all dependencies.
14539        // Otherwise the compiler may crash trying to incorrectly calculate
14540        // dependencies and emit instruction in the wrong order at the actual
14541        // scheduling.
14542        TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14543        return std::nullopt;
14544      }
14545    }
14546  
14547    bool ReSchedule = false;
14548    for (Value *V : VL) {
14549      if (doesNotNeedToBeScheduled(V))
14550        continue;
14551      ScheduleData *BundleMember = getScheduleData(V);
14552      assert(BundleMember &&
14553             "no ScheduleData for bundle member (maybe not in same basic block)");
14554  
14555      // Make sure we don't leave the pieces of the bundle in the ready list when
14556      // whole bundle might not be ready.
14557      ReadyInsts.remove(BundleMember);
14558  
14559      if (!BundleMember->IsScheduled)
14560        continue;
14561      // A bundle member was scheduled as single instruction before and now
14562      // needs to be scheduled as part of the bundle. We just get rid of the
14563      // existing schedule.
14564      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
14565                        << " was already scheduled\n");
14566      ReSchedule = true;
14567    }
14568  
14569    auto *Bundle = buildBundle(VL);
14570    TryScheduleBundleImpl(ReSchedule, Bundle);
14571    if (!Bundle->isReady()) {
14572      cancelScheduling(VL, S.OpValue);
14573      return std::nullopt;
14574    }
14575    return Bundle;
14576  }
14577  
14578  void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14579                                                  Value *OpValue) {
14580    if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14581        doesNotNeedToSchedule(VL))
14582      return;
14583  
14584    if (doesNotNeedToBeScheduled(OpValue))
14585      OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14586    ScheduleData *Bundle = getScheduleData(OpValue);
14587    LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
14588    assert(!Bundle->IsScheduled &&
14589           "Can't cancel bundle which is already scheduled");
14590    assert(Bundle->isSchedulingEntity() &&
14591           (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14592           "tried to unbundle something which is not a bundle");
14593  
14594    // Remove the bundle from the ready list.
14595    if (Bundle->isReady())
14596      ReadyInsts.remove(Bundle);
14597  
14598    // Un-bundle: make single instructions out of the bundle.
14599    ScheduleData *BundleMember = Bundle;
14600    while (BundleMember) {
14601      assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14602      BundleMember->FirstInBundle = BundleMember;
14603      ScheduleData *Next = BundleMember->NextInBundle;
14604      BundleMember->NextInBundle = nullptr;
14605      BundleMember->TE = nullptr;
14606      if (BundleMember->unscheduledDepsInBundle() == 0) {
14607        ReadyInsts.insert(BundleMember);
14608      }
14609      BundleMember = Next;
14610    }
14611  }
14612  
14613  BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14614    // Allocate a new ScheduleData for the instruction.
14615    if (ChunkPos >= ChunkSize) {
14616      ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14617      ChunkPos = 0;
14618    }
14619    return &(ScheduleDataChunks.back()[ChunkPos++]);
14620  }
14621  
14622  bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14623                                                        const InstructionsState &S) {
14624    if (getScheduleData(V, isOneOf(S, V)))
14625      return true;
14626    Instruction *I = dyn_cast<Instruction>(V);
14627    assert(I && "bundle member must be an instruction");
14628    assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14629           !doesNotNeedToBeScheduled(I) &&
14630           "phi nodes/insertelements/extractelements/extractvalues don't need to "
14631           "be scheduled");
14632    auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14633      ScheduleData *ISD = getScheduleData(I);
14634      if (!ISD)
14635        return false;
14636      assert(isInSchedulingRegion(ISD) &&
14637             "ScheduleData not in scheduling region");
14638      ScheduleData *SD = allocateScheduleDataChunks();
14639      SD->Inst = I;
14640      SD->init(SchedulingRegionID, S.OpValue);
14641      ExtraScheduleDataMap[I][S.OpValue] = SD;
14642      return true;
14643    };
14644    if (CheckScheduleForI(I))
14645      return true;
14646    if (!ScheduleStart) {
14647      // It's the first instruction in the new region.
14648      initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14649      ScheduleStart = I;
14650      ScheduleEnd = I->getNextNode();
14651      if (isOneOf(S, I) != I)
14652        CheckScheduleForI(I);
14653      assert(ScheduleEnd && "tried to vectorize a terminator?");
14654      LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
14655      return true;
14656    }
14657    // Search up and down at the same time, because we don't know if the new
14658    // instruction is above or below the existing scheduling region.
14659    // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14660    // against the budget. Otherwise debug info could affect codegen.
14661    BasicBlock::reverse_iterator UpIter =
14662        ++ScheduleStart->getIterator().getReverse();
14663    BasicBlock::reverse_iterator UpperEnd = BB->rend();
14664    BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14665    BasicBlock::iterator LowerEnd = BB->end();
14666    auto IsAssumeLikeIntr = [](const Instruction &I) {
14667      if (auto *II = dyn_cast<IntrinsicInst>(&I))
14668        return II->isAssumeLikeIntrinsic();
14669      return false;
14670    };
14671    UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14672    DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14673    while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14674           &*DownIter != I) {
14675      if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14676        LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
14677        return false;
14678      }
14679  
14680      ++UpIter;
14681      ++DownIter;
14682  
14683      UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14684      DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14685    }
14686    if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14687      assert(I->getParent() == ScheduleStart->getParent() &&
14688             "Instruction is in wrong basic block.");
14689      initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14690      ScheduleStart = I;
14691      if (isOneOf(S, I) != I)
14692        CheckScheduleForI(I);
14693      LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
14694                        << "\n");
14695      return true;
14696    }
14697    assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14698           "Expected to reach top of the basic block or instruction down the "
14699           "lower end.");
14700    assert(I->getParent() == ScheduleEnd->getParent() &&
14701           "Instruction is in wrong basic block.");
14702    initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14703                     nullptr);
14704    ScheduleEnd = I->getNextNode();
14705    if (isOneOf(S, I) != I)
14706      CheckScheduleForI(I);
14707    assert(ScheduleEnd && "tried to vectorize a terminator?");
14708    LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
14709    return true;
14710  }
14711  
14712  void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14713                                                  Instruction *ToI,
14714                                                  ScheduleData *PrevLoadStore,
14715                                                  ScheduleData *NextLoadStore) {
14716    ScheduleData *CurrentLoadStore = PrevLoadStore;
14717    for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14718      // No need to allocate data for non-schedulable instructions.
14719      if (doesNotNeedToBeScheduled(I))
14720        continue;
14721      ScheduleData *SD = ScheduleDataMap.lookup(I);
14722      if (!SD) {
14723        SD = allocateScheduleDataChunks();
14724        ScheduleDataMap[I] = SD;
14725        SD->Inst = I;
14726      }
14727      assert(!isInSchedulingRegion(SD) &&
14728             "new ScheduleData already in scheduling region");
14729      SD->init(SchedulingRegionID, I);
14730  
14731      if (I->mayReadOrWriteMemory() &&
14732          (!isa<IntrinsicInst>(I) ||
14733           (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14734            cast<IntrinsicInst>(I)->getIntrinsicID() !=
14735                Intrinsic::pseudoprobe))) {
14736        // Update the linked list of memory accessing instructions.
14737        if (CurrentLoadStore) {
14738          CurrentLoadStore->NextLoadStore = SD;
14739        } else {
14740          FirstLoadStoreInRegion = SD;
14741        }
14742        CurrentLoadStore = SD;
14743      }
14744  
14745      if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14746          match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14747        RegionHasStackSave = true;
14748    }
14749    if (NextLoadStore) {
14750      if (CurrentLoadStore)
14751        CurrentLoadStore->NextLoadStore = NextLoadStore;
14752    } else {
14753      LastLoadStoreInRegion = CurrentLoadStore;
14754    }
14755  }
14756  
14757  void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14758                                                       bool InsertInReadyList,
14759                                                       BoUpSLP *SLP) {
14760    assert(SD->isSchedulingEntity());
14761  
14762    SmallVector<ScheduleData *, 10> WorkList;
14763    WorkList.push_back(SD);
14764  
14765    while (!WorkList.empty()) {
14766      ScheduleData *SD = WorkList.pop_back_val();
14767      for (ScheduleData *BundleMember = SD; BundleMember;
14768           BundleMember = BundleMember->NextInBundle) {
14769        assert(isInSchedulingRegion(BundleMember));
14770        if (BundleMember->hasValidDependencies())
14771          continue;
14772  
14773        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
14774                   << "\n");
14775        BundleMember->Dependencies = 0;
14776        BundleMember->resetUnscheduledDeps();
14777  
14778        // Handle def-use chain dependencies.
14779        if (BundleMember->OpValue != BundleMember->Inst) {
14780          if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14781            BundleMember->Dependencies++;
14782            ScheduleData *DestBundle = UseSD->FirstInBundle;
14783            if (!DestBundle->IsScheduled)
14784              BundleMember->incrementUnscheduledDeps(1);
14785            if (!DestBundle->hasValidDependencies())
14786              WorkList.push_back(DestBundle);
14787          }
14788        } else {
14789          for (User *U : BundleMember->Inst->users()) {
14790            if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14791              BundleMember->Dependencies++;
14792              ScheduleData *DestBundle = UseSD->FirstInBundle;
14793              if (!DestBundle->IsScheduled)
14794                BundleMember->incrementUnscheduledDeps(1);
14795              if (!DestBundle->hasValidDependencies())
14796                WorkList.push_back(DestBundle);
14797            }
14798          }
14799        }
14800  
14801        auto MakeControlDependent = [&](Instruction *I) {
14802          auto *DepDest = getScheduleData(I);
14803          assert(DepDest && "must be in schedule window");
14804          DepDest->ControlDependencies.push_back(BundleMember);
14805          BundleMember->Dependencies++;
14806          ScheduleData *DestBundle = DepDest->FirstInBundle;
14807          if (!DestBundle->IsScheduled)
14808            BundleMember->incrementUnscheduledDeps(1);
14809          if (!DestBundle->hasValidDependencies())
14810            WorkList.push_back(DestBundle);
14811        };
14812  
14813        // Any instruction which isn't safe to speculate at the beginning of the
14814        // block is control dependend on any early exit or non-willreturn call
14815        // which proceeds it.
14816        if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14817          for (Instruction *I = BundleMember->Inst->getNextNode();
14818               I != ScheduleEnd; I = I->getNextNode()) {
14819            if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14820              continue;
14821  
14822            // Add the dependency
14823            MakeControlDependent(I);
14824  
14825            if (!isGuaranteedToTransferExecutionToSuccessor(I))
14826              // Everything past here must be control dependent on I.
14827              break;
14828          }
14829        }
14830  
14831        if (RegionHasStackSave) {
14832          // If we have an inalloc alloca instruction, it needs to be scheduled
14833          // after any preceeding stacksave.  We also need to prevent any alloca
14834          // from reordering above a preceeding stackrestore.
14835          if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14836              match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14837            for (Instruction *I = BundleMember->Inst->getNextNode();
14838                 I != ScheduleEnd; I = I->getNextNode()) {
14839              if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14840                  match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14841                // Any allocas past here must be control dependent on I, and I
14842                // must be memory dependend on BundleMember->Inst.
14843                break;
14844  
14845              if (!isa<AllocaInst>(I))
14846                continue;
14847  
14848              // Add the dependency
14849              MakeControlDependent(I);
14850            }
14851          }
14852  
14853          // In addition to the cases handle just above, we need to prevent
14854          // allocas and loads/stores from moving below a stacksave or a
14855          // stackrestore. Avoiding moving allocas below stackrestore is currently
14856          // thought to be conservatism. Moving loads/stores below a stackrestore
14857          // can lead to incorrect code.
14858          if (isa<AllocaInst>(BundleMember->Inst) ||
14859              BundleMember->Inst->mayReadOrWriteMemory()) {
14860            for (Instruction *I = BundleMember->Inst->getNextNode();
14861                 I != ScheduleEnd; I = I->getNextNode()) {
14862              if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14863                  !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14864                continue;
14865  
14866              // Add the dependency
14867              MakeControlDependent(I);
14868              break;
14869            }
14870          }
14871        }
14872  
14873        // Handle the memory dependencies (if any).
14874        ScheduleData *DepDest = BundleMember->NextLoadStore;
14875        if (!DepDest)
14876          continue;
14877        Instruction *SrcInst = BundleMember->Inst;
14878        assert(SrcInst->mayReadOrWriteMemory() &&
14879               "NextLoadStore list for non memory effecting bundle?");
14880        MemoryLocation SrcLoc = getLocation(SrcInst);
14881        bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14882        unsigned NumAliased = 0;
14883        unsigned DistToSrc = 1;
14884  
14885        for (; DepDest; DepDest = DepDest->NextLoadStore) {
14886          assert(isInSchedulingRegion(DepDest));
14887  
14888          // We have two limits to reduce the complexity:
14889          // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14890          //    SLP->isAliased (which is the expensive part in this loop).
14891          // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14892          //    the whole loop (even if the loop is fast, it's quadratic).
14893          //    It's important for the loop break condition (see below) to
14894          //    check this limit even between two read-only instructions.
14895          if (DistToSrc >= MaxMemDepDistance ||
14896              ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14897               (NumAliased >= AliasedCheckLimit ||
14898                SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14899  
14900            // We increment the counter only if the locations are aliased
14901            // (instead of counting all alias checks). This gives a better
14902            // balance between reduced runtime and accurate dependencies.
14903            NumAliased++;
14904  
14905            DepDest->MemoryDependencies.push_back(BundleMember);
14906            BundleMember->Dependencies++;
14907            ScheduleData *DestBundle = DepDest->FirstInBundle;
14908            if (!DestBundle->IsScheduled) {
14909              BundleMember->incrementUnscheduledDeps(1);
14910            }
14911            if (!DestBundle->hasValidDependencies()) {
14912              WorkList.push_back(DestBundle);
14913            }
14914          }
14915  
14916          // Example, explaining the loop break condition: Let's assume our
14917          // starting instruction is i0 and MaxMemDepDistance = 3.
14918          //
14919          //                      +--------v--v--v
14920          //             i0,i1,i2,i3,i4,i5,i6,i7,i8
14921          //             +--------^--^--^
14922          //
14923          // MaxMemDepDistance let us stop alias-checking at i3 and we add
14924          // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14925          // Previously we already added dependencies from i3 to i6,i7,i8
14926          // (because of MaxMemDepDistance). As we added a dependency from
14927          // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14928          // and we can abort this loop at i6.
14929          if (DistToSrc >= 2 * MaxMemDepDistance)
14930            break;
14931          DistToSrc++;
14932        }
14933      }
14934      if (InsertInReadyList && SD->isReady()) {
14935        ReadyInsts.insert(SD);
14936        LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
14937                          << "\n");
14938      }
14939    }
14940  }
14941  
14942  void BoUpSLP::BlockScheduling::resetSchedule() {
14943    assert(ScheduleStart &&
14944           "tried to reset schedule on block which has not been scheduled");
14945    for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14946      doForAllOpcodes(I, [&](ScheduleData *SD) {
14947        assert(isInSchedulingRegion(SD) &&
14948               "ScheduleData not in scheduling region");
14949        SD->IsScheduled = false;
14950        SD->resetUnscheduledDeps();
14951      });
14952    }
14953    ReadyInsts.clear();
14954  }
14955  
14956  void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14957    if (!BS->ScheduleStart)
14958      return;
14959  
14960    LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14961  
14962    // A key point - if we got here, pre-scheduling was able to find a valid
14963    // scheduling of the sub-graph of the scheduling window which consists
14964    // of all vector bundles and their transitive users.  As such, we do not
14965    // need to reschedule anything *outside of* that subgraph.
14966  
14967    BS->resetSchedule();
14968  
14969    // For the real scheduling we use a more sophisticated ready-list: it is
14970    // sorted by the original instruction location. This lets the final schedule
14971    // be as  close as possible to the original instruction order.
14972    // WARNING: If changing this order causes a correctness issue, that means
14973    // there is some missing dependence edge in the schedule data graph.
14974    struct ScheduleDataCompare {
14975      bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14976        return SD2->SchedulingPriority < SD1->SchedulingPriority;
14977      }
14978    };
14979    std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14980  
14981    // Ensure that all dependency data is updated (for nodes in the sub-graph)
14982    // and fill the ready-list with initial instructions.
14983    int Idx = 0;
14984    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14985         I = I->getNextNode()) {
14986      BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14987        TreeEntry *SDTE = getTreeEntry(SD->Inst);
14988        (void)SDTE;
14989        assert((isVectorLikeInstWithConstOps(SD->Inst) ||
14990                SD->isPartOfBundle() ==
14991                    (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14992               "scheduler and vectorizer bundle mismatch");
14993        SD->FirstInBundle->SchedulingPriority = Idx++;
14994  
14995        if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14996          BS->calculateDependencies(SD, false, this);
14997      });
14998    }
14999    BS->initialFillReadyList(ReadyInsts);
15000  
15001    Instruction *LastScheduledInst = BS->ScheduleEnd;
15002  
15003    // Do the "real" scheduling.
15004    while (!ReadyInsts.empty()) {
15005      ScheduleData *Picked = *ReadyInsts.begin();
15006      ReadyInsts.erase(ReadyInsts.begin());
15007  
15008      // Move the scheduled instruction(s) to their dedicated places, if not
15009      // there yet.
15010      for (ScheduleData *BundleMember = Picked; BundleMember;
15011           BundleMember = BundleMember->NextInBundle) {
15012        Instruction *PickedInst = BundleMember->Inst;
15013        if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15014          PickedInst->moveAfter(LastScheduledInst->getPrevNode());
15015        LastScheduledInst = PickedInst;
15016      }
15017  
15018      BS->schedule(Picked, ReadyInsts);
15019    }
15020  
15021    // Check that we didn't break any of our invariants.
15022  #ifdef EXPENSIVE_CHECKS
15023    BS->verify();
15024  #endif
15025  
15026  #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15027    // Check that all schedulable entities got scheduled
15028    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15029      BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15030        if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15031          assert(SD->IsScheduled && "must be scheduled at this point");
15032        }
15033      });
15034    }
15035  #endif
15036  
15037    // Avoid duplicate scheduling of the block.
15038    BS->ScheduleStart = nullptr;
15039  }
15040  
15041  unsigned BoUpSLP::getVectorElementSize(Value *V) {
15042    // If V is a store, just return the width of the stored value (or value
15043    // truncated just before storing) without traversing the expression tree.
15044    // This is the common case.
15045    if (auto *Store = dyn_cast<StoreInst>(V))
15046      return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15047  
15048    if (auto *IEI = dyn_cast<InsertElementInst>(V))
15049      return getVectorElementSize(IEI->getOperand(1));
15050  
15051    auto E = InstrElementSize.find(V);
15052    if (E != InstrElementSize.end())
15053      return E->second;
15054  
15055    // If V is not a store, we can traverse the expression tree to find loads
15056    // that feed it. The type of the loaded value may indicate a more suitable
15057    // width than V's type. We want to base the vector element size on the width
15058    // of memory operations where possible.
15059    SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
15060    SmallPtrSet<Instruction *, 16> Visited;
15061    if (auto *I = dyn_cast<Instruction>(V)) {
15062      Worklist.emplace_back(I, I->getParent(), 0);
15063      Visited.insert(I);
15064    }
15065  
15066    // Traverse the expression tree in bottom-up order looking for loads. If we
15067    // encounter an instruction we don't yet handle, we give up.
15068    auto Width = 0u;
15069    Value *FirstNonBool = nullptr;
15070    while (!Worklist.empty()) {
15071      auto [I, Parent, Level] = Worklist.pop_back_val();
15072  
15073      // We should only be looking at scalar instructions here. If the current
15074      // instruction has a vector type, skip.
15075      auto *Ty = I->getType();
15076      if (isa<VectorType>(Ty))
15077        continue;
15078      if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15079        FirstNonBool = I;
15080      if (Level > RecursionMaxDepth)
15081        continue;
15082  
15083      // If the current instruction is a load, update MaxWidth to reflect the
15084      // width of the loaded value.
15085      if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
15086        Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
15087  
15088      // Otherwise, we need to visit the operands of the instruction. We only
15089      // handle the interesting cases from buildTree here. If an operand is an
15090      // instruction we haven't yet visited and from the same basic block as the
15091      // user or the use is a PHI node, we add it to the worklist.
15092      else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
15093                   BinaryOperator, UnaryOperator>(I)) {
15094        for (Use &U : I->operands()) {
15095          if (auto *J = dyn_cast<Instruction>(U.get()))
15096            if (Visited.insert(J).second &&
15097                (isa<PHINode>(I) || J->getParent() == Parent)) {
15098              Worklist.emplace_back(J, J->getParent(), Level + 1);
15099              continue;
15100            }
15101          if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15102            FirstNonBool = U.get();
15103        }
15104      } else {
15105        break;
15106      }
15107    }
15108  
15109    // If we didn't encounter a memory access in the expression tree, or if we
15110    // gave up for some reason, just return the width of V. Otherwise, return the
15111    // maximum width we found.
15112    if (!Width) {
15113      if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15114        V = FirstNonBool;
15115      Width = DL->getTypeSizeInBits(V->getType());
15116    }
15117  
15118    for (Instruction *I : Visited)
15119      InstrElementSize[I] = Width;
15120  
15121    return Width;
15122  }
15123  
15124  bool BoUpSLP::collectValuesToDemote(
15125      const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15126      SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
15127      unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15128      bool IsTruncRoot) const {
15129    // We can always demote constants.
15130    if (all_of(E.Scalars, IsaPred<Constant>))
15131      return true;
15132  
15133    unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
15134    if (OrigBitWidth == BitWidth) {
15135      MaxDepthLevel = 1;
15136      return true;
15137    }
15138  
15139    // If the value is not a vectorized instruction in the expression and not used
15140    // by the insertelement instruction and not used in multiple vector nodes, it
15141    // cannot be demoted.
15142    bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
15143      return !isKnownNonNegative(R, SimplifyQuery(*DL));
15144    });
15145    auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15146      if (MultiNodeScalars.contains(V))
15147        return false;
15148      // For lat shuffle of sext/zext with many uses need to check the extra bit
15149      // for unsigned values, otherwise may have incorrect casting for reused
15150      // scalars.
15151      bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
15152      if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15153        APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15154        if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15155          return true;
15156      }
15157      unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15158      unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15159      if (IsSignedNode)
15160        ++BitWidth1;
15161      if (auto *I = dyn_cast<Instruction>(V)) {
15162        APInt Mask = DB->getDemandedBits(I);
15163        unsigned BitWidth2 =
15164            std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15165        while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15166          APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
15167          if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15168            break;
15169          BitWidth2 *= 2;
15170        }
15171        BitWidth1 = std::min(BitWidth1, BitWidth2);
15172      }
15173      BitWidth = std::max(BitWidth, BitWidth1);
15174      return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15175    };
15176    using namespace std::placeholders;
15177    auto FinalAnalysis = [&]() {
15178      if (!IsProfitableToDemote)
15179        return false;
15180      bool Res = all_of(
15181          E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
15182      // Demote gathers.
15183      if (Res && E.isGather()) {
15184        // Check possible extractelement instructions bases and final vector
15185        // length.
15186        SmallPtrSet<Value *, 4> UniqueBases;
15187        for (Value *V : E.Scalars) {
15188          auto *EE = dyn_cast<ExtractElementInst>(V);
15189          if (!EE)
15190            continue;
15191          UniqueBases.insert(EE->getVectorOperand());
15192        }
15193        const unsigned VF = E.Scalars.size();
15194        Type *OrigScalarTy = E.Scalars.front()->getType();
15195        if (UniqueBases.size() <= 2 ||
15196            TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
15197                TTI->getNumberOfParts(getWidenedType(
15198                    IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
15199          ToDemote.push_back(E.Idx);
15200      }
15201      return Res;
15202    };
15203    if (E.isGather() || !Visited.insert(&E).second ||
15204        any_of(E.Scalars, [&](Value *V) {
15205          return all_of(V->users(), [&](User *U) {
15206            return isa<InsertElementInst>(U) && !getTreeEntry(U);
15207          });
15208        }))
15209      return FinalAnalysis();
15210  
15211    if (any_of(E.Scalars, [&](Value *V) {
15212          return !all_of(V->users(), [=](User *U) {
15213            return getTreeEntry(U) ||
15214                   (E.Idx == 0 && UserIgnoreList &&
15215                    UserIgnoreList->contains(U)) ||
15216                   (!isa<CmpInst>(U) && U->getType()->isSized() &&
15217                    !U->getType()->isScalableTy() &&
15218                    DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15219          }) && !IsPotentiallyTruncated(V, BitWidth);
15220        }))
15221      return false;
15222  
15223    auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15224                               bool &NeedToExit) {
15225      NeedToExit = false;
15226      unsigned InitLevel = MaxDepthLevel;
15227      for (const TreeEntry *Op : Operands) {
15228        unsigned Level = InitLevel;
15229        if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15230                                   ToDemote, Visited, Level, IsProfitableToDemote,
15231                                   IsTruncRoot)) {
15232          if (!IsProfitableToDemote)
15233            return false;
15234          NeedToExit = true;
15235          if (!FinalAnalysis())
15236            return false;
15237          continue;
15238        }
15239        MaxDepthLevel = std::max(MaxDepthLevel, Level);
15240      }
15241      return true;
15242    };
15243    auto AttemptCheckBitwidth =
15244        [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15245          // Try all bitwidth < OrigBitWidth.
15246          NeedToExit = false;
15247          unsigned BestFailBitwidth = 0;
15248          for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15249            if (Checker(BitWidth, OrigBitWidth))
15250              return true;
15251            if (BestFailBitwidth == 0 && FinalAnalysis())
15252              BestFailBitwidth = BitWidth;
15253          }
15254          if (BitWidth >= OrigBitWidth) {
15255            if (BestFailBitwidth == 0) {
15256              BitWidth = OrigBitWidth;
15257              return false;
15258            }
15259            MaxDepthLevel = 1;
15260            BitWidth = BestFailBitwidth;
15261            NeedToExit = true;
15262            return true;
15263          }
15264          return false;
15265        };
15266    auto TryProcessInstruction =
15267        [&](unsigned &BitWidth,
15268            ArrayRef<const TreeEntry *> Operands = std::nullopt,
15269            function_ref<bool(unsigned, unsigned)> Checker = {}) {
15270          if (Operands.empty()) {
15271            if (!IsTruncRoot)
15272              MaxDepthLevel = 1;
15273            (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15274                                                std::ref(BitWidth)));
15275          } else {
15276            // Several vectorized uses? Check if we can truncate it, otherwise -
15277            // exit.
15278            if (E.UserTreeIndices.size() > 1 &&
15279                !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15280                                             std::ref(BitWidth))))
15281              return false;
15282            bool NeedToExit = false;
15283            if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15284              return false;
15285            if (NeedToExit)
15286              return true;
15287            if (!ProcessOperands(Operands, NeedToExit))
15288              return false;
15289            if (NeedToExit)
15290              return true;
15291          }
15292  
15293          ++MaxDepthLevel;
15294          // Record the entry that we can demote.
15295          ToDemote.push_back(E.Idx);
15296          return IsProfitableToDemote;
15297        };
15298    switch (E.getOpcode()) {
15299  
15300    // We can always demote truncations and extensions. Since truncations can
15301    // seed additional demotion, we save the truncated value.
15302    case Instruction::Trunc:
15303      if (IsProfitableToDemoteRoot)
15304        IsProfitableToDemote = true;
15305      return TryProcessInstruction(BitWidth);
15306    case Instruction::ZExt:
15307    case Instruction::SExt:
15308      IsProfitableToDemote = true;
15309      return TryProcessInstruction(BitWidth);
15310  
15311    // We can demote certain binary operations if we can demote both of their
15312    // operands.
15313    case Instruction::Add:
15314    case Instruction::Sub:
15315    case Instruction::Mul:
15316    case Instruction::And:
15317    case Instruction::Or:
15318    case Instruction::Xor: {
15319      return TryProcessInstruction(
15320          BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15321    }
15322    case Instruction::Shl: {
15323      // If we are truncating the result of this SHL, and if it's a shift of an
15324      // inrange amount, we can always perform a SHL in a smaller type.
15325      auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15326        return all_of(E.Scalars, [&](Value *V) {
15327          auto *I = cast<Instruction>(V);
15328          KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15329          return AmtKnownBits.getMaxValue().ult(BitWidth);
15330        });
15331      };
15332      return TryProcessInstruction(
15333          BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15334    }
15335    case Instruction::LShr: {
15336      // If this is a truncate of a logical shr, we can truncate it to a smaller
15337      // lshr iff we know that the bits we would otherwise be shifting in are
15338      // already zeros.
15339      auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15340        return all_of(E.Scalars, [&](Value *V) {
15341          auto *I = cast<Instruction>(V);
15342          KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15343          APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15344          return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15345                 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15346                                   SimplifyQuery(*DL));
15347        });
15348      };
15349      return TryProcessInstruction(
15350          BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15351          LShrChecker);
15352    }
15353    case Instruction::AShr: {
15354      // If this is a truncate of an arithmetic shr, we can truncate it to a
15355      // smaller ashr iff we know that all the bits from the sign bit of the
15356      // original type and the sign bit of the truncate type are similar.
15357      auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15358        return all_of(E.Scalars, [&](Value *V) {
15359          auto *I = cast<Instruction>(V);
15360          KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15361          unsigned ShiftedBits = OrigBitWidth - BitWidth;
15362          return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15363                 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15364                                                  nullptr, DT);
15365        });
15366      };
15367      return TryProcessInstruction(
15368          BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15369          AShrChecker);
15370    }
15371    case Instruction::UDiv:
15372    case Instruction::URem: {
15373      // UDiv and URem can be truncated if all the truncated bits are zero.
15374      auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15375        assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15376        return all_of(E.Scalars, [&](Value *V) {
15377          auto *I = cast<Instruction>(V);
15378          APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15379          return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15380                 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15381        });
15382      };
15383      return TryProcessInstruction(
15384          BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15385    }
15386  
15387    // We can demote selects if we can demote their true and false values.
15388    case Instruction::Select: {
15389      return TryProcessInstruction(
15390          BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15391    }
15392  
15393    // We can demote phis if we can demote all their incoming operands. Note that
15394    // we don't need to worry about cycles since we ensure single use above.
15395    case Instruction::PHI: {
15396      const unsigned NumOps = E.getNumOperands();
15397      SmallVector<const TreeEntry *> Ops(NumOps);
15398      transform(seq<unsigned>(0, NumOps), Ops.begin(),
15399                std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15400  
15401      return TryProcessInstruction(BitWidth, Ops);
15402    }
15403  
15404    case Instruction::Call: {
15405      auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15406      if (!IC)
15407        break;
15408      Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);
15409      if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15410          ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15411        break;
15412      SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15413      function_ref<bool(unsigned, unsigned)> CallChecker;
15414      auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15415        assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15416        return all_of(E.Scalars, [&](Value *V) {
15417          auto *I = cast<Instruction>(V);
15418          if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15419            APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15420            return MaskedValueIsZero(I->getOperand(0), Mask,
15421                                     SimplifyQuery(*DL)) &&
15422                   MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15423          }
15424          assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15425                 "Expected min/max intrinsics only.");
15426          unsigned SignBits = OrigBitWidth - BitWidth;
15427          APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15428          unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15429                                                nullptr, DT);
15430          unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15431                                                nullptr, DT);
15432          return SignBits <= Op0SignBits &&
15433                 ((SignBits != Op0SignBits &&
15434                   !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15435                  MaskedValueIsZero(I->getOperand(0), Mask,
15436                                    SimplifyQuery(*DL))) &&
15437                 SignBits <= Op1SignBits &&
15438                 ((SignBits != Op1SignBits &&
15439                   !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15440                  MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15441        });
15442      };
15443      auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15444        assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15445        return all_of(E.Scalars, [&](Value *V) {
15446          auto *I = cast<Instruction>(V);
15447          unsigned SignBits = OrigBitWidth - BitWidth;
15448          APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15449          unsigned Op0SignBits =
15450              ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
15451          return SignBits <= Op0SignBits &&
15452                 ((SignBits != Op0SignBits &&
15453                   !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15454                  MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
15455        });
15456      };
15457      if (ID != Intrinsic::abs) {
15458        Operands.push_back(getOperandEntry(&E, 1));
15459        CallChecker = CompChecker;
15460      } else {
15461        CallChecker = AbsChecker;
15462      }
15463      InstructionCost BestCost =
15464          std::numeric_limits<InstructionCost::CostType>::max();
15465      unsigned BestBitWidth = BitWidth;
15466      unsigned VF = E.Scalars.size();
15467      // Choose the best bitwidth based on cost estimations.
15468      auto Checker = [&](unsigned BitWidth, unsigned) {
15469        unsigned MinBW = PowerOf2Ceil(BitWidth);
15470        SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15471        auto VecCallCosts = getVectorCallCosts(
15472            IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
15473            TTI, TLI, ArgTys);
15474        InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15475        if (Cost < BestCost) {
15476          BestCost = Cost;
15477          BestBitWidth = BitWidth;
15478        }
15479        return false;
15480      };
15481      [[maybe_unused]] bool NeedToExit;
15482      (void)AttemptCheckBitwidth(Checker, NeedToExit);
15483      BitWidth = BestBitWidth;
15484      return TryProcessInstruction(BitWidth, Operands, CallChecker);
15485    }
15486  
15487    // Otherwise, conservatively give up.
15488    default:
15489      break;
15490    }
15491    MaxDepthLevel = 1;
15492    return FinalAnalysis();
15493  }
15494  
15495  static RecurKind getRdxKind(Value *V);
15496  
15497  void BoUpSLP::computeMinimumValueSizes() {
15498    // We only attempt to truncate integer expressions.
15499    bool IsStoreOrInsertElt =
15500        VectorizableTree.front()->getOpcode() == Instruction::Store ||
15501        VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15502    if ((IsStoreOrInsertElt || UserIgnoreList) &&
15503        ExtraBitWidthNodes.size() <= 1 &&
15504        (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15505         CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15506      return;
15507  
15508    unsigned NodeIdx = 0;
15509    if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15510      NodeIdx = 1;
15511  
15512    // Ensure the roots of the vectorizable tree don't form a cycle.
15513    if (VectorizableTree[NodeIdx]->isGather() ||
15514        (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15515        (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15516                                [NodeIdx](const EdgeInfo &EI) {
15517                                  return EI.UserTE->Idx >
15518                                         static_cast<int>(NodeIdx);
15519                                })))
15520      return;
15521  
15522    // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15523    // resize to the final type.
15524    bool IsTruncRoot = false;
15525    bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15526    SmallVector<unsigned> RootDemotes;
15527    if (NodeIdx != 0 &&
15528        VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15529        VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15530      assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15531      IsTruncRoot = true;
15532      RootDemotes.push_back(NodeIdx);
15533      IsProfitableToDemoteRoot = true;
15534      ++NodeIdx;
15535    }
15536  
15537    // Analyzed the reduction already and not profitable - exit.
15538    if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15539      return;
15540  
15541    SmallVector<unsigned> ToDemote;
15542    auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15543                                  bool IsProfitableToDemoteRoot, unsigned Opcode,
15544                                  unsigned Limit, bool IsTruncRoot,
15545                                  bool IsSignedCmp) -> unsigned {
15546      ToDemote.clear();
15547      // Check if the root is trunc and the next node is gather/buildvector, then
15548      // keep trunc in scalars, which is free in most cases.
15549      if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15550          E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15551          all_of(E.Scalars, [&](Value *V) {
15552            return V->hasOneUse() || isa<Constant>(V) ||
15553                   (!V->hasNUsesOrMore(UsesLimit) &&
15554                    none_of(V->users(), [&](User *U) {
15555                      const TreeEntry *TE = getTreeEntry(U);
15556                      const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15557                      if (TE == UserTE || !TE)
15558                        return false;
15559                      if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15560                               SelectInst>(U) ||
15561                          !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15562                               SelectInst>(UserTE->getMainOp()))
15563                        return true;
15564                      unsigned UserTESz = DL->getTypeSizeInBits(
15565                          UserTE->Scalars.front()->getType());
15566                      auto It = MinBWs.find(TE);
15567                      if (It != MinBWs.end() && It->second.first > UserTESz)
15568                        return true;
15569                      return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15570                    }));
15571          })) {
15572        ToDemote.push_back(E.Idx);
15573        const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15574        auto It = MinBWs.find(UserTE);
15575        if (It != MinBWs.end())
15576          return It->second.first;
15577        unsigned MaxBitWidth =
15578            DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15579        MaxBitWidth = bit_ceil(MaxBitWidth);
15580        if (MaxBitWidth < 8 && MaxBitWidth > 1)
15581          MaxBitWidth = 8;
15582        return MaxBitWidth;
15583      }
15584  
15585      unsigned VF = E.getVectorFactor();
15586      auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15587      if (!TreeRootIT || !Opcode)
15588        return 0u;
15589  
15590      if (any_of(E.Scalars,
15591                 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15592        return 0u;
15593  
15594      unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF));
15595  
15596      // The maximum bit width required to represent all the values that can be
15597      // demoted without loss of precision. It would be safe to truncate the roots
15598      // of the expression to this width.
15599      unsigned MaxBitWidth = 1u;
15600  
15601      // True if the roots can be zero-extended back to their original type,
15602      // rather than sign-extended. We know that if the leading bits are not
15603      // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15604      // True.
15605      // Determine if the sign bit of all the roots is known to be zero. If not,
15606      // IsKnownPositive is set to False.
15607      bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15608        KnownBits Known = computeKnownBits(R, *DL);
15609        return Known.isNonNegative();
15610      });
15611  
15612      // We first check if all the bits of the roots are demanded. If they're not,
15613      // we can truncate the roots to this narrower type.
15614      for (Value *Root : E.Scalars) {
15615        unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15616        TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15617        unsigned BitWidth1 = NumTypeBits - NumSignBits;
15618        // If we can't prove that the sign bit is zero, we must add one to the
15619        // maximum bit width to account for the unknown sign bit. This preserves
15620        // the existing sign bit so we can safely sign-extend the root back to the
15621        // original type. Otherwise, if we know the sign bit is zero, we will
15622        // zero-extend the root instead.
15623        //
15624        // FIXME: This is somewhat suboptimal, as there will be cases where adding
15625        //        one to the maximum bit width will yield a larger-than-necessary
15626        //        type. In general, we need to add an extra bit only if we can't
15627        //        prove that the upper bit of the original type is equal to the
15628        //        upper bit of the proposed smaller type. If these two bits are
15629        //        the same (either zero or one) we know that sign-extending from
15630        //        the smaller type will result in the same value. Here, since we
15631        //        can't yet prove this, we are just making the proposed smaller
15632        //        type larger to ensure correctness.
15633        if (!IsKnownPositive)
15634          ++BitWidth1;
15635  
15636        APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15637        unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15638        MaxBitWidth =
15639            std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15640      }
15641  
15642      if (MaxBitWidth < 8 && MaxBitWidth > 1)
15643        MaxBitWidth = 8;
15644  
15645      // If the original type is large, but reduced type does not improve the reg
15646      // use - ignore it.
15647      if (NumParts > 1 &&
15648          NumParts ==
15649              TTI->getNumberOfParts(getWidenedType(
15650                  IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15651        return 0u;
15652  
15653      bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15654                                  Opcode == Instruction::SExt ||
15655                                  Opcode == Instruction::ZExt || NumParts > 1;
15656      // Conservatively determine if we can actually truncate the roots of the
15657      // expression. Collect the values that can be demoted in ToDemote and
15658      // additional roots that require investigating in Roots.
15659      DenseSet<const TreeEntry *> Visited;
15660      unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15661      bool NeedToDemote = IsProfitableToDemote;
15662  
15663      if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15664                                 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15665                                 IsTruncRoot) ||
15666          (MaxDepthLevel <= Limit &&
15667           !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15668              (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15669               DL->getTypeSizeInBits(TreeRootIT) /
15670                       DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15671                                                 ->getOperand(0)
15672                                                 ->getType()) >
15673                   2)))))
15674        return 0u;
15675      // Round MaxBitWidth up to the next power-of-two.
15676      MaxBitWidth = bit_ceil(MaxBitWidth);
15677  
15678      return MaxBitWidth;
15679    };
15680  
15681    // If we can truncate the root, we must collect additional values that might
15682    // be demoted as a result. That is, those seeded by truncations we will
15683    // modify.
15684    // Add reduction ops sizes, if any.
15685    if (UserIgnoreList &&
15686        isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15687      for (Value *V : *UserIgnoreList) {
15688        auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15689        auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15690        unsigned BitWidth1 = NumTypeBits - NumSignBits;
15691        if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
15692          ++BitWidth1;
15693        unsigned BitWidth2 = BitWidth1;
15694        if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
15695          auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15696          BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15697        }
15698        ReductionBitWidth =
15699            std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15700      }
15701      if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15702        ReductionBitWidth = 8;
15703  
15704      ReductionBitWidth = bit_ceil(ReductionBitWidth);
15705    }
15706    bool IsTopRoot = NodeIdx == 0;
15707    while (NodeIdx < VectorizableTree.size() &&
15708           VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15709           VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15710      RootDemotes.push_back(NodeIdx);
15711      ++NodeIdx;
15712      IsTruncRoot = true;
15713    }
15714    bool IsSignedCmp = false;
15715    while (NodeIdx < VectorizableTree.size()) {
15716      ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15717      unsigned Limit = 2;
15718      unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15719      if (IsTopRoot &&
15720          ReductionBitWidth ==
15721              DL->getTypeSizeInBits(
15722                  VectorizableTree.front()->Scalars.front()->getType()))
15723        Limit = 3;
15724      unsigned MaxBitWidth = ComputeMaxBitWidth(
15725          *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15726          Limit, IsTruncRoot, IsSignedCmp);
15727      if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15728        if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15729          ReductionBitWidth = bit_ceil(MaxBitWidth);
15730        else if (MaxBitWidth == 0)
15731          ReductionBitWidth = 0;
15732      }
15733  
15734      for (unsigned Idx : RootDemotes) {
15735        if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15736              uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15737              if (OrigBitWidth > MaxBitWidth) {
15738                APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15739                return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15740              }
15741              return false;
15742            }))
15743          ToDemote.push_back(Idx);
15744      }
15745      RootDemotes.clear();
15746      IsTopRoot = false;
15747      IsProfitableToDemoteRoot = true;
15748  
15749      if (ExtraBitWidthNodes.empty()) {
15750        NodeIdx = VectorizableTree.size();
15751      } else {
15752        unsigned NewIdx = 0;
15753        do {
15754          NewIdx = *ExtraBitWidthNodes.begin();
15755          ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15756        } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15757        NodeIdx = NewIdx;
15758        IsTruncRoot =
15759            NodeIdx < VectorizableTree.size() &&
15760            any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15761                   [](const EdgeInfo &EI) {
15762                     return EI.EdgeIdx == 0 &&
15763                            EI.UserTE->getOpcode() == Instruction::Trunc &&
15764                            !EI.UserTE->isAltShuffle();
15765                   });
15766        IsSignedCmp =
15767            NodeIdx < VectorizableTree.size() &&
15768            any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15769                   [&](const EdgeInfo &EI) {
15770                     return EI.UserTE->getOpcode() == Instruction::ICmp &&
15771                            any_of(EI.UserTE->Scalars, [&](Value *V) {
15772                              auto *IC = dyn_cast<ICmpInst>(V);
15773                              return IC &&
15774                                     (IC->isSigned() ||
15775                                      !isKnownNonNegative(IC->getOperand(0),
15776                                                          SimplifyQuery(*DL)) ||
15777                                      !isKnownNonNegative(IC->getOperand(1),
15778                                                          SimplifyQuery(*DL)));
15779                            });
15780                   });
15781      }
15782  
15783      // If the maximum bit width we compute is less than the with of the roots'
15784      // type, we can proceed with the narrowing. Otherwise, do nothing.
15785      if (MaxBitWidth == 0 ||
15786          MaxBitWidth >=
15787              cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15788        if (UserIgnoreList)
15789          AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15790        continue;
15791      }
15792  
15793      // Finally, map the values we can demote to the maximum bit with we
15794      // computed.
15795      for (unsigned Idx : ToDemote) {
15796        TreeEntry *TE = VectorizableTree[Idx].get();
15797        if (MinBWs.contains(TE))
15798          continue;
15799        bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15800                          return !isKnownNonNegative(R, SimplifyQuery(*DL));
15801                        });
15802        MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15803      }
15804    }
15805  }
15806  
15807  PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15808    auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15809    auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15810    auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15811    auto *AA = &AM.getResult<AAManager>(F);
15812    auto *LI = &AM.getResult<LoopAnalysis>(F);
15813    auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15814    auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15815    auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15816    auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
15817  
15818    bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15819    if (!Changed)
15820      return PreservedAnalyses::all();
15821  
15822    PreservedAnalyses PA;
15823    PA.preserveSet<CFGAnalyses>();
15824    return PA;
15825  }
15826  
15827  bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15828                                  TargetTransformInfo *TTI_,
15829                                  TargetLibraryInfo *TLI_, AAResults *AA_,
15830                                  LoopInfo *LI_, DominatorTree *DT_,
15831                                  AssumptionCache *AC_, DemandedBits *DB_,
15832                                  OptimizationRemarkEmitter *ORE_) {
15833    if (!RunSLPVectorization)
15834      return false;
15835    SE = SE_;
15836    TTI = TTI_;
15837    TLI = TLI_;
15838    AA = AA_;
15839    LI = LI_;
15840    DT = DT_;
15841    AC = AC_;
15842    DB = DB_;
15843    DL = &F.getDataLayout();
15844  
15845    Stores.clear();
15846    GEPs.clear();
15847    bool Changed = false;
15848  
15849    // If the target claims to have no vector registers don't attempt
15850    // vectorization.
15851    if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
15852      LLVM_DEBUG(
15853          dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15854      return false;
15855    }
15856  
15857    // Don't vectorize when the attribute NoImplicitFloat is used.
15858    if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15859      return false;
15860  
15861    LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15862  
15863    // Use the bottom up slp vectorizer to construct chains that start with
15864    // store instructions.
15865    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15866  
15867    // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15868    // delete instructions.
15869  
15870    // Update DFS numbers now so that we can use them for ordering.
15871    DT->updateDFSNumbers();
15872  
15873    // Scan the blocks in the function in post order.
15874    for (auto *BB : post_order(&F.getEntryBlock())) {
15875      // Start new block - clear the list of reduction roots.
15876      R.clearReductionData();
15877      collectSeedInstructions(BB);
15878  
15879      // Vectorize trees that end at stores.
15880      if (!Stores.empty()) {
15881        LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15882                          << " underlying objects.\n");
15883        Changed |= vectorizeStoreChains(R);
15884      }
15885  
15886      // Vectorize trees that end at reductions.
15887      Changed |= vectorizeChainsInBlock(BB, R);
15888  
15889      // Vectorize the index computations of getelementptr instructions. This
15890      // is primarily intended to catch gather-like idioms ending at
15891      // non-consecutive loads.
15892      if (!GEPs.empty()) {
15893        LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15894                          << " underlying objects.\n");
15895        Changed |= vectorizeGEPIndices(BB, R);
15896      }
15897    }
15898  
15899    if (Changed) {
15900      R.optimizeGatherSequence();
15901      LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15902    }
15903    return Changed;
15904  }
15905  
15906  std::optional<bool>
15907  SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15908                                         unsigned Idx, unsigned MinVF,
15909                                         unsigned &Size) {
15910    Size = 0;
15911    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15912                      << "\n");
15913    const unsigned Sz = R.getVectorElementSize(Chain[0]);
15914    unsigned VF = Chain.size();
15915  
15916    if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15917      // Check if vectorizing with a non-power-of-2 VF should be considered. At
15918      // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15919      // all vector lanes are used.
15920      if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15921        return false;
15922    }
15923  
15924    LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15925                      << "\n");
15926  
15927    SetVector<Value *> ValOps;
15928    for (Value *V : Chain)
15929      ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15930    // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15931    InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15932    if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15933      DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15934      bool IsPowerOf2 =
15935          isPowerOf2_32(ValOps.size()) ||
15936          (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15937      if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15938           (!S.MainOp->isSafeToRemove() ||
15939            any_of(ValOps.getArrayRef(),
15940                   [&](Value *V) {
15941                     return !isa<ExtractElementInst>(V) &&
15942                            (V->getNumUses() > Chain.size() ||
15943                             any_of(V->users(), [&](User *U) {
15944                               return !Stores.contains(U);
15945                             }));
15946                   }))) ||
15947          (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15948        Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15949        return false;
15950      }
15951    }
15952    if (R.isLoadCombineCandidate(Chain))
15953      return true;
15954    R.buildTree(Chain);
15955    // Check if tree tiny and store itself or its value is not vectorized.
15956    if (R.isTreeTinyAndNotFullyVectorizable()) {
15957      if (R.isGathered(Chain.front()) ||
15958          R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15959        return std::nullopt;
15960      Size = R.getTreeSize();
15961      return false;
15962    }
15963    R.reorderTopToBottom();
15964    R.reorderBottomToTop();
15965    R.buildExternalUses();
15966  
15967    R.computeMinimumValueSizes();
15968    R.transformNodes();
15969  
15970    Size = R.getTreeSize();
15971    if (S.getOpcode() == Instruction::Load)
15972      Size = 2; // cut off masked gather small trees
15973    InstructionCost Cost = R.getTreeCost();
15974  
15975    LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15976    if (Cost < -SLPCostThreshold) {
15977      LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15978  
15979      using namespace ore;
15980  
15981      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15982                                          cast<StoreInst>(Chain[0]))
15983                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15984                       << " and with tree size "
15985                       << NV("TreeSize", R.getTreeSize()));
15986  
15987      R.vectorizeTree();
15988      return true;
15989    }
15990  
15991    return false;
15992  }
15993  
15994  /// Checks if the quadratic mean deviation is less than 90% of the mean size.
15995  static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15996                             bool First) {
15997    unsigned Num = 0;
15998    uint64_t Sum = std::accumulate(
15999        Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16000        [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16001          unsigned Size = First ? Val.first : Val.second;
16002          if (Size == 1)
16003            return V;
16004          ++Num;
16005          return V + Size;
16006        });
16007    if (Num == 0)
16008      return true;
16009    uint64_t Mean = Sum / Num;
16010    if (Mean == 0)
16011      return true;
16012    uint64_t Dev = std::accumulate(
16013                       Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16014                       [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16015                         unsigned P = First ? Val.first : Val.second;
16016                         if (P == 1)
16017                           return V;
16018                         return V + (P - Mean) * (P - Mean);
16019                       }) /
16020                   Num;
16021    return Dev * 81 / (Mean * Mean) == 0;
16022  }
16023  
16024  bool SLPVectorizerPass::vectorizeStores(
16025      ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16026      DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16027          &Visited) {
16028    // We may run into multiple chains that merge into a single chain. We mark the
16029    // stores that we vectorized so that we don't visit the same store twice.
16030    BoUpSLP::ValueSet VectorizedStores;
16031    bool Changed = false;
16032  
16033    struct StoreDistCompare {
16034      bool operator()(const std::pair<unsigned, int> &Op1,
16035                      const std::pair<unsigned, int> &Op2) const {
16036        return Op1.second < Op2.second;
16037      }
16038    };
16039    // A set of pairs (index of store in Stores array ref, Distance of the store
16040    // address relative to base store address in units).
16041    using StoreIndexToDistSet =
16042        std::set<std::pair<unsigned, int>, StoreDistCompare>;
16043    auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16044      int PrevDist = -1;
16045      BoUpSLP::ValueList Operands;
16046      // Collect the chain into a list.
16047      for (auto [Idx, Data] : enumerate(Set)) {
16048        if (Operands.empty() || Data.second - PrevDist == 1) {
16049          Operands.push_back(Stores[Data.first]);
16050          PrevDist = Data.second;
16051          if (Idx != Set.size() - 1)
16052            continue;
16053        }
16054        auto E = make_scope_exit([&, &DataVar = Data]() {
16055          Operands.clear();
16056          Operands.push_back(Stores[DataVar.first]);
16057          PrevDist = DataVar.second;
16058        });
16059  
16060        if (Operands.size() <= 1 ||
16061            !Visited
16062                 .insert({Operands.front(),
16063                          cast<StoreInst>(Operands.front())->getValueOperand(),
16064                          Operands.back(),
16065                          cast<StoreInst>(Operands.back())->getValueOperand(),
16066                          Operands.size()})
16067                 .second)
16068          continue;
16069  
16070        unsigned MaxVecRegSize = R.getMaxVecRegSize();
16071        unsigned EltSize = R.getVectorElementSize(Operands[0]);
16072        unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
16073  
16074        unsigned MaxVF =
16075            std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16076        unsigned MaxRegVF = MaxVF;
16077        auto *Store = cast<StoreInst>(Operands[0]);
16078        Type *StoreTy = Store->getValueOperand()->getType();
16079        Type *ValueTy = StoreTy;
16080        if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
16081          ValueTy = Trunc->getSrcTy();
16082        if (ValueTy == StoreTy &&
16083            R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
16084          MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
16085        unsigned MinVF = std::max<unsigned>(
16086            2, PowerOf2Ceil(TTI->getStoreMinimumVF(
16087                   R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16088                   ValueTy)));
16089  
16090        if (MaxVF < MinVF) {
16091          LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16092                            << ") < "
16093                            << "MinVF (" << MinVF << ")\n");
16094          continue;
16095        }
16096  
16097        unsigned NonPowerOf2VF = 0;
16098        if (VectorizeNonPowerOf2) {
16099          // First try vectorizing with a non-power-of-2 VF. At the moment, only
16100          // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16101          // lanes are used.
16102          unsigned CandVF = Operands.size();
16103          if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
16104            NonPowerOf2VF = CandVF;
16105        }
16106  
16107        unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
16108        SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16109        unsigned Size = MinVF;
16110        for_each(reverse(CandidateVFs), [&](unsigned &VF) {
16111          VF = Size > MaxVF ? NonPowerOf2VF : Size;
16112          Size *= 2;
16113        });
16114        unsigned End = Operands.size();
16115        unsigned Repeat = 0;
16116        constexpr unsigned MaxAttempts = 4;
16117        OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
16118        for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
16119          P.first = P.second = 1;
16120        });
16121        DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
16122        auto IsNotVectorized = [](bool First,
16123                                  const std::pair<unsigned, unsigned> &P) {
16124          return First ? P.first > 0 : P.second > 0;
16125        };
16126        auto IsVectorized = [](bool First,
16127                               const std::pair<unsigned, unsigned> &P) {
16128          return First ? P.first == 0 : P.second == 0;
16129        };
16130        auto VFIsProfitable = [](bool First, unsigned Size,
16131                                 const std::pair<unsigned, unsigned> &P) {
16132          return First ? Size >= P.first : Size >= P.second;
16133        };
16134        auto FirstSizeSame = [](unsigned Size,
16135                                const std::pair<unsigned, unsigned> &P) {
16136          return Size == P.first;
16137        };
16138        while (true) {
16139          ++Repeat;
16140          bool RepeatChanged = false;
16141          bool AnyProfitableGraph = false;
16142          for (unsigned Size : CandidateVFs) {
16143            AnyProfitableGraph = false;
16144            unsigned StartIdx = std::distance(
16145                RangeSizes.begin(),
16146                find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
16147                                              std::placeholders::_1)));
16148            while (StartIdx < End) {
16149              unsigned EndIdx =
16150                  std::distance(RangeSizes.begin(),
16151                                find_if(RangeSizes.drop_front(StartIdx),
16152                                        std::bind(IsVectorized, Size >= MaxRegVF,
16153                                                  std::placeholders::_1)));
16154              unsigned Sz = EndIdx >= End ? End : EndIdx;
16155              for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16156                if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
16157                                    Size >= MaxRegVF)) {
16158                  ++Cnt;
16159                  continue;
16160                }
16161                ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
16162                assert(all_of(Slice,
16163                              [&](Value *V) {
16164                                return cast<StoreInst>(V)
16165                                           ->getValueOperand()
16166                                           ->getType() ==
16167                                       cast<StoreInst>(Slice.front())
16168                                           ->getValueOperand()
16169                                           ->getType();
16170                              }) &&
16171                       "Expected all operands of same type.");
16172                if (!NonSchedulable.empty()) {
16173                  auto [NonSchedSizeMax, NonSchedSizeMin] =
16174                      NonSchedulable.lookup(Slice.front());
16175                  if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16176                    Cnt += NonSchedSizeMax;
16177                    continue;
16178                  }
16179                }
16180                unsigned TreeSize;
16181                std::optional<bool> Res =
16182                    vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16183                if (!Res) {
16184                  NonSchedulable
16185                      .try_emplace(Slice.front(), std::make_pair(Size, Size))
16186                      .first->getSecond()
16187                      .second = Size;
16188                } else if (*Res) {
16189                  // Mark the vectorized stores so that we don't vectorize them
16190                  // again.
16191                  VectorizedStores.insert(Slice.begin(), Slice.end());
16192                  // Mark the vectorized stores so that we don't vectorize them
16193                  // again.
16194                  AnyProfitableGraph = RepeatChanged = Changed = true;
16195                  // If we vectorized initial block, no need to try to vectorize
16196                  // it again.
16197                  for_each(RangeSizes.slice(Cnt, Size),
16198                           [](std::pair<unsigned, unsigned> &P) {
16199                             P.first = P.second = 0;
16200                           });
16201                  if (Cnt < StartIdx + MinVF) {
16202                    for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16203                             [](std::pair<unsigned, unsigned> &P) {
16204                               P.first = P.second = 0;
16205                             });
16206                    StartIdx = Cnt + Size;
16207                  }
16208                  if (Cnt > Sz - Size - MinVF) {
16209                    for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
16210                             [](std::pair<unsigned, unsigned> &P) {
16211                               P.first = P.second = 0;
16212                             });
16213                    if (Sz == End)
16214                      End = Cnt;
16215                    Sz = Cnt;
16216                  }
16217                  Cnt += Size;
16218                  continue;
16219                }
16220                if (Size > 2 && Res &&
16221                    !all_of(RangeSizes.slice(Cnt, Size),
16222                            std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
16223                                      std::placeholders::_1))) {
16224                  Cnt += Size;
16225                  continue;
16226                }
16227                // Check for the very big VFs that we're not rebuilding same
16228                // trees, just with larger number of elements.
16229                if (Size > MaxRegVF && TreeSize > 1 &&
16230                    all_of(RangeSizes.slice(Cnt, Size),
16231                           std::bind(FirstSizeSame, TreeSize,
16232                                     std::placeholders::_1))) {
16233                  Cnt += Size;
16234                  while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16235                    ++Cnt;
16236                  continue;
16237                }
16238                if (TreeSize > 1)
16239                  for_each(RangeSizes.slice(Cnt, Size),
16240                           [&](std::pair<unsigned, unsigned> &P) {
16241                             if (Size >= MaxRegVF)
16242                               P.second = std::max(P.second, TreeSize);
16243                             else
16244                               P.first = std::max(P.first, TreeSize);
16245                           });
16246                ++Cnt;
16247                AnyProfitableGraph = true;
16248              }
16249              if (StartIdx >= End)
16250                break;
16251              if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16252                AnyProfitableGraph = true;
16253              StartIdx = std::distance(
16254                  RangeSizes.begin(),
16255                  find_if(RangeSizes.drop_front(Sz),
16256                          std::bind(IsNotVectorized, Size >= MaxRegVF,
16257                                    std::placeholders::_1)));
16258            }
16259            if (!AnyProfitableGraph && Size >= MaxRegVF)
16260              break;
16261          }
16262          // All values vectorized - exit.
16263          if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16264                return P.first == 0 && P.second == 0;
16265              }))
16266            break;
16267          // Check if tried all attempts or no need for the last attempts at all.
16268          if (Repeat >= MaxAttempts ||
16269              (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16270            break;
16271          constexpr unsigned StoresLimit = 64;
16272          const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16273              Operands.size(),
16274              static_cast<unsigned>(
16275                  End -
16276                  std::distance(
16277                      RangeSizes.begin(),
16278                      find_if(RangeSizes, std::bind(IsNotVectorized, true,
16279                                                    std::placeholders::_1))) +
16280                  1)));
16281          unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16282          if (VF > MaxTotalNum || VF >= StoresLimit)
16283            break;
16284          for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16285            if (P.first != 0)
16286              P.first = std::max(P.second, P.first);
16287          });
16288          // Last attempt to vectorize max number of elements, if all previous
16289          // attempts were unsuccessful because of the cost issues.
16290          CandidateVFs.clear();
16291          CandidateVFs.push_back(VF);
16292        }
16293      }
16294    };
16295  
16296    // Stores pair (first: index of the store into Stores array ref, address of
16297    // which taken as base, second: sorted set of pairs {index, dist}, which are
16298    // indices of stores in the set and their store location distances relative to
16299    // the base address).
16300  
16301    // Need to store the index of the very first store separately, since the set
16302    // may be reordered after the insertion and the first store may be moved. This
16303    // container allows to reduce number of calls of getPointersDiff() function.
16304    SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
16305    // Inserts the specified store SI with the given index Idx to the set of the
16306    // stores. If the store with the same distance is found already - stop
16307    // insertion, try to vectorize already found stores. If some stores from this
16308    // sequence were not vectorized - try to vectorize them with the new store
16309    // later. But this logic is applied only to the stores, that come before the
16310    // previous store with the same distance.
16311    // Example:
16312    // 1. store x, %p
16313    // 2. store y, %p+1
16314    // 3. store z, %p+2
16315    // 4. store a, %p
16316    // 5. store b, %p+3
16317    // - Scan this from the last to first store. The very first bunch of stores is
16318    // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16319    // vector).
16320    // - The next store in the list - #1 - has the same distance from store #5 as
16321    // the store #4.
16322    // - Try to vectorize sequence of stores 4,2,3,5.
16323    // - If all these stores are vectorized - just drop them.
16324    // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16325    // - Start new stores sequence.
16326    // The new bunch of stores is {1, {1, 0}}.
16327    // - Add the stores from previous sequence, that were not vectorized.
16328    // Here we consider the stores in the reversed order, rather they are used in
16329    // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16330    // Store #3 can be added -> comes after store #4 with the same distance as
16331    // store #1.
16332    // Store #5 cannot be added - comes before store #4.
16333    // This logic allows to improve the compile time, we assume that the stores
16334    // after previous store with the same distance most likely have memory
16335    // dependencies and no need to waste compile time to try to vectorize them.
16336    // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16337    auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16338      for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16339        std::optional<int> Diff = getPointersDiff(
16340            Stores[Set.first]->getValueOperand()->getType(),
16341            Stores[Set.first]->getPointerOperand(),
16342            SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16343            /*StrictCheck=*/true);
16344        if (!Diff)
16345          continue;
16346        auto It = Set.second.find(std::make_pair(Idx, *Diff));
16347        if (It == Set.second.end()) {
16348          Set.second.emplace(Idx, *Diff);
16349          return;
16350        }
16351        // Try to vectorize the first found set to avoid duplicate analysis.
16352        TryToVectorize(Set.second);
16353        StoreIndexToDistSet PrevSet;
16354        PrevSet.swap(Set.second);
16355        Set.first = Idx;
16356        Set.second.emplace(Idx, 0);
16357        // Insert stores that followed previous match to try to vectorize them
16358        // with this store.
16359        unsigned StartIdx = It->first + 1;
16360        SmallBitVector UsedStores(Idx - StartIdx);
16361        // Distances to previously found dup store (or this store, since they
16362        // store to the same addresses).
16363        SmallVector<int> Dists(Idx - StartIdx, 0);
16364        for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16365          // Do not try to vectorize sequences, we already tried.
16366          if (Pair.first <= It->first ||
16367              VectorizedStores.contains(Stores[Pair.first]))
16368            break;
16369          unsigned BI = Pair.first - StartIdx;
16370          UsedStores.set(BI);
16371          Dists[BI] = Pair.second - It->second;
16372        }
16373        for (unsigned I = StartIdx; I < Idx; ++I) {
16374          unsigned BI = I - StartIdx;
16375          if (UsedStores.test(BI))
16376            Set.second.emplace(I, Dists[BI]);
16377        }
16378        return;
16379      }
16380      auto &Res = SortedStores.emplace_back();
16381      Res.first = Idx;
16382      Res.second.emplace(Idx, 0);
16383    };
16384    Type *PrevValTy = nullptr;
16385    for (auto [I, SI] : enumerate(Stores)) {
16386      if (R.isDeleted(SI))
16387        continue;
16388      if (!PrevValTy)
16389        PrevValTy = SI->getValueOperand()->getType();
16390      // Check that we do not try to vectorize stores of different types.
16391      if (PrevValTy != SI->getValueOperand()->getType()) {
16392        for (auto &Set : SortedStores)
16393          TryToVectorize(Set.second);
16394        SortedStores.clear();
16395        PrevValTy = SI->getValueOperand()->getType();
16396      }
16397      FillStoresSet(I, SI);
16398    }
16399  
16400    // Final vectorization attempt.
16401    for (auto &Set : SortedStores)
16402      TryToVectorize(Set.second);
16403  
16404    return Changed;
16405  }
16406  
16407  void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16408    // Initialize the collections. We will make a single pass over the block.
16409    Stores.clear();
16410    GEPs.clear();
16411  
16412    // Visit the store and getelementptr instructions in BB and organize them in
16413    // Stores and GEPs according to the underlying objects of their pointer
16414    // operands.
16415    for (Instruction &I : *BB) {
16416      // Ignore store instructions that are volatile or have a pointer operand
16417      // that doesn't point to a scalar type.
16418      if (auto *SI = dyn_cast<StoreInst>(&I)) {
16419        if (!SI->isSimple())
16420          continue;
16421        if (!isValidElementType(SI->getValueOperand()->getType()))
16422          continue;
16423        Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16424      }
16425  
16426      // Ignore getelementptr instructions that have more than one index, a
16427      // constant index, or a pointer operand that doesn't point to a scalar
16428      // type.
16429      else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16430        if (GEP->getNumIndices() != 1)
16431          continue;
16432        Value *Idx = GEP->idx_begin()->get();
16433        if (isa<Constant>(Idx))
16434          continue;
16435        if (!isValidElementType(Idx->getType()))
16436          continue;
16437        if (GEP->getType()->isVectorTy())
16438          continue;
16439        GEPs[GEP->getPointerOperand()].push_back(GEP);
16440      }
16441    }
16442  }
16443  
16444  bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16445                                             bool MaxVFOnly) {
16446    if (VL.size() < 2)
16447      return false;
16448  
16449    LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16450                      << VL.size() << ".\n");
16451  
16452    // Check that all of the parts are instructions of the same type,
16453    // we permit an alternate opcode via InstructionsState.
16454    InstructionsState S = getSameOpcode(VL, *TLI);
16455    if (!S.getOpcode())
16456      return false;
16457  
16458    Instruction *I0 = cast<Instruction>(S.OpValue);
16459    // Make sure invalid types (including vector type) are rejected before
16460    // determining vectorization factor for scalar instructions.
16461    for (Value *V : VL) {
16462      Type *Ty = V->getType();
16463      if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16464        // NOTE: the following will give user internal llvm type name, which may
16465        // not be useful.
16466        R.getORE()->emit([&]() {
16467          std::string TypeStr;
16468          llvm::raw_string_ostream rso(TypeStr);
16469          Ty->print(rso);
16470          return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16471                 << "Cannot SLP vectorize list: type "
16472                 << TypeStr + " is unsupported by vectorizer";
16473        });
16474        return false;
16475      }
16476    }
16477  
16478    unsigned Sz = R.getVectorElementSize(I0);
16479    unsigned MinVF = R.getMinVF(Sz);
16480    unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16481    MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16482    if (MaxVF < 2) {
16483      R.getORE()->emit([&]() {
16484        return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16485               << "Cannot SLP vectorize list: vectorization factor "
16486               << "less than 2 is not supported";
16487      });
16488      return false;
16489    }
16490  
16491    bool Changed = false;
16492    bool CandidateFound = false;
16493    InstructionCost MinCost = SLPCostThreshold.getValue();
16494    Type *ScalarTy = VL[0]->getType();
16495    if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16496      ScalarTy = IE->getOperand(1)->getType();
16497  
16498    unsigned NextInst = 0, MaxInst = VL.size();
16499    for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16500      // No actual vectorization should happen, if number of parts is the same as
16501      // provided vectorization factor (i.e. the scalar type is used for vector
16502      // code during codegen).
16503      auto *VecTy = getWidenedType(ScalarTy, VF);
16504      if (TTI->getNumberOfParts(VecTy) == VF)
16505        continue;
16506      for (unsigned I = NextInst; I < MaxInst; ++I) {
16507        unsigned ActualVF = std::min(MaxInst - I, VF);
16508  
16509        if (!isPowerOf2_32(ActualVF))
16510          continue;
16511  
16512        if (MaxVFOnly && ActualVF < MaxVF)
16513          break;
16514        if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16515          break;
16516  
16517        ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16518        // Check that a previous iteration of this loop did not delete the Value.
16519        if (llvm::any_of(Ops, [&R](Value *V) {
16520              auto *I = dyn_cast<Instruction>(V);
16521              return I && R.isDeleted(I);
16522            }))
16523          continue;
16524  
16525        LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16526                          << "\n");
16527  
16528        R.buildTree(Ops);
16529        if (R.isTreeTinyAndNotFullyVectorizable())
16530          continue;
16531        R.reorderTopToBottom();
16532        R.reorderBottomToTop(
16533            /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16534            !R.doesRootHaveInTreeUses());
16535        R.buildExternalUses();
16536  
16537        R.computeMinimumValueSizes();
16538        R.transformNodes();
16539        InstructionCost Cost = R.getTreeCost();
16540        CandidateFound = true;
16541        MinCost = std::min(MinCost, Cost);
16542  
16543        LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16544                          << " for VF=" << ActualVF << "\n");
16545        if (Cost < -SLPCostThreshold) {
16546          LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16547          R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16548                                                      cast<Instruction>(Ops[0]))
16549                                   << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16550                                   << " and with tree size "
16551                                   << ore::NV("TreeSize", R.getTreeSize()));
16552  
16553          R.vectorizeTree();
16554          // Move to the next bundle.
16555          I += VF - 1;
16556          NextInst = I + 1;
16557          Changed = true;
16558        }
16559      }
16560    }
16561  
16562    if (!Changed && CandidateFound) {
16563      R.getORE()->emit([&]() {
16564        return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16565               << "List vectorization was possible but not beneficial with cost "
16566               << ore::NV("Cost", MinCost) << " >= "
16567               << ore::NV("Treshold", -SLPCostThreshold);
16568      });
16569    } else if (!Changed) {
16570      R.getORE()->emit([&]() {
16571        return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16572               << "Cannot SLP vectorize list: vectorization was impossible"
16573               << " with available vectorization factors";
16574      });
16575    }
16576    return Changed;
16577  }
16578  
16579  bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16580    if (!I)
16581      return false;
16582  
16583    if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16584      return false;
16585  
16586    Value *P = I->getParent();
16587  
16588    // Vectorize in current basic block only.
16589    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16590    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16591    if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16592      return false;
16593  
16594    // First collect all possible candidates
16595    SmallVector<std::pair<Value *, Value *>, 4> Candidates;
16596    Candidates.emplace_back(Op0, Op1);
16597  
16598    auto *A = dyn_cast<BinaryOperator>(Op0);
16599    auto *B = dyn_cast<BinaryOperator>(Op1);
16600    // Try to skip B.
16601    if (A && B && B->hasOneUse()) {
16602      auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16603      auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16604      if (B0 && B0->getParent() == P)
16605        Candidates.emplace_back(A, B0);
16606      if (B1 && B1->getParent() == P)
16607        Candidates.emplace_back(A, B1);
16608    }
16609    // Try to skip A.
16610    if (B && A && A->hasOneUse()) {
16611      auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16612      auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16613      if (A0 && A0->getParent() == P)
16614        Candidates.emplace_back(A0, B);
16615      if (A1 && A1->getParent() == P)
16616        Candidates.emplace_back(A1, B);
16617    }
16618  
16619    if (Candidates.size() == 1)
16620      return tryToVectorizeList({Op0, Op1}, R);
16621  
16622    // We have multiple options. Try to pick the single best.
16623    std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16624    if (!BestCandidate)
16625      return false;
16626    return tryToVectorizeList(
16627        {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16628  }
16629  
16630  namespace {
16631  
16632  /// Model horizontal reductions.
16633  ///
16634  /// A horizontal reduction is a tree of reduction instructions that has values
16635  /// that can be put into a vector as its leaves. For example:
16636  ///
16637  /// mul mul mul mul
16638  ///  \  /    \  /
16639  ///   +       +
16640  ///    \     /
16641  ///       +
16642  /// This tree has "mul" as its leaf values and "+" as its reduction
16643  /// instructions. A reduction can feed into a store or a binary operation
16644  /// feeding a phi.
16645  ///    ...
16646  ///    \  /
16647  ///     +
16648  ///     |
16649  ///  phi +=
16650  ///
16651  ///  Or:
16652  ///    ...
16653  ///    \  /
16654  ///     +
16655  ///     |
16656  ///   *p =
16657  ///
16658  class HorizontalReduction {
16659    using ReductionOpsType = SmallVector<Value *, 16>;
16660    using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16661    ReductionOpsListType ReductionOps;
16662    /// List of possibly reduced values.
16663    SmallVector<SmallVector<Value *>> ReducedVals;
16664    /// Maps reduced value to the corresponding reduction operation.
16665    DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
16666    // Use map vector to make stable output.
16667    MapVector<Instruction *, Value *> ExtraArgs;
16668    WeakTrackingVH ReductionRoot;
16669    /// The type of reduction operation.
16670    RecurKind RdxKind;
16671    /// Checks if the optimization of original scalar identity operations on
16672    /// matched horizontal reductions is enabled and allowed.
16673    bool IsSupportedHorRdxIdentityOp = false;
16674  
16675    static bool isCmpSelMinMax(Instruction *I) {
16676      return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16677             RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
16678    }
16679  
16680    // And/or are potentially poison-safe logical patterns like:
16681    // select x, y, false
16682    // select x, true, y
16683    static bool isBoolLogicOp(Instruction *I) {
16684      return isa<SelectInst>(I) &&
16685             (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16686    }
16687  
16688    /// Checks if instruction is associative and can be vectorized.
16689    static bool isVectorizable(RecurKind Kind, Instruction *I) {
16690      if (Kind == RecurKind::None)
16691        return false;
16692  
16693      // Integer ops that map to select instructions or intrinsics are fine.
16694      if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
16695          isBoolLogicOp(I))
16696        return true;
16697  
16698      if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16699        // FP min/max are associative except for NaN and -0.0. We do not
16700        // have to rule out -0.0 here because the intrinsic semantics do not
16701        // specify a fixed result for it.
16702        return I->getFastMathFlags().noNaNs();
16703      }
16704  
16705      if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16706        return true;
16707  
16708      return I->isAssociative();
16709    }
16710  
16711    static Value *getRdxOperand(Instruction *I, unsigned Index) {
16712      // Poison-safe 'or' takes the form: select X, true, Y
16713      // To make that work with the normal operand processing, we skip the
16714      // true value operand.
16715      // TODO: Change the code and data structures to handle this without a hack.
16716      if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16717        return I->getOperand(2);
16718      return I->getOperand(Index);
16719    }
16720  
16721    /// Creates reduction operation with the current opcode.
16722    static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16723                           Value *RHS, const Twine &Name, bool UseSelect) {
16724      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16725      switch (Kind) {
16726      case RecurKind::Or:
16727        if (UseSelect &&
16728            LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
16729          return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16730        return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16731                                   Name);
16732      case RecurKind::And:
16733        if (UseSelect &&
16734            LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
16735          return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16736        return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16737                                   Name);
16738      case RecurKind::Add:
16739      case RecurKind::Mul:
16740      case RecurKind::Xor:
16741      case RecurKind::FAdd:
16742      case RecurKind::FMul:
16743        return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16744                                   Name);
16745      case RecurKind::FMax:
16746        return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16747      case RecurKind::FMin:
16748        return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16749      case RecurKind::FMaximum:
16750        return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16751      case RecurKind::FMinimum:
16752        return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16753      case RecurKind::SMax:
16754        if (UseSelect) {
16755          Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16756          return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16757        }
16758        return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16759      case RecurKind::SMin:
16760        if (UseSelect) {
16761          Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16762          return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16763        }
16764        return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16765      case RecurKind::UMax:
16766        if (UseSelect) {
16767          Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16768          return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16769        }
16770        return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16771      case RecurKind::UMin:
16772        if (UseSelect) {
16773          Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16774          return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16775        }
16776        return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16777      default:
16778        llvm_unreachable("Unknown reduction operation.");
16779      }
16780    }
16781  
16782    /// Creates reduction operation with the current opcode with the IR flags
16783    /// from \p ReductionOps, dropping nuw/nsw flags.
16784    static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16785                           Value *RHS, const Twine &Name,
16786                           const ReductionOpsListType &ReductionOps) {
16787      bool UseSelect = ReductionOps.size() == 2 ||
16788                       // Logical or/and.
16789                       (ReductionOps.size() == 1 &&
16790                        any_of(ReductionOps.front(), IsaPred<SelectInst>));
16791      assert((!UseSelect || ReductionOps.size() != 2 ||
16792              isa<SelectInst>(ReductionOps[1][0])) &&
16793             "Expected cmp + select pairs for reduction");
16794      Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16795      if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
16796        if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16797          propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16798                           /*IncludeWrapFlags=*/false);
16799          propagateIRFlags(Op, ReductionOps[1], nullptr,
16800                           /*IncludeWrapFlags=*/false);
16801          return Op;
16802        }
16803      }
16804      propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16805      return Op;
16806    }
16807  
16808  public:
16809    static RecurKind getRdxKind(Value *V) {
16810      auto *I = dyn_cast<Instruction>(V);
16811      if (!I)
16812        return RecurKind::None;
16813      if (match(I, m_Add(m_Value(), m_Value())))
16814        return RecurKind::Add;
16815      if (match(I, m_Mul(m_Value(), m_Value())))
16816        return RecurKind::Mul;
16817      if (match(I, m_And(m_Value(), m_Value())) ||
16818          match(I, m_LogicalAnd(m_Value(), m_Value())))
16819        return RecurKind::And;
16820      if (match(I, m_Or(m_Value(), m_Value())) ||
16821          match(I, m_LogicalOr(m_Value(), m_Value())))
16822        return RecurKind::Or;
16823      if (match(I, m_Xor(m_Value(), m_Value())))
16824        return RecurKind::Xor;
16825      if (match(I, m_FAdd(m_Value(), m_Value())))
16826        return RecurKind::FAdd;
16827      if (match(I, m_FMul(m_Value(), m_Value())))
16828        return RecurKind::FMul;
16829  
16830      if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16831        return RecurKind::FMax;
16832      if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16833        return RecurKind::FMin;
16834  
16835      if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16836        return RecurKind::FMaximum;
16837      if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16838        return RecurKind::FMinimum;
16839      // This matches either cmp+select or intrinsics. SLP is expected to handle
16840      // either form.
16841      // TODO: If we are canonicalizing to intrinsics, we can remove several
16842      //       special-case paths that deal with selects.
16843      if (match(I, m_SMax(m_Value(), m_Value())))
16844        return RecurKind::SMax;
16845      if (match(I, m_SMin(m_Value(), m_Value())))
16846        return RecurKind::SMin;
16847      if (match(I, m_UMax(m_Value(), m_Value())))
16848        return RecurKind::UMax;
16849      if (match(I, m_UMin(m_Value(), m_Value())))
16850        return RecurKind::UMin;
16851  
16852      if (auto *Select = dyn_cast<SelectInst>(I)) {
16853        // Try harder: look for min/max pattern based on instructions producing
16854        // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16855        // During the intermediate stages of SLP, it's very common to have
16856        // pattern like this (since optimizeGatherSequence is run only once
16857        // at the end):
16858        // %1 = extractelement <2 x i32> %a, i32 0
16859        // %2 = extractelement <2 x i32> %a, i32 1
16860        // %cond = icmp sgt i32 %1, %2
16861        // %3 = extractelement <2 x i32> %a, i32 0
16862        // %4 = extractelement <2 x i32> %a, i32 1
16863        // %select = select i1 %cond, i32 %3, i32 %4
16864        CmpInst::Predicate Pred;
16865        Instruction *L1;
16866        Instruction *L2;
16867  
16868        Value *LHS = Select->getTrueValue();
16869        Value *RHS = Select->getFalseValue();
16870        Value *Cond = Select->getCondition();
16871  
16872        // TODO: Support inverse predicates.
16873        if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16874          if (!isa<ExtractElementInst>(RHS) ||
16875              !L2->isIdenticalTo(cast<Instruction>(RHS)))
16876            return RecurKind::None;
16877        } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16878          if (!isa<ExtractElementInst>(LHS) ||
16879              !L1->isIdenticalTo(cast<Instruction>(LHS)))
16880            return RecurKind::None;
16881        } else {
16882          if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16883            return RecurKind::None;
16884          if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16885              !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16886              !L2->isIdenticalTo(cast<Instruction>(RHS)))
16887            return RecurKind::None;
16888        }
16889  
16890        switch (Pred) {
16891        default:
16892          return RecurKind::None;
16893        case CmpInst::ICMP_SGT:
16894        case CmpInst::ICMP_SGE:
16895          return RecurKind::SMax;
16896        case CmpInst::ICMP_SLT:
16897        case CmpInst::ICMP_SLE:
16898          return RecurKind::SMin;
16899        case CmpInst::ICMP_UGT:
16900        case CmpInst::ICMP_UGE:
16901          return RecurKind::UMax;
16902        case CmpInst::ICMP_ULT:
16903        case CmpInst::ICMP_ULE:
16904          return RecurKind::UMin;
16905        }
16906      }
16907      return RecurKind::None;
16908    }
16909  
16910    /// Get the index of the first operand.
16911    static unsigned getFirstOperandIndex(Instruction *I) {
16912      return isCmpSelMinMax(I) ? 1 : 0;
16913    }
16914  
16915  private:
16916    /// Total number of operands in the reduction operation.
16917    static unsigned getNumberOfOperands(Instruction *I) {
16918      return isCmpSelMinMax(I) ? 3 : 2;
16919    }
16920  
16921    /// Checks if the instruction is in basic block \p BB.
16922    /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16923    static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16924      if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16925        auto *Sel = cast<SelectInst>(I);
16926        auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16927        return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16928      }
16929      return I->getParent() == BB;
16930    }
16931  
16932    /// Expected number of uses for reduction operations/reduced values.
16933    static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16934      if (IsCmpSelMinMax) {
16935        // SelectInst must be used twice while the condition op must have single
16936        // use only.
16937        if (auto *Sel = dyn_cast<SelectInst>(I))
16938          return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16939        return I->hasNUses(2);
16940      }
16941  
16942      // Arithmetic reduction operation must be used once only.
16943      return I->hasOneUse();
16944    }
16945  
16946    /// Initializes the list of reduction operations.
16947    void initReductionOps(Instruction *I) {
16948      if (isCmpSelMinMax(I))
16949        ReductionOps.assign(2, ReductionOpsType());
16950      else
16951        ReductionOps.assign(1, ReductionOpsType());
16952    }
16953  
16954    /// Add all reduction operations for the reduction instruction \p I.
16955    void addReductionOps(Instruction *I) {
16956      if (isCmpSelMinMax(I)) {
16957        ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16958        ReductionOps[1].emplace_back(I);
16959      } else {
16960        ReductionOps[0].emplace_back(I);
16961      }
16962    }
16963  
16964    static bool isGoodForReduction(ArrayRef<Value *> Data) {
16965      int Sz = Data.size();
16966      auto *I = dyn_cast<Instruction>(Data.front());
16967      return Sz > 1 || isConstant(Data.front()) ||
16968             (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16969    }
16970  
16971  public:
16972    HorizontalReduction() = default;
16973  
16974    /// Try to find a reduction tree.
16975    bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16976                                   ScalarEvolution &SE, const DataLayout &DL,
16977                                   const TargetLibraryInfo &TLI) {
16978      RdxKind = HorizontalReduction::getRdxKind(Root);
16979      if (!isVectorizable(RdxKind, Root))
16980        return false;
16981  
16982      // Analyze "regular" integer/FP types for reductions - no target-specific
16983      // types or pointers.
16984      Type *Ty = Root->getType();
16985      if (!isValidElementType(Ty) || Ty->isPointerTy())
16986        return false;
16987  
16988      // Though the ultimate reduction may have multiple uses, its condition must
16989      // have only single use.
16990      if (auto *Sel = dyn_cast<SelectInst>(Root))
16991        if (!Sel->getCondition()->hasOneUse())
16992          return false;
16993  
16994      ReductionRoot = Root;
16995  
16996      // Iterate through all the operands of the possible reduction tree and
16997      // gather all the reduced values, sorting them by their value id.
16998      BasicBlock *BB = Root->getParent();
16999      bool IsCmpSelMinMax = isCmpSelMinMax(Root);
17000      SmallVector<Instruction *> Worklist(1, Root);
17001      // Checks if the operands of the \p TreeN instruction are also reduction
17002      // operations or should be treated as reduced values or an extra argument,
17003      // which is not part of the reduction.
17004      auto CheckOperands = [&](Instruction *TreeN,
17005                               SmallVectorImpl<Value *> &ExtraArgs,
17006                               SmallVectorImpl<Value *> &PossibleReducedVals,
17007                               SmallVectorImpl<Instruction *> &ReductionOps) {
17008        for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
17009                                      getNumberOfOperands(TreeN)))) {
17010          Value *EdgeVal = getRdxOperand(TreeN, I);
17011          ReducedValsToOps[EdgeVal].push_back(TreeN);
17012          auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17013          // Edge has wrong parent - mark as an extra argument.
17014          if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
17015              !hasSameParent(EdgeInst, BB)) {
17016            ExtraArgs.push_back(EdgeVal);
17017            continue;
17018          }
17019          // If the edge is not an instruction, or it is different from the main
17020          // reduction opcode or has too many uses - possible reduced value.
17021          // Also, do not try to reduce const values, if the operation is not
17022          // foldable.
17023          if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
17024              IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17025              !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17026              !isVectorizable(RdxKind, EdgeInst) ||
17027              (R.isAnalyzedReductionRoot(EdgeInst) &&
17028               all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17029            PossibleReducedVals.push_back(EdgeVal);
17030            continue;
17031          }
17032          ReductionOps.push_back(EdgeInst);
17033        }
17034      };
17035      // Try to regroup reduced values so that it gets more profitable to try to
17036      // reduce them. Values are grouped by their value ids, instructions - by
17037      // instruction op id and/or alternate op id, plus do extra analysis for
17038      // loads (grouping them by the distabce between pointers) and cmp
17039      // instructions (grouping them by the predicate).
17040      MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
17041          PossibleReducedVals;
17042      initReductionOps(Root);
17043      DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
17044      SmallSet<size_t, 2> LoadKeyUsed;
17045  
17046      auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17047        Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
17048        if (LoadKeyUsed.contains(Key)) {
17049          auto LIt = LoadsMap.find(Ptr);
17050          if (LIt != LoadsMap.end()) {
17051            for (LoadInst *RLI : LIt->second) {
17052              if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
17053                                  LI->getType(), LI->getPointerOperand(), DL, SE,
17054                                  /*StrictCheck=*/true))
17055                return hash_value(RLI->getPointerOperand());
17056            }
17057            for (LoadInst *RLI : LIt->second) {
17058              if (arePointersCompatible(RLI->getPointerOperand(),
17059                                        LI->getPointerOperand(), TLI)) {
17060                hash_code SubKey = hash_value(RLI->getPointerOperand());
17061                return SubKey;
17062              }
17063            }
17064            if (LIt->second.size() > 2) {
17065              hash_code SubKey =
17066                  hash_value(LIt->second.back()->getPointerOperand());
17067              return SubKey;
17068            }
17069          }
17070        }
17071        LoadKeyUsed.insert(Key);
17072        LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
17073        return hash_value(LI->getPointerOperand());
17074      };
17075  
17076      while (!Worklist.empty()) {
17077        Instruction *TreeN = Worklist.pop_back_val();
17078        SmallVector<Value *> Args;
17079        SmallVector<Value *> PossibleRedVals;
17080        SmallVector<Instruction *> PossibleReductionOps;
17081        CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17082        // If too many extra args - mark the instruction itself as a reduction
17083        // value, not a reduction operation.
17084        if (Args.size() < 2) {
17085          addReductionOps(TreeN);
17086          // Add extra args.
17087          if (!Args.empty()) {
17088            assert(Args.size() == 1 && "Expected only single argument.");
17089            ExtraArgs[TreeN] = Args.front();
17090          }
17091          // Add reduction values. The values are sorted for better vectorization
17092          // results.
17093          for (Value *V : PossibleRedVals) {
17094            size_t Key, Idx;
17095            std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17096                                                   /*AllowAlternate=*/false);
17097            ++PossibleReducedVals[Key][Idx]
17098                  .insert(std::make_pair(V, 0))
17099                  .first->second;
17100          }
17101          Worklist.append(PossibleReductionOps.rbegin(),
17102                          PossibleReductionOps.rend());
17103        } else {
17104          size_t Key, Idx;
17105          std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
17106                                                 /*AllowAlternate=*/false);
17107          ++PossibleReducedVals[Key][Idx]
17108                .insert(std::make_pair(TreeN, 0))
17109                .first->second;
17110        }
17111      }
17112      auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17113      // Sort values by the total number of values kinds to start the reduction
17114      // from the longest possible reduced values sequences.
17115      for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17116        auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17117        SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17118        for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17119             It != E; ++It) {
17120          PossibleRedValsVect.emplace_back();
17121          auto RedValsVect = It->second.takeVector();
17122          stable_sort(RedValsVect, llvm::less_second());
17123          for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17124            PossibleRedValsVect.back().append(Data.second, Data.first);
17125        }
17126        stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
17127          return P1.size() > P2.size();
17128        });
17129        int NewIdx = -1;
17130        for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17131          if (NewIdx < 0 ||
17132              (!isGoodForReduction(Data) &&
17133               (!isa<LoadInst>(Data.front()) ||
17134                !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17135                getUnderlyingObject(
17136                    cast<LoadInst>(Data.front())->getPointerOperand()) !=
17137                    getUnderlyingObject(
17138                        cast<LoadInst>(ReducedVals[NewIdx].front())
17139                            ->getPointerOperand())))) {
17140            NewIdx = ReducedVals.size();
17141            ReducedVals.emplace_back();
17142          }
17143          ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
17144        }
17145      }
17146      // Sort the reduced values by number of same/alternate opcode and/or pointer
17147      // operand.
17148      stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17149        return P1.size() > P2.size();
17150      });
17151      return true;
17152    }
17153  
17154    /// Attempt to vectorize the tree found by matchAssociativeReduction.
17155    Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17156                       const TargetLibraryInfo &TLI) {
17157      constexpr int ReductionLimit = 4;
17158      constexpr unsigned RegMaxNumber = 4;
17159      constexpr unsigned RedValsMaxNumber = 128;
17160      // If there are a sufficient number of reduction values, reduce
17161      // to a nearby power-of-2. We can safely generate oversized
17162      // vectors and rely on the backend to split them to legal sizes.
17163      unsigned NumReducedVals =
17164          std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
17165                          [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17166                            if (!isGoodForReduction(Vals))
17167                              return Num;
17168                            return Num + Vals.size();
17169                          });
17170      if (NumReducedVals < ReductionLimit &&
17171          (!AllowHorRdxIdenityOptimization ||
17172           all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
17173             return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
17174           }))) {
17175        for (ReductionOpsType &RdxOps : ReductionOps)
17176          for (Value *RdxOp : RdxOps)
17177            V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17178        return nullptr;
17179      }
17180  
17181      IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17182                                      TargetFolder(DL));
17183      Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
17184  
17185      // Track the reduced values in case if they are replaced by extractelement
17186      // because of the vectorization.
17187      DenseMap<Value *, WeakTrackingVH> TrackedVals(
17188          ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17189      BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17190      SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17191      ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
17192      // The same extra argument may be used several times, so log each attempt
17193      // to use it.
17194      for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17195        assert(Pair.first && "DebugLoc must be set.");
17196        ExternallyUsedValues[Pair.second].push_back(Pair.first);
17197        TrackedVals.try_emplace(Pair.second, Pair.second);
17198      }
17199  
17200      // The compare instruction of a min/max is the insertion point for new
17201      // instructions and may be replaced with a new compare instruction.
17202      auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17203        assert(isa<SelectInst>(RdxRootInst) &&
17204               "Expected min/max reduction to have select root instruction");
17205        Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17206        assert(isa<Instruction>(ScalarCond) &&
17207               "Expected min/max reduction to have compare condition");
17208        return cast<Instruction>(ScalarCond);
17209      };
17210  
17211      // Return new VectorizedTree, based on previous value.
17212      auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17213        if (VectorizedTree) {
17214          // Update the final value in the reduction.
17215          Builder.SetCurrentDebugLocation(
17216              cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17217          if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17218              (isGuaranteedNotToBePoison(Res) &&
17219               !isGuaranteedNotToBePoison(VectorizedTree))) {
17220            auto It = ReducedValsToOps.find(Res);
17221            if (It != ReducedValsToOps.end() &&
17222                any_of(It->getSecond(),
17223                       [](Instruction *I) { return isBoolLogicOp(I); }))
17224              std::swap(VectorizedTree, Res);
17225          }
17226  
17227          return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
17228                          ReductionOps);
17229        }
17230        // Initialize the final value in the reduction.
17231        return Res;
17232      };
17233      bool AnyBoolLogicOp =
17234          any_of(ReductionOps.back(), [](Value *V) {
17235            return isBoolLogicOp(cast<Instruction>(V));
17236          });
17237      // The reduction root is used as the insertion point for new instructions,
17238      // so set it as externally used to prevent it from being deleted.
17239      ExternallyUsedValues[ReductionRoot];
17240      SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17241                                        ReductionOps.front().size());
17242      for (ReductionOpsType &RdxOps : ReductionOps)
17243        for (Value *RdxOp : RdxOps) {
17244          if (!RdxOp)
17245            continue;
17246          IgnoreList.insert(RdxOp);
17247        }
17248      // Intersect the fast-math-flags from all reduction operations.
17249      FastMathFlags RdxFMF;
17250      RdxFMF.set();
17251      for (Value *U : IgnoreList)
17252        if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17253          RdxFMF &= FPMO->getFastMathFlags();
17254      bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17255  
17256      // Need to track reduced vals, they may be changed during vectorization of
17257      // subvectors.
17258      for (ArrayRef<Value *> Candidates : ReducedVals)
17259        for (Value *V : Candidates)
17260          TrackedVals.try_emplace(V, V);
17261  
17262      DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17263      // List of the values that were reduced in other trees as part of gather
17264      // nodes and thus requiring extract if fully vectorized in other trees.
17265      SmallPtrSet<Value *, 4> RequiredExtract;
17266      Value *VectorizedTree = nullptr;
17267      bool CheckForReusedReductionOps = false;
17268      // Try to vectorize elements based on their type.
17269      SmallVector<InstructionsState> States;
17270      for (ArrayRef<Value *> RV : ReducedVals)
17271        States.push_back(getSameOpcode(RV, TLI));
17272      for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17273        ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17274        InstructionsState S = States[I];
17275        SmallVector<Value *> Candidates;
17276        Candidates.reserve(2 * OrigReducedVals.size());
17277        DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17278        for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17279          Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17280          // Check if the reduction value was not overriden by the extractelement
17281          // instruction because of the vectorization and exclude it, if it is not
17282          // compatible with other values.
17283          // Also check if the instruction was folded to constant/other value.
17284          auto *Inst = dyn_cast<Instruction>(RdxVal);
17285          if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17286               (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17287              (S.getOpcode() && !Inst))
17288            continue;
17289          Candidates.push_back(RdxVal);
17290          TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17291        }
17292        bool ShuffledExtracts = false;
17293        // Try to handle shuffled extractelements.
17294        if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17295            I + 1 < E) {
17296          InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
17297          if (NextS.getOpcode() == Instruction::ExtractElement &&
17298              !NextS.isAltShuffle()) {
17299            SmallVector<Value *> CommonCandidates(Candidates);
17300            for (Value *RV : ReducedVals[I + 1]) {
17301              Value *RdxVal = TrackedVals.find(RV)->second;
17302              // Check if the reduction value was not overriden by the
17303              // extractelement instruction because of the vectorization and
17304              // exclude it, if it is not compatible with other values.
17305              if (auto *Inst = dyn_cast<Instruction>(RdxVal))
17306                if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17307                  continue;
17308              CommonCandidates.push_back(RdxVal);
17309              TrackedToOrig.try_emplace(RdxVal, RV);
17310            }
17311            SmallVector<int> Mask;
17312            if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17313              ++I;
17314              Candidates.swap(CommonCandidates);
17315              ShuffledExtracts = true;
17316            }
17317          }
17318        }
17319  
17320        // Emit code for constant values.
17321        if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17322            allConstant(Candidates)) {
17323          Value *Res = Candidates.front();
17324          ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17325          for (Value *VC : ArrayRef(Candidates).drop_front()) {
17326            Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17327            ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17328            if (auto *ResI = dyn_cast<Instruction>(Res))
17329              V.analyzedReductionRoot(ResI);
17330          }
17331          VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17332          continue;
17333        }
17334  
17335        unsigned NumReducedVals = Candidates.size();
17336        if (NumReducedVals < ReductionLimit &&
17337            (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17338             !isSplat(Candidates)))
17339          continue;
17340  
17341        // Check if we support repeated scalar values processing (optimization of
17342        // original scalar identity operations on matched horizontal reductions).
17343        IsSupportedHorRdxIdentityOp =
17344            AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17345            RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17346        // Gather same values.
17347        MapVector<Value *, unsigned> SameValuesCounter;
17348        if (IsSupportedHorRdxIdentityOp)
17349          for (Value *V : Candidates)
17350            ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17351        // Used to check if the reduced values used same number of times. In this
17352        // case the compiler may produce better code. E.g. if reduced values are
17353        // aabbccdd (8 x values), then the first node of the tree will have a node
17354        // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17355        // Plus, the final reduction will be performed on <8 x aabbccdd>.
17356        // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17357        // x abcd) * 2.
17358        // Currently it only handles add/fadd/xor. and/or/min/max do not require
17359        // this analysis, other operations may require an extra estimation of
17360        // the profitability.
17361        bool SameScaleFactor = false;
17362        bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17363                                SameValuesCounter.size() != Candidates.size();
17364        if (OptReusedScalars) {
17365          SameScaleFactor =
17366              (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17367               RdxKind == RecurKind::Xor) &&
17368              all_of(drop_begin(SameValuesCounter),
17369                     [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17370                       return P.second == SameValuesCounter.front().second;
17371                     });
17372          Candidates.resize(SameValuesCounter.size());
17373          transform(SameValuesCounter, Candidates.begin(),
17374                    [](const auto &P) { return P.first; });
17375          NumReducedVals = Candidates.size();
17376          // Have a reduction of the same element.
17377          if (NumReducedVals == 1) {
17378            Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17379            unsigned Cnt = SameValuesCounter.lookup(OrigV);
17380            Value *RedVal =
17381                emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17382            VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17383            VectorizedVals.try_emplace(OrigV, Cnt);
17384            continue;
17385          }
17386        }
17387  
17388        unsigned MaxVecRegSize = V.getMaxVecRegSize();
17389        unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17390        unsigned MaxElts =
17391            RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17392  
17393        unsigned ReduxWidth = std::min<unsigned>(
17394            llvm::bit_floor(NumReducedVals),
17395            std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17396                                 RegMaxNumber * RedValsMaxNumber));
17397        unsigned Start = 0;
17398        unsigned Pos = Start;
17399        // Restarts vectorization attempt with lower vector factor.
17400        unsigned PrevReduxWidth = ReduxWidth;
17401        bool CheckForReusedReductionOpsLocal = false;
17402        auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17403                                    &CheckForReusedReductionOpsLocal,
17404                                    &PrevReduxWidth, &V,
17405                                    &IgnoreList](bool IgnoreVL = false) {
17406          bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17407          if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17408            // Check if any of the reduction ops are gathered. If so, worth
17409            // trying again with less number of reduction ops.
17410            CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17411          }
17412          ++Pos;
17413          if (Pos < NumReducedVals - ReduxWidth + 1)
17414            return IsAnyRedOpGathered;
17415          Pos = Start;
17416          ReduxWidth /= 2;
17417          return IsAnyRedOpGathered;
17418        };
17419        bool AnyVectorized = false;
17420        while (Pos < NumReducedVals - ReduxWidth + 1 &&
17421               ReduxWidth >= ReductionLimit) {
17422          // Dependency in tree of the reduction ops - drop this attempt, try
17423          // later.
17424          if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17425              Start == 0) {
17426            CheckForReusedReductionOps = true;
17427            break;
17428          }
17429          PrevReduxWidth = ReduxWidth;
17430          ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17431          // Beeing analyzed already - skip.
17432          if (V.areAnalyzedReductionVals(VL)) {
17433            (void)AdjustReducedVals(/*IgnoreVL=*/true);
17434            continue;
17435          }
17436          // Early exit if any of the reduction values were deleted during
17437          // previous vectorization attempts.
17438          if (any_of(VL, [&V](Value *RedVal) {
17439                auto *RedValI = dyn_cast<Instruction>(RedVal);
17440                if (!RedValI)
17441                  return false;
17442                return V.isDeleted(RedValI);
17443              }))
17444            break;
17445          V.buildTree(VL, IgnoreList);
17446          if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17447            if (!AdjustReducedVals())
17448              V.analyzedReductionVals(VL);
17449            continue;
17450          }
17451          if (V.isLoadCombineReductionCandidate(RdxKind)) {
17452            if (!AdjustReducedVals())
17453              V.analyzedReductionVals(VL);
17454            continue;
17455          }
17456          V.reorderTopToBottom();
17457          // No need to reorder the root node at all.
17458          V.reorderBottomToTop(/*IgnoreReorder=*/true);
17459          // Keep extracted other reduction values, if they are used in the
17460          // vectorization trees.
17461          BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17462              ExternallyUsedValues);
17463          for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17464            if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17465              continue;
17466            for (Value *V : ReducedVals[Cnt])
17467              if (isa<Instruction>(V))
17468                LocalExternallyUsedValues[TrackedVals[V]];
17469          }
17470          if (!IsSupportedHorRdxIdentityOp) {
17471            // Number of uses of the candidates in the vector of values.
17472            assert(SameValuesCounter.empty() &&
17473                   "Reused values counter map is not empty");
17474            for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17475              if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17476                continue;
17477              Value *V = Candidates[Cnt];
17478              Value *OrigV = TrackedToOrig.find(V)->second;
17479              ++SameValuesCounter[OrigV];
17480            }
17481          }
17482          SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17483          // Gather externally used values.
17484          SmallPtrSet<Value *, 4> Visited;
17485          for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17486            if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17487              continue;
17488            Value *RdxVal = Candidates[Cnt];
17489            if (!Visited.insert(RdxVal).second)
17490              continue;
17491            // Check if the scalar was vectorized as part of the vectorization
17492            // tree but not the top node.
17493            if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17494              LocalExternallyUsedValues[RdxVal];
17495              continue;
17496            }
17497            Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17498            unsigned NumOps =
17499                VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17500            if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17501              LocalExternallyUsedValues[RdxVal];
17502          }
17503          // Do not need the list of reused scalars in regular mode anymore.
17504          if (!IsSupportedHorRdxIdentityOp)
17505            SameValuesCounter.clear();
17506          for (Value *RdxVal : VL)
17507            if (RequiredExtract.contains(RdxVal))
17508              LocalExternallyUsedValues[RdxVal];
17509          // Update LocalExternallyUsedValues for the scalar, replaced by
17510          // extractelement instructions.
17511          DenseMap<Value *, Value *> ReplacementToExternal;
17512          for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17513            ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17514          for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17515            Value *Ext = Pair.first;
17516            auto RIt = ReplacementToExternal.find(Ext);
17517            while (RIt != ReplacementToExternal.end()) {
17518              Ext = RIt->second;
17519              RIt = ReplacementToExternal.find(Ext);
17520            }
17521            auto *It = ExternallyUsedValues.find(Ext);
17522            if (It == ExternallyUsedValues.end())
17523              continue;
17524            LocalExternallyUsedValues[Pair.second].append(It->second);
17525          }
17526          V.buildExternalUses(LocalExternallyUsedValues);
17527  
17528          V.computeMinimumValueSizes();
17529          V.transformNodes();
17530  
17531          // Estimate cost.
17532          InstructionCost TreeCost = V.getTreeCost(VL);
17533          InstructionCost ReductionCost =
17534              getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17535          InstructionCost Cost = TreeCost + ReductionCost;
17536          LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17537                            << " for reduction\n");
17538          if (!Cost.isValid())
17539            break;
17540          if (Cost >= -SLPCostThreshold) {
17541            V.getORE()->emit([&]() {
17542              return OptimizationRemarkMissed(
17543                         SV_NAME, "HorSLPNotBeneficial",
17544                         ReducedValsToOps.find(VL[0])->second.front())
17545                     << "Vectorizing horizontal reduction is possible "
17546                     << "but not beneficial with cost " << ore::NV("Cost", Cost)
17547                     << " and threshold "
17548                     << ore::NV("Threshold", -SLPCostThreshold);
17549            });
17550            if (!AdjustReducedVals())
17551              V.analyzedReductionVals(VL);
17552            continue;
17553          }
17554  
17555          LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17556                            << Cost << ". (HorRdx)\n");
17557          V.getORE()->emit([&]() {
17558            return OptimizationRemark(
17559                       SV_NAME, "VectorizedHorizontalReduction",
17560                       ReducedValsToOps.find(VL[0])->second.front())
17561                   << "Vectorized horizontal reduction with cost "
17562                   << ore::NV("Cost", Cost) << " and with tree size "
17563                   << ore::NV("TreeSize", V.getTreeSize());
17564          });
17565  
17566          Builder.setFastMathFlags(RdxFMF);
17567  
17568          // Emit a reduction. If the root is a select (min/max idiom), the insert
17569          // point is the compare condition of that select.
17570          Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17571          Instruction *InsertPt = RdxRootInst;
17572          if (IsCmpSelMinMax)
17573            InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17574  
17575          // Vectorize a tree.
17576          Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17577                                                  ReplacedExternals, InsertPt);
17578  
17579          Builder.SetInsertPoint(InsertPt);
17580  
17581          // To prevent poison from leaking across what used to be sequential,
17582          // safe, scalar boolean logic operations, the reduction operand must be
17583          // frozen.
17584          if ((isBoolLogicOp(RdxRootInst) ||
17585               (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17586              !isGuaranteedNotToBePoison(VectorizedRoot))
17587            VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17588  
17589          // Emit code to correctly handle reused reduced values, if required.
17590          if (OptReusedScalars && !SameScaleFactor) {
17591            VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17592                                           SameValuesCounter, TrackedToOrig);
17593          }
17594  
17595          Value *ReducedSubTree =
17596              emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17597          if (ReducedSubTree->getType() != VL.front()->getType()) {
17598            assert(ReducedSubTree->getType() != VL.front()->getType() &&
17599                   "Expected different reduction type.");
17600            ReducedSubTree =
17601                Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17602                                      V.isSignedMinBitwidthRootNode());
17603          }
17604  
17605          // Improved analysis for add/fadd/xor reductions with same scale factor
17606          // for all operands of reductions. We can emit scalar ops for them
17607          // instead.
17608          if (OptReusedScalars && SameScaleFactor)
17609            ReducedSubTree = emitScaleForReusedOps(
17610                ReducedSubTree, Builder, SameValuesCounter.front().second);
17611  
17612          VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17613          // Count vectorized reduced values to exclude them from final reduction.
17614          for (Value *RdxVal : VL) {
17615            Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17616            if (IsSupportedHorRdxIdentityOp) {
17617              VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17618              continue;
17619            }
17620            ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17621            if (!V.isVectorized(RdxVal))
17622              RequiredExtract.insert(RdxVal);
17623          }
17624          Pos += ReduxWidth;
17625          Start = Pos;
17626          ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17627          AnyVectorized = true;
17628        }
17629        if (OptReusedScalars && !AnyVectorized) {
17630          for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17631            Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17632            VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17633            Value *OrigV = TrackedToOrig.find(P.first)->second;
17634            VectorizedVals.try_emplace(OrigV, P.second);
17635          }
17636          continue;
17637        }
17638      }
17639      if (VectorizedTree) {
17640        // Reorder operands of bool logical op in the natural order to avoid
17641        // possible problem with poison propagation. If not possible to reorder
17642        // (both operands are originally RHS), emit an extra freeze instruction
17643        // for the LHS operand.
17644        // I.e., if we have original code like this:
17645        // RedOp1 = select i1 ?, i1 LHS, i1 false
17646        // RedOp2 = select i1 RHS, i1 ?, i1 false
17647  
17648        // Then, we swap LHS/RHS to create a new op that matches the poison
17649        // semantics of the original code.
17650  
17651        // If we have original code like this and both values could be poison:
17652        // RedOp1 = select i1 ?, i1 LHS, i1 false
17653        // RedOp2 = select i1 ?, i1 RHS, i1 false
17654  
17655        // Then, we must freeze LHS in the new op.
17656        auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17657                                                     Instruction *RedOp1,
17658                                                     Instruction *RedOp2,
17659                                                     bool InitStep) {
17660          if (!AnyBoolLogicOp)
17661            return;
17662          if (isBoolLogicOp(RedOp1) &&
17663              ((!InitStep && LHS == VectorizedTree) ||
17664               getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17665            return;
17666          if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17667                                        getRdxOperand(RedOp2, 0) == RHS ||
17668                                        isGuaranteedNotToBePoison(RHS))) {
17669            std::swap(LHS, RHS);
17670            return;
17671          }
17672          if (LHS != VectorizedTree)
17673            LHS = Builder.CreateFreeze(LHS);
17674        };
17675        // Finish the reduction.
17676        // Need to add extra arguments and not vectorized possible reduction
17677        // values.
17678        // Try to avoid dependencies between the scalar remainders after
17679        // reductions.
17680        auto FinalGen =
17681            [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
17682                bool InitStep) {
17683              unsigned Sz = InstVals.size();
17684              SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
17685                                                                       Sz % 2);
17686              for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17687                Instruction *RedOp = InstVals[I + 1].first;
17688                Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17689                Value *RdxVal1 = InstVals[I].second;
17690                Value *StableRdxVal1 = RdxVal1;
17691                auto It1 = TrackedVals.find(RdxVal1);
17692                if (It1 != TrackedVals.end())
17693                  StableRdxVal1 = It1->second;
17694                Value *RdxVal2 = InstVals[I + 1].second;
17695                Value *StableRdxVal2 = RdxVal2;
17696                auto It2 = TrackedVals.find(RdxVal2);
17697                if (It2 != TrackedVals.end())
17698                  StableRdxVal2 = It2->second;
17699                // To prevent poison from leaking across what used to be
17700                // sequential, safe, scalar boolean logic operations, the
17701                // reduction operand must be frozen.
17702                FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17703                                  RedOp, InitStep);
17704                Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17705                                           StableRdxVal2, "op.rdx", ReductionOps);
17706                ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17707              }
17708              if (Sz % 2 == 1)
17709                ExtraReds[Sz / 2] = InstVals.back();
17710              return ExtraReds;
17711            };
17712        SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
17713        ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17714                                     VectorizedTree);
17715        SmallPtrSet<Value *, 8> Visited;
17716        for (ArrayRef<Value *> Candidates : ReducedVals) {
17717          for (Value *RdxVal : Candidates) {
17718            if (!Visited.insert(RdxVal).second)
17719              continue;
17720            unsigned NumOps = VectorizedVals.lookup(RdxVal);
17721            for (Instruction *RedOp :
17722                 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17723                     .drop_back(NumOps))
17724              ExtraReductions.emplace_back(RedOp, RdxVal);
17725          }
17726        }
17727        for (auto &Pair : ExternallyUsedValues) {
17728          // Add each externally used value to the final reduction.
17729          for (auto *I : Pair.second)
17730            ExtraReductions.emplace_back(I, Pair.first);
17731        }
17732        // Iterate through all not-vectorized reduction values/extra arguments.
17733        bool InitStep = true;
17734        while (ExtraReductions.size() > 1) {
17735          SmallVector<std::pair<Instruction *, Value *>> NewReds =
17736              FinalGen(ExtraReductions, InitStep);
17737          ExtraReductions.swap(NewReds);
17738          InitStep = false;
17739        }
17740        VectorizedTree = ExtraReductions.front().second;
17741  
17742        ReductionRoot->replaceAllUsesWith(VectorizedTree);
17743  
17744        // The original scalar reduction is expected to have no remaining
17745        // uses outside the reduction tree itself.  Assert that we got this
17746        // correct, replace internal uses with undef, and mark for eventual
17747        // deletion.
17748  #ifndef NDEBUG
17749        SmallSet<Value *, 4> IgnoreSet;
17750        for (ArrayRef<Value *> RdxOps : ReductionOps)
17751          IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17752  #endif
17753        for (ArrayRef<Value *> RdxOps : ReductionOps) {
17754          for (Value *Ignore : RdxOps) {
17755            if (!Ignore)
17756              continue;
17757  #ifndef NDEBUG
17758            for (auto *U : Ignore->users()) {
17759              assert(IgnoreSet.count(U) &&
17760                     "All users must be either in the reduction ops list.");
17761            }
17762  #endif
17763            if (!Ignore->use_empty()) {
17764              Value *P = PoisonValue::get(Ignore->getType());
17765              Ignore->replaceAllUsesWith(P);
17766            }
17767          }
17768          V.removeInstructionsAndOperands(RdxOps);
17769        }
17770      } else if (!CheckForReusedReductionOps) {
17771        for (ReductionOpsType &RdxOps : ReductionOps)
17772          for (Value *RdxOp : RdxOps)
17773            V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17774      }
17775      return VectorizedTree;
17776    }
17777  
17778  private:
17779    /// Calculate the cost of a reduction.
17780    InstructionCost getReductionCost(TargetTransformInfo *TTI,
17781                                     ArrayRef<Value *> ReducedVals,
17782                                     bool IsCmpSelMinMax, unsigned ReduxWidth,
17783                                     FastMathFlags FMF) {
17784      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17785      Type *ScalarTy = ReducedVals.front()->getType();
17786      FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
17787      InstructionCost VectorCost = 0, ScalarCost;
17788      // If all of the reduced values are constant, the vector cost is 0, since
17789      // the reduction value can be calculated at the compile time.
17790      bool AllConsts = allConstant(ReducedVals);
17791      auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17792        InstructionCost Cost = 0;
17793        // Scalar cost is repeated for N-1 elements.
17794        int Cnt = ReducedVals.size();
17795        for (Value *RdxVal : ReducedVals) {
17796          if (Cnt == 1)
17797            break;
17798          --Cnt;
17799          if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17800            Cost += GenCostFn();
17801            continue;
17802          }
17803          InstructionCost ScalarCost = 0;
17804          for (User *U : RdxVal->users()) {
17805            auto *RdxOp = cast<Instruction>(U);
17806            if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17807              ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17808              continue;
17809            }
17810            ScalarCost = InstructionCost::getInvalid();
17811            break;
17812          }
17813          if (ScalarCost.isValid())
17814            Cost += ScalarCost;
17815          else
17816            Cost += GenCostFn();
17817        }
17818        return Cost;
17819      };
17820      switch (RdxKind) {
17821      case RecurKind::Add:
17822      case RecurKind::Mul:
17823      case RecurKind::Or:
17824      case RecurKind::And:
17825      case RecurKind::Xor:
17826      case RecurKind::FAdd:
17827      case RecurKind::FMul: {
17828        unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17829        if (!AllConsts)
17830          VectorCost =
17831              TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17832        ScalarCost = EvaluateScalarCost([&]() {
17833          return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17834        });
17835        break;
17836      }
17837      case RecurKind::FMax:
17838      case RecurKind::FMin:
17839      case RecurKind::FMaximum:
17840      case RecurKind::FMinimum:
17841      case RecurKind::SMax:
17842      case RecurKind::SMin:
17843      case RecurKind::UMax:
17844      case RecurKind::UMin: {
17845        Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
17846        if (!AllConsts)
17847          VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17848        ScalarCost = EvaluateScalarCost([&]() {
17849          IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17850          return TTI->getIntrinsicInstrCost(ICA, CostKind);
17851        });
17852        break;
17853      }
17854      default:
17855        llvm_unreachable("Expected arithmetic or min/max reduction operation");
17856      }
17857  
17858      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17859                        << " for reduction of " << shortBundleName(ReducedVals)
17860                        << " (It is a splitting reduction)\n");
17861      return VectorCost - ScalarCost;
17862    }
17863  
17864    /// Emit a horizontal reduction of the vectorized value.
17865    Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17866                         unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17867      assert(VectorizedValue && "Need to have a vectorized tree node");
17868      assert(isPowerOf2_32(ReduxWidth) &&
17869             "We only handle power-of-two reductions for now");
17870      assert(RdxKind != RecurKind::FMulAdd &&
17871             "A call to the llvm.fmuladd intrinsic is not handled yet");
17872  
17873      ++NumVectorInstructions;
17874      return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17875    }
17876  
17877    /// Emits optimized code for unique scalar value reused \p Cnt times.
17878    Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17879                                 unsigned Cnt) {
17880      assert(IsSupportedHorRdxIdentityOp &&
17881             "The optimization of matched scalar identity horizontal reductions "
17882             "must be supported.");
17883      switch (RdxKind) {
17884      case RecurKind::Add: {
17885        // res = mul vv, n
17886        Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17887        LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17888                          << VectorizedValue << ". (HorRdx)\n");
17889        return Builder.CreateMul(VectorizedValue, Scale);
17890      }
17891      case RecurKind::Xor: {
17892        // res = n % 2 ? 0 : vv
17893        LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17894                          << ". (HorRdx)\n");
17895        if (Cnt % 2 == 0)
17896          return Constant::getNullValue(VectorizedValue->getType());
17897        return VectorizedValue;
17898      }
17899      case RecurKind::FAdd: {
17900        // res = fmul v, n
17901        Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17902        LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17903                          << VectorizedValue << ". (HorRdx)\n");
17904        return Builder.CreateFMul(VectorizedValue, Scale);
17905      }
17906      case RecurKind::And:
17907      case RecurKind::Or:
17908      case RecurKind::SMax:
17909      case RecurKind::SMin:
17910      case RecurKind::UMax:
17911      case RecurKind::UMin:
17912      case RecurKind::FMax:
17913      case RecurKind::FMin:
17914      case RecurKind::FMaximum:
17915      case RecurKind::FMinimum:
17916        // res = vv
17917        return VectorizedValue;
17918      case RecurKind::Mul:
17919      case RecurKind::FMul:
17920      case RecurKind::FMulAdd:
17921      case RecurKind::IAnyOf:
17922      case RecurKind::FAnyOf:
17923      case RecurKind::None:
17924        llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17925      }
17926      return nullptr;
17927    }
17928  
17929    /// Emits actual operation for the scalar identity values, found during
17930    /// horizontal reduction analysis.
17931    Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17932                         BoUpSLP &R,
17933                         const MapVector<Value *, unsigned> &SameValuesCounter,
17934                         const DenseMap<Value *, Value *> &TrackedToOrig) {
17935      assert(IsSupportedHorRdxIdentityOp &&
17936             "The optimization of matched scalar identity horizontal reductions "
17937             "must be supported.");
17938      ArrayRef<Value *> VL = R.getRootNodeScalars();
17939      auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17940      if (VTy->getElementType() != VL.front()->getType()) {
17941        VectorizedValue = Builder.CreateIntCast(
17942            VectorizedValue,
17943            getWidenedType(VL.front()->getType(), VTy->getNumElements()),
17944            R.isSignedMinBitwidthRootNode());
17945      }
17946      switch (RdxKind) {
17947      case RecurKind::Add: {
17948        // root = mul prev_root, <1, 1, n, 1>
17949        SmallVector<Constant *> Vals;
17950        for (Value *V : VL) {
17951          unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17952          Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17953        }
17954        auto *Scale = ConstantVector::get(Vals);
17955        LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17956                          << VectorizedValue << ". (HorRdx)\n");
17957        return Builder.CreateMul(VectorizedValue, Scale);
17958      }
17959      case RecurKind::And:
17960      case RecurKind::Or:
17961        // No need for multiple or/and(s).
17962        LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17963                          << ". (HorRdx)\n");
17964        return VectorizedValue;
17965      case RecurKind::SMax:
17966      case RecurKind::SMin:
17967      case RecurKind::UMax:
17968      case RecurKind::UMin:
17969      case RecurKind::FMax:
17970      case RecurKind::FMin:
17971      case RecurKind::FMaximum:
17972      case RecurKind::FMinimum:
17973        // No need for multiple min/max(s) of the same value.
17974        LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17975                          << ". (HorRdx)\n");
17976        return VectorizedValue;
17977      case RecurKind::Xor: {
17978        // Replace values with even number of repeats with 0, since
17979        // x xor x = 0.
17980        // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17981        // 7>, if elements 4th and 6th elements have even number of repeats.
17982        SmallVector<int> Mask(
17983            cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17984            PoisonMaskElem);
17985        std::iota(Mask.begin(), Mask.end(), 0);
17986        bool NeedShuffle = false;
17987        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17988          Value *V = VL[I];
17989          unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17990          if (Cnt % 2 == 0) {
17991            Mask[I] = VF;
17992            NeedShuffle = true;
17993          }
17994        }
17995        LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17996                                                : Mask) dbgs()
17997                                           << I << " ";
17998                   dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17999        if (NeedShuffle)
18000          VectorizedValue = Builder.CreateShuffleVector(
18001              VectorizedValue,
18002              ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
18003        return VectorizedValue;
18004      }
18005      case RecurKind::FAdd: {
18006        // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
18007        SmallVector<Constant *> Vals;
18008        for (Value *V : VL) {
18009          unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18010          Vals.push_back(ConstantFP::get(V->getType(), Cnt));
18011        }
18012        auto *Scale = ConstantVector::get(Vals);
18013        return Builder.CreateFMul(VectorizedValue, Scale);
18014      }
18015      case RecurKind::Mul:
18016      case RecurKind::FMul:
18017      case RecurKind::FMulAdd:
18018      case RecurKind::IAnyOf:
18019      case RecurKind::FAnyOf:
18020      case RecurKind::None:
18021        llvm_unreachable("Unexpected reduction kind for reused scalars.");
18022      }
18023      return nullptr;
18024    }
18025  };
18026  } // end anonymous namespace
18027  
18028  /// Gets recurrence kind from the specified value.
18029  static RecurKind getRdxKind(Value *V) {
18030    return HorizontalReduction::getRdxKind(V);
18031  }
18032  static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18033    if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18034      return cast<FixedVectorType>(IE->getType())->getNumElements();
18035  
18036    unsigned AggregateSize = 1;
18037    auto *IV = cast<InsertValueInst>(InsertInst);
18038    Type *CurrentType = IV->getType();
18039    do {
18040      if (auto *ST = dyn_cast<StructType>(CurrentType)) {
18041        for (auto *Elt : ST->elements())
18042          if (Elt != ST->getElementType(0)) // check homogeneity
18043            return std::nullopt;
18044        AggregateSize *= ST->getNumElements();
18045        CurrentType = ST->getElementType(0);
18046      } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18047        AggregateSize *= AT->getNumElements();
18048        CurrentType = AT->getElementType();
18049      } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18050        AggregateSize *= VT->getNumElements();
18051        return AggregateSize;
18052      } else if (CurrentType->isSingleValueType()) {
18053        return AggregateSize;
18054      } else {
18055        return std::nullopt;
18056      }
18057    } while (true);
18058  }
18059  
18060  static void findBuildAggregate_rec(Instruction *LastInsertInst,
18061                                     TargetTransformInfo *TTI,
18062                                     SmallVectorImpl<Value *> &BuildVectorOpds,
18063                                     SmallVectorImpl<Value *> &InsertElts,
18064                                     unsigned OperandOffset) {
18065    do {
18066      Value *InsertedOperand = LastInsertInst->getOperand(1);
18067      std::optional<unsigned> OperandIndex =
18068          getElementIndex(LastInsertInst, OperandOffset);
18069      if (!OperandIndex)
18070        return;
18071      if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18072        findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
18073                               BuildVectorOpds, InsertElts, *OperandIndex);
18074  
18075      } else {
18076        BuildVectorOpds[*OperandIndex] = InsertedOperand;
18077        InsertElts[*OperandIndex] = LastInsertInst;
18078      }
18079      LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
18080    } while (LastInsertInst != nullptr &&
18081             isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18082             LastInsertInst->hasOneUse());
18083  }
18084  
18085  /// Recognize construction of vectors like
18086  ///  %ra = insertelement <4 x float> poison, float %s0, i32 0
18087  ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
18088  ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
18089  ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
18090  ///  starting from the last insertelement or insertvalue instruction.
18091  ///
18092  /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18093  /// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18094  /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18095  ///
18096  /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18097  ///
18098  /// \return true if it matches.
18099  static bool findBuildAggregate(Instruction *LastInsertInst,
18100                                 TargetTransformInfo *TTI,
18101                                 SmallVectorImpl<Value *> &BuildVectorOpds,
18102                                 SmallVectorImpl<Value *> &InsertElts) {
18103  
18104    assert((isa<InsertElementInst>(LastInsertInst) ||
18105            isa<InsertValueInst>(LastInsertInst)) &&
18106           "Expected insertelement or insertvalue instruction!");
18107  
18108    assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18109           "Expected empty result vectors!");
18110  
18111    std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
18112    if (!AggregateSize)
18113      return false;
18114    BuildVectorOpds.resize(*AggregateSize);
18115    InsertElts.resize(*AggregateSize);
18116  
18117    findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
18118    llvm::erase(BuildVectorOpds, nullptr);
18119    llvm::erase(InsertElts, nullptr);
18120    if (BuildVectorOpds.size() >= 2)
18121      return true;
18122  
18123    return false;
18124  }
18125  
18126  /// Try and get a reduction instruction from a phi node.
18127  ///
18128  /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18129  /// if they come from either \p ParentBB or a containing loop latch.
18130  ///
18131  /// \returns A candidate reduction value if possible, or \code nullptr \endcode
18132  /// if not possible.
18133  static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
18134                                        BasicBlock *ParentBB, LoopInfo *LI) {
18135    // There are situations where the reduction value is not dominated by the
18136    // reduction phi. Vectorizing such cases has been reported to cause
18137    // miscompiles. See PR25787.
18138    auto DominatedReduxValue = [&](Value *R) {
18139      return isa<Instruction>(R) &&
18140             DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
18141    };
18142  
18143    Instruction *Rdx = nullptr;
18144  
18145    // Return the incoming value if it comes from the same BB as the phi node.
18146    if (P->getIncomingBlock(0) == ParentBB) {
18147      Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18148    } else if (P->getIncomingBlock(1) == ParentBB) {
18149      Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18150    }
18151  
18152    if (Rdx && DominatedReduxValue(Rdx))
18153      return Rdx;
18154  
18155    // Otherwise, check whether we have a loop latch to look at.
18156    Loop *BBL = LI->getLoopFor(ParentBB);
18157    if (!BBL)
18158      return nullptr;
18159    BasicBlock *BBLatch = BBL->getLoopLatch();
18160    if (!BBLatch)
18161      return nullptr;
18162  
18163    // There is a loop latch, return the incoming value if it comes from
18164    // that. This reduction pattern occasionally turns up.
18165    if (P->getIncomingBlock(0) == BBLatch) {
18166      Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18167    } else if (P->getIncomingBlock(1) == BBLatch) {
18168      Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18169    }
18170  
18171    if (Rdx && DominatedReduxValue(Rdx))
18172      return Rdx;
18173  
18174    return nullptr;
18175  }
18176  
18177  static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18178    if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
18179      return true;
18180    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
18181      return true;
18182    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
18183      return true;
18184    if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
18185      return true;
18186    if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
18187      return true;
18188    if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
18189      return true;
18190    if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
18191      return true;
18192    if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
18193      return true;
18194    if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
18195      return true;
18196    return false;
18197  }
18198  
18199  /// We could have an initial reduction that is not an add.
18200  ///  r *= v1 + v2 + v3 + v4
18201  /// In such a case start looking for a tree rooted in the first '+'.
18202  /// \Returns the new root if found, which may be nullptr if not an instruction.
18203  static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
18204                                                   Instruction *Root) {
18205    assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18206            isa<IntrinsicInst>(Root)) &&
18207           "Expected binop, select, or intrinsic for reduction matching");
18208    Value *LHS =
18209        Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18210    Value *RHS =
18211        Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18212    if (LHS == Phi)
18213      return dyn_cast<Instruction>(RHS);
18214    if (RHS == Phi)
18215      return dyn_cast<Instruction>(LHS);
18216    return nullptr;
18217  }
18218  
18219  /// \p Returns the first operand of \p I that does not match \p Phi. If
18220  /// operand is not an instruction it returns nullptr.
18221  static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
18222    Value *Op0 = nullptr;
18223    Value *Op1 = nullptr;
18224    if (!matchRdxBop(I, Op0, Op1))
18225      return nullptr;
18226    return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18227  }
18228  
18229  /// \Returns true if \p I is a candidate instruction for reduction vectorization.
18230  static bool isReductionCandidate(Instruction *I) {
18231    bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
18232    Value *B0 = nullptr, *B1 = nullptr;
18233    bool IsBinop = matchRdxBop(I, B0, B1);
18234    return IsBinop || IsSelect;
18235  }
18236  
18237  bool SLPVectorizerPass::vectorizeHorReduction(
18238      PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
18239      SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18240    if (!ShouldVectorizeHor)
18241      return false;
18242    bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18243  
18244    if (Root->getParent() != BB || isa<PHINode>(Root))
18245      return false;
18246  
18247    // If we can find a secondary reduction root, use that instead.
18248    auto SelectRoot = [&]() {
18249      if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18250          HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18251        if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18252          return NewRoot;
18253      return Root;
18254    };
18255  
18256    // Start analysis starting from Root instruction. If horizontal reduction is
18257    // found, try to vectorize it. If it is not a horizontal reduction or
18258    // vectorization is not possible or not effective, and currently analyzed
18259    // instruction is a binary operation, try to vectorize the operands, using
18260    // pre-order DFS traversal order. If the operands were not vectorized, repeat
18261    // the same procedure considering each operand as a possible root of the
18262    // horizontal reduction.
18263    // Interrupt the process if the Root instruction itself was vectorized or all
18264    // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18265    // If a horizintal reduction was not matched or vectorized we collect
18266    // instructions for possible later attempts for vectorization.
18267    std::queue<std::pair<Instruction *, unsigned>> Stack;
18268    Stack.emplace(SelectRoot(), 0);
18269    SmallPtrSet<Value *, 8> VisitedInstrs;
18270    bool Res = false;
18271    auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18272      if (R.isAnalyzedReductionRoot(Inst))
18273        return nullptr;
18274      if (!isReductionCandidate(Inst))
18275        return nullptr;
18276      HorizontalReduction HorRdx;
18277      if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18278        return nullptr;
18279      return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18280    };
18281    auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18282      if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18283        FutureSeed = getNonPhiOperand(Root, P);
18284        if (!FutureSeed)
18285          return false;
18286      }
18287      // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18288      // analysis is done separately.
18289      if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18290        PostponedInsts.push_back(FutureSeed);
18291      return true;
18292    };
18293  
18294    while (!Stack.empty()) {
18295      Instruction *Inst;
18296      unsigned Level;
18297      std::tie(Inst, Level) = Stack.front();
18298      Stack.pop();
18299      // Do not try to analyze instruction that has already been vectorized.
18300      // This may happen when we vectorize instruction operands on a previous
18301      // iteration while stack was populated before that happened.
18302      if (R.isDeleted(Inst))
18303        continue;
18304      if (Value *VectorizedV = TryToReduce(Inst)) {
18305        Res = true;
18306        if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18307          // Try to find another reduction.
18308          Stack.emplace(I, Level);
18309          continue;
18310        }
18311        if (R.isDeleted(Inst))
18312          continue;
18313      } else {
18314        // We could not vectorize `Inst` so try to use it as a future seed.
18315        if (!TryAppendToPostponedInsts(Inst)) {
18316          assert(Stack.empty() && "Expected empty stack");
18317          break;
18318        }
18319      }
18320  
18321      // Try to vectorize operands.
18322      // Continue analysis for the instruction from the same basic block only to
18323      // save compile time.
18324      if (++Level < RecursionMaxDepth)
18325        for (auto *Op : Inst->operand_values())
18326          if (VisitedInstrs.insert(Op).second)
18327            if (auto *I = dyn_cast<Instruction>(Op))
18328              // Do not try to vectorize CmpInst operands,  this is done
18329              // separately.
18330              if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18331                  !R.isDeleted(I) && I->getParent() == BB)
18332                Stack.emplace(I, Level);
18333    }
18334    return Res;
18335  }
18336  
18337  bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18338                                                   BasicBlock *BB, BoUpSLP &R,
18339                                                   TargetTransformInfo *TTI) {
18340    SmallVector<WeakTrackingVH> PostponedInsts;
18341    bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18342    Res |= tryToVectorize(PostponedInsts, R);
18343    return Res;
18344  }
18345  
18346  bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18347                                         BoUpSLP &R) {
18348    bool Res = false;
18349    for (Value *V : Insts)
18350      if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18351        Res |= tryToVectorize(Inst, R);
18352    return Res;
18353  }
18354  
18355  bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18356                                                   BasicBlock *BB, BoUpSLP &R,
18357                                                   bool MaxVFOnly) {
18358    if (!R.canMapToVector(IVI->getType()))
18359      return false;
18360  
18361    SmallVector<Value *, 16> BuildVectorOpds;
18362    SmallVector<Value *, 16> BuildVectorInsts;
18363    if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18364      return false;
18365  
18366    if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18367      R.getORE()->emit([&]() {
18368        return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18369               << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18370                  "trying reduction first.";
18371      });
18372      return false;
18373    }
18374    LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18375    // Aggregate value is unlikely to be processed in vector register.
18376    return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18377  }
18378  
18379  bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18380                                                     BasicBlock *BB, BoUpSLP &R,
18381                                                     bool MaxVFOnly) {
18382    SmallVector<Value *, 16> BuildVectorInsts;
18383    SmallVector<Value *, 16> BuildVectorOpds;
18384    SmallVector<int> Mask;
18385    if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18386        (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18387         isFixedVectorShuffle(BuildVectorOpds, Mask)))
18388      return false;
18389  
18390    if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18391      R.getORE()->emit([&]() {
18392        return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18393               << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18394                  "trying reduction first.";
18395      });
18396      return false;
18397    }
18398    LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18399    return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18400  }
18401  
18402  template <typename T>
18403  static bool tryToVectorizeSequence(
18404      SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18405      function_ref<bool(T *, T *)> AreCompatible,
18406      function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18407      bool MaxVFOnly, BoUpSLP &R) {
18408    bool Changed = false;
18409    // Sort by type, parent, operands.
18410    stable_sort(Incoming, Comparator);
18411  
18412    // Try to vectorize elements base on their type.
18413    SmallVector<T *> Candidates;
18414    SmallVector<T *> VL;
18415    for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18416         VL.clear()) {
18417      // Look for the next elements with the same type, parent and operand
18418      // kinds.
18419      auto *I = dyn_cast<Instruction>(*IncIt);
18420      if (!I || R.isDeleted(I)) {
18421        ++IncIt;
18422        continue;
18423      }
18424      auto *SameTypeIt = IncIt;
18425      while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18426                                 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18427                                 AreCompatible(*SameTypeIt, *IncIt))) {
18428        auto *I = dyn_cast<Instruction>(*SameTypeIt);
18429        ++SameTypeIt;
18430        if (I && !R.isDeleted(I))
18431          VL.push_back(cast<T>(I));
18432      }
18433  
18434      // Try to vectorize them.
18435      unsigned NumElts = VL.size();
18436      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18437                        << NumElts << ")\n");
18438      // The vectorization is a 3-state attempt:
18439      // 1. Try to vectorize instructions with the same/alternate opcodes with the
18440      // size of maximal register at first.
18441      // 2. Try to vectorize remaining instructions with the same type, if
18442      // possible. This may result in the better vectorization results rather than
18443      // if we try just to vectorize instructions with the same/alternate opcodes.
18444      // 3. Final attempt to try to vectorize all instructions with the
18445      // same/alternate ops only, this may result in some extra final
18446      // vectorization.
18447      if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18448        // Success start over because instructions might have been changed.
18449        Changed = true;
18450        VL.swap(Candidates);
18451        Candidates.clear();
18452        for (T *V : VL) {
18453          if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18454            Candidates.push_back(V);
18455        }
18456      } else {
18457        /// \Returns the minimum number of elements that we will attempt to
18458        /// vectorize.
18459        auto GetMinNumElements = [&R](Value *V) {
18460          unsigned EltSize = R.getVectorElementSize(V);
18461          return std::max(2U, R.getMaxVecRegSize() / EltSize);
18462        };
18463        if (NumElts < GetMinNumElements(*IncIt) &&
18464            (Candidates.empty() ||
18465             Candidates.front()->getType() == (*IncIt)->getType())) {
18466          for (T *V : VL) {
18467            if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18468              Candidates.push_back(V);
18469          }
18470        }
18471      }
18472      // Final attempt to vectorize instructions with the same types.
18473      if (Candidates.size() > 1 &&
18474          (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18475        if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18476          // Success start over because instructions might have been changed.
18477          Changed = true;
18478        } else if (MaxVFOnly) {
18479          // Try to vectorize using small vectors.
18480          SmallVector<T *> VL;
18481          for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18482               VL.clear()) {
18483            auto *I = dyn_cast<Instruction>(*It);
18484            if (!I || R.isDeleted(I)) {
18485              ++It;
18486              continue;
18487            }
18488            auto *SameTypeIt = It;
18489            while (SameTypeIt != End &&
18490                   (!isa<Instruction>(*SameTypeIt) ||
18491                    R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18492                    AreCompatible(*SameTypeIt, *It))) {
18493              auto *I = dyn_cast<Instruction>(*SameTypeIt);
18494              ++SameTypeIt;
18495              if (I && !R.isDeleted(I))
18496                VL.push_back(cast<T>(I));
18497            }
18498            unsigned NumElts = VL.size();
18499            if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18500                                                    /*MaxVFOnly=*/false))
18501              Changed = true;
18502            It = SameTypeIt;
18503          }
18504        }
18505        Candidates.clear();
18506      }
18507  
18508      // Start over at the next instruction of a different type (or the end).
18509      IncIt = SameTypeIt;
18510    }
18511    return Changed;
18512  }
18513  
18514  /// Compare two cmp instructions. If IsCompatibility is true, function returns
18515  /// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18516  /// operands. If IsCompatibility is false, function implements strict weak
18517  /// ordering relation between two cmp instructions, returning true if the first
18518  /// instruction is "less" than the second, i.e. its predicate is less than the
18519  /// predicate of the second or the operands IDs are less than the operands IDs
18520  /// of the second cmp instruction.
18521  template <bool IsCompatibility>
18522  static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18523                         const DominatorTree &DT) {
18524    assert(isValidElementType(V->getType()) &&
18525           isValidElementType(V2->getType()) &&
18526           "Expected valid element types only.");
18527    if (V == V2)
18528      return IsCompatibility;
18529    auto *CI1 = cast<CmpInst>(V);
18530    auto *CI2 = cast<CmpInst>(V2);
18531    if (CI1->getOperand(0)->getType()->getTypeID() <
18532        CI2->getOperand(0)->getType()->getTypeID())
18533      return !IsCompatibility;
18534    if (CI1->getOperand(0)->getType()->getTypeID() >
18535        CI2->getOperand(0)->getType()->getTypeID())
18536      return false;
18537    CmpInst::Predicate Pred1 = CI1->getPredicate();
18538    CmpInst::Predicate Pred2 = CI2->getPredicate();
18539    CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
18540    CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
18541    CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18542    CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18543    if (BasePred1 < BasePred2)
18544      return !IsCompatibility;
18545    if (BasePred1 > BasePred2)
18546      return false;
18547    // Compare operands.
18548    bool CI1Preds = Pred1 == BasePred1;
18549    bool CI2Preds = Pred2 == BasePred1;
18550    for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18551      auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18552      auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18553      if (Op1 == Op2)
18554        continue;
18555      if (Op1->getValueID() < Op2->getValueID())
18556        return !IsCompatibility;
18557      if (Op1->getValueID() > Op2->getValueID())
18558        return false;
18559      if (auto *I1 = dyn_cast<Instruction>(Op1))
18560        if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18561          if (IsCompatibility) {
18562            if (I1->getParent() != I2->getParent())
18563              return false;
18564          } else {
18565            // Try to compare nodes with same parent.
18566            DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18567            DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18568            if (!NodeI1)
18569              return NodeI2 != nullptr;
18570            if (!NodeI2)
18571              return false;
18572            assert((NodeI1 == NodeI2) ==
18573                       (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18574                   "Different nodes should have different DFS numbers");
18575            if (NodeI1 != NodeI2)
18576              return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18577          }
18578          InstructionsState S = getSameOpcode({I1, I2}, TLI);
18579          if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18580            continue;
18581          if (IsCompatibility)
18582            return false;
18583          if (I1->getOpcode() != I2->getOpcode())
18584            return I1->getOpcode() < I2->getOpcode();
18585        }
18586    }
18587    return IsCompatibility;
18588  }
18589  
18590  template <typename ItT>
18591  bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18592                                            BasicBlock *BB, BoUpSLP &R) {
18593    bool Changed = false;
18594    // Try to find reductions first.
18595    for (CmpInst *I : CmpInsts) {
18596      if (R.isDeleted(I))
18597        continue;
18598      for (Value *Op : I->operands())
18599        if (auto *RootOp = dyn_cast<Instruction>(Op)) {
18600          Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18601          if (R.isDeleted(I))
18602            break;
18603        }
18604    }
18605    // Try to vectorize operands as vector bundles.
18606    for (CmpInst *I : CmpInsts) {
18607      if (R.isDeleted(I))
18608        continue;
18609      Changed |= tryToVectorize(I, R);
18610    }
18611    // Try to vectorize list of compares.
18612    // Sort by type, compare predicate, etc.
18613    auto CompareSorter = [&](Value *V, Value *V2) {
18614      if (V == V2)
18615        return false;
18616      return compareCmp<false>(V, V2, *TLI, *DT);
18617    };
18618  
18619    auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18620      if (V1 == V2)
18621        return true;
18622      return compareCmp<true>(V1, V2, *TLI, *DT);
18623    };
18624  
18625    SmallVector<Value *> Vals;
18626    for (Instruction *V : CmpInsts)
18627      if (!R.isDeleted(V) && isValidElementType(V->getType()))
18628        Vals.push_back(V);
18629    if (Vals.size() <= 1)
18630      return Changed;
18631    Changed |= tryToVectorizeSequence<Value>(
18632        Vals, CompareSorter, AreCompatibleCompares,
18633        [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18634          // Exclude possible reductions from other blocks.
18635          bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18636            return any_of(V->users(), [V](User *U) {
18637              auto *Select = dyn_cast<SelectInst>(U);
18638              return Select &&
18639                     Select->getParent() != cast<Instruction>(V)->getParent();
18640            });
18641          });
18642          if (ArePossiblyReducedInOtherBlock)
18643            return false;
18644          return tryToVectorizeList(Candidates, R, MaxVFOnly);
18645        },
18646        /*MaxVFOnly=*/true, R);
18647    return Changed;
18648  }
18649  
18650  bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18651                                           BasicBlock *BB, BoUpSLP &R) {
18652    assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18653           "This function only accepts Insert instructions");
18654    bool OpsChanged = false;
18655    SmallVector<WeakTrackingVH> PostponedInsts;
18656    for (auto *I : reverse(Instructions)) {
18657      // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18658      if (R.isDeleted(I) || isa<CmpInst>(I))
18659        continue;
18660      if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18661        OpsChanged |=
18662            vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18663      } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18664        OpsChanged |=
18665            vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18666      }
18667      // pass2 - try to vectorize reductions only
18668      if (R.isDeleted(I))
18669        continue;
18670      OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18671      if (R.isDeleted(I) || isa<CmpInst>(I))
18672        continue;
18673      // pass3 - try to match and vectorize a buildvector sequence.
18674      if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18675        OpsChanged |=
18676            vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
18677      } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18678        OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18679                                                 /*MaxVFOnly=*/false);
18680      }
18681    }
18682    // Now try to vectorize postponed instructions.
18683    OpsChanged |= tryToVectorize(PostponedInsts, R);
18684  
18685    Instructions.clear();
18686    return OpsChanged;
18687  }
18688  
18689  bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18690    bool Changed = false;
18691    SmallVector<Value *, 4> Incoming;
18692    SmallPtrSet<Value *, 16> VisitedInstrs;
18693    // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18694    // node. Allows better to identify the chains that can be vectorized in the
18695    // better way.
18696    DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
18697    auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18698      assert(isValidElementType(V1->getType()) &&
18699             isValidElementType(V2->getType()) &&
18700             "Expected vectorizable types only.");
18701      // It is fine to compare type IDs here, since we expect only vectorizable
18702      // types, like ints, floats and pointers, we don't care about other type.
18703      if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18704        return true;
18705      if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18706        return false;
18707      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18708      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18709      if (Opcodes1.size() < Opcodes2.size())
18710        return true;
18711      if (Opcodes1.size() > Opcodes2.size())
18712        return false;
18713      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18714        {
18715          // Instructions come first.
18716          auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18717          auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18718          if (I1 && I2) {
18719            DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18720            DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18721            if (!NodeI1)
18722              return NodeI2 != nullptr;
18723            if (!NodeI2)
18724              return false;
18725            assert((NodeI1 == NodeI2) ==
18726                       (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18727                   "Different nodes should have different DFS numbers");
18728            if (NodeI1 != NodeI2)
18729              return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18730            InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18731            if (S.getOpcode() && !S.isAltShuffle())
18732              continue;
18733            return I1->getOpcode() < I2->getOpcode();
18734          }
18735          if (I1)
18736            return true;
18737          if (I2)
18738            return false;
18739        }
18740        {
18741          // Non-undef constants come next.
18742          bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18743          bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18744          if (C1 && C2)
18745            continue;
18746          if (C1)
18747            return true;
18748          if (C2)
18749            return false;
18750        }
18751        bool U1 = isa<UndefValue>(Opcodes1[I]);
18752        bool U2 = isa<UndefValue>(Opcodes2[I]);
18753        {
18754          // Non-constant non-instructions come next.
18755          if (!U1 && !U2) {
18756            auto ValID1 = Opcodes1[I]->getValueID();
18757            auto ValID2 = Opcodes2[I]->getValueID();
18758            if (ValID1 == ValID2)
18759              continue;
18760            if (ValID1 < ValID2)
18761              return true;
18762            if (ValID1 > ValID2)
18763              return false;
18764          }
18765          if (!U1)
18766            return true;
18767          if (!U2)
18768            return false;
18769        }
18770        // Undefs come last.
18771        assert(U1 && U2 && "The only thing left should be undef & undef.");
18772        continue;
18773      }
18774      return false;
18775    };
18776    auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
18777      if (V1 == V2)
18778        return true;
18779      if (V1->getType() != V2->getType())
18780        return false;
18781      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18782      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18783      if (Opcodes1.size() != Opcodes2.size())
18784        return false;
18785      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18786        // Undefs are compatible with any other value.
18787        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18788          continue;
18789        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18790          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18791            if (R.isDeleted(I1) || R.isDeleted(I2))
18792              return false;
18793            if (I1->getParent() != I2->getParent())
18794              return false;
18795            InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18796            if (S.getOpcode())
18797              continue;
18798            return false;
18799          }
18800        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18801          continue;
18802        if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18803          return false;
18804      }
18805      return true;
18806    };
18807  
18808    bool HaveVectorizedPhiNodes = false;
18809    do {
18810      // Collect the incoming values from the PHIs.
18811      Incoming.clear();
18812      for (Instruction &I : *BB) {
18813        auto *P = dyn_cast<PHINode>(&I);
18814        if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18815          break;
18816  
18817        // No need to analyze deleted, vectorized and non-vectorizable
18818        // instructions.
18819        if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18820            isValidElementType(P->getType()))
18821          Incoming.push_back(P);
18822      }
18823  
18824      if (Incoming.size() <= 1)
18825        break;
18826  
18827      // Find the corresponding non-phi nodes for better matching when trying to
18828      // build the tree.
18829      for (Value *V : Incoming) {
18830        SmallVectorImpl<Value *> &Opcodes =
18831            PHIToOpcodes.try_emplace(V).first->getSecond();
18832        if (!Opcodes.empty())
18833          continue;
18834        SmallVector<Value *, 4> Nodes(1, V);
18835        SmallPtrSet<Value *, 4> Visited;
18836        while (!Nodes.empty()) {
18837          auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18838          if (!Visited.insert(PHI).second)
18839            continue;
18840          for (Value *V : PHI->incoming_values()) {
18841            if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18842              Nodes.push_back(PHI1);
18843              continue;
18844            }
18845            Opcodes.emplace_back(V);
18846          }
18847        }
18848      }
18849  
18850      HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18851          Incoming, PHICompare, AreCompatiblePHIs,
18852          [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18853            return tryToVectorizeList(Candidates, R, MaxVFOnly);
18854          },
18855          /*MaxVFOnly=*/true, R);
18856      Changed |= HaveVectorizedPhiNodes;
18857      if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
18858            auto *PHI = dyn_cast<PHINode>(P.first);
18859            return !PHI || R.isDeleted(PHI);
18860          }))
18861        PHIToOpcodes.clear();
18862      VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18863    } while (HaveVectorizedPhiNodes);
18864  
18865    VisitedInstrs.clear();
18866  
18867    InstSetVector PostProcessInserts;
18868    SmallSetVector<CmpInst *, 8> PostProcessCmps;
18869    // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18870    // also vectorizes `PostProcessCmps`.
18871    auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18872      bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18873      if (VectorizeCmps) {
18874        Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18875        PostProcessCmps.clear();
18876      }
18877      PostProcessInserts.clear();
18878      return Changed;
18879    };
18880    // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18881    auto IsInPostProcessInstrs = [&](Instruction *I) {
18882      if (auto *Cmp = dyn_cast<CmpInst>(I))
18883        return PostProcessCmps.contains(Cmp);
18884      return isa<InsertElementInst, InsertValueInst>(I) &&
18885             PostProcessInserts.contains(I);
18886    };
18887    // Returns true if `I` is an instruction without users, like terminator, or
18888    // function call with ignored return value, store. Ignore unused instructions
18889    // (basing on instruction type, except for CallInst and InvokeInst).
18890    auto HasNoUsers = [](Instruction *I) {
18891      return I->use_empty() &&
18892             (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18893    };
18894    for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18895      // Skip instructions with scalable type. The num of elements is unknown at
18896      // compile-time for scalable type.
18897      if (isa<ScalableVectorType>(It->getType()))
18898        continue;
18899  
18900      // Skip instructions marked for the deletion.
18901      if (R.isDeleted(&*It))
18902        continue;
18903      // We may go through BB multiple times so skip the one we have checked.
18904      if (!VisitedInstrs.insert(&*It).second) {
18905        if (HasNoUsers(&*It) &&
18906            VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18907          // We would like to start over since some instructions are deleted
18908          // and the iterator may become invalid value.
18909          Changed = true;
18910          It = BB->begin();
18911          E = BB->end();
18912        }
18913        continue;
18914      }
18915  
18916      if (isa<DbgInfoIntrinsic>(It))
18917        continue;
18918  
18919      // Try to vectorize reductions that use PHINodes.
18920      if (PHINode *P = dyn_cast<PHINode>(It)) {
18921        // Check that the PHI is a reduction PHI.
18922        if (P->getNumIncomingValues() == 2) {
18923          // Try to match and vectorize a horizontal reduction.
18924          Instruction *Root = getReductionInstr(DT, P, BB, LI);
18925          if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18926            Changed = true;
18927            It = BB->begin();
18928            E = BB->end();
18929            continue;
18930          }
18931        }
18932        // Try to vectorize the incoming values of the PHI, to catch reductions
18933        // that feed into PHIs.
18934        for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
18935          // Skip if the incoming block is the current BB for now. Also, bypass
18936          // unreachable IR for efficiency and to avoid crashing.
18937          // TODO: Collect the skipped incoming values and try to vectorize them
18938          // after processing BB.
18939          if (BB == P->getIncomingBlock(I) ||
18940              !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18941            continue;
18942  
18943          // Postponed instructions should not be vectorized here, delay their
18944          // vectorization.
18945          if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18946              PI && !IsInPostProcessInstrs(PI)) {
18947            bool Res = vectorizeRootInstruction(nullptr, PI,
18948                                                P->getIncomingBlock(I), R, TTI);
18949            Changed |= Res;
18950            if (Res && R.isDeleted(P)) {
18951              It = BB->begin();
18952              E = BB->end();
18953              break;
18954            }
18955          }
18956        }
18957        continue;
18958      }
18959  
18960      if (HasNoUsers(&*It)) {
18961        bool OpsChanged = false;
18962        auto *SI = dyn_cast<StoreInst>(It);
18963        bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18964        if (SI) {
18965          auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18966          // Try to vectorize chain in store, if this is the only store to the
18967          // address in the block.
18968          // TODO: This is just a temporarily solution to save compile time. Need
18969          // to investigate if we can safely turn on slp-vectorize-hor-store
18970          // instead to allow lookup for reduction chains in all non-vectorized
18971          // stores (need to check side effects and compile time).
18972          TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18973                                SI->getValueOperand()->hasOneUse();
18974        }
18975        if (TryToVectorizeRoot) {
18976          for (auto *V : It->operand_values()) {
18977            // Postponed instructions should not be vectorized here, delay their
18978            // vectorization.
18979            if (auto *VI = dyn_cast<Instruction>(V);
18980                VI && !IsInPostProcessInstrs(VI))
18981              // Try to match and vectorize a horizontal reduction.
18982              OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18983          }
18984        }
18985        // Start vectorization of post-process list of instructions from the
18986        // top-tree instructions to try to vectorize as many instructions as
18987        // possible.
18988        OpsChanged |=
18989            VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18990        if (OpsChanged) {
18991          // We would like to start over since some instructions are deleted
18992          // and the iterator may become invalid value.
18993          Changed = true;
18994          It = BB->begin();
18995          E = BB->end();
18996          continue;
18997        }
18998      }
18999  
19000      if (isa<InsertElementInst, InsertValueInst>(It))
19001        PostProcessInserts.insert(&*It);
19002      else if (isa<CmpInst>(It))
19003        PostProcessCmps.insert(cast<CmpInst>(&*It));
19004    }
19005  
19006    return Changed;
19007  }
19008  
19009  bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
19010    auto Changed = false;
19011    for (auto &Entry : GEPs) {
19012      // If the getelementptr list has fewer than two elements, there's nothing
19013      // to do.
19014      if (Entry.second.size() < 2)
19015        continue;
19016  
19017      LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
19018                        << Entry.second.size() << ".\n");
19019  
19020      // Process the GEP list in chunks suitable for the target's supported
19021      // vector size. If a vector register can't hold 1 element, we are done. We
19022      // are trying to vectorize the index computations, so the maximum number of
19023      // elements is based on the size of the index expression, rather than the
19024      // size of the GEP itself (the target's pointer size).
19025      auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
19026        return !R.isDeleted(GEP);
19027      });
19028      if (It == Entry.second.end())
19029        continue;
19030      unsigned MaxVecRegSize = R.getMaxVecRegSize();
19031      unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
19032      if (MaxVecRegSize < EltSize)
19033        continue;
19034  
19035      unsigned MaxElts = MaxVecRegSize / EltSize;
19036      for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19037        auto Len = std::min<unsigned>(BE - BI, MaxElts);
19038        ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19039  
19040        // Initialize a set a candidate getelementptrs. Note that we use a
19041        // SetVector here to preserve program order. If the index computations
19042        // are vectorizable and begin with loads, we want to minimize the chance
19043        // of having to reorder them later.
19044        SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19045  
19046        // Some of the candidates may have already been vectorized after we
19047        // initially collected them or their index is optimized to constant value.
19048        // If so, they are marked as deleted, so remove them from the set of
19049        // candidates.
19050        Candidates.remove_if([&R](Value *I) {
19051          return R.isDeleted(cast<Instruction>(I)) ||
19052                 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
19053        });
19054  
19055        // Remove from the set of candidates all pairs of getelementptrs with
19056        // constant differences. Such getelementptrs are likely not good
19057        // candidates for vectorization in a bottom-up phase since one can be
19058        // computed from the other. We also ensure all candidate getelementptr
19059        // indices are unique.
19060        for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19061          auto *GEPI = GEPList[I];
19062          if (!Candidates.count(GEPI))
19063            continue;
19064          auto *SCEVI = SE->getSCEV(GEPList[I]);
19065          for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19066            auto *GEPJ = GEPList[J];
19067            auto *SCEVJ = SE->getSCEV(GEPList[J]);
19068            if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
19069              Candidates.remove(GEPI);
19070              Candidates.remove(GEPJ);
19071            } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19072              Candidates.remove(GEPJ);
19073            }
19074          }
19075        }
19076  
19077        // We break out of the above computation as soon as we know there are
19078        // fewer than two candidates remaining.
19079        if (Candidates.size() < 2)
19080          continue;
19081  
19082        // Add the single, non-constant index of each candidate to the bundle. We
19083        // ensured the indices met these constraints when we originally collected
19084        // the getelementptrs.
19085        SmallVector<Value *, 16> Bundle(Candidates.size());
19086        auto BundleIndex = 0u;
19087        for (auto *V : Candidates) {
19088          auto *GEP = cast<GetElementPtrInst>(V);
19089          auto *GEPIdx = GEP->idx_begin()->get();
19090          assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19091          Bundle[BundleIndex++] = GEPIdx;
19092        }
19093  
19094        // Try and vectorize the indices. We are currently only interested in
19095        // gather-like cases of the form:
19096        //
19097        // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19098        //
19099        // where the loads of "a", the loads of "b", and the subtractions can be
19100        // performed in parallel. It's likely that detecting this pattern in a
19101        // bottom-up phase will be simpler and less costly than building a
19102        // full-blown top-down phase beginning at the consecutive loads.
19103        Changed |= tryToVectorizeList(Bundle, R);
19104      }
19105    }
19106    return Changed;
19107  }
19108  
19109  bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19110    bool Changed = false;
19111    // Sort by type, base pointers and values operand. Value operands must be
19112    // compatible (have the same opcode, same parent), otherwise it is
19113    // definitely not profitable to try to vectorize them.
19114    auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19115      if (V->getValueOperand()->getType()->getTypeID() <
19116          V2->getValueOperand()->getType()->getTypeID())
19117        return true;
19118      if (V->getValueOperand()->getType()->getTypeID() >
19119          V2->getValueOperand()->getType()->getTypeID())
19120        return false;
19121      if (V->getPointerOperandType()->getTypeID() <
19122          V2->getPointerOperandType()->getTypeID())
19123        return true;
19124      if (V->getPointerOperandType()->getTypeID() >
19125          V2->getPointerOperandType()->getTypeID())
19126        return false;
19127      // UndefValues are compatible with all other values.
19128      if (isa<UndefValue>(V->getValueOperand()) ||
19129          isa<UndefValue>(V2->getValueOperand()))
19130        return false;
19131      if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
19132        if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19133          DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
19134              DT->getNode(I1->getParent());
19135          DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
19136              DT->getNode(I2->getParent());
19137          assert(NodeI1 && "Should only process reachable instructions");
19138          assert(NodeI2 && "Should only process reachable instructions");
19139          assert((NodeI1 == NodeI2) ==
19140                     (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19141                 "Different nodes should have different DFS numbers");
19142          if (NodeI1 != NodeI2)
19143            return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19144          InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19145          if (S.getOpcode())
19146            return false;
19147          return I1->getOpcode() < I2->getOpcode();
19148        }
19149      if (isa<Constant>(V->getValueOperand()) &&
19150          isa<Constant>(V2->getValueOperand()))
19151        return false;
19152      return V->getValueOperand()->getValueID() <
19153             V2->getValueOperand()->getValueID();
19154    };
19155  
19156    auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19157      if (V1 == V2)
19158        return true;
19159      if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19160        return false;
19161      if (V1->getPointerOperandType() != V2->getPointerOperandType())
19162        return false;
19163      // Undefs are compatible with any other value.
19164      if (isa<UndefValue>(V1->getValueOperand()) ||
19165          isa<UndefValue>(V2->getValueOperand()))
19166        return true;
19167      if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
19168        if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19169          if (I1->getParent() != I2->getParent())
19170            return false;
19171          InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19172          return S.getOpcode() > 0;
19173        }
19174      if (isa<Constant>(V1->getValueOperand()) &&
19175          isa<Constant>(V2->getValueOperand()))
19176        return true;
19177      return V1->getValueOperand()->getValueID() ==
19178             V2->getValueOperand()->getValueID();
19179    };
19180  
19181    // Attempt to sort and vectorize each of the store-groups.
19182    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
19183    for (auto &Pair : Stores) {
19184      if (Pair.second.size() < 2)
19185        continue;
19186  
19187      LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19188                        << Pair.second.size() << ".\n");
19189  
19190      if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
19191        continue;
19192  
19193      // Reverse stores to do bottom-to-top analysis. This is important if the
19194      // values are stores to the same addresses several times, in this case need
19195      // to follow the stores order (reversed to meet the memory dependecies).
19196      SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19197                                              Pair.second.rend());
19198      Changed |= tryToVectorizeSequence<StoreInst>(
19199          ReversedStores, StoreSorter, AreCompatibleStores,
19200          [&](ArrayRef<StoreInst *> Candidates, bool) {
19201            return vectorizeStores(Candidates, R, Attempted);
19202          },
19203          /*MaxVFOnly=*/false, R);
19204    }
19205    return Changed;
19206  }
19207