xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 6c05f3a74f30934ee60919cc97e16ec69b542b06)
1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/DenseSet.h"
22 #include "llvm/ADT/PriorityQueue.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/ScopeExit.h"
25 #include "llvm/ADT/SetOperations.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallBitVector.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallString.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/ADT/iterator.h"
33 #include "llvm/ADT/iterator_range.h"
34 #include "llvm/Analysis/AliasAnalysis.h"
35 #include "llvm/Analysis/AssumptionCache.h"
36 #include "llvm/Analysis/CodeMetrics.h"
37 #include "llvm/Analysis/ConstantFolding.h"
38 #include "llvm/Analysis/DemandedBits.h"
39 #include "llvm/Analysis/GlobalsModRef.h"
40 #include "llvm/Analysis/IVDescriptors.h"
41 #include "llvm/Analysis/LoopAccessAnalysis.h"
42 #include "llvm/Analysis/LoopInfo.h"
43 #include "llvm/Analysis/MemoryLocation.h"
44 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
45 #include "llvm/Analysis/ScalarEvolution.h"
46 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
47 #include "llvm/Analysis/TargetLibraryInfo.h"
48 #include "llvm/Analysis/TargetTransformInfo.h"
49 #include "llvm/Analysis/ValueTracking.h"
50 #include "llvm/Analysis/VectorUtils.h"
51 #include "llvm/IR/Attributes.h"
52 #include "llvm/IR/BasicBlock.h"
53 #include "llvm/IR/Constant.h"
54 #include "llvm/IR/Constants.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/DerivedTypes.h"
57 #include "llvm/IR/Dominators.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/IRBuilder.h"
60 #include "llvm/IR/InstrTypes.h"
61 #include "llvm/IR/Instruction.h"
62 #include "llvm/IR/Instructions.h"
63 #include "llvm/IR/IntrinsicInst.h"
64 #include "llvm/IR/Intrinsics.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PatternMatch.h"
68 #include "llvm/IR/Type.h"
69 #include "llvm/IR/Use.h"
70 #include "llvm/IR/User.h"
71 #include "llvm/IR/Value.h"
72 #include "llvm/IR/ValueHandle.h"
73 #ifdef EXPENSIVE_CHECKS
74 #include "llvm/IR/Verifier.h"
75 #endif
76 #include "llvm/Pass.h"
77 #include "llvm/Support/Casting.h"
78 #include "llvm/Support/CommandLine.h"
79 #include "llvm/Support/Compiler.h"
80 #include "llvm/Support/DOTGraphTraits.h"
81 #include "llvm/Support/Debug.h"
82 #include "llvm/Support/ErrorHandling.h"
83 #include "llvm/Support/GraphWriter.h"
84 #include "llvm/Support/InstructionCost.h"
85 #include "llvm/Support/KnownBits.h"
86 #include "llvm/Support/MathExtras.h"
87 #include "llvm/Support/raw_ostream.h"
88 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
89 #include "llvm/Transforms/Utils/Local.h"
90 #include "llvm/Transforms/Utils/LoopUtils.h"
91 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92 #include <algorithm>
93 #include <cassert>
94 #include <cstdint>
95 #include <iterator>
96 #include <memory>
97 #include <optional>
98 #include <set>
99 #include <string>
100 #include <tuple>
101 #include <utility>
102 
103 using namespace llvm;
104 using namespace llvm::PatternMatch;
105 using namespace slpvectorizer;
106 
107 #define SV_NAME "slp-vectorizer"
108 #define DEBUG_TYPE "SLP"
109 
110 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111 
112 static cl::opt<bool>
113     RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114                         cl::desc("Run the SLP vectorization passes"));
115 
116 static cl::opt<bool>
117     SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118              cl::desc("Enable vectorization for wider vector utilization"));
119 
120 static cl::opt<int>
121     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
122                      cl::desc("Only vectorize if you gain more than this "
123                               "number "));
124 
125 static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
126     "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127     cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128              "heuristics and makes vectorization decision via cost modeling."));
129 
130 static cl::opt<bool>
131 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132                    cl::desc("Attempt to vectorize horizontal reductions"));
133 
134 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
135     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136     cl::desc(
137         "Attempt to vectorize horizontal reductions feeding into a store"));
138 
139 // NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140 // even if we match a reduction but do not vectorize in the end.
141 static cl::opt<bool> AllowHorRdxIdenityOptimization(
142     "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143     cl::desc("Allow optimization of original scalar identity operations on "
144              "matched horizontal reductions."));
145 
146 static cl::opt<int>
147 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
148     cl::desc("Attempt to vectorize for this register size in bits"));
149 
150 static cl::opt<unsigned>
151 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
152     cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153 
154 /// Limits the size of scheduling regions in a block.
155 /// It avoid long compile times for _very_ large blocks where vector
156 /// instructions are spread over a wide range.
157 /// This limit is way higher than needed by real-world functions.
158 static cl::opt<int>
159 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160     cl::desc("Limit the size of the SLP scheduling region per block"));
161 
162 static cl::opt<int> MinVectorRegSizeOption(
163     "slp-min-reg-size", cl::init(128), cl::Hidden,
164     cl::desc("Attempt to vectorize for this register size in bits"));
165 
166 static cl::opt<unsigned> RecursionMaxDepth(
167     "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168     cl::desc("Limit the recursion depth when building a vectorizable tree"));
169 
170 static cl::opt<unsigned> MinTreeSize(
171     "slp-min-tree-size", cl::init(3), cl::Hidden,
172     cl::desc("Only vectorize small trees if they are fully vectorizable"));
173 
174 // The maximum depth that the look-ahead score heuristic will explore.
175 // The higher this value, the higher the compilation time overhead.
176 static cl::opt<int> LookAheadMaxDepth(
177     "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178     cl::desc("The maximum look-ahead depth for operand reordering scores"));
179 
180 // The maximum depth that the look-ahead score heuristic will explore
181 // when it probing among candidates for vectorization tree roots.
182 // The higher this value, the higher the compilation time overhead but unlike
183 // similar limit for operands ordering this is less frequently used, hence
184 // impact of higher value is less noticeable.
185 static cl::opt<int> RootLookAheadMaxDepth(
186     "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187     cl::desc("The maximum look-ahead depth for searching best rooting option"));
188 
189 static cl::opt<unsigned> MinProfitableStridedLoads(
190     "slp-min-strided-loads", cl::init(2), cl::Hidden,
191     cl::desc("The minimum number of loads, which should be considered strided, "
192              "if the stride is > 1 or is runtime value"));
193 
194 static cl::opt<unsigned> MaxProfitableLoadStride(
195     "slp-max-stride", cl::init(8), cl::Hidden,
196     cl::desc("The maximum stride, considered to be profitable."));
197 
198 static cl::opt<bool>
199     ViewSLPTree("view-slp-tree", cl::Hidden,
200                 cl::desc("Display the SLP trees with Graphviz"));
201 
202 static cl::opt<bool> VectorizeNonPowerOf2(
203     "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204     cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205 
206 // Limit the number of alias checks. The limit is chosen so that
207 // it has no negative effect on the llvm benchmarks.
208 static const unsigned AliasedCheckLimit = 10;
209 
210 // Limit of the number of uses for potentially transformed instructions/values,
211 // used in checks to avoid compile-time explode.
212 static constexpr int UsesLimit = 64;
213 
214 // Another limit for the alias checks: The maximum distance between load/store
215 // instructions where alias checks are done.
216 // This limit is useful for very large basic blocks.
217 static const unsigned MaxMemDepDistance = 160;
218 
219 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220 /// regions to be handled.
221 static const int MinScheduleRegionSize = 16;
222 
223 /// Maximum allowed number of operands in the PHI nodes.
224 static const unsigned MaxPHINumOperands = 128;
225 
226 /// Predicate for the element types that the SLP vectorizer supports.
227 ///
228 /// The most important thing to filter here are types which are invalid in LLVM
229 /// vectors. We also filter target specific types which have absolutely no
230 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231 /// avoids spending time checking the cost model and realizing that they will
232 /// be inevitably scalarized.
233 static bool isValidElementType(Type *Ty) {
234   // TODO: Support ScalableVectorType.
235   if (SLPReVec && isa<FixedVectorType>(Ty))
236     Ty = Ty->getScalarType();
237   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238          !Ty->isPPC_FP128Ty();
239 }
240 
241 /// \returns the number of elements for Ty.
242 static unsigned getNumElements(Type *Ty) {
243   assert(!isa<ScalableVectorType>(Ty) &&
244          "ScalableVectorType is not supported.");
245   if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246     return VecTy->getNumElements();
247   return 1;
248 }
249 
250 /// \returns the vector type of ScalarTy based on vectorization factor.
251 static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252   return FixedVectorType::get(ScalarTy->getScalarType(),
253                               VF * getNumElements(ScalarTy));
254 }
255 
256 /// \returns True if the value is a constant (but not globals/constant
257 /// expressions).
258 static bool isConstant(Value *V) {
259   return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
260 }
261 
262 /// Checks if \p V is one of vector-like instructions, i.e. undef,
263 /// insertelement/extractelement with constant indices for fixed vector type or
264 /// extractvalue instruction.
265 static bool isVectorLikeInstWithConstOps(Value *V) {
266   if (!isa<InsertElementInst, ExtractElementInst>(V) &&
267       !isa<ExtractValueInst, UndefValue>(V))
268     return false;
269   auto *I = dyn_cast<Instruction>(V);
270   if (!I || isa<ExtractValueInst>(I))
271     return true;
272   if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
273     return false;
274   if (isa<ExtractElementInst>(I))
275     return isConstant(I->getOperand(1));
276   assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
277   return isConstant(I->getOperand(2));
278 }
279 
280 /// Returns power-of-2 number of elements in a single register (part), given the
281 /// total number of elements \p Size and number of registers (parts) \p
282 /// NumParts.
283 static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
284   return PowerOf2Ceil(divideCeil(Size, NumParts));
285 }
286 
287 /// Returns correct remaining number of elements, considering total amount \p
288 /// Size, (power-of-2 number) of elements in a single register \p PartNumElems
289 /// and current register (part) \p Part.
290 static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
291                             unsigned Part) {
292   return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
293 }
294 
295 #if !defined(NDEBUG)
296 /// Print a short descriptor of the instruction bundle suitable for debug output.
297 static std::string shortBundleName(ArrayRef<Value *> VL) {
298   std::string Result;
299   raw_string_ostream OS(Result);
300   OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
301   OS.flush();
302   return Result;
303 }
304 #endif
305 
306 /// \returns true if all of the instructions in \p VL are in the same block or
307 /// false otherwise.
308 static bool allSameBlock(ArrayRef<Value *> VL) {
309   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
310   if (!I0)
311     return false;
312   if (all_of(VL, isVectorLikeInstWithConstOps))
313     return true;
314 
315   BasicBlock *BB = I0->getParent();
316   for (int I = 1, E = VL.size(); I < E; I++) {
317     auto *II = dyn_cast<Instruction>(VL[I]);
318     if (!II)
319       return false;
320 
321     if (BB != II->getParent())
322       return false;
323   }
324   return true;
325 }
326 
327 /// \returns True if all of the values in \p VL are constants (but not
328 /// globals/constant expressions).
329 static bool allConstant(ArrayRef<Value *> VL) {
330   // Constant expressions and globals can't be vectorized like normal integer/FP
331   // constants.
332   return all_of(VL, isConstant);
333 }
334 
335 /// \returns True if all of the values in \p VL are identical or some of them
336 /// are UndefValue.
337 static bool isSplat(ArrayRef<Value *> VL) {
338   Value *FirstNonUndef = nullptr;
339   for (Value *V : VL) {
340     if (isa<UndefValue>(V))
341       continue;
342     if (!FirstNonUndef) {
343       FirstNonUndef = V;
344       continue;
345     }
346     if (V != FirstNonUndef)
347       return false;
348   }
349   return FirstNonUndef != nullptr;
350 }
351 
352 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
353 static bool isCommutative(Instruction *I) {
354   if (auto *Cmp = dyn_cast<CmpInst>(I))
355     return Cmp->isCommutative();
356   if (auto *BO = dyn_cast<BinaryOperator>(I))
357     return BO->isCommutative() ||
358            (BO->getOpcode() == Instruction::Sub &&
359             !BO->hasNUsesOrMore(UsesLimit) &&
360             all_of(
361                 BO->uses(),
362                 [](const Use &U) {
363                   // Commutative, if icmp eq/ne sub, 0
364                   ICmpInst::Predicate Pred;
365                   if (match(U.getUser(),
366                             m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
367                       (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
368                     return true;
369                   // Commutative, if abs(sub nsw, true) or abs(sub, false).
370                   ConstantInt *Flag;
371                   return match(U.getUser(),
372                                m_Intrinsic<Intrinsic::abs>(
373                                    m_Specific(U.get()), m_ConstantInt(Flag))) &&
374                          (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
375                           Flag->isOne());
376                 })) ||
377            (BO->getOpcode() == Instruction::FSub &&
378             !BO->hasNUsesOrMore(UsesLimit) &&
379             all_of(BO->uses(), [](const Use &U) {
380               return match(U.getUser(),
381                            m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
382             }));
383   return I->isCommutative();
384 }
385 
386 template <typename T>
387 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
388                                                      unsigned Offset) {
389   static_assert(std::is_same_v<T, InsertElementInst> ||
390                     std::is_same_v<T, ExtractElementInst>,
391                 "unsupported T");
392   int Index = Offset;
393   if (const auto *IE = dyn_cast<T>(Inst)) {
394     const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
395     if (!VT)
396       return std::nullopt;
397     const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
398     if (!CI)
399       return std::nullopt;
400     if (CI->getValue().uge(VT->getNumElements()))
401       return std::nullopt;
402     Index *= VT->getNumElements();
403     Index += CI->getZExtValue();
404     return Index;
405   }
406   return std::nullopt;
407 }
408 
409 /// \returns inserting or extracting index of InsertElement, ExtractElement or
410 /// InsertValue instruction, using Offset as base offset for index.
411 /// \returns std::nullopt if the index is not an immediate.
412 static std::optional<unsigned> getElementIndex(const Value *Inst,
413                                                unsigned Offset = 0) {
414   if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
415     return Index;
416   if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
417     return Index;
418 
419   int Index = Offset;
420 
421   const auto *IV = dyn_cast<InsertValueInst>(Inst);
422   if (!IV)
423     return std::nullopt;
424 
425   Type *CurrentType = IV->getType();
426   for (unsigned I : IV->indices()) {
427     if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
428       Index *= ST->getNumElements();
429       CurrentType = ST->getElementType(I);
430     } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
431       Index *= AT->getNumElements();
432       CurrentType = AT->getElementType();
433     } else {
434       return std::nullopt;
435     }
436     Index += I;
437   }
438   return Index;
439 }
440 
441 namespace {
442 /// Specifies the way the mask should be analyzed for undefs/poisonous elements
443 /// in the shuffle mask.
444 enum class UseMask {
445   FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
446             ///< check for the mask elements for the first argument (mask
447             ///< indices are in range [0:VF)).
448   SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
449              ///< for the mask elements for the second argument (mask indices
450              ///< are in range [VF:2*VF))
451   UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
452                ///< future shuffle elements and mark them as ones as being used
453                ///< in future. Non-undef elements are considered as unused since
454                ///< they're already marked as used in the mask.
455 };
456 } // namespace
457 
458 /// Prepares a use bitset for the given mask either for the first argument or
459 /// for the second.
460 static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
461                                    UseMask MaskArg) {
462   SmallBitVector UseMask(VF, true);
463   for (auto [Idx, Value] : enumerate(Mask)) {
464     if (Value == PoisonMaskElem) {
465       if (MaskArg == UseMask::UndefsAsMask)
466         UseMask.reset(Idx);
467       continue;
468     }
469     if (MaskArg == UseMask::FirstArg && Value < VF)
470       UseMask.reset(Value);
471     else if (MaskArg == UseMask::SecondArg && Value >= VF)
472       UseMask.reset(Value - VF);
473   }
474   return UseMask;
475 }
476 
477 /// Checks if the given value is actually an undefined constant vector.
478 /// Also, if the \p UseMask is not empty, tries to check if the non-masked
479 /// elements actually mask the insertelement buildvector, if any.
480 template <bool IsPoisonOnly = false>
481 static SmallBitVector isUndefVector(const Value *V,
482                                     const SmallBitVector &UseMask = {}) {
483   SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
484   using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
485   if (isa<T>(V))
486     return Res;
487   auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
488   if (!VecTy)
489     return Res.reset();
490   auto *C = dyn_cast<Constant>(V);
491   if (!C) {
492     if (!UseMask.empty()) {
493       const Value *Base = V;
494       while (auto *II = dyn_cast<InsertElementInst>(Base)) {
495         Base = II->getOperand(0);
496         if (isa<T>(II->getOperand(1)))
497           continue;
498         std::optional<unsigned> Idx = getElementIndex(II);
499         if (!Idx) {
500           Res.reset();
501           return Res;
502         }
503         if (*Idx < UseMask.size() && !UseMask.test(*Idx))
504           Res.reset(*Idx);
505       }
506       // TODO: Add analysis for shuffles here too.
507       if (V == Base) {
508         Res.reset();
509       } else {
510         SmallBitVector SubMask(UseMask.size(), false);
511         Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
512       }
513     } else {
514       Res.reset();
515     }
516     return Res;
517   }
518   for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
519     if (Constant *Elem = C->getAggregateElement(I))
520       if (!isa<T>(Elem) &&
521           (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
522         Res.reset(I);
523   }
524   return Res;
525 }
526 
527 /// Checks if the vector of instructions can be represented as a shuffle, like:
528 /// %x0 = extractelement <4 x i8> %x, i32 0
529 /// %x3 = extractelement <4 x i8> %x, i32 3
530 /// %y1 = extractelement <4 x i8> %y, i32 1
531 /// %y2 = extractelement <4 x i8> %y, i32 2
532 /// %x0x0 = mul i8 %x0, %x0
533 /// %x3x3 = mul i8 %x3, %x3
534 /// %y1y1 = mul i8 %y1, %y1
535 /// %y2y2 = mul i8 %y2, %y2
536 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
537 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
538 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
539 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
540 /// ret <4 x i8> %ins4
541 /// can be transformed into:
542 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
543 ///                                                         i32 6>
544 /// %2 = mul <4 x i8> %1, %1
545 /// ret <4 x i8> %2
546 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
547 /// TODO: Can we split off and reuse the shuffle mask detection from
548 /// ShuffleVectorInst/getShuffleCost?
549 static std::optional<TargetTransformInfo::ShuffleKind>
550 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
551   const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
552   if (It == VL.end())
553     return std::nullopt;
554   unsigned Size =
555       std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
556         auto *EI = dyn_cast<ExtractElementInst>(V);
557         if (!EI)
558           return S;
559         auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
560         if (!VTy)
561           return S;
562         return std::max(S, VTy->getNumElements());
563       });
564 
565   Value *Vec1 = nullptr;
566   Value *Vec2 = nullptr;
567   bool HasNonUndefVec = any_of(VL, [](Value *V) {
568     auto *EE = dyn_cast<ExtractElementInst>(V);
569     if (!EE)
570       return false;
571     Value *Vec = EE->getVectorOperand();
572     if (isa<UndefValue>(Vec))
573       return false;
574     return isGuaranteedNotToBePoison(Vec);
575   });
576   enum ShuffleMode { Unknown, Select, Permute };
577   ShuffleMode CommonShuffleMode = Unknown;
578   Mask.assign(VL.size(), PoisonMaskElem);
579   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
580     // Undef can be represented as an undef element in a vector.
581     if (isa<UndefValue>(VL[I]))
582       continue;
583     auto *EI = cast<ExtractElementInst>(VL[I]);
584     if (isa<ScalableVectorType>(EI->getVectorOperandType()))
585       return std::nullopt;
586     auto *Vec = EI->getVectorOperand();
587     // We can extractelement from undef or poison vector.
588     if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
589       continue;
590     // All vector operands must have the same number of vector elements.
591     if (isa<UndefValue>(Vec)) {
592       Mask[I] = I;
593     } else {
594       if (isa<UndefValue>(EI->getIndexOperand()))
595         continue;
596       auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
597       if (!Idx)
598         return std::nullopt;
599       // Undefined behavior if Idx is negative or >= Size.
600       if (Idx->getValue().uge(Size))
601         continue;
602       unsigned IntIdx = Idx->getValue().getZExtValue();
603       Mask[I] = IntIdx;
604     }
605     if (isUndefVector(Vec).all() && HasNonUndefVec)
606       continue;
607     // For correct shuffling we have to have at most 2 different vector operands
608     // in all extractelement instructions.
609     if (!Vec1 || Vec1 == Vec) {
610       Vec1 = Vec;
611     } else if (!Vec2 || Vec2 == Vec) {
612       Vec2 = Vec;
613       Mask[I] += Size;
614     } else {
615       return std::nullopt;
616     }
617     if (CommonShuffleMode == Permute)
618       continue;
619     // If the extract index is not the same as the operation number, it is a
620     // permutation.
621     if (Mask[I] % Size != I) {
622       CommonShuffleMode = Permute;
623       continue;
624     }
625     CommonShuffleMode = Select;
626   }
627   // If we're not crossing lanes in different vectors, consider it as blending.
628   if (CommonShuffleMode == Select && Vec2)
629     return TargetTransformInfo::SK_Select;
630   // If Vec2 was never used, we have a permutation of a single vector, otherwise
631   // we have permutation of 2 vectors.
632   return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
633               : TargetTransformInfo::SK_PermuteSingleSrc;
634 }
635 
636 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
637 static std::optional<unsigned> getExtractIndex(Instruction *E) {
638   unsigned Opcode = E->getOpcode();
639   assert((Opcode == Instruction::ExtractElement ||
640           Opcode == Instruction::ExtractValue) &&
641          "Expected extractelement or extractvalue instruction.");
642   if (Opcode == Instruction::ExtractElement) {
643     auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
644     if (!CI)
645       return std::nullopt;
646     return CI->getZExtValue();
647   }
648   auto *EI = cast<ExtractValueInst>(E);
649   if (EI->getNumIndices() != 1)
650     return std::nullopt;
651   return *EI->idx_begin();
652 }
653 
654 namespace {
655 
656 /// Main data required for vectorization of instructions.
657 struct InstructionsState {
658   /// The very first instruction in the list with the main opcode.
659   Value *OpValue = nullptr;
660 
661   /// The main/alternate instruction.
662   Instruction *MainOp = nullptr;
663   Instruction *AltOp = nullptr;
664 
665   /// The main/alternate opcodes for the list of instructions.
666   unsigned getOpcode() const {
667     return MainOp ? MainOp->getOpcode() : 0;
668   }
669 
670   unsigned getAltOpcode() const {
671     return AltOp ? AltOp->getOpcode() : 0;
672   }
673 
674   /// Some of the instructions in the list have alternate opcodes.
675   bool isAltShuffle() const { return AltOp != MainOp; }
676 
677   bool isOpcodeOrAlt(Instruction *I) const {
678     unsigned CheckedOpcode = I->getOpcode();
679     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
680   }
681 
682   InstructionsState() = delete;
683   InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
684       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685 };
686 
687 } // end anonymous namespace
688 
689 /// Chooses the correct key for scheduling data. If \p Op has the same (or
690 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691 /// OpValue.
692 static Value *isOneOf(const InstructionsState &S, Value *Op) {
693   auto *I = dyn_cast<Instruction>(Op);
694   if (I && S.isOpcodeOrAlt(I))
695     return Op;
696   return S.OpValue;
697 }
698 
699 /// \returns true if \p Opcode is allowed as part of the main/alternate
700 /// instruction for SLP vectorization.
701 ///
702 /// Example of unsupported opcode is SDIV that can potentially cause UB if the
703 /// "shuffled out" lane would result in division by zero.
704 static bool isValidForAlternation(unsigned Opcode) {
705   if (Instruction::isIntDivRem(Opcode))
706     return false;
707 
708   return true;
709 }
710 
711 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712                                        const TargetLibraryInfo &TLI,
713                                        unsigned BaseIndex = 0);
714 
715 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716 /// compatible instructions or constants, or just some other regular values.
717 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
718                                 Value *Op1, const TargetLibraryInfo &TLI) {
719   return (isConstant(BaseOp0) && isConstant(Op0)) ||
720          (isConstant(BaseOp1) && isConstant(Op1)) ||
721          (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722           !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723          BaseOp0 == Op0 || BaseOp1 == Op1 ||
724          getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
725          getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
726 }
727 
728 /// \returns true if a compare instruction \p CI has similar "look" and
729 /// same predicate as \p BaseCI, "as is" or with its operands and predicate
730 /// swapped, false otherwise.
731 static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
732                                const TargetLibraryInfo &TLI) {
733   assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
734          "Assessing comparisons of different types?");
735   CmpInst::Predicate BasePred = BaseCI->getPredicate();
736   CmpInst::Predicate Pred = CI->getPredicate();
737   CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
738 
739   Value *BaseOp0 = BaseCI->getOperand(0);
740   Value *BaseOp1 = BaseCI->getOperand(1);
741   Value *Op0 = CI->getOperand(0);
742   Value *Op1 = CI->getOperand(1);
743 
744   return (BasePred == Pred &&
745           areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
746          (BasePred == SwappedPred &&
747           areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
748 }
749 
750 /// \returns analysis of the Instructions in \p VL described in
751 /// InstructionsState, the Opcode that we suppose the whole list
752 /// could be vectorized even if its structure is diverse.
753 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754                                        const TargetLibraryInfo &TLI,
755                                        unsigned BaseIndex) {
756   // Make sure these are all Instructions.
757   if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
758     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759 
760   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762   bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
763   CmpInst::Predicate BasePred =
764       IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
765               : CmpInst::BAD_ICMP_PREDICATE;
766   unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767   unsigned AltOpcode = Opcode;
768   unsigned AltIndex = BaseIndex;
769 
770   bool SwappedPredsCompatible = [&]() {
771     if (!IsCmpOp)
772       return false;
773     SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
774     UniquePreds.insert(BasePred);
775     UniqueNonSwappedPreds.insert(BasePred);
776     for (Value *V : VL) {
777       auto *I = dyn_cast<CmpInst>(V);
778       if (!I)
779         return false;
780       CmpInst::Predicate CurrentPred = I->getPredicate();
781       CmpInst::Predicate SwappedCurrentPred =
782           CmpInst::getSwappedPredicate(CurrentPred);
783       UniqueNonSwappedPreds.insert(CurrentPred);
784       if (!UniquePreds.contains(CurrentPred) &&
785           !UniquePreds.contains(SwappedCurrentPred))
786         UniquePreds.insert(CurrentPred);
787     }
788     // Total number of predicates > 2, but if consider swapped predicates
789     // compatible only 2, consider swappable predicates as compatible opcodes,
790     // not alternate.
791     return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
792   }();
793   // Check for one alternate opcode from another BinaryOperator.
794   // TODO - generalize to support all operators (types, calls etc.).
795   auto *IBase = cast<Instruction>(VL[BaseIndex]);
796   Intrinsic::ID BaseID = 0;
797   SmallVector<VFInfo> BaseMappings;
798   if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
799     BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
800     BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
801     if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
802       return InstructionsState(VL[BaseIndex], nullptr, nullptr);
803   }
804   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
805     auto *I = cast<Instruction>(VL[Cnt]);
806     unsigned InstOpcode = I->getOpcode();
807     if (IsBinOp && isa<BinaryOperator>(I)) {
808       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
809         continue;
810       if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
811           isValidForAlternation(Opcode)) {
812         AltOpcode = InstOpcode;
813         AltIndex = Cnt;
814         continue;
815       }
816     } else if (IsCastOp && isa<CastInst>(I)) {
817       Value *Op0 = IBase->getOperand(0);
818       Type *Ty0 = Op0->getType();
819       Value *Op1 = I->getOperand(0);
820       Type *Ty1 = Op1->getType();
821       if (Ty0 == Ty1) {
822         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
823           continue;
824         if (Opcode == AltOpcode) {
825           assert(isValidForAlternation(Opcode) &&
826                  isValidForAlternation(InstOpcode) &&
827                  "Cast isn't safe for alternation, logic needs to be updated!");
828           AltOpcode = InstOpcode;
829           AltIndex = Cnt;
830           continue;
831         }
832       }
833     } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
834       auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
835       Type *Ty0 = BaseInst->getOperand(0)->getType();
836       Type *Ty1 = Inst->getOperand(0)->getType();
837       if (Ty0 == Ty1) {
838         assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
839         // Check for compatible operands. If the corresponding operands are not
840         // compatible - need to perform alternate vectorization.
841         CmpInst::Predicate CurrentPred = Inst->getPredicate();
842         CmpInst::Predicate SwappedCurrentPred =
843             CmpInst::getSwappedPredicate(CurrentPred);
844 
845         if ((E == 2 || SwappedPredsCompatible) &&
846             (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
847           continue;
848 
849         if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
850           continue;
851         auto *AltInst = cast<CmpInst>(VL[AltIndex]);
852         if (AltIndex != BaseIndex) {
853           if (isCmpSameOrSwapped(AltInst, Inst, TLI))
854             continue;
855         } else if (BasePred != CurrentPred) {
856           assert(
857               isValidForAlternation(InstOpcode) &&
858               "CmpInst isn't safe for alternation, logic needs to be updated!");
859           AltIndex = Cnt;
860           continue;
861         }
862         CmpInst::Predicate AltPred = AltInst->getPredicate();
863         if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
864             AltPred == CurrentPred || AltPred == SwappedCurrentPred)
865           continue;
866       }
867     } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
868       if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
869         if (Gep->getNumOperands() != 2 ||
870             Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
871           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
872       } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
873         if (!isVectorLikeInstWithConstOps(EI))
874           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
875       } else if (auto *LI = dyn_cast<LoadInst>(I)) {
876         auto *BaseLI = cast<LoadInst>(IBase);
877         if (!LI->isSimple() || !BaseLI->isSimple())
878           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
879       } else if (auto *Call = dyn_cast<CallInst>(I)) {
880         auto *CallBase = cast<CallInst>(IBase);
881         if (Call->getCalledFunction() != CallBase->getCalledFunction())
882           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
883         if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
884             !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885                         Call->op_begin() + Call->getBundleOperandsEndIndex(),
886                         CallBase->op_begin() +
887                             CallBase->getBundleOperandsStartIndex())))
888           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
889         Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
890         if (ID != BaseID)
891           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
892         if (!ID) {
893           SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
894           if (Mappings.size() != BaseMappings.size() ||
895               Mappings.front().ISA != BaseMappings.front().ISA ||
896               Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
897               Mappings.front().VectorName != BaseMappings.front().VectorName ||
898               Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
899               Mappings.front().Shape.Parameters !=
900                   BaseMappings.front().Shape.Parameters)
901             return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902         }
903       }
904       continue;
905     }
906     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907   }
908 
909   return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
910                            cast<Instruction>(VL[AltIndex]));
911 }
912 
913 /// \returns true if all of the values in \p VL have the same type or false
914 /// otherwise.
915 static bool allSameType(ArrayRef<Value *> VL) {
916   Type *Ty = VL.front()->getType();
917   return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
918 }
919 
920 /// \returns True if in-tree use also needs extract. This refers to
921 /// possible scalar operand in vectorized instruction.
922 static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
923                                         TargetLibraryInfo *TLI) {
924   unsigned Opcode = UserInst->getOpcode();
925   switch (Opcode) {
926   case Instruction::Load: {
927     LoadInst *LI = cast<LoadInst>(UserInst);
928     return (LI->getPointerOperand() == Scalar);
929   }
930   case Instruction::Store: {
931     StoreInst *SI = cast<StoreInst>(UserInst);
932     return (SI->getPointerOperand() == Scalar);
933   }
934   case Instruction::Call: {
935     CallInst *CI = cast<CallInst>(UserInst);
936     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
937     return any_of(enumerate(CI->args()), [&](auto &&Arg) {
938       return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939              Arg.value().get() == Scalar;
940     });
941   }
942   default:
943     return false;
944   }
945 }
946 
947 /// \returns the AA location that is being access by the instruction.
948 static MemoryLocation getLocation(Instruction *I) {
949   if (StoreInst *SI = dyn_cast<StoreInst>(I))
950     return MemoryLocation::get(SI);
951   if (LoadInst *LI = dyn_cast<LoadInst>(I))
952     return MemoryLocation::get(LI);
953   return MemoryLocation();
954 }
955 
956 /// \returns True if the instruction is not a volatile or atomic load/store.
957 static bool isSimple(Instruction *I) {
958   if (LoadInst *LI = dyn_cast<LoadInst>(I))
959     return LI->isSimple();
960   if (StoreInst *SI = dyn_cast<StoreInst>(I))
961     return SI->isSimple();
962   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
963     return !MI->isVolatile();
964   return true;
965 }
966 
967 /// Shuffles \p Mask in accordance with the given \p SubMask.
968 /// \param ExtendingManyInputs Supports reshuffling of the mask with not only
969 /// one but two input vectors.
970 static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
971                     bool ExtendingManyInputs = false) {
972   if (SubMask.empty())
973     return;
974   assert(
975       (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
976        // Check if input scalars were extended to match the size of other node.
977        (SubMask.size() == Mask.size() &&
978         std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
979                     [](int Idx) { return Idx == PoisonMaskElem; }))) &&
980       "SubMask with many inputs support must be larger than the mask.");
981   if (Mask.empty()) {
982     Mask.append(SubMask.begin(), SubMask.end());
983     return;
984   }
985   SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
986   int TermValue = std::min(Mask.size(), SubMask.size());
987   for (int I = 0, E = SubMask.size(); I < E; ++I) {
988     if (SubMask[I] == PoisonMaskElem ||
989         (!ExtendingManyInputs &&
990          (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
991       continue;
992     NewMask[I] = Mask[SubMask[I]];
993   }
994   Mask.swap(NewMask);
995 }
996 
997 /// Order may have elements assigned special value (size) which is out of
998 /// bounds. Such indices only appear on places which correspond to undef values
999 /// (see canReuseExtract for details) and used in order to avoid undef values
1000 /// have effect on operands ordering.
1001 /// The first loop below simply finds all unused indices and then the next loop
1002 /// nest assigns these indices for undef values positions.
1003 /// As an example below Order has two undef positions and they have assigned
1004 /// values 3 and 7 respectively:
1005 /// before:  6 9 5 4 9 2 1 0
1006 /// after:   6 3 5 4 7 2 1 0
1007 static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1008   const unsigned Sz = Order.size();
1009   SmallBitVector UnusedIndices(Sz, /*t=*/true);
1010   SmallBitVector MaskedIndices(Sz);
1011   for (unsigned I = 0; I < Sz; ++I) {
1012     if (Order[I] < Sz)
1013       UnusedIndices.reset(Order[I]);
1014     else
1015       MaskedIndices.set(I);
1016   }
1017   if (MaskedIndices.none())
1018     return;
1019   assert(UnusedIndices.count() == MaskedIndices.count() &&
1020          "Non-synced masked/available indices.");
1021   int Idx = UnusedIndices.find_first();
1022   int MIdx = MaskedIndices.find_first();
1023   while (MIdx >= 0) {
1024     assert(Idx >= 0 && "Indices must be synced.");
1025     Order[MIdx] = Idx;
1026     Idx = UnusedIndices.find_next(Idx);
1027     MIdx = MaskedIndices.find_next(MIdx);
1028   }
1029 }
1030 
1031 /// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1032 /// Opcode1.
1033 SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
1034                                unsigned Opcode1) {
1035   SmallBitVector OpcodeMask(VL.size(), false);
1036   for (unsigned Lane : seq<unsigned>(VL.size()))
1037     if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1038       OpcodeMask.set(Lane);
1039   return OpcodeMask;
1040 }
1041 
1042 namespace llvm {
1043 
1044 static void inversePermutation(ArrayRef<unsigned> Indices,
1045                                SmallVectorImpl<int> &Mask) {
1046   Mask.clear();
1047   const unsigned E = Indices.size();
1048   Mask.resize(E, PoisonMaskElem);
1049   for (unsigned I = 0; I < E; ++I)
1050     Mask[Indices[I]] = I;
1051 }
1052 
1053 /// Reorders the list of scalars in accordance with the given \p Mask.
1054 static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1055                            ArrayRef<int> Mask) {
1056   assert(!Mask.empty() && "Expected non-empty mask.");
1057   SmallVector<Value *> Prev(Scalars.size(),
1058                             PoisonValue::get(Scalars.front()->getType()));
1059   Prev.swap(Scalars);
1060   for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1061     if (Mask[I] != PoisonMaskElem)
1062       Scalars[Mask[I]] = Prev[I];
1063 }
1064 
1065 /// Checks if the provided value does not require scheduling. It does not
1066 /// require scheduling if this is not an instruction or it is an instruction
1067 /// that does not read/write memory and all operands are either not instructions
1068 /// or phi nodes or instructions from different blocks.
1069 static bool areAllOperandsNonInsts(Value *V) {
1070   auto *I = dyn_cast<Instruction>(V);
1071   if (!I)
1072     return true;
1073   return !mayHaveNonDefUseDependency(*I) &&
1074     all_of(I->operands(), [I](Value *V) {
1075       auto *IO = dyn_cast<Instruction>(V);
1076       if (!IO)
1077         return true;
1078       return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1079     });
1080 }
1081 
1082 /// Checks if the provided value does not require scheduling. It does not
1083 /// require scheduling if this is not an instruction or it is an instruction
1084 /// that does not read/write memory and all users are phi nodes or instructions
1085 /// from the different blocks.
1086 static bool isUsedOutsideBlock(Value *V) {
1087   auto *I = dyn_cast<Instruction>(V);
1088   if (!I)
1089     return true;
1090   // Limits the number of uses to save compile time.
1091   return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1092          all_of(I->users(), [I](User *U) {
1093            auto *IU = dyn_cast<Instruction>(U);
1094            if (!IU)
1095              return true;
1096            return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1097          });
1098 }
1099 
1100 /// Checks if the specified value does not require scheduling. It does not
1101 /// require scheduling if all operands and all users do not need to be scheduled
1102 /// in the current basic block.
1103 static bool doesNotNeedToBeScheduled(Value *V) {
1104   return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1105 }
1106 
1107 /// Checks if the specified array of instructions does not require scheduling.
1108 /// It is so if all either instructions have operands that do not require
1109 /// scheduling or their users do not require scheduling since they are phis or
1110 /// in other basic blocks.
1111 static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1112   return !VL.empty() &&
1113          (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
1114 }
1115 
1116 namespace slpvectorizer {
1117 
1118 /// Bottom Up SLP Vectorizer.
1119 class BoUpSLP {
1120   struct TreeEntry;
1121   struct ScheduleData;
1122   class ShuffleCostEstimator;
1123   class ShuffleInstructionBuilder;
1124 
1125 public:
1126   /// Tracks the state we can represent the loads in the given sequence.
1127   enum class LoadsState {
1128     Gather,
1129     Vectorize,
1130     ScatterVectorize,
1131     StridedVectorize
1132   };
1133 
1134   using ValueList = SmallVector<Value *, 8>;
1135   using InstrList = SmallVector<Instruction *, 16>;
1136   using ValueSet = SmallPtrSet<Value *, 16>;
1137   using StoreList = SmallVector<StoreInst *, 8>;
1138   using ExtraValueToDebugLocsMap =
1139       MapVector<Value *, SmallVector<Instruction *, 2>>;
1140   using OrdersType = SmallVector<unsigned, 4>;
1141 
1142   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1143           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1144           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1145           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1146       : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147         AC(AC), DB(DB), DL(DL), ORE(ORE),
1148         Builder(Se->getContext(), TargetFolder(*DL)) {
1149     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1150     // Use the vector register size specified by the target unless overridden
1151     // by a command-line option.
1152     // TODO: It would be better to limit the vectorization factor based on
1153     //       data type rather than just register size. For example, x86 AVX has
1154     //       256-bit registers, but it does not support integer operations
1155     //       at that width (that requires AVX2).
1156     if (MaxVectorRegSizeOption.getNumOccurrences())
1157       MaxVecRegSize = MaxVectorRegSizeOption;
1158     else
1159       MaxVecRegSize =
1160           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1161               .getFixedValue();
1162 
1163     if (MinVectorRegSizeOption.getNumOccurrences())
1164       MinVecRegSize = MinVectorRegSizeOption;
1165     else
1166       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1167   }
1168 
1169   /// Vectorize the tree that starts with the elements in \p VL.
1170   /// Returns the vectorized root.
1171   Value *vectorizeTree();
1172 
1173   /// Vectorize the tree but with the list of externally used values \p
1174   /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1175   /// generated extractvalue instructions.
1176   /// \param ReplacedExternals containd list of replaced external values
1177   /// {scalar, replace} after emitting extractelement for external uses.
1178   Value *
1179   vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1180                 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1181                 Instruction *ReductionRoot = nullptr);
1182 
1183   /// \returns the cost incurred by unwanted spills and fills, caused by
1184   /// holding live values over call sites.
1185   InstructionCost getSpillCost() const;
1186 
1187   /// \returns the vectorization cost of the subtree that starts at \p VL.
1188   /// A negative number means that this is profitable.
1189   InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1190 
1191   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1192   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1193   void buildTree(ArrayRef<Value *> Roots,
1194                  const SmallDenseSet<Value *> &UserIgnoreLst);
1195 
1196   /// Construct a vectorizable tree that starts at \p Roots.
1197   void buildTree(ArrayRef<Value *> Roots);
1198 
1199   /// Returns whether the root node has in-tree uses.
1200   bool doesRootHaveInTreeUses() const {
1201     return !VectorizableTree.empty() &&
1202            !VectorizableTree.front()->UserTreeIndices.empty();
1203   }
1204 
1205   /// Return the scalars of the root node.
1206   ArrayRef<Value *> getRootNodeScalars() const {
1207     assert(!VectorizableTree.empty() && "No graph to get the first node from");
1208     return VectorizableTree.front()->Scalars;
1209   }
1210 
1211   /// Checks if the root graph node can be emitted with narrower bitwidth at
1212   /// codegen and returns it signedness, if so.
1213   bool isSignedMinBitwidthRootNode() const {
1214     return MinBWs.at(VectorizableTree.front().get()).second;
1215   }
1216 
1217   /// Builds external uses of the vectorized scalars, i.e. the list of
1218   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1219   /// ExternallyUsedValues contains additional list of external uses to handle
1220   /// vectorization of reductions.
1221   void
1222   buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1223 
1224   /// Transforms graph nodes to target specific representations, if profitable.
1225   void transformNodes();
1226 
1227   /// Clear the internal data structures that are created by 'buildTree'.
1228   void deleteTree() {
1229     VectorizableTree.clear();
1230     ScalarToTreeEntry.clear();
1231     MultiNodeScalars.clear();
1232     MustGather.clear();
1233     NonScheduledFirst.clear();
1234     EntryToLastInstruction.clear();
1235     ExternalUses.clear();
1236     ExternalUsesAsGEPs.clear();
1237     for (auto &Iter : BlocksSchedules) {
1238       BlockScheduling *BS = Iter.second.get();
1239       BS->clear();
1240     }
1241     MinBWs.clear();
1242     ReductionBitWidth = 0;
1243     CastMaxMinBWSizes.reset();
1244     ExtraBitWidthNodes.clear();
1245     InstrElementSize.clear();
1246     UserIgnoreList = nullptr;
1247     PostponedGathers.clear();
1248     ValueToGatherNodes.clear();
1249   }
1250 
1251   unsigned getTreeSize() const { return VectorizableTree.size(); }
1252 
1253   /// Perform LICM and CSE on the newly generated gather sequences.
1254   void optimizeGatherSequence();
1255 
1256   /// Checks if the specified gather tree entry \p TE can be represented as a
1257   /// shuffled vector entry + (possibly) permutation with other gathers. It
1258   /// implements the checks only for possibly ordered scalars (Loads,
1259   /// ExtractElement, ExtractValue), which can be part of the graph.
1260   std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1261 
1262   /// Sort loads into increasing pointers offsets to allow greater clustering.
1263   std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1264 
1265   /// Gets reordering data for the given tree entry. If the entry is vectorized
1266   /// - just return ReorderIndices, otherwise check if the scalars can be
1267   /// reordered and return the most optimal order.
1268   /// \return std::nullopt if ordering is not important, empty order, if
1269   /// identity order is important, or the actual order.
1270   /// \param TopToBottom If true, include the order of vectorized stores and
1271   /// insertelement nodes, otherwise skip them.
1272   std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1273                                               bool TopToBottom);
1274 
1275   /// Reorders the current graph to the most profitable order starting from the
1276   /// root node to the leaf nodes. The best order is chosen only from the nodes
1277   /// of the same size (vectorization factor). Smaller nodes are considered
1278   /// parts of subgraph with smaller VF and they are reordered independently. We
1279   /// can make it because we still need to extend smaller nodes to the wider VF
1280   /// and we can merge reordering shuffles with the widening shuffles.
1281   void reorderTopToBottom();
1282 
1283   /// Reorders the current graph to the most profitable order starting from
1284   /// leaves to the root. It allows to rotate small subgraphs and reduce the
1285   /// number of reshuffles if the leaf nodes use the same order. In this case we
1286   /// can merge the orders and just shuffle user node instead of shuffling its
1287   /// operands. Plus, even the leaf nodes have different orders, it allows to
1288   /// sink reordering in the graph closer to the root node and merge it later
1289   /// during analysis.
1290   void reorderBottomToTop(bool IgnoreReorder = false);
1291 
1292   /// \return The vector element size in bits to use when vectorizing the
1293   /// expression tree ending at \p V. If V is a store, the size is the width of
1294   /// the stored value. Otherwise, the size is the width of the largest loaded
1295   /// value reaching V. This method is used by the vectorizer to calculate
1296   /// vectorization factors.
1297   unsigned getVectorElementSize(Value *V);
1298 
1299   /// Compute the minimum type sizes required to represent the entries in a
1300   /// vectorizable tree.
1301   void computeMinimumValueSizes();
1302 
1303   // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1304   unsigned getMaxVecRegSize() const {
1305     return MaxVecRegSize;
1306   }
1307 
1308   // \returns minimum vector register size as set by cl::opt.
1309   unsigned getMinVecRegSize() const {
1310     return MinVecRegSize;
1311   }
1312 
1313   unsigned getMinVF(unsigned Sz) const {
1314     return std::max(2U, getMinVecRegSize() / Sz);
1315   }
1316 
1317   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1318     unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1319       MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1320     return MaxVF ? MaxVF : UINT_MAX;
1321   }
1322 
1323   /// Check if homogeneous aggregate is isomorphic to some VectorType.
1324   /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1325   /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1326   /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1327   ///
1328   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1329   unsigned canMapToVector(Type *T) const;
1330 
1331   /// \returns True if the VectorizableTree is both tiny and not fully
1332   /// vectorizable. We do not vectorize such trees.
1333   bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1334 
1335   /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1336   /// can be load combined in the backend. Load combining may not be allowed in
1337   /// the IR optimizer, so we do not want to alter the pattern. For example,
1338   /// partially transforming a scalar bswap() pattern into vector code is
1339   /// effectively impossible for the backend to undo.
1340   /// TODO: If load combining is allowed in the IR optimizer, this analysis
1341   ///       may not be necessary.
1342   bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1343 
1344   /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1345   /// can be load combined in the backend. Load combining may not be allowed in
1346   /// the IR optimizer, so we do not want to alter the pattern. For example,
1347   /// partially transforming a scalar bswap() pattern into vector code is
1348   /// effectively impossible for the backend to undo.
1349   /// TODO: If load combining is allowed in the IR optimizer, this analysis
1350   ///       may not be necessary.
1351   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1352 
1353   /// Checks if the given array of loads can be represented as a vectorized,
1354   /// scatter or just simple gather.
1355   /// \param VL list of loads.
1356   /// \param VL0 main load value.
1357   /// \param Order returned order of load instructions.
1358   /// \param PointerOps returned list of pointer operands.
1359   /// \param TryRecursiveCheck used to check if long masked gather can be
1360   /// represented as a serie of loads/insert subvector, if profitable.
1361   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
1362                                SmallVectorImpl<unsigned> &Order,
1363                                SmallVectorImpl<Value *> &PointerOps,
1364                                bool TryRecursiveCheck = true) const;
1365 
1366   OptimizationRemarkEmitter *getORE() { return ORE; }
1367 
1368   /// This structure holds any data we need about the edges being traversed
1369   /// during buildTree_rec(). We keep track of:
1370   /// (i) the user TreeEntry index, and
1371   /// (ii) the index of the edge.
1372   struct EdgeInfo {
1373     EdgeInfo() = default;
1374     EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1375         : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1376     /// The user TreeEntry.
1377     TreeEntry *UserTE = nullptr;
1378     /// The operand index of the use.
1379     unsigned EdgeIdx = UINT_MAX;
1380 #ifndef NDEBUG
1381     friend inline raw_ostream &operator<<(raw_ostream &OS,
1382                                           const BoUpSLP::EdgeInfo &EI) {
1383       EI.dump(OS);
1384       return OS;
1385     }
1386     /// Debug print.
1387     void dump(raw_ostream &OS) const {
1388       OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1389          << " EdgeIdx:" << EdgeIdx << "}";
1390     }
1391     LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1392 #endif
1393     bool operator == (const EdgeInfo &Other) const {
1394       return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1395     }
1396   };
1397 
1398   /// A helper class used for scoring candidates for two consecutive lanes.
1399   class LookAheadHeuristics {
1400     const TargetLibraryInfo &TLI;
1401     const DataLayout &DL;
1402     ScalarEvolution &SE;
1403     const BoUpSLP &R;
1404     int NumLanes; // Total number of lanes (aka vectorization factor).
1405     int MaxLevel; // The maximum recursion depth for accumulating score.
1406 
1407   public:
1408     LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1409                         ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1410                         int MaxLevel)
1411         : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1412           MaxLevel(MaxLevel) {}
1413 
1414     // The hard-coded scores listed here are not very important, though it shall
1415     // be higher for better matches to improve the resulting cost. When
1416     // computing the scores of matching one sub-tree with another, we are
1417     // basically counting the number of values that are matching. So even if all
1418     // scores are set to 1, we would still get a decent matching result.
1419     // However, sometimes we have to break ties. For example we may have to
1420     // choose between matching loads vs matching opcodes. This is what these
1421     // scores are helping us with: they provide the order of preference. Also,
1422     // this is important if the scalar is externally used or used in another
1423     // tree entry node in the different lane.
1424 
1425     /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1426     static const int ScoreConsecutiveLoads = 4;
1427     /// The same load multiple times. This should have a better score than
1428     /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1429     /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1430     /// a vector load and 1.0 for a broadcast.
1431     static const int ScoreSplatLoads = 3;
1432     /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1433     static const int ScoreReversedLoads = 3;
1434     /// A load candidate for masked gather.
1435     static const int ScoreMaskedGatherCandidate = 1;
1436     /// ExtractElementInst from same vector and consecutive indexes.
1437     static const int ScoreConsecutiveExtracts = 4;
1438     /// ExtractElementInst from same vector and reversed indices.
1439     static const int ScoreReversedExtracts = 3;
1440     /// Constants.
1441     static const int ScoreConstants = 2;
1442     /// Instructions with the same opcode.
1443     static const int ScoreSameOpcode = 2;
1444     /// Instructions with alt opcodes (e.g, add + sub).
1445     static const int ScoreAltOpcodes = 1;
1446     /// Identical instructions (a.k.a. splat or broadcast).
1447     static const int ScoreSplat = 1;
1448     /// Matching with an undef is preferable to failing.
1449     static const int ScoreUndef = 1;
1450     /// Score for failing to find a decent match.
1451     static const int ScoreFail = 0;
1452     /// Score if all users are vectorized.
1453     static const int ScoreAllUserVectorized = 1;
1454 
1455     /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1456     /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1457     /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1458     /// MainAltOps.
1459     int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1460                         ArrayRef<Value *> MainAltOps) const {
1461       if (!isValidElementType(V1->getType()) ||
1462           !isValidElementType(V2->getType()))
1463         return LookAheadHeuristics::ScoreFail;
1464 
1465       if (V1 == V2) {
1466         if (isa<LoadInst>(V1)) {
1467           // Retruns true if the users of V1 and V2 won't need to be extracted.
1468           auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1469             // Bail out if we have too many uses to save compilation time.
1470             if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1471               return false;
1472 
1473             auto AllUsersVectorized = [U1, U2, this](Value *V) {
1474               return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1475                 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1476               });
1477             };
1478             return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1479           };
1480           // A broadcast of a load can be cheaper on some targets.
1481           if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1482                                           ElementCount::getFixed(NumLanes)) &&
1483               ((int)V1->getNumUses() == NumLanes ||
1484                AllUsersAreInternal(V1, V2)))
1485             return LookAheadHeuristics::ScoreSplatLoads;
1486         }
1487         return LookAheadHeuristics::ScoreSplat;
1488       }
1489 
1490       auto CheckSameEntryOrFail = [&]() {
1491         if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1492             TE1 && TE1 == R.getTreeEntry(V2))
1493           return LookAheadHeuristics::ScoreSplatLoads;
1494         return LookAheadHeuristics::ScoreFail;
1495       };
1496 
1497       auto *LI1 = dyn_cast<LoadInst>(V1);
1498       auto *LI2 = dyn_cast<LoadInst>(V2);
1499       if (LI1 && LI2) {
1500         if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1501             !LI2->isSimple())
1502           return CheckSameEntryOrFail();
1503 
1504         std::optional<int> Dist = getPointersDiff(
1505             LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1506             LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1507         if (!Dist || *Dist == 0) {
1508           if (getUnderlyingObject(LI1->getPointerOperand()) ==
1509                   getUnderlyingObject(LI2->getPointerOperand()) &&
1510               R.TTI->isLegalMaskedGather(
1511                   getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1512             return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1513           return CheckSameEntryOrFail();
1514         }
1515         // The distance is too large - still may be profitable to use masked
1516         // loads/gathers.
1517         if (std::abs(*Dist) > NumLanes / 2)
1518           return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1519         // This still will detect consecutive loads, but we might have "holes"
1520         // in some cases. It is ok for non-power-2 vectorization and may produce
1521         // better results. It should not affect current vectorization.
1522         return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1523                            : LookAheadHeuristics::ScoreReversedLoads;
1524       }
1525 
1526       auto *C1 = dyn_cast<Constant>(V1);
1527       auto *C2 = dyn_cast<Constant>(V2);
1528       if (C1 && C2)
1529         return LookAheadHeuristics::ScoreConstants;
1530 
1531       // Extracts from consecutive indexes of the same vector better score as
1532       // the extracts could be optimized away.
1533       Value *EV1;
1534       ConstantInt *Ex1Idx;
1535       if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1536         // Undefs are always profitable for extractelements.
1537         // Compiler can easily combine poison and extractelement <non-poison> or
1538         // undef and extractelement <poison>. But combining undef +
1539         // extractelement <non-poison-but-may-produce-poison> requires some
1540         // extra operations.
1541         if (isa<UndefValue>(V2))
1542           return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1543                      ? LookAheadHeuristics::ScoreConsecutiveExtracts
1544                      : LookAheadHeuristics::ScoreSameOpcode;
1545         Value *EV2 = nullptr;
1546         ConstantInt *Ex2Idx = nullptr;
1547         if (match(V2,
1548                   m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
1549                                                          m_Undef())))) {
1550           // Undefs are always profitable for extractelements.
1551           if (!Ex2Idx)
1552             return LookAheadHeuristics::ScoreConsecutiveExtracts;
1553           if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1554             return LookAheadHeuristics::ScoreConsecutiveExtracts;
1555           if (EV2 == EV1) {
1556             int Idx1 = Ex1Idx->getZExtValue();
1557             int Idx2 = Ex2Idx->getZExtValue();
1558             int Dist = Idx2 - Idx1;
1559             // The distance is too large - still may be profitable to use
1560             // shuffles.
1561             if (std::abs(Dist) == 0)
1562               return LookAheadHeuristics::ScoreSplat;
1563             if (std::abs(Dist) > NumLanes / 2)
1564               return LookAheadHeuristics::ScoreSameOpcode;
1565             return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1566                               : LookAheadHeuristics::ScoreReversedExtracts;
1567           }
1568           return LookAheadHeuristics::ScoreAltOpcodes;
1569         }
1570         return CheckSameEntryOrFail();
1571       }
1572 
1573       auto *I1 = dyn_cast<Instruction>(V1);
1574       auto *I2 = dyn_cast<Instruction>(V2);
1575       if (I1 && I2) {
1576         if (I1->getParent() != I2->getParent())
1577           return CheckSameEntryOrFail();
1578         SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1579         Ops.push_back(I1);
1580         Ops.push_back(I2);
1581         InstructionsState S = getSameOpcode(Ops, TLI);
1582         // Note: Only consider instructions with <= 2 operands to avoid
1583         // complexity explosion.
1584         if (S.getOpcode() &&
1585             (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1586              !S.isAltShuffle()) &&
1587             all_of(Ops, [&S](Value *V) {
1588               return cast<Instruction>(V)->getNumOperands() ==
1589                      S.MainOp->getNumOperands();
1590             }))
1591           return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1592                                   : LookAheadHeuristics::ScoreSameOpcode;
1593       }
1594 
1595       if (isa<UndefValue>(V2))
1596         return LookAheadHeuristics::ScoreUndef;
1597 
1598       return CheckSameEntryOrFail();
1599     }
1600 
1601     /// Go through the operands of \p LHS and \p RHS recursively until
1602     /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1603     /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1604     /// of \p U1 and \p U2), except at the beginning of the recursion where
1605     /// these are set to nullptr.
1606     ///
1607     /// For example:
1608     /// \verbatim
1609     ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
1610     ///     \ /         \ /         \ /        \ /
1611     ///      +           +           +          +
1612     ///     G1          G2          G3         G4
1613     /// \endverbatim
1614     /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1615     /// each level recursively, accumulating the score. It starts from matching
1616     /// the additions at level 0, then moves on to the loads (level 1). The
1617     /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1618     /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1619     /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1620     /// Please note that the order of the operands does not matter, as we
1621     /// evaluate the score of all profitable combinations of operands. In
1622     /// other words the score of G1 and G4 is the same as G1 and G2. This
1623     /// heuristic is based on ideas described in:
1624     ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
1625     ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1626     ///   Luís F. W. Góes
1627     int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1628                            Instruction *U2, int CurrLevel,
1629                            ArrayRef<Value *> MainAltOps) const {
1630 
1631       // Get the shallow score of V1 and V2.
1632       int ShallowScoreAtThisLevel =
1633           getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1634 
1635       // If reached MaxLevel,
1636       //  or if V1 and V2 are not instructions,
1637       //  or if they are SPLAT,
1638       //  or if they are not consecutive,
1639       //  or if profitable to vectorize loads or extractelements, early return
1640       //  the current cost.
1641       auto *I1 = dyn_cast<Instruction>(LHS);
1642       auto *I2 = dyn_cast<Instruction>(RHS);
1643       if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1644           ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1645           (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1646             (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1647             (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1648            ShallowScoreAtThisLevel))
1649         return ShallowScoreAtThisLevel;
1650       assert(I1 && I2 && "Should have early exited.");
1651 
1652       // Contains the I2 operand indexes that got matched with I1 operands.
1653       SmallSet<unsigned, 4> Op2Used;
1654 
1655       // Recursion towards the operands of I1 and I2. We are trying all possible
1656       // operand pairs, and keeping track of the best score.
1657       for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1658            OpIdx1 != NumOperands1; ++OpIdx1) {
1659         // Try to pair op1I with the best operand of I2.
1660         int MaxTmpScore = 0;
1661         unsigned MaxOpIdx2 = 0;
1662         bool FoundBest = false;
1663         // If I2 is commutative try all combinations.
1664         unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1665         unsigned ToIdx = isCommutative(I2)
1666                              ? I2->getNumOperands()
1667                              : std::min(I2->getNumOperands(), OpIdx1 + 1);
1668         assert(FromIdx <= ToIdx && "Bad index");
1669         for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1670           // Skip operands already paired with OpIdx1.
1671           if (Op2Used.count(OpIdx2))
1672             continue;
1673           // Recursively calculate the cost at each level
1674           int TmpScore =
1675               getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1676                                  I1, I2, CurrLevel + 1, std::nullopt);
1677           // Look for the best score.
1678           if (TmpScore > LookAheadHeuristics::ScoreFail &&
1679               TmpScore > MaxTmpScore) {
1680             MaxTmpScore = TmpScore;
1681             MaxOpIdx2 = OpIdx2;
1682             FoundBest = true;
1683           }
1684         }
1685         if (FoundBest) {
1686           // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1687           Op2Used.insert(MaxOpIdx2);
1688           ShallowScoreAtThisLevel += MaxTmpScore;
1689         }
1690       }
1691       return ShallowScoreAtThisLevel;
1692     }
1693   };
1694   /// A helper data structure to hold the operands of a vector of instructions.
1695   /// This supports a fixed vector length for all operand vectors.
1696   class VLOperands {
1697     /// For each operand we need (i) the value, and (ii) the opcode that it
1698     /// would be attached to if the expression was in a left-linearized form.
1699     /// This is required to avoid illegal operand reordering.
1700     /// For example:
1701     /// \verbatim
1702     ///                         0 Op1
1703     ///                         |/
1704     /// Op1 Op2   Linearized    + Op2
1705     ///   \ /     ---------->   |/
1706     ///    -                    -
1707     ///
1708     /// Op1 - Op2            (0 + Op1) - Op2
1709     /// \endverbatim
1710     ///
1711     /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1712     ///
1713     /// Another way to think of this is to track all the operations across the
1714     /// path from the operand all the way to the root of the tree and to
1715     /// calculate the operation that corresponds to this path. For example, the
1716     /// path from Op2 to the root crosses the RHS of the '-', therefore the
1717     /// corresponding operation is a '-' (which matches the one in the
1718     /// linearized tree, as shown above).
1719     ///
1720     /// For lack of a better term, we refer to this operation as Accumulated
1721     /// Path Operation (APO).
1722     struct OperandData {
1723       OperandData() = default;
1724       OperandData(Value *V, bool APO, bool IsUsed)
1725           : V(V), APO(APO), IsUsed(IsUsed) {}
1726       /// The operand value.
1727       Value *V = nullptr;
1728       /// TreeEntries only allow a single opcode, or an alternate sequence of
1729       /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1730       /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1731       /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1732       /// (e.g., Add/Mul)
1733       bool APO = false;
1734       /// Helper data for the reordering function.
1735       bool IsUsed = false;
1736     };
1737 
1738     /// During operand reordering, we are trying to select the operand at lane
1739     /// that matches best with the operand at the neighboring lane. Our
1740     /// selection is based on the type of value we are looking for. For example,
1741     /// if the neighboring lane has a load, we need to look for a load that is
1742     /// accessing a consecutive address. These strategies are summarized in the
1743     /// 'ReorderingMode' enumerator.
1744     enum class ReorderingMode {
1745       Load,     ///< Matching loads to consecutive memory addresses
1746       Opcode,   ///< Matching instructions based on opcode (same or alternate)
1747       Constant, ///< Matching constants
1748       Splat,    ///< Matching the same instruction multiple times (broadcast)
1749       Failed,   ///< We failed to create a vectorizable group
1750     };
1751 
1752     using OperandDataVec = SmallVector<OperandData, 2>;
1753 
1754     /// A vector of operand vectors.
1755     SmallVector<OperandDataVec, 4> OpsVec;
1756 
1757     const TargetLibraryInfo &TLI;
1758     const DataLayout &DL;
1759     ScalarEvolution &SE;
1760     const BoUpSLP &R;
1761     const Loop *L = nullptr;
1762 
1763     /// \returns the operand data at \p OpIdx and \p Lane.
1764     OperandData &getData(unsigned OpIdx, unsigned Lane) {
1765       return OpsVec[OpIdx][Lane];
1766     }
1767 
1768     /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1769     const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1770       return OpsVec[OpIdx][Lane];
1771     }
1772 
1773     /// Clears the used flag for all entries.
1774     void clearUsed() {
1775       for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1776            OpIdx != NumOperands; ++OpIdx)
1777         for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1778              ++Lane)
1779           OpsVec[OpIdx][Lane].IsUsed = false;
1780     }
1781 
1782     /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1783     void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1784       std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1785     }
1786 
1787     /// \param Lane lane of the operands under analysis.
1788     /// \param OpIdx operand index in \p Lane lane we're looking the best
1789     /// candidate for.
1790     /// \param Idx operand index of the current candidate value.
1791     /// \returns The additional score due to possible broadcasting of the
1792     /// elements in the lane. It is more profitable to have power-of-2 unique
1793     /// elements in the lane, it will be vectorized with higher probability
1794     /// after removing duplicates. Currently the SLP vectorizer supports only
1795     /// vectorization of the power-of-2 number of unique scalars.
1796     int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1797       Value *IdxLaneV = getData(Idx, Lane).V;
1798       if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1799         return 0;
1800       SmallPtrSet<Value *, 4> Uniques;
1801       for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1802         if (Ln == Lane)
1803           continue;
1804         Value *OpIdxLnV = getData(OpIdx, Ln).V;
1805         if (!isa<Instruction>(OpIdxLnV))
1806           return 0;
1807         Uniques.insert(OpIdxLnV);
1808       }
1809       int UniquesCount = Uniques.size();
1810       int UniquesCntWithIdxLaneV =
1811           Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1812       Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813       int UniquesCntWithOpIdxLaneV =
1814           Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1815       if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1816         return 0;
1817       return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1818               UniquesCntWithOpIdxLaneV) -
1819              (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1820     }
1821 
1822     /// \param Lane lane of the operands under analysis.
1823     /// \param OpIdx operand index in \p Lane lane we're looking the best
1824     /// candidate for.
1825     /// \param Idx operand index of the current candidate value.
1826     /// \returns The additional score for the scalar which users are all
1827     /// vectorized.
1828     int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1829       Value *IdxLaneV = getData(Idx, Lane).V;
1830       Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1831       // Do not care about number of uses for vector-like instructions
1832       // (extractelement/extractvalue with constant indices), they are extracts
1833       // themselves and already externally used. Vectorization of such
1834       // instructions does not add extra extractelement instruction, just may
1835       // remove it.
1836       if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1837           isVectorLikeInstWithConstOps(OpIdxLaneV))
1838         return LookAheadHeuristics::ScoreAllUserVectorized;
1839       auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1840       if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1841         return 0;
1842       return R.areAllUsersVectorized(IdxLaneI)
1843                  ? LookAheadHeuristics::ScoreAllUserVectorized
1844                  : 0;
1845     }
1846 
1847     /// Score scaling factor for fully compatible instructions but with
1848     /// different number of external uses. Allows better selection of the
1849     /// instructions with less external uses.
1850     static const int ScoreScaleFactor = 10;
1851 
1852     /// \Returns the look-ahead score, which tells us how much the sub-trees
1853     /// rooted at \p LHS and \p RHS match, the more they match the higher the
1854     /// score. This helps break ties in an informed way when we cannot decide on
1855     /// the order of the operands by just considering the immediate
1856     /// predecessors.
1857     int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1858                           int Lane, unsigned OpIdx, unsigned Idx,
1859                           bool &IsUsed) {
1860       LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1861                                     LookAheadMaxDepth);
1862       // Keep track of the instruction stack as we recurse into the operands
1863       // during the look-ahead score exploration.
1864       int Score =
1865           LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1866                                        /*CurrLevel=*/1, MainAltOps);
1867       if (Score) {
1868         int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1869         if (Score <= -SplatScore) {
1870           // Set the minimum score for splat-like sequence to avoid setting
1871           // failed state.
1872           Score = 1;
1873         } else {
1874           Score += SplatScore;
1875           // Scale score to see the difference between different operands
1876           // and similar operands but all vectorized/not all vectorized
1877           // uses. It does not affect actual selection of the best
1878           // compatible operand in general, just allows to select the
1879           // operand with all vectorized uses.
1880           Score *= ScoreScaleFactor;
1881           Score += getExternalUseScore(Lane, OpIdx, Idx);
1882           IsUsed = true;
1883         }
1884       }
1885       return Score;
1886     }
1887 
1888     /// Best defined scores per lanes between the passes. Used to choose the
1889     /// best operand (with the highest score) between the passes.
1890     /// The key - {Operand Index, Lane}.
1891     /// The value - the best score between the passes for the lane and the
1892     /// operand.
1893     SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1894         BestScoresPerLanes;
1895 
1896     // Search all operands in Ops[*][Lane] for the one that matches best
1897     // Ops[OpIdx][LastLane] and return its opreand index.
1898     // If no good match can be found, return std::nullopt.
1899     std::optional<unsigned>
1900     getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1901                    ArrayRef<ReorderingMode> ReorderingModes,
1902                    ArrayRef<Value *> MainAltOps) {
1903       unsigned NumOperands = getNumOperands();
1904 
1905       // The operand of the previous lane at OpIdx.
1906       Value *OpLastLane = getData(OpIdx, LastLane).V;
1907 
1908       // Our strategy mode for OpIdx.
1909       ReorderingMode RMode = ReorderingModes[OpIdx];
1910       if (RMode == ReorderingMode::Failed)
1911         return std::nullopt;
1912 
1913       // The linearized opcode of the operand at OpIdx, Lane.
1914       bool OpIdxAPO = getData(OpIdx, Lane).APO;
1915 
1916       // The best operand index and its score.
1917       // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1918       // are using the score to differentiate between the two.
1919       struct BestOpData {
1920         std::optional<unsigned> Idx;
1921         unsigned Score = 0;
1922       } BestOp;
1923       BestOp.Score =
1924           BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1925               .first->second;
1926 
1927       // Track if the operand must be marked as used. If the operand is set to
1928       // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1929       // want to reestimate the operands again on the following iterations).
1930       bool IsUsed = RMode == ReorderingMode::Splat ||
1931                     RMode == ReorderingMode::Constant ||
1932                     RMode == ReorderingMode::Load;
1933       // Iterate through all unused operands and look for the best.
1934       for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1935         // Get the operand at Idx and Lane.
1936         OperandData &OpData = getData(Idx, Lane);
1937         Value *Op = OpData.V;
1938         bool OpAPO = OpData.APO;
1939 
1940         // Skip already selected operands.
1941         if (OpData.IsUsed)
1942           continue;
1943 
1944         // Skip if we are trying to move the operand to a position with a
1945         // different opcode in the linearized tree form. This would break the
1946         // semantics.
1947         if (OpAPO != OpIdxAPO)
1948           continue;
1949 
1950         // Look for an operand that matches the current mode.
1951         switch (RMode) {
1952         case ReorderingMode::Load:
1953         case ReorderingMode::Opcode: {
1954           bool LeftToRight = Lane > LastLane;
1955           Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1956           Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1957           int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1958                                         OpIdx, Idx, IsUsed);
1959           if (Score > static_cast<int>(BestOp.Score) ||
1960               (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1961                Idx == OpIdx)) {
1962             BestOp.Idx = Idx;
1963             BestOp.Score = Score;
1964             BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1965           }
1966           break;
1967         }
1968         case ReorderingMode::Constant:
1969           if (isa<Constant>(Op) ||
1970               (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1971             BestOp.Idx = Idx;
1972             if (isa<Constant>(Op)) {
1973               BestOp.Score = LookAheadHeuristics::ScoreConstants;
1974               BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1975                   LookAheadHeuristics::ScoreConstants;
1976             }
1977             if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1978               IsUsed = false;
1979           }
1980           break;
1981         case ReorderingMode::Splat:
1982           if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1983             IsUsed = Op == OpLastLane;
1984             if (Op == OpLastLane) {
1985               BestOp.Score = LookAheadHeuristics::ScoreSplat;
1986               BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1987                   LookAheadHeuristics::ScoreSplat;
1988             }
1989             BestOp.Idx = Idx;
1990           }
1991           break;
1992         case ReorderingMode::Failed:
1993           llvm_unreachable("Not expected Failed reordering mode.");
1994         }
1995       }
1996 
1997       if (BestOp.Idx) {
1998         getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1999         return BestOp.Idx;
2000       }
2001       // If we could not find a good match return std::nullopt.
2002       return std::nullopt;
2003     }
2004 
2005     /// Helper for reorderOperandVecs.
2006     /// \returns the lane that we should start reordering from. This is the one
2007     /// which has the least number of operands that can freely move about or
2008     /// less profitable because it already has the most optimal set of operands.
2009     unsigned getBestLaneToStartReordering() const {
2010       unsigned Min = UINT_MAX;
2011       unsigned SameOpNumber = 0;
2012       // std::pair<unsigned, unsigned> is used to implement a simple voting
2013       // algorithm and choose the lane with the least number of operands that
2014       // can freely move about or less profitable because it already has the
2015       // most optimal set of operands. The first unsigned is a counter for
2016       // voting, the second unsigned is the counter of lanes with instructions
2017       // with same/alternate opcodes and same parent basic block.
2018       MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2019       // Try to be closer to the original results, if we have multiple lanes
2020       // with same cost. If 2 lanes have the same cost, use the one with the
2021       // lowest index.
2022       for (int I = getNumLanes(); I > 0; --I) {
2023         unsigned Lane = I - 1;
2024         OperandsOrderData NumFreeOpsHash =
2025             getMaxNumOperandsThatCanBeReordered(Lane);
2026         // Compare the number of operands that can move and choose the one with
2027         // the least number.
2028         if (NumFreeOpsHash.NumOfAPOs < Min) {
2029           Min = NumFreeOpsHash.NumOfAPOs;
2030           SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2031           HashMap.clear();
2032           HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2033         } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034                    NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2035           // Select the most optimal lane in terms of number of operands that
2036           // should be moved around.
2037           SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038           HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2039         } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040                    NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041           auto *It = HashMap.find(NumFreeOpsHash.Hash);
2042           if (It == HashMap.end())
2043             HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2044           else
2045             ++It->second.first;
2046         }
2047       }
2048       // Select the lane with the minimum counter.
2049       unsigned BestLane = 0;
2050       unsigned CntMin = UINT_MAX;
2051       for (const auto &Data : reverse(HashMap)) {
2052         if (Data.second.first < CntMin) {
2053           CntMin = Data.second.first;
2054           BestLane = Data.second.second;
2055         }
2056       }
2057       return BestLane;
2058     }
2059 
2060     /// Data structure that helps to reorder operands.
2061     struct OperandsOrderData {
2062       /// The best number of operands with the same APOs, which can be
2063       /// reordered.
2064       unsigned NumOfAPOs = UINT_MAX;
2065       /// Number of operands with the same/alternate instruction opcode and
2066       /// parent.
2067       unsigned NumOpsWithSameOpcodeParent = 0;
2068       /// Hash for the actual operands ordering.
2069       /// Used to count operands, actually their position id and opcode
2070       /// value. It is used in the voting mechanism to find the lane with the
2071       /// least number of operands that can freely move about or less profitable
2072       /// because it already has the most optimal set of operands. Can be
2073       /// replaced with SmallVector<unsigned> instead but hash code is faster
2074       /// and requires less memory.
2075       unsigned Hash = 0;
2076     };
2077     /// \returns the maximum number of operands that are allowed to be reordered
2078     /// for \p Lane and the number of compatible instructions(with the same
2079     /// parent/opcode). This is used as a heuristic for selecting the first lane
2080     /// to start operand reordering.
2081     OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2082       unsigned CntTrue = 0;
2083       unsigned NumOperands = getNumOperands();
2084       // Operands with the same APO can be reordered. We therefore need to count
2085       // how many of them we have for each APO, like this: Cnt[APO] = x.
2086       // Since we only have two APOs, namely true and false, we can avoid using
2087       // a map. Instead we can simply count the number of operands that
2088       // correspond to one of them (in this case the 'true' APO), and calculate
2089       // the other by subtracting it from the total number of operands.
2090       // Operands with the same instruction opcode and parent are more
2091       // profitable since we don't need to move them in many cases, with a high
2092       // probability such lane already can be vectorized effectively.
2093       bool AllUndefs = true;
2094       unsigned NumOpsWithSameOpcodeParent = 0;
2095       Instruction *OpcodeI = nullptr;
2096       BasicBlock *Parent = nullptr;
2097       unsigned Hash = 0;
2098       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2099         const OperandData &OpData = getData(OpIdx, Lane);
2100         if (OpData.APO)
2101           ++CntTrue;
2102         // Use Boyer-Moore majority voting for finding the majority opcode and
2103         // the number of times it occurs.
2104         if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2105           if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2106               I->getParent() != Parent) {
2107             if (NumOpsWithSameOpcodeParent == 0) {
2108               NumOpsWithSameOpcodeParent = 1;
2109               OpcodeI = I;
2110               Parent = I->getParent();
2111             } else {
2112               --NumOpsWithSameOpcodeParent;
2113             }
2114           } else {
2115             ++NumOpsWithSameOpcodeParent;
2116           }
2117         }
2118         Hash = hash_combine(
2119             Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2120         AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2121       }
2122       if (AllUndefs)
2123         return {};
2124       OperandsOrderData Data;
2125       Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2126       Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2127       Data.Hash = Hash;
2128       return Data;
2129     }
2130 
2131     /// Go through the instructions in VL and append their operands.
2132     void appendOperandsOfVL(ArrayRef<Value *> VL) {
2133       assert(!VL.empty() && "Bad VL");
2134       assert((empty() || VL.size() == getNumLanes()) &&
2135              "Expected same number of lanes");
2136       assert(isa<Instruction>(VL[0]) && "Expected instruction");
2137       unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2138       constexpr unsigned IntrinsicNumOperands = 2;
2139       if (isa<IntrinsicInst>(VL[0]))
2140         NumOperands = IntrinsicNumOperands;
2141       OpsVec.resize(NumOperands);
2142       unsigned NumLanes = VL.size();
2143       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144         OpsVec[OpIdx].resize(NumLanes);
2145         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2146           assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2147           // Our tree has just 3 nodes: the root and two operands.
2148           // It is therefore trivial to get the APO. We only need to check the
2149           // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2150           // RHS operand. The LHS operand of both add and sub is never attached
2151           // to an inversese operation in the linearized form, therefore its APO
2152           // is false. The RHS is true only if VL[Lane] is an inverse operation.
2153 
2154           // Since operand reordering is performed on groups of commutative
2155           // operations or alternating sequences (e.g., +, -), we can safely
2156           // tell the inverse operations by checking commutativity.
2157           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2158           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2159           OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2160                                  APO, false};
2161         }
2162       }
2163     }
2164 
2165     /// \returns the number of operands.
2166     unsigned getNumOperands() const { return OpsVec.size(); }
2167 
2168     /// \returns the number of lanes.
2169     unsigned getNumLanes() const { return OpsVec[0].size(); }
2170 
2171     /// \returns the operand value at \p OpIdx and \p Lane.
2172     Value *getValue(unsigned OpIdx, unsigned Lane) const {
2173       return getData(OpIdx, Lane).V;
2174     }
2175 
2176     /// \returns true if the data structure is empty.
2177     bool empty() const { return OpsVec.empty(); }
2178 
2179     /// Clears the data.
2180     void clear() { OpsVec.clear(); }
2181 
2182     /// \Returns true if there are enough operands identical to \p Op to fill
2183     /// the whole vector (it is mixed with constants or loop invariant values).
2184     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2185     bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2186       bool OpAPO = getData(OpIdx, Lane).APO;
2187       bool IsInvariant = L && L->isLoopInvariant(Op);
2188       unsigned Cnt = 0;
2189       for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2190         if (Ln == Lane)
2191           continue;
2192         // This is set to true if we found a candidate for broadcast at Lane.
2193         bool FoundCandidate = false;
2194         for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195           OperandData &Data = getData(OpI, Ln);
2196           if (Data.APO != OpAPO || Data.IsUsed)
2197             continue;
2198           Value *OpILane = getValue(OpI, Lane);
2199           bool IsConstantOp = isa<Constant>(OpILane);
2200           // Consider the broadcast candidate if:
2201           // 1. Same value is found in one of the operands.
2202           if (Data.V == Op ||
2203               // 2. The operand in the given lane is not constant but there is a
2204               // constant operand in another lane (which can be moved to the
2205               // given lane). In this case we can represent it as a simple
2206               // permutation of constant and broadcast.
2207               (!IsConstantOp &&
2208                ((Lns > 2 && isa<Constant>(Data.V)) ||
2209                 // 2.1. If we have only 2 lanes, need to check that value in the
2210                 // next lane does not build same opcode sequence.
2211                 (Lns == 2 &&
2212                  !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2213                       .getOpcode() &&
2214                  isa<Constant>(Data.V)))) ||
2215               // 3. The operand in the current lane is loop invariant (can be
2216               // hoisted out) and another operand is also a loop invariant
2217               // (though not a constant). In this case the whole vector can be
2218               // hoisted out.
2219               // FIXME: need to teach the cost model about this case for better
2220               // estimation.
2221               (IsInvariant && !isa<Constant>(Data.V) &&
2222                !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2223                L->isLoopInvariant(Data.V))) {
2224             FoundCandidate = true;
2225             Data.IsUsed = Data.V == Op;
2226             if (Data.V == Op)
2227               ++Cnt;
2228             break;
2229           }
2230         }
2231         if (!FoundCandidate)
2232           return false;
2233       }
2234       return getNumLanes() == 2 || Cnt > 1;
2235     }
2236 
2237     /// Checks if there is at least single compatible operand in lanes other
2238     /// than \p Lane, compatible with the operand \p Op.
2239     bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2240       bool OpAPO = getData(OpIdx, Lane).APO;
2241       for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2242         if (Ln == Lane)
2243           continue;
2244         if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2245               const OperandData &Data = getData(OpI, Ln);
2246               if (Data.APO != OpAPO || Data.IsUsed)
2247                 return true;
2248               Value *OpILn = getValue(OpI, Ln);
2249               return (L && L->isLoopInvariant(OpILn)) ||
2250                      (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2251                       Op->getParent() == cast<Instruction>(OpILn)->getParent());
2252             }))
2253           return true;
2254       }
2255       return false;
2256     }
2257 
2258   public:
2259     /// Initialize with all the operands of the instruction vector \p RootVL.
2260     VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
2261         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2262           L(R.LI->getLoopFor(
2263               (cast<Instruction>(RootVL.front())->getParent()))) {
2264       // Append all the operands of RootVL.
2265       appendOperandsOfVL(RootVL);
2266     }
2267 
2268     /// \Returns a value vector with the operands across all lanes for the
2269     /// opearnd at \p OpIdx.
2270     ValueList getVL(unsigned OpIdx) const {
2271       ValueList OpVL(OpsVec[OpIdx].size());
2272       assert(OpsVec[OpIdx].size() == getNumLanes() &&
2273              "Expected same num of lanes across all operands");
2274       for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275         OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2276       return OpVL;
2277     }
2278 
2279     // Performs operand reordering for 2 or more operands.
2280     // The original operands are in OrigOps[OpIdx][Lane].
2281     // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2282     void reorder() {
2283       unsigned NumOperands = getNumOperands();
2284       unsigned NumLanes = getNumLanes();
2285       // Each operand has its own mode. We are using this mode to help us select
2286       // the instructions for each lane, so that they match best with the ones
2287       // we have selected so far.
2288       SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2289 
2290       // This is a greedy single-pass algorithm. We are going over each lane
2291       // once and deciding on the best order right away with no back-tracking.
2292       // However, in order to increase its effectiveness, we start with the lane
2293       // that has operands that can move the least. For example, given the
2294       // following lanes:
2295       //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
2296       //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
2297       //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
2298       //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
2299       // we will start at Lane 1, since the operands of the subtraction cannot
2300       // be reordered. Then we will visit the rest of the lanes in a circular
2301       // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2302 
2303       // Find the first lane that we will start our search from.
2304       unsigned FirstLane = getBestLaneToStartReordering();
2305 
2306       // Initialize the modes.
2307       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2308         Value *OpLane0 = getValue(OpIdx, FirstLane);
2309         // Keep track if we have instructions with all the same opcode on one
2310         // side.
2311         if (isa<LoadInst>(OpLane0))
2312           ReorderingModes[OpIdx] = ReorderingMode::Load;
2313         else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2314           // Check if OpLane0 should be broadcast.
2315           if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2316               !canBeVectorized(OpILane0, OpIdx, FirstLane))
2317             ReorderingModes[OpIdx] = ReorderingMode::Splat;
2318           else
2319             ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2320         } else if (isa<Constant>(OpLane0))
2321           ReorderingModes[OpIdx] = ReorderingMode::Constant;
2322         else if (isa<Argument>(OpLane0))
2323           // Our best hope is a Splat. It may save some cost in some cases.
2324           ReorderingModes[OpIdx] = ReorderingMode::Splat;
2325         else
2326           // NOTE: This should be unreachable.
2327           ReorderingModes[OpIdx] = ReorderingMode::Failed;
2328       }
2329 
2330       // Check that we don't have same operands. No need to reorder if operands
2331       // are just perfect diamond or shuffled diamond match. Do not do it only
2332       // for possible broadcasts or non-power of 2 number of scalars (just for
2333       // now).
2334       auto &&SkipReordering = [this]() {
2335         SmallPtrSet<Value *, 4> UniqueValues;
2336         ArrayRef<OperandData> Op0 = OpsVec.front();
2337         for (const OperandData &Data : Op0)
2338           UniqueValues.insert(Data.V);
2339         for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2340           if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2341                 return !UniqueValues.contains(Data.V);
2342               }))
2343             return false;
2344         }
2345         // TODO: Check if we can remove a check for non-power-2 number of
2346         // scalars after full support of non-power-2 vectorization.
2347         return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2348       };
2349 
2350       // If the initial strategy fails for any of the operand indexes, then we
2351       // perform reordering again in a second pass. This helps avoid assigning
2352       // high priority to the failed strategy, and should improve reordering for
2353       // the non-failed operand indexes.
2354       for (int Pass = 0; Pass != 2; ++Pass) {
2355         // Check if no need to reorder operands since they're are perfect or
2356         // shuffled diamond match.
2357         // Need to do it to avoid extra external use cost counting for
2358         // shuffled matches, which may cause regressions.
2359         if (SkipReordering())
2360           break;
2361         // Skip the second pass if the first pass did not fail.
2362         bool StrategyFailed = false;
2363         // Mark all operand data as free to use.
2364         clearUsed();
2365         // We keep the original operand order for the FirstLane, so reorder the
2366         // rest of the lanes. We are visiting the nodes in a circular fashion,
2367         // using FirstLane as the center point and increasing the radius
2368         // distance.
2369         SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2370         for (unsigned I = 0; I < NumOperands; ++I)
2371           MainAltOps[I].push_back(getData(I, FirstLane).V);
2372 
2373         for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2374           // Visit the lane on the right and then the lane on the left.
2375           for (int Direction : {+1, -1}) {
2376             int Lane = FirstLane + Direction * Distance;
2377             if (Lane < 0 || Lane >= (int)NumLanes)
2378               continue;
2379             int LastLane = Lane - Direction;
2380             assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2381                    "Out of bounds");
2382             // Look for a good match for each operand.
2383             for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384               // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2385               std::optional<unsigned> BestIdx = getBestOperand(
2386                   OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2387               // By not selecting a value, we allow the operands that follow to
2388               // select a better matching value. We will get a non-null value in
2389               // the next run of getBestOperand().
2390               if (BestIdx) {
2391                 // Swap the current operand with the one returned by
2392                 // getBestOperand().
2393                 swap(OpIdx, *BestIdx, Lane);
2394               } else {
2395                 // Enable the second pass.
2396                 StrategyFailed = true;
2397               }
2398               // Try to get the alternate opcode and follow it during analysis.
2399               if (MainAltOps[OpIdx].size() != 2) {
2400                 OperandData &AltOp = getData(OpIdx, Lane);
2401                 InstructionsState OpS =
2402                     getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2403                 if (OpS.getOpcode() && OpS.isAltShuffle())
2404                   MainAltOps[OpIdx].push_back(AltOp.V);
2405               }
2406             }
2407           }
2408         }
2409         // Skip second pass if the strategy did not fail.
2410         if (!StrategyFailed)
2411           break;
2412       }
2413     }
2414 
2415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2416     LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2417       switch (RMode) {
2418       case ReorderingMode::Load:
2419         return "Load";
2420       case ReorderingMode::Opcode:
2421         return "Opcode";
2422       case ReorderingMode::Constant:
2423         return "Constant";
2424       case ReorderingMode::Splat:
2425         return "Splat";
2426       case ReorderingMode::Failed:
2427         return "Failed";
2428       }
2429       llvm_unreachable("Unimplemented Reordering Type");
2430     }
2431 
2432     LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2433                                                    raw_ostream &OS) {
2434       return OS << getModeStr(RMode);
2435     }
2436 
2437     /// Debug print.
2438     LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2439       printMode(RMode, dbgs());
2440     }
2441 
2442     friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2443       return printMode(RMode, OS);
2444     }
2445 
2446     LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2447       const unsigned Indent = 2;
2448       unsigned Cnt = 0;
2449       for (const OperandDataVec &OpDataVec : OpsVec) {
2450         OS << "Operand " << Cnt++ << "\n";
2451         for (const OperandData &OpData : OpDataVec) {
2452           OS.indent(Indent) << "{";
2453           if (Value *V = OpData.V)
2454             OS << *V;
2455           else
2456             OS << "null";
2457           OS << ", APO:" << OpData.APO << "}\n";
2458         }
2459         OS << "\n";
2460       }
2461       return OS;
2462     }
2463 
2464     /// Debug print.
2465     LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2466 #endif
2467   };
2468 
2469   /// Evaluate each pair in \p Candidates and return index into \p Candidates
2470   /// for a pair which have highest score deemed to have best chance to form
2471   /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2472   /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2473   /// of the cost, considered to be good enough score.
2474   std::optional<int>
2475   findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2476                    int Limit = LookAheadHeuristics::ScoreFail) const {
2477     LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2478                                   RootLookAheadMaxDepth);
2479     int BestScore = Limit;
2480     std::optional<int> Index;
2481     for (int I : seq<int>(0, Candidates.size())) {
2482       int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2483                                                Candidates[I].second,
2484                                                /*U1=*/nullptr, /*U2=*/nullptr,
2485                                                /*Level=*/1, std::nullopt);
2486       if (Score > BestScore) {
2487         BestScore = Score;
2488         Index = I;
2489       }
2490     }
2491     return Index;
2492   }
2493 
2494   /// Checks if the instruction is marked for deletion.
2495   bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2496 
2497   /// Removes an instruction from its block and eventually deletes it.
2498   /// It's like Instruction::eraseFromParent() except that the actual deletion
2499   /// is delayed until BoUpSLP is destructed.
2500   void eraseInstruction(Instruction *I) {
2501     DeletedInstructions.insert(I);
2502   }
2503 
2504   /// Remove instructions from the parent function and clear the operands of \p
2505   /// DeadVals instructions, marking for deletion trivially dead operands.
2506   template <typename T>
2507   void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2508     SmallVector<WeakTrackingVH> DeadInsts;
2509     for (T *V : DeadVals) {
2510       auto *I = cast<Instruction>(V);
2511       DeletedInstructions.insert(I);
2512     }
2513     DenseSet<Value *> Processed;
2514     for (T *V : DeadVals) {
2515       if (!V || !Processed.insert(V).second)
2516         continue;
2517       auto *I = cast<Instruction>(V);
2518       salvageDebugInfo(*I);
2519       SmallVector<const TreeEntry *> Entries;
2520       if (const TreeEntry *Entry = getTreeEntry(I)) {
2521         Entries.push_back(Entry);
2522         auto It = MultiNodeScalars.find(I);
2523         if (It != MultiNodeScalars.end())
2524           Entries.append(It->second.begin(), It->second.end());
2525       }
2526       for (Use &U : I->operands()) {
2527         if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2528             OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2529             wouldInstructionBeTriviallyDead(OpI, TLI) &&
2530             (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2531                return Entry->VectorizedValue == OpI;
2532              })))
2533           DeadInsts.push_back(OpI);
2534       }
2535       I->dropAllReferences();
2536     }
2537     for (T *V : DeadVals) {
2538       auto *I = cast<Instruction>(V);
2539       if (!I->getParent())
2540         continue;
2541       assert((I->use_empty() || all_of(I->uses(),
2542                                        [&](Use &U) {
2543                                          return isDeleted(
2544                                              cast<Instruction>(U.getUser()));
2545                                        })) &&
2546              "trying to erase instruction with users.");
2547       I->removeFromParent();
2548       SE->forgetValue(I);
2549     }
2550     // Process the dead instruction list until empty.
2551     while (!DeadInsts.empty()) {
2552       Value *V = DeadInsts.pop_back_val();
2553       Instruction *VI = cast_or_null<Instruction>(V);
2554       if (!VI || !VI->getParent())
2555         continue;
2556       assert(isInstructionTriviallyDead(VI, TLI) &&
2557              "Live instruction found in dead worklist!");
2558       assert(VI->use_empty() && "Instructions with uses are not dead.");
2559 
2560       // Don't lose the debug info while deleting the instructions.
2561       salvageDebugInfo(*VI);
2562 
2563       // Null out all of the instruction's operands to see if any operand
2564       // becomes dead as we go.
2565       for (Use &OpU : VI->operands()) {
2566         Value *OpV = OpU.get();
2567         if (!OpV)
2568           continue;
2569         OpU.set(nullptr);
2570 
2571         if (!OpV->use_empty())
2572           continue;
2573 
2574         // If the operand is an instruction that became dead as we nulled out
2575         // the operand, and if it is 'trivially' dead, delete it in a future
2576         // loop iteration.
2577         if (auto *OpI = dyn_cast<Instruction>(OpV))
2578           if (!DeletedInstructions.contains(OpI) &&
2579               isInstructionTriviallyDead(OpI, TLI))
2580             DeadInsts.push_back(OpI);
2581       }
2582 
2583       VI->removeFromParent();
2584       DeletedInstructions.insert(VI);
2585       SE->forgetValue(VI);
2586     }
2587   }
2588 
2589   /// Checks if the instruction was already analyzed for being possible
2590   /// reduction root.
2591   bool isAnalyzedReductionRoot(Instruction *I) const {
2592     return AnalyzedReductionsRoots.count(I);
2593   }
2594   /// Register given instruction as already analyzed for being possible
2595   /// reduction root.
2596   void analyzedReductionRoot(Instruction *I) {
2597     AnalyzedReductionsRoots.insert(I);
2598   }
2599   /// Checks if the provided list of reduced values was checked already for
2600   /// vectorization.
2601   bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
2602     return AnalyzedReductionVals.contains(hash_value(VL));
2603   }
2604   /// Adds the list of reduced values to list of already checked values for the
2605   /// vectorization.
2606   void analyzedReductionVals(ArrayRef<Value *> VL) {
2607     AnalyzedReductionVals.insert(hash_value(VL));
2608   }
2609   /// Clear the list of the analyzed reduction root instructions.
2610   void clearReductionData() {
2611     AnalyzedReductionsRoots.clear();
2612     AnalyzedReductionVals.clear();
2613     AnalyzedMinBWVals.clear();
2614   }
2615   /// Checks if the given value is gathered in one of the nodes.
2616   bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2617     return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2618   }
2619   /// Checks if the given value is gathered in one of the nodes.
2620   bool isGathered(const Value *V) const {
2621     return MustGather.contains(V);
2622   }
2623   /// Checks if the specified value was not schedule.
2624   bool isNotScheduled(const Value *V) const {
2625     return NonScheduledFirst.contains(V);
2626   }
2627 
2628   /// Check if the value is vectorized in the tree.
2629   bool isVectorized(Value *V) const { return getTreeEntry(V); }
2630 
2631   ~BoUpSLP();
2632 
2633 private:
2634   /// Determine if a node \p E in can be demoted to a smaller type with a
2635   /// truncation. We collect the entries that will be demoted in ToDemote.
2636   /// \param E Node for analysis
2637   /// \param ToDemote indices of the nodes to be demoted.
2638   bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2639                              unsigned &BitWidth,
2640                              SmallVectorImpl<unsigned> &ToDemote,
2641                              DenseSet<const TreeEntry *> &Visited,
2642                              unsigned &MaxDepthLevel,
2643                              bool &IsProfitableToDemote,
2644                              bool IsTruncRoot) const;
2645 
2646   /// Check if the operands on the edges \p Edges of the \p UserTE allows
2647   /// reordering (i.e. the operands can be reordered because they have only one
2648   /// user and reordarable).
2649   /// \param ReorderableGathers List of all gather nodes that require reordering
2650   /// (e.g., gather of extractlements or partially vectorizable loads).
2651   /// \param GatherOps List of gather operand nodes for \p UserTE that require
2652   /// reordering, subset of \p NonVectorized.
2653   bool
2654   canReorderOperands(TreeEntry *UserTE,
2655                      SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2656                      ArrayRef<TreeEntry *> ReorderableGathers,
2657                      SmallVectorImpl<TreeEntry *> &GatherOps);
2658 
2659   /// Checks if the given \p TE is a gather node with clustered reused scalars
2660   /// and reorders it per given \p Mask.
2661   void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2662 
2663   /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2664   /// if any. If it is not vectorized (gather node), returns nullptr.
2665   TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2666     ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2667     TreeEntry *TE = nullptr;
2668     const auto *It = find_if(VL, [&](Value *V) {
2669       TE = getTreeEntry(V);
2670       if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2671         return true;
2672       auto It = MultiNodeScalars.find(V);
2673       if (It != MultiNodeScalars.end()) {
2674         for (TreeEntry *E : It->second) {
2675           if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2676             TE = E;
2677             return true;
2678           }
2679         }
2680       }
2681       return false;
2682     });
2683     if (It != VL.end()) {
2684       assert(TE->isSame(VL) && "Expected same scalars.");
2685       return TE;
2686     }
2687     return nullptr;
2688   }
2689 
2690   /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2691   /// if any. If it is not vectorized (gather node), returns nullptr.
2692   const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2693                                         unsigned OpIdx) const {
2694     return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2695         const_cast<TreeEntry *>(UserTE), OpIdx);
2696   }
2697 
2698   /// Checks if all users of \p I are the part of the vectorization tree.
2699   bool areAllUsersVectorized(
2700       Instruction *I,
2701       const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2702 
2703   /// Return information about the vector formed for the specified index
2704   /// of a vector of (the same) instruction.
2705   TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2706 
2707   /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2708   const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2709 
2710   /// \returns Cast context for the given graph node.
2711   TargetTransformInfo::CastContextHint
2712   getCastContextHint(const TreeEntry &TE) const;
2713 
2714   /// \returns the cost of the vectorizable entry.
2715   InstructionCost getEntryCost(const TreeEntry *E,
2716                                ArrayRef<Value *> VectorizedVals,
2717                                SmallPtrSetImpl<Value *> &CheckedExtracts);
2718 
2719   /// This is the recursive part of buildTree.
2720   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2721                      const EdgeInfo &EI);
2722 
2723   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2724   /// be vectorized to use the original vector (or aggregate "bitcast" to a
2725   /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2726   /// returns false, setting \p CurrentOrder to either an empty vector or a
2727   /// non-identity permutation that allows to reuse extract instructions.
2728   /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2729   /// extract order.
2730   bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2731                        SmallVectorImpl<unsigned> &CurrentOrder,
2732                        bool ResizeAllowed = false) const;
2733 
2734   /// Vectorize a single entry in the tree.
2735   /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2736   /// avoid issues with def-use order.
2737   Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2738 
2739   /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2740   /// \p E.
2741   /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2742   /// avoid issues with def-use order.
2743   Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2744 
2745   /// Create a new vector from a list of scalar values.  Produces a sequence
2746   /// which exploits values reused across lanes, and arranges the inserts
2747   /// for ease of later optimization.
2748   template <typename BVTy, typename ResTy, typename... Args>
2749   ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2750 
2751   /// Create a new vector from a list of scalar values.  Produces a sequence
2752   /// which exploits values reused across lanes, and arranges the inserts
2753   /// for ease of later optimization.
2754   Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2755 
2756   /// Returns the instruction in the bundle, which can be used as a base point
2757   /// for scheduling. Usually it is the last instruction in the bundle, except
2758   /// for the case when all operands are external (in this case, it is the first
2759   /// instruction in the list).
2760   Instruction &getLastInstructionInBundle(const TreeEntry *E);
2761 
2762   /// Tries to find extractelement instructions with constant indices from fixed
2763   /// vector type and gather such instructions into a bunch, which highly likely
2764   /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2765   /// was successful, the matched scalars are replaced by poison values in \p VL
2766   /// for future analysis.
2767   std::optional<TargetTransformInfo::ShuffleKind>
2768   tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2769                                            SmallVectorImpl<int> &Mask) const;
2770 
2771   /// Tries to find extractelement instructions with constant indices from fixed
2772   /// vector type and gather such instructions into a bunch, which highly likely
2773   /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2774   /// was successful, the matched scalars are replaced by poison values in \p VL
2775   /// for future analysis.
2776   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2777   tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2778                              SmallVectorImpl<int> &Mask,
2779                              unsigned NumParts) const;
2780 
2781   /// Checks if the gathered \p VL can be represented as a single register
2782   /// shuffle(s) of previous tree entries.
2783   /// \param TE Tree entry checked for permutation.
2784   /// \param VL List of scalars (a subset of the TE scalar), checked for
2785   /// permutations. Must form single-register vector.
2786   /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2787   /// commands to build the mask using the original vector value, without
2788   /// relying on the potential reordering.
2789   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2790   /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2791   std::optional<TargetTransformInfo::ShuffleKind>
2792   isGatherShuffledSingleRegisterEntry(
2793       const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2794       SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2795       bool ForOrder);
2796 
2797   /// Checks if the gathered \p VL can be represented as multi-register
2798   /// shuffle(s) of previous tree entries.
2799   /// \param TE Tree entry checked for permutation.
2800   /// \param VL List of scalars (a subset of the TE scalar), checked for
2801   /// permutations.
2802   /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2803   /// commands to build the mask using the original vector value, without
2804   /// relying on the potential reordering.
2805   /// \returns per-register series of ShuffleKind, if gathered values can be
2806   /// represented as shuffles of previous tree entries. \p Mask is filled with
2807   /// the shuffle mask (also on per-register base).
2808   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2809   isGatherShuffledEntry(
2810       const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2811       SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2812       unsigned NumParts, bool ForOrder = false);
2813 
2814   /// \returns the scalarization cost for this list of values. Assuming that
2815   /// this subtree gets vectorized, we may need to extract the values from the
2816   /// roots. This method calculates the cost of extracting the values.
2817   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2818   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2819                                 Type *ScalarTy) const;
2820 
2821   /// Set the Builder insert point to one after the last instruction in
2822   /// the bundle
2823   void setInsertPointAfterBundle(const TreeEntry *E);
2824 
2825   /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2826   /// specified, the starting vector value is poison.
2827   Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2828 
2829   /// \returns whether the VectorizableTree is fully vectorizable and will
2830   /// be beneficial even the tree height is tiny.
2831   bool isFullyVectorizableTinyTree(bool ForReduction) const;
2832 
2833   /// Reorder commutative or alt operands to get better probability of
2834   /// generating vectorized code.
2835   static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2836                                              SmallVectorImpl<Value *> &Left,
2837                                              SmallVectorImpl<Value *> &Right,
2838                                              const BoUpSLP &R);
2839 
2840   /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2841   /// users of \p TE and collects the stores. It returns the map from the store
2842   /// pointers to the collected stores.
2843   DenseMap<Value *, SmallVector<StoreInst *>>
2844   collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2845 
2846   /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2847   /// stores in \p StoresVec can form a vector instruction. If so it returns
2848   /// true and populates \p ReorderIndices with the shuffle indices of the
2849   /// stores when compared to the sorted vector.
2850   bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2851                      OrdersType &ReorderIndices) const;
2852 
2853   /// Iterates through the users of \p TE, looking for scalar stores that can be
2854   /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2855   /// their order and builds an order index vector for each store bundle. It
2856   /// returns all these order vectors found.
2857   /// We run this after the tree has formed, otherwise we may come across user
2858   /// instructions that are not yet in the tree.
2859   SmallVector<OrdersType, 1>
2860   findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2861 
2862   struct TreeEntry {
2863     using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2864     TreeEntry(VecTreeTy &Container) : Container(Container) {}
2865 
2866     /// \returns Common mask for reorder indices and reused scalars.
2867     SmallVector<int> getCommonMask() const {
2868       SmallVector<int> Mask;
2869       inversePermutation(ReorderIndices, Mask);
2870       ::addMask(Mask, ReuseShuffleIndices);
2871       return Mask;
2872     }
2873 
2874     /// \returns true if the scalars in VL are equal to this entry.
2875     bool isSame(ArrayRef<Value *> VL) const {
2876       auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2877         if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2878           return std::equal(VL.begin(), VL.end(), Scalars.begin());
2879         return VL.size() == Mask.size() &&
2880                std::equal(VL.begin(), VL.end(), Mask.begin(),
2881                           [Scalars](Value *V, int Idx) {
2882                             return (isa<UndefValue>(V) &&
2883                                     Idx == PoisonMaskElem) ||
2884                                    (Idx != PoisonMaskElem && V == Scalars[Idx]);
2885                           });
2886       };
2887       if (!ReorderIndices.empty()) {
2888         // TODO: implement matching if the nodes are just reordered, still can
2889         // treat the vector as the same if the list of scalars matches VL
2890         // directly, without reordering.
2891         SmallVector<int> Mask;
2892         inversePermutation(ReorderIndices, Mask);
2893         if (VL.size() == Scalars.size())
2894           return IsSame(Scalars, Mask);
2895         if (VL.size() == ReuseShuffleIndices.size()) {
2896           ::addMask(Mask, ReuseShuffleIndices);
2897           return IsSame(Scalars, Mask);
2898         }
2899         return false;
2900       }
2901       return IsSame(Scalars, ReuseShuffleIndices);
2902     }
2903 
2904     bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2905       return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906              UserTreeIndices.front().UserTE == UserEI.UserTE;
2907     }
2908 
2909     /// \returns true if current entry has same operands as \p TE.
2910     bool hasEqualOperands(const TreeEntry &TE) const {
2911       if (TE.getNumOperands() != getNumOperands())
2912         return false;
2913       SmallBitVector Used(getNumOperands());
2914       for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2915         unsigned PrevCount = Used.count();
2916         for (unsigned K = 0; K < E; ++K) {
2917           if (Used.test(K))
2918             continue;
2919           if (getOperand(K) == TE.getOperand(I)) {
2920             Used.set(K);
2921             break;
2922           }
2923         }
2924         // Check if we actually found the matching operand.
2925         if (PrevCount == Used.count())
2926           return false;
2927       }
2928       return true;
2929     }
2930 
2931     /// \return Final vectorization factor for the node. Defined by the total
2932     /// number of vectorized scalars, including those, used several times in the
2933     /// entry and counted in the \a ReuseShuffleIndices, if any.
2934     unsigned getVectorFactor() const {
2935       if (!ReuseShuffleIndices.empty())
2936         return ReuseShuffleIndices.size();
2937       return Scalars.size();
2938     };
2939 
2940     /// Checks if the current node is a gather node.
2941     bool isGather() const {return State == NeedToGather; }
2942 
2943     /// A vector of scalars.
2944     ValueList Scalars;
2945 
2946     /// The Scalars are vectorized into this value. It is initialized to Null.
2947     WeakTrackingVH VectorizedValue = nullptr;
2948 
2949     /// New vector phi instructions emitted for the vectorized phi nodes.
2950     PHINode *PHI = nullptr;
2951 
2952     /// Do we need to gather this sequence or vectorize it
2953     /// (either with vector instruction or with scatter/gather
2954     /// intrinsics for store/load)?
2955     enum EntryState {
2956       Vectorize,
2957       ScatterVectorize,
2958       StridedVectorize,
2959       NeedToGather
2960     };
2961     EntryState State;
2962 
2963     /// Does this sequence require some shuffling?
2964     SmallVector<int, 4> ReuseShuffleIndices;
2965 
2966     /// Does this entry require reordering?
2967     SmallVector<unsigned, 4> ReorderIndices;
2968 
2969     /// Points back to the VectorizableTree.
2970     ///
2971     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
2972     /// to be a pointer and needs to be able to initialize the child iterator.
2973     /// Thus we need a reference back to the container to translate the indices
2974     /// to entries.
2975     VecTreeTy &Container;
2976 
2977     /// The TreeEntry index containing the user of this entry.  We can actually
2978     /// have multiple users so the data structure is not truly a tree.
2979     SmallVector<EdgeInfo, 1> UserTreeIndices;
2980 
2981     /// The index of this treeEntry in VectorizableTree.
2982     int Idx = -1;
2983 
2984   private:
2985     /// The operands of each instruction in each lane Operands[op_index][lane].
2986     /// Note: This helps avoid the replication of the code that performs the
2987     /// reordering of operands during buildTree_rec() and vectorizeTree().
2988     SmallVector<ValueList, 2> Operands;
2989 
2990     /// The main/alternate instruction.
2991     Instruction *MainOp = nullptr;
2992     Instruction *AltOp = nullptr;
2993 
2994   public:
2995     /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2996     void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2997       if (Operands.size() < OpIdx + 1)
2998         Operands.resize(OpIdx + 1);
2999       assert(Operands[OpIdx].empty() && "Already resized?");
3000       assert(OpVL.size() <= Scalars.size() &&
3001              "Number of operands is greater than the number of scalars.");
3002       Operands[OpIdx].resize(OpVL.size());
3003       copy(OpVL, Operands[OpIdx].begin());
3004     }
3005 
3006     /// Set the operands of this bundle in their original order.
3007     void setOperandsInOrder() {
3008       assert(Operands.empty() && "Already initialized?");
3009       auto *I0 = cast<Instruction>(Scalars[0]);
3010       Operands.resize(I0->getNumOperands());
3011       unsigned NumLanes = Scalars.size();
3012       for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3013            OpIdx != NumOperands; ++OpIdx) {
3014         Operands[OpIdx].resize(NumLanes);
3015         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3016           auto *I = cast<Instruction>(Scalars[Lane]);
3017           assert(I->getNumOperands() == NumOperands &&
3018                  "Expected same number of operands");
3019           Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3020         }
3021       }
3022     }
3023 
3024     /// Reorders operands of the node to the given mask \p Mask.
3025     void reorderOperands(ArrayRef<int> Mask) {
3026       for (ValueList &Operand : Operands)
3027         reorderScalars(Operand, Mask);
3028     }
3029 
3030     /// \returns the \p OpIdx operand of this TreeEntry.
3031     ValueList &getOperand(unsigned OpIdx) {
3032       assert(OpIdx < Operands.size() && "Off bounds");
3033       return Operands[OpIdx];
3034     }
3035 
3036     /// \returns the \p OpIdx operand of this TreeEntry.
3037     ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3038       assert(OpIdx < Operands.size() && "Off bounds");
3039       return Operands[OpIdx];
3040     }
3041 
3042     /// \returns the number of operands.
3043     unsigned getNumOperands() const { return Operands.size(); }
3044 
3045     /// \return the single \p OpIdx operand.
3046     Value *getSingleOperand(unsigned OpIdx) const {
3047       assert(OpIdx < Operands.size() && "Off bounds");
3048       assert(!Operands[OpIdx].empty() && "No operand available");
3049       return Operands[OpIdx][0];
3050     }
3051 
3052     /// Some of the instructions in the list have alternate opcodes.
3053     bool isAltShuffle() const { return MainOp != AltOp; }
3054 
3055     bool isOpcodeOrAlt(Instruction *I) const {
3056       unsigned CheckedOpcode = I->getOpcode();
3057       return (getOpcode() == CheckedOpcode ||
3058               getAltOpcode() == CheckedOpcode);
3059     }
3060 
3061     /// Chooses the correct key for scheduling data. If \p Op has the same (or
3062     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3063     /// \p OpValue.
3064     Value *isOneOf(Value *Op) const {
3065       auto *I = dyn_cast<Instruction>(Op);
3066       if (I && isOpcodeOrAlt(I))
3067         return Op;
3068       return MainOp;
3069     }
3070 
3071     void setOperations(const InstructionsState &S) {
3072       MainOp = S.MainOp;
3073       AltOp = S.AltOp;
3074     }
3075 
3076     Instruction *getMainOp() const {
3077       return MainOp;
3078     }
3079 
3080     Instruction *getAltOp() const {
3081       return AltOp;
3082     }
3083 
3084     /// The main/alternate opcodes for the list of instructions.
3085     unsigned getOpcode() const {
3086       return MainOp ? MainOp->getOpcode() : 0;
3087     }
3088 
3089     unsigned getAltOpcode() const {
3090       return AltOp ? AltOp->getOpcode() : 0;
3091     }
3092 
3093     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3094     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3095     int findLaneForValue(Value *V) const {
3096       unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3097       assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3098       if (!ReorderIndices.empty())
3099         FoundLane = ReorderIndices[FoundLane];
3100       assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3101       if (!ReuseShuffleIndices.empty()) {
3102         FoundLane = std::distance(ReuseShuffleIndices.begin(),
3103                                   find(ReuseShuffleIndices, FoundLane));
3104       }
3105       return FoundLane;
3106     }
3107 
3108     /// Build a shuffle mask for graph entry which represents a merge of main
3109     /// and alternate operations.
3110     void
3111     buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3112                           SmallVectorImpl<int> &Mask,
3113                           SmallVectorImpl<Value *> *OpScalars = nullptr,
3114                           SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3115 
3116     /// Return true if this is a non-power-of-2 node.
3117     bool isNonPowOf2Vec() const {
3118       bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3119       assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3120              "Reshuffling not supported with non-power-of-2 vectors yet.");
3121       return IsNonPowerOf2;
3122     }
3123 
3124 #ifndef NDEBUG
3125     /// Debug printer.
3126     LLVM_DUMP_METHOD void dump() const {
3127       dbgs() << Idx << ".\n";
3128       for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3129         dbgs() << "Operand " << OpI << ":\n";
3130         for (const Value *V : Operands[OpI])
3131           dbgs().indent(2) << *V << "\n";
3132       }
3133       dbgs() << "Scalars: \n";
3134       for (Value *V : Scalars)
3135         dbgs().indent(2) << *V << "\n";
3136       dbgs() << "State: ";
3137       switch (State) {
3138       case Vectorize:
3139         dbgs() << "Vectorize\n";
3140         break;
3141       case ScatterVectorize:
3142         dbgs() << "ScatterVectorize\n";
3143         break;
3144       case StridedVectorize:
3145         dbgs() << "StridedVectorize\n";
3146         break;
3147       case NeedToGather:
3148         dbgs() << "NeedToGather\n";
3149         break;
3150       }
3151       dbgs() << "MainOp: ";
3152       if (MainOp)
3153         dbgs() << *MainOp << "\n";
3154       else
3155         dbgs() << "NULL\n";
3156       dbgs() << "AltOp: ";
3157       if (AltOp)
3158         dbgs() << *AltOp << "\n";
3159       else
3160         dbgs() << "NULL\n";
3161       dbgs() << "VectorizedValue: ";
3162       if (VectorizedValue)
3163         dbgs() << *VectorizedValue << "\n";
3164       else
3165         dbgs() << "NULL\n";
3166       dbgs() << "ReuseShuffleIndices: ";
3167       if (ReuseShuffleIndices.empty())
3168         dbgs() << "Empty";
3169       else
3170         for (int ReuseIdx : ReuseShuffleIndices)
3171           dbgs() << ReuseIdx << ", ";
3172       dbgs() << "\n";
3173       dbgs() << "ReorderIndices: ";
3174       for (unsigned ReorderIdx : ReorderIndices)
3175         dbgs() << ReorderIdx << ", ";
3176       dbgs() << "\n";
3177       dbgs() << "UserTreeIndices: ";
3178       for (const auto &EInfo : UserTreeIndices)
3179         dbgs() << EInfo << ", ";
3180       dbgs() << "\n";
3181     }
3182 #endif
3183   };
3184 
3185 #ifndef NDEBUG
3186   void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3187                      InstructionCost VecCost, InstructionCost ScalarCost,
3188                      StringRef Banner) const {
3189     dbgs() << "SLP: " << Banner << ":\n";
3190     E->dump();
3191     dbgs() << "SLP: Costs:\n";
3192     dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3193     dbgs() << "SLP:     VectorCost = " << VecCost << "\n";
3194     dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";
3195     dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = "
3196            << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3197   }
3198 #endif
3199 
3200   /// Create a new VectorizableTree entry.
3201   TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3202                           std::optional<ScheduleData *> Bundle,
3203                           const InstructionsState &S,
3204                           const EdgeInfo &UserTreeIdx,
3205                           ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3206                           ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3207     TreeEntry::EntryState EntryState =
3208         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209     return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210                         ReuseShuffleIndices, ReorderIndices);
3211   }
3212 
3213   TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3214                           TreeEntry::EntryState EntryState,
3215                           std::optional<ScheduleData *> Bundle,
3216                           const InstructionsState &S,
3217                           const EdgeInfo &UserTreeIdx,
3218                           ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3219                           ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3220     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3221             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222            "Need to vectorize gather entry?");
3223     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3224     TreeEntry *Last = VectorizableTree.back().get();
3225     Last->Idx = VectorizableTree.size() - 1;
3226     Last->State = EntryState;
3227     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3228                                      ReuseShuffleIndices.end());
3229     if (ReorderIndices.empty()) {
3230       Last->Scalars.assign(VL.begin(), VL.end());
3231       Last->setOperations(S);
3232     } else {
3233       // Reorder scalars and build final mask.
3234       Last->Scalars.assign(VL.size(), nullptr);
3235       transform(ReorderIndices, Last->Scalars.begin(),
3236                 [VL](unsigned Idx) -> Value * {
3237                   if (Idx >= VL.size())
3238                     return UndefValue::get(VL.front()->getType());
3239                   return VL[Idx];
3240                 });
3241       InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3242       Last->setOperations(S);
3243       Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3244     }
3245     if (!Last->isGather()) {
3246       for (Value *V : VL) {
3247         const TreeEntry *TE = getTreeEntry(V);
3248         assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3249                "Scalar already in tree!");
3250         if (TE) {
3251           if (TE != Last)
3252             MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3253           continue;
3254         }
3255         ScalarToTreeEntry[V] = Last;
3256       }
3257       // Update the scheduler bundle to point to this TreeEntry.
3258       ScheduleData *BundleMember = *Bundle;
3259       assert((BundleMember || isa<PHINode>(S.MainOp) ||
3260               isVectorLikeInstWithConstOps(S.MainOp) ||
3261               doesNotNeedToSchedule(VL)) &&
3262              "Bundle and VL out of sync");
3263       if (BundleMember) {
3264         for (Value *V : VL) {
3265           if (doesNotNeedToBeScheduled(V))
3266             continue;
3267           if (!BundleMember)
3268             continue;
3269           BundleMember->TE = Last;
3270           BundleMember = BundleMember->NextInBundle;
3271         }
3272       }
3273       assert(!BundleMember && "Bundle and VL out of sync");
3274     } else {
3275       // Build a map for gathered scalars to the nodes where they are used.
3276       bool AllConstsOrCasts = true;
3277       for (Value *V : VL)
3278         if (!isConstant(V)) {
3279           auto *I = dyn_cast<CastInst>(V);
3280           AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3281           ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3282         }
3283       if (AllConstsOrCasts)
3284         CastMaxMinBWSizes =
3285             std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3286       MustGather.insert(VL.begin(), VL.end());
3287     }
3288 
3289     if (UserTreeIdx.UserTE) {
3290       Last->UserTreeIndices.push_back(UserTreeIdx);
3291       assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3292              "Reordering isn't implemented for non-power-of-2 nodes yet");
3293     }
3294     return Last;
3295   }
3296 
3297   /// -- Vectorization State --
3298   /// Holds all of the tree entries.
3299   TreeEntry::VecTreeTy VectorizableTree;
3300 
3301 #ifndef NDEBUG
3302   /// Debug printer.
3303   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3304     for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305       VectorizableTree[Id]->dump();
3306       dbgs() << "\n";
3307     }
3308   }
3309 #endif
3310 
3311   TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3312 
3313   const TreeEntry *getTreeEntry(Value *V) const {
3314     return ScalarToTreeEntry.lookup(V);
3315   }
3316 
3317   /// Check that the operand node of alternate node does not generate
3318   /// buildvector sequence. If it is, then probably not worth it to build
3319   /// alternate shuffle, if number of buildvector operands + alternate
3320   /// instruction > than the number of buildvector instructions.
3321   /// \param S the instructions state of the analyzed values.
3322   /// \param VL list of the instructions with alternate opcodes.
3323   bool areAltOperandsProfitable(const InstructionsState &S,
3324                                 ArrayRef<Value *> VL) const;
3325 
3326   /// Checks if the specified list of the instructions/values can be vectorized
3327   /// and fills required data before actual scheduling of the instructions.
3328   TreeEntry::EntryState getScalarsVectorizationState(
3329       InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3330       OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3331 
3332   /// Maps a specific scalar to its tree entry.
3333   SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3334 
3335   /// List of scalars, used in several vectorize nodes, and the list of the
3336   /// nodes.
3337   SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3338 
3339   /// Maps a value to the proposed vectorizable size.
3340   SmallDenseMap<Value *, unsigned> InstrElementSize;
3341 
3342   /// A list of scalars that we found that we need to keep as scalars.
3343   ValueSet MustGather;
3344 
3345   /// A set of first non-schedulable values.
3346   ValueSet NonScheduledFirst;
3347 
3348   /// A map between the vectorized entries and the last instructions in the
3349   /// bundles. The bundles are built in use order, not in the def order of the
3350   /// instructions. So, we cannot rely directly on the last instruction in the
3351   /// bundle being the last instruction in the program order during
3352   /// vectorization process since the basic blocks are affected, need to
3353   /// pre-gather them before.
3354   DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3355 
3356   /// List of gather nodes, depending on other gather/vector nodes, which should
3357   /// be emitted after the vector instruction emission process to correctly
3358   /// handle order of the vector instructions and shuffles.
3359   SetVector<const TreeEntry *> PostponedGathers;
3360 
3361   using ValueToGatherNodesMap =
3362       DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3363   ValueToGatherNodesMap ValueToGatherNodes;
3364 
3365   /// This POD struct describes one external user in the vectorized tree.
3366   struct ExternalUser {
3367     ExternalUser(Value *S, llvm::User *U, int L)
3368         : Scalar(S), User(U), Lane(L) {}
3369 
3370     // Which scalar in our function.
3371     Value *Scalar;
3372 
3373     // Which user that uses the scalar.
3374     llvm::User *User;
3375 
3376     // Which lane does the scalar belong to.
3377     int Lane;
3378   };
3379   using UserList = SmallVector<ExternalUser, 16>;
3380 
3381   /// Checks if two instructions may access the same memory.
3382   ///
3383   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3384   /// is invariant in the calling loop.
3385   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3386                  Instruction *Inst2) {
3387     if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3388       return true;
3389     // First check if the result is already in the cache.
3390     AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3391     auto It = AliasCache.find(Key);
3392     if (It != AliasCache.end())
3393       return It->second;
3394     bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3395     // Store the result in the cache.
3396     AliasCache.try_emplace(Key, Aliased);
3397     AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3398     return Aliased;
3399   }
3400 
3401   using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3402 
3403   /// Cache for alias results.
3404   /// TODO: consider moving this to the AliasAnalysis itself.
3405   DenseMap<AliasCacheKey, bool> AliasCache;
3406 
3407   // Cache for pointerMayBeCaptured calls inside AA.  This is preserved
3408   // globally through SLP because we don't perform any action which
3409   // invalidates capture results.
3410   BatchAAResults BatchAA;
3411 
3412   /// Temporary store for deleted instructions. Instructions will be deleted
3413   /// eventually when the BoUpSLP is destructed.  The deferral is required to
3414   /// ensure that there are no incorrect collisions in the AliasCache, which
3415   /// can happen if a new instruction is allocated at the same address as a
3416   /// previously deleted instruction.
3417   DenseSet<Instruction *> DeletedInstructions;
3418 
3419   /// Set of the instruction, being analyzed already for reductions.
3420   SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3421 
3422   /// Set of hashes for the list of reduction values already being analyzed.
3423   DenseSet<size_t> AnalyzedReductionVals;
3424 
3425   /// Values, already been analyzed for mininmal bitwidth and found to be
3426   /// non-profitable.
3427   DenseSet<Value *> AnalyzedMinBWVals;
3428 
3429   /// A list of values that need to extracted out of the tree.
3430   /// This list holds pairs of (Internal Scalar : External User). External User
3431   /// can be nullptr, it means that this Internal Scalar will be used later,
3432   /// after vectorization.
3433   UserList ExternalUses;
3434 
3435   /// A list of GEPs which can be reaplced by scalar GEPs instead of
3436   /// extractelement instructions.
3437   SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3438 
3439   /// Values used only by @llvm.assume calls.
3440   SmallPtrSet<const Value *, 32> EphValues;
3441 
3442   /// Holds all of the instructions that we gathered, shuffle instructions and
3443   /// extractelements.
3444   SetVector<Instruction *> GatherShuffleExtractSeq;
3445 
3446   /// A list of blocks that we are going to CSE.
3447   DenseSet<BasicBlock *> CSEBlocks;
3448 
3449   /// Contains all scheduling relevant data for an instruction.
3450   /// A ScheduleData either represents a single instruction or a member of an
3451   /// instruction bundle (= a group of instructions which is combined into a
3452   /// vector instruction).
3453   struct ScheduleData {
3454     // The initial value for the dependency counters. It means that the
3455     // dependencies are not calculated yet.
3456     enum { InvalidDeps = -1 };
3457 
3458     ScheduleData() = default;
3459 
3460     void init(int BlockSchedulingRegionID, Value *OpVal) {
3461       FirstInBundle = this;
3462       NextInBundle = nullptr;
3463       NextLoadStore = nullptr;
3464       IsScheduled = false;
3465       SchedulingRegionID = BlockSchedulingRegionID;
3466       clearDependencies();
3467       OpValue = OpVal;
3468       TE = nullptr;
3469     }
3470 
3471     /// Verify basic self consistency properties
3472     void verify() {
3473       if (hasValidDependencies()) {
3474         assert(UnscheduledDeps <= Dependencies && "invariant");
3475       } else {
3476         assert(UnscheduledDeps == Dependencies && "invariant");
3477       }
3478 
3479       if (IsScheduled) {
3480         assert(isSchedulingEntity() &&
3481                 "unexpected scheduled state");
3482         for (const ScheduleData *BundleMember = this; BundleMember;
3483              BundleMember = BundleMember->NextInBundle) {
3484           assert(BundleMember->hasValidDependencies() &&
3485                  BundleMember->UnscheduledDeps == 0 &&
3486                  "unexpected scheduled state");
3487           assert((BundleMember == this || !BundleMember->IsScheduled) &&
3488                  "only bundle is marked scheduled");
3489         }
3490       }
3491 
3492       assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493              "all bundle members must be in same basic block");
3494     }
3495 
3496     /// Returns true if the dependency information has been calculated.
3497     /// Note that depenendency validity can vary between instructions within
3498     /// a single bundle.
3499     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3500 
3501     /// Returns true for single instructions and for bundle representatives
3502     /// (= the head of a bundle).
3503     bool isSchedulingEntity() const { return FirstInBundle == this; }
3504 
3505     /// Returns true if it represents an instruction bundle and not only a
3506     /// single instruction.
3507     bool isPartOfBundle() const {
3508       return NextInBundle != nullptr || FirstInBundle != this || TE;
3509     }
3510 
3511     /// Returns true if it is ready for scheduling, i.e. it has no more
3512     /// unscheduled depending instructions/bundles.
3513     bool isReady() const {
3514       assert(isSchedulingEntity() &&
3515              "can't consider non-scheduling entity for ready list");
3516       return unscheduledDepsInBundle() == 0 && !IsScheduled;
3517     }
3518 
3519     /// Modifies the number of unscheduled dependencies for this instruction,
3520     /// and returns the number of remaining dependencies for the containing
3521     /// bundle.
3522     int incrementUnscheduledDeps(int Incr) {
3523       assert(hasValidDependencies() &&
3524              "increment of unscheduled deps would be meaningless");
3525       UnscheduledDeps += Incr;
3526       return FirstInBundle->unscheduledDepsInBundle();
3527     }
3528 
3529     /// Sets the number of unscheduled dependencies to the number of
3530     /// dependencies.
3531     void resetUnscheduledDeps() {
3532       UnscheduledDeps = Dependencies;
3533     }
3534 
3535     /// Clears all dependency information.
3536     void clearDependencies() {
3537       Dependencies = InvalidDeps;
3538       resetUnscheduledDeps();
3539       MemoryDependencies.clear();
3540       ControlDependencies.clear();
3541     }
3542 
3543     int unscheduledDepsInBundle() const {
3544       assert(isSchedulingEntity() && "only meaningful on the bundle");
3545       int Sum = 0;
3546       for (const ScheduleData *BundleMember = this; BundleMember;
3547            BundleMember = BundleMember->NextInBundle) {
3548         if (BundleMember->UnscheduledDeps == InvalidDeps)
3549           return InvalidDeps;
3550         Sum += BundleMember->UnscheduledDeps;
3551       }
3552       return Sum;
3553     }
3554 
3555     void dump(raw_ostream &os) const {
3556       if (!isSchedulingEntity()) {
3557         os << "/ " << *Inst;
3558       } else if (NextInBundle) {
3559         os << '[' << *Inst;
3560         ScheduleData *SD = NextInBundle;
3561         while (SD) {
3562           os << ';' << *SD->Inst;
3563           SD = SD->NextInBundle;
3564         }
3565         os << ']';
3566       } else {
3567         os << *Inst;
3568       }
3569     }
3570 
3571     Instruction *Inst = nullptr;
3572 
3573     /// Opcode of the current instruction in the schedule data.
3574     Value *OpValue = nullptr;
3575 
3576     /// The TreeEntry that this instruction corresponds to.
3577     TreeEntry *TE = nullptr;
3578 
3579     /// Points to the head in an instruction bundle (and always to this for
3580     /// single instructions).
3581     ScheduleData *FirstInBundle = nullptr;
3582 
3583     /// Single linked list of all instructions in a bundle. Null if it is a
3584     /// single instruction.
3585     ScheduleData *NextInBundle = nullptr;
3586 
3587     /// Single linked list of all memory instructions (e.g. load, store, call)
3588     /// in the block - until the end of the scheduling region.
3589     ScheduleData *NextLoadStore = nullptr;
3590 
3591     /// The dependent memory instructions.
3592     /// This list is derived on demand in calculateDependencies().
3593     SmallVector<ScheduleData *, 4> MemoryDependencies;
3594 
3595     /// List of instructions which this instruction could be control dependent
3596     /// on.  Allowing such nodes to be scheduled below this one could introduce
3597     /// a runtime fault which didn't exist in the original program.
3598     /// ex: this is a load or udiv following a readonly call which inf loops
3599     SmallVector<ScheduleData *, 4> ControlDependencies;
3600 
3601     /// This ScheduleData is in the current scheduling region if this matches
3602     /// the current SchedulingRegionID of BlockScheduling.
3603     int SchedulingRegionID = 0;
3604 
3605     /// Used for getting a "good" final ordering of instructions.
3606     int SchedulingPriority = 0;
3607 
3608     /// The number of dependencies. Constitutes of the number of users of the
3609     /// instruction plus the number of dependent memory instructions (if any).
3610     /// This value is calculated on demand.
3611     /// If InvalidDeps, the number of dependencies is not calculated yet.
3612     int Dependencies = InvalidDeps;
3613 
3614     /// The number of dependencies minus the number of dependencies of scheduled
3615     /// instructions. As soon as this is zero, the instruction/bundle gets ready
3616     /// for scheduling.
3617     /// Note that this is negative as long as Dependencies is not calculated.
3618     int UnscheduledDeps = InvalidDeps;
3619 
3620     /// True if this instruction is scheduled (or considered as scheduled in the
3621     /// dry-run).
3622     bool IsScheduled = false;
3623   };
3624 
3625 #ifndef NDEBUG
3626   friend inline raw_ostream &operator<<(raw_ostream &os,
3627                                         const BoUpSLP::ScheduleData &SD) {
3628     SD.dump(os);
3629     return os;
3630   }
3631 #endif
3632 
3633   friend struct GraphTraits<BoUpSLP *>;
3634   friend struct DOTGraphTraits<BoUpSLP *>;
3635 
3636   /// Contains all scheduling data for a basic block.
3637   /// It does not schedules instructions, which are not memory read/write
3638   /// instructions and their operands are either constants, or arguments, or
3639   /// phis, or instructions from others blocks, or their users are phis or from
3640   /// the other blocks. The resulting vector instructions can be placed at the
3641   /// beginning of the basic block without scheduling (if operands does not need
3642   /// to be scheduled) or at the end of the block (if users are outside of the
3643   /// block). It allows to save some compile time and memory used by the
3644   /// compiler.
3645   /// ScheduleData is assigned for each instruction in between the boundaries of
3646   /// the tree entry, even for those, which are not part of the graph. It is
3647   /// required to correctly follow the dependencies between the instructions and
3648   /// their correct scheduling. The ScheduleData is not allocated for the
3649   /// instructions, which do not require scheduling, like phis, nodes with
3650   /// extractelements/insertelements only or nodes with instructions, with
3651   /// uses/operands outside of the block.
3652   struct BlockScheduling {
3653     BlockScheduling(BasicBlock *BB)
3654         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3655 
3656     void clear() {
3657       ReadyInsts.clear();
3658       ScheduleStart = nullptr;
3659       ScheduleEnd = nullptr;
3660       FirstLoadStoreInRegion = nullptr;
3661       LastLoadStoreInRegion = nullptr;
3662       RegionHasStackSave = false;
3663 
3664       // Reduce the maximum schedule region size by the size of the
3665       // previous scheduling run.
3666       ScheduleRegionSizeLimit -= ScheduleRegionSize;
3667       if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3668         ScheduleRegionSizeLimit = MinScheduleRegionSize;
3669       ScheduleRegionSize = 0;
3670 
3671       // Make a new scheduling region, i.e. all existing ScheduleData is not
3672       // in the new region yet.
3673       ++SchedulingRegionID;
3674     }
3675 
3676     ScheduleData *getScheduleData(Instruction *I) {
3677       if (BB != I->getParent())
3678         // Avoid lookup if can't possibly be in map.
3679         return nullptr;
3680       ScheduleData *SD = ScheduleDataMap.lookup(I);
3681       if (SD && isInSchedulingRegion(SD))
3682         return SD;
3683       return nullptr;
3684     }
3685 
3686     ScheduleData *getScheduleData(Value *V) {
3687       if (auto *I = dyn_cast<Instruction>(V))
3688         return getScheduleData(I);
3689       return nullptr;
3690     }
3691 
3692     ScheduleData *getScheduleData(Value *V, Value *Key) {
3693       if (V == Key)
3694         return getScheduleData(V);
3695       auto I = ExtraScheduleDataMap.find(V);
3696       if (I != ExtraScheduleDataMap.end()) {
3697         ScheduleData *SD = I->second.lookup(Key);
3698         if (SD && isInSchedulingRegion(SD))
3699           return SD;
3700       }
3701       return nullptr;
3702     }
3703 
3704     bool isInSchedulingRegion(ScheduleData *SD) const {
3705       return SD->SchedulingRegionID == SchedulingRegionID;
3706     }
3707 
3708     /// Marks an instruction as scheduled and puts all dependent ready
3709     /// instructions into the ready-list.
3710     template <typename ReadyListType>
3711     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712       SD->IsScheduled = true;
3713       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
3714 
3715       for (ScheduleData *BundleMember = SD; BundleMember;
3716            BundleMember = BundleMember->NextInBundle) {
3717         if (BundleMember->Inst != BundleMember->OpValue)
3718           continue;
3719 
3720         // Handle the def-use chain dependencies.
3721 
3722         // Decrement the unscheduled counter and insert to ready list if ready.
3723         auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3724           doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3725             if (OpDef && OpDef->hasValidDependencies() &&
3726                 OpDef->incrementUnscheduledDeps(-1) == 0) {
3727               // There are no more unscheduled dependencies after
3728               // decrementing, so we can put the dependent instruction
3729               // into the ready list.
3730               ScheduleData *DepBundle = OpDef->FirstInBundle;
3731               assert(!DepBundle->IsScheduled &&
3732                      "already scheduled bundle gets ready");
3733               ReadyList.insert(DepBundle);
3734               LLVM_DEBUG(dbgs()
3735                          << "SLP:    gets ready (def): " << *DepBundle << "\n");
3736             }
3737           });
3738         };
3739 
3740         // If BundleMember is a vector bundle, its operands may have been
3741         // reordered during buildTree(). We therefore need to get its operands
3742         // through the TreeEntry.
3743         if (TreeEntry *TE = BundleMember->TE) {
3744           // Need to search for the lane since the tree entry can be reordered.
3745           int Lane = std::distance(TE->Scalars.begin(),
3746                                    find(TE->Scalars, BundleMember->Inst));
3747           assert(Lane >= 0 && "Lane not set");
3748 
3749           // Since vectorization tree is being built recursively this assertion
3750           // ensures that the tree entry has all operands set before reaching
3751           // this code. Couple of exceptions known at the moment are extracts
3752           // where their second (immediate) operand is not added. Since
3753           // immediates do not affect scheduler behavior this is considered
3754           // okay.
3755           auto *In = BundleMember->Inst;
3756           assert(
3757               In &&
3758               (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3759                In->getNumOperands() == TE->getNumOperands()) &&
3760               "Missed TreeEntry operands?");
3761           (void)In; // fake use to avoid build failure when assertions disabled
3762 
3763           for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3764                OpIdx != NumOperands; ++OpIdx)
3765             if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3766               DecrUnsched(I);
3767         } else {
3768           // If BundleMember is a stand-alone instruction, no operand reordering
3769           // has taken place, so we directly access its operands.
3770           for (Use &U : BundleMember->Inst->operands())
3771             if (auto *I = dyn_cast<Instruction>(U.get()))
3772               DecrUnsched(I);
3773         }
3774         // Handle the memory dependencies.
3775         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776           if (MemoryDepSD->hasValidDependencies() &&
3777               MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3778             // There are no more unscheduled dependencies after decrementing,
3779             // so we can put the dependent instruction into the ready list.
3780             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781             assert(!DepBundle->IsScheduled &&
3782                    "already scheduled bundle gets ready");
3783             ReadyList.insert(DepBundle);
3784             LLVM_DEBUG(dbgs()
3785                        << "SLP:    gets ready (mem): " << *DepBundle << "\n");
3786           }
3787         }
3788         // Handle the control dependencies.
3789         for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790           if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3791             // There are no more unscheduled dependencies after decrementing,
3792             // so we can put the dependent instruction into the ready list.
3793             ScheduleData *DepBundle = DepSD->FirstInBundle;
3794             assert(!DepBundle->IsScheduled &&
3795                    "already scheduled bundle gets ready");
3796             ReadyList.insert(DepBundle);
3797             LLVM_DEBUG(dbgs()
3798                        << "SLP:    gets ready (ctl): " << *DepBundle << "\n");
3799           }
3800         }
3801       }
3802     }
3803 
3804     /// Verify basic self consistency properties of the data structure.
3805     void verify() {
3806       if (!ScheduleStart)
3807         return;
3808 
3809       assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810              ScheduleStart->comesBefore(ScheduleEnd) &&
3811              "Not a valid scheduling region?");
3812 
3813       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3814         auto *SD = getScheduleData(I);
3815         if (!SD)
3816           continue;
3817         assert(isInSchedulingRegion(SD) &&
3818                "primary schedule data not in window?");
3819         assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820                "entire bundle in window!");
3821         (void)SD;
3822         doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3823       }
3824 
3825       for (auto *SD : ReadyInsts) {
3826         assert(SD->isSchedulingEntity() && SD->isReady() &&
3827                "item in ready list not ready?");
3828         (void)SD;
3829       }
3830     }
3831 
3832     void doForAllOpcodes(Value *V,
3833                          function_ref<void(ScheduleData *SD)> Action) {
3834       if (ScheduleData *SD = getScheduleData(V))
3835         Action(SD);
3836       auto I = ExtraScheduleDataMap.find(V);
3837       if (I != ExtraScheduleDataMap.end())
3838         for (auto &P : I->second)
3839           if (isInSchedulingRegion(P.second))
3840             Action(P.second);
3841     }
3842 
3843     /// Put all instructions into the ReadyList which are ready for scheduling.
3844     template <typename ReadyListType>
3845     void initialFillReadyList(ReadyListType &ReadyList) {
3846       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3847         doForAllOpcodes(I, [&](ScheduleData *SD) {
3848           if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3849               SD->isReady()) {
3850             ReadyList.insert(SD);
3851             LLVM_DEBUG(dbgs()
3852                        << "SLP:    initially in ready list: " << *SD << "\n");
3853           }
3854         });
3855       }
3856     }
3857 
3858     /// Build a bundle from the ScheduleData nodes corresponding to the
3859     /// scalar instruction for each lane.
3860     ScheduleData *buildBundle(ArrayRef<Value *> VL);
3861 
3862     /// Checks if a bundle of instructions can be scheduled, i.e. has no
3863     /// cyclic dependencies. This is only a dry-run, no instructions are
3864     /// actually moved at this stage.
3865     /// \returns the scheduling bundle. The returned Optional value is not
3866     /// std::nullopt if \p VL is allowed to be scheduled.
3867     std::optional<ScheduleData *>
3868     tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3869                       const InstructionsState &S);
3870 
3871     /// Un-bundles a group of instructions.
3872     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3873 
3874     /// Allocates schedule data chunk.
3875     ScheduleData *allocateScheduleDataChunks();
3876 
3877     /// Extends the scheduling region so that V is inside the region.
3878     /// \returns true if the region size is within the limit.
3879     bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3880 
3881     /// Initialize the ScheduleData structures for new instructions in the
3882     /// scheduling region.
3883     void initScheduleData(Instruction *FromI, Instruction *ToI,
3884                           ScheduleData *PrevLoadStore,
3885                           ScheduleData *NextLoadStore);
3886 
3887     /// Updates the dependency information of a bundle and of all instructions/
3888     /// bundles which depend on the original bundle.
3889     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3890                                BoUpSLP *SLP);
3891 
3892     /// Sets all instruction in the scheduling region to un-scheduled.
3893     void resetSchedule();
3894 
3895     BasicBlock *BB;
3896 
3897     /// Simple memory allocation for ScheduleData.
3898     SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3899 
3900     /// The size of a ScheduleData array in ScheduleDataChunks.
3901     int ChunkSize;
3902 
3903     /// The allocator position in the current chunk, which is the last entry
3904     /// of ScheduleDataChunks.
3905     int ChunkPos;
3906 
3907     /// Attaches ScheduleData to Instruction.
3908     /// Note that the mapping survives during all vectorization iterations, i.e.
3909     /// ScheduleData structures are recycled.
3910     DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3911 
3912     /// Attaches ScheduleData to Instruction with the leading key.
3913     DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3914         ExtraScheduleDataMap;
3915 
3916     /// The ready-list for scheduling (only used for the dry-run).
3917     SetVector<ScheduleData *> ReadyInsts;
3918 
3919     /// The first instruction of the scheduling region.
3920     Instruction *ScheduleStart = nullptr;
3921 
3922     /// The first instruction _after_ the scheduling region.
3923     Instruction *ScheduleEnd = nullptr;
3924 
3925     /// The first memory accessing instruction in the scheduling region
3926     /// (can be null).
3927     ScheduleData *FirstLoadStoreInRegion = nullptr;
3928 
3929     /// The last memory accessing instruction in the scheduling region
3930     /// (can be null).
3931     ScheduleData *LastLoadStoreInRegion = nullptr;
3932 
3933     /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3934     /// region?  Used to optimize the dependence calculation for the
3935     /// common case where there isn't.
3936     bool RegionHasStackSave = false;
3937 
3938     /// The current size of the scheduling region.
3939     int ScheduleRegionSize = 0;
3940 
3941     /// The maximum size allowed for the scheduling region.
3942     int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3943 
3944     /// The ID of the scheduling region. For a new vectorization iteration this
3945     /// is incremented which "removes" all ScheduleData from the region.
3946     /// Make sure that the initial SchedulingRegionID is greater than the
3947     /// initial SchedulingRegionID in ScheduleData (which is 0).
3948     int SchedulingRegionID = 1;
3949   };
3950 
3951   /// Attaches the BlockScheduling structures to basic blocks.
3952   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3953 
3954   /// Performs the "real" scheduling. Done before vectorization is actually
3955   /// performed in a basic block.
3956   void scheduleBlock(BlockScheduling *BS);
3957 
3958   /// List of users to ignore during scheduling and that don't need extracting.
3959   const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3960 
3961   /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3962   /// sorted SmallVectors of unsigned.
3963   struct OrdersTypeDenseMapInfo {
3964     static OrdersType getEmptyKey() {
3965       OrdersType V;
3966       V.push_back(~1U);
3967       return V;
3968     }
3969 
3970     static OrdersType getTombstoneKey() {
3971       OrdersType V;
3972       V.push_back(~2U);
3973       return V;
3974     }
3975 
3976     static unsigned getHashValue(const OrdersType &V) {
3977       return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3978     }
3979 
3980     static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3981       return LHS == RHS;
3982     }
3983   };
3984 
3985   // Analysis and block reference.
3986   Function *F;
3987   ScalarEvolution *SE;
3988   TargetTransformInfo *TTI;
3989   TargetLibraryInfo *TLI;
3990   LoopInfo *LI;
3991   DominatorTree *DT;
3992   AssumptionCache *AC;
3993   DemandedBits *DB;
3994   const DataLayout *DL;
3995   OptimizationRemarkEmitter *ORE;
3996 
3997   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3998   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3999 
4000   /// Instruction builder to construct the vectorized tree.
4001   IRBuilder<TargetFolder> Builder;
4002 
4003   /// A map of scalar integer values to the smallest bit width with which they
4004   /// can legally be represented. The values map to (width, signed) pairs,
4005   /// where "width" indicates the minimum bit width and "signed" is True if the
4006   /// value must be signed-extended, rather than zero-extended, back to its
4007   /// original width.
4008   DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
4009 
4010   /// Final size of the reduced vector, if the current graph represents the
4011   /// input for the reduction and it was possible to narrow the size of the
4012   /// reduction.
4013   unsigned ReductionBitWidth = 0;
4014 
4015   /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4016   /// type sizes, used in the tree.
4017   std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4018 
4019   /// Indices of the vectorized nodes, which supposed to be the roots of the new
4020   /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4021   DenseSet<unsigned> ExtraBitWidthNodes;
4022 };
4023 
4024 } // end namespace slpvectorizer
4025 
4026 template <> struct GraphTraits<BoUpSLP *> {
4027   using TreeEntry = BoUpSLP::TreeEntry;
4028 
4029   /// NodeRef has to be a pointer per the GraphWriter.
4030   using NodeRef = TreeEntry *;
4031 
4032   using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
4033 
4034   /// Add the VectorizableTree to the index iterator to be able to return
4035   /// TreeEntry pointers.
4036   struct ChildIteratorType
4037       : public iterator_adaptor_base<
4038             ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4039     ContainerTy &VectorizableTree;
4040 
4041     ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
4042                       ContainerTy &VT)
4043         : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4044 
4045     NodeRef operator*() { return I->UserTE; }
4046   };
4047 
4048   static NodeRef getEntryNode(BoUpSLP &R) {
4049     return R.VectorizableTree[0].get();
4050   }
4051 
4052   static ChildIteratorType child_begin(NodeRef N) {
4053     return {N->UserTreeIndices.begin(), N->Container};
4054   }
4055 
4056   static ChildIteratorType child_end(NodeRef N) {
4057     return {N->UserTreeIndices.end(), N->Container};
4058   }
4059 
4060   /// For the node iterator we just need to turn the TreeEntry iterator into a
4061   /// TreeEntry* iterator so that it dereferences to NodeRef.
4062   class nodes_iterator {
4063     using ItTy = ContainerTy::iterator;
4064     ItTy It;
4065 
4066   public:
4067     nodes_iterator(const ItTy &It2) : It(It2) {}
4068     NodeRef operator*() { return It->get(); }
4069     nodes_iterator operator++() {
4070       ++It;
4071       return *this;
4072     }
4073     bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4074   };
4075 
4076   static nodes_iterator nodes_begin(BoUpSLP *R) {
4077     return nodes_iterator(R->VectorizableTree.begin());
4078   }
4079 
4080   static nodes_iterator nodes_end(BoUpSLP *R) {
4081     return nodes_iterator(R->VectorizableTree.end());
4082   }
4083 
4084   static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4085 };
4086 
4087 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4088   using TreeEntry = BoUpSLP::TreeEntry;
4089 
4090   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4091 
4092   std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4093     std::string Str;
4094     raw_string_ostream OS(Str);
4095     OS << Entry->Idx << ".\n";
4096     if (isSplat(Entry->Scalars))
4097       OS << "<splat> ";
4098     for (auto *V : Entry->Scalars) {
4099       OS << *V;
4100       if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4101             return EU.Scalar == V;
4102           }))
4103         OS << " <extract>";
4104       OS << "\n";
4105     }
4106     return Str;
4107   }
4108 
4109   static std::string getNodeAttributes(const TreeEntry *Entry,
4110                                        const BoUpSLP *) {
4111     if (Entry->isGather())
4112       return "color=red";
4113     if (Entry->State == TreeEntry::ScatterVectorize ||
4114         Entry->State == TreeEntry::StridedVectorize)
4115       return "color=blue";
4116     return "";
4117   }
4118 };
4119 
4120 } // end namespace llvm
4121 
4122 BoUpSLP::~BoUpSLP() {
4123   SmallVector<WeakTrackingVH> DeadInsts;
4124   for (auto *I : DeletedInstructions) {
4125     if (!I->getParent()) {
4126       // Temporarily insert instruction back to erase them from parent and
4127       // memory later.
4128       if (isa<PHINode>(I))
4129         // Phi nodes must be the very first instructions in the block.
4130         I->insertBefore(F->getEntryBlock(),
4131                         F->getEntryBlock().getFirstNonPHIIt());
4132       else
4133         I->insertBefore(F->getEntryBlock().getTerminator());
4134       continue;
4135     }
4136     for (Use &U : I->operands()) {
4137       auto *Op = dyn_cast<Instruction>(U.get());
4138       if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4139           wouldInstructionBeTriviallyDead(Op, TLI))
4140         DeadInsts.emplace_back(Op);
4141     }
4142     I->dropAllReferences();
4143   }
4144   for (auto *I : DeletedInstructions) {
4145     assert(I->use_empty() &&
4146            "trying to erase instruction with users.");
4147     I->eraseFromParent();
4148   }
4149 
4150   // Cleanup any dead scalar code feeding the vectorized instructions
4151   RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4152 
4153 #ifdef EXPENSIVE_CHECKS
4154   // If we could guarantee that this call is not extremely slow, we could
4155   // remove the ifdef limitation (see PR47712).
4156   assert(!verifyFunction(*F, &dbgs()));
4157 #endif
4158 }
4159 
4160 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4161 /// contains original mask for the scalars reused in the node. Procedure
4162 /// transform this mask in accordance with the given \p Mask.
4163 static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
4164   assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4165          "Expected non-empty mask.");
4166   SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4167   Prev.swap(Reuses);
4168   for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4169     if (Mask[I] != PoisonMaskElem)
4170       Reuses[Mask[I]] = Prev[I];
4171 }
4172 
4173 /// Reorders the given \p Order according to the given \p Mask. \p Order - is
4174 /// the original order of the scalars. Procedure transforms the provided order
4175 /// in accordance with the given \p Mask. If the resulting \p Order is just an
4176 /// identity order, \p Order is cleared.
4177 static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
4178                          bool BottomOrder = false) {
4179   assert(!Mask.empty() && "Expected non-empty mask.");
4180   unsigned Sz = Mask.size();
4181   if (BottomOrder) {
4182     SmallVector<unsigned> PrevOrder;
4183     if (Order.empty()) {
4184       PrevOrder.resize(Sz);
4185       std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4186     } else {
4187       PrevOrder.swap(Order);
4188     }
4189     Order.assign(Sz, Sz);
4190     for (unsigned I = 0; I < Sz; ++I)
4191       if (Mask[I] != PoisonMaskElem)
4192         Order[I] = PrevOrder[Mask[I]];
4193     if (all_of(enumerate(Order), [&](const auto &Data) {
4194           return Data.value() == Sz || Data.index() == Data.value();
4195         })) {
4196       Order.clear();
4197       return;
4198     }
4199     fixupOrderingIndices(Order);
4200     return;
4201   }
4202   SmallVector<int> MaskOrder;
4203   if (Order.empty()) {
4204     MaskOrder.resize(Sz);
4205     std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4206   } else {
4207     inversePermutation(Order, MaskOrder);
4208   }
4209   reorderReuses(MaskOrder, Mask);
4210   if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4211     Order.clear();
4212     return;
4213   }
4214   Order.assign(Sz, Sz);
4215   for (unsigned I = 0; I < Sz; ++I)
4216     if (MaskOrder[I] != PoisonMaskElem)
4217       Order[MaskOrder[I]] = I;
4218   fixupOrderingIndices(Order);
4219 }
4220 
4221 std::optional<BoUpSLP::OrdersType>
4222 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4223   assert(TE.isGather() && "Expected gather node only.");
4224   // Try to find subvector extract/insert patterns and reorder only such
4225   // patterns.
4226   SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4227   Type *ScalarTy = GatheredScalars.front()->getType();
4228   int NumScalars = GatheredScalars.size();
4229   if (!isValidElementType(ScalarTy))
4230     return std::nullopt;
4231   auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4232   int NumParts = TTI->getNumberOfParts(VecTy);
4233   if (NumParts == 0 || NumParts >= NumScalars)
4234     NumParts = 1;
4235   SmallVector<int> ExtractMask;
4236   SmallVector<int> Mask;
4237   SmallVector<SmallVector<const TreeEntry *>> Entries;
4238   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4239       tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4240   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4241       isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4242                             /*ForOrder=*/true);
4243   // No shuffled operands - ignore.
4244   if (GatherShuffles.empty() && ExtractShuffles.empty())
4245     return std::nullopt;
4246   OrdersType CurrentOrder(NumScalars, NumScalars);
4247   if (GatherShuffles.size() == 1 &&
4248       *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4249       Entries.front().front()->isSame(TE.Scalars)) {
4250     // Perfect match in the graph, will reuse the previously vectorized
4251     // node. Cost is 0.
4252     std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4253     return CurrentOrder;
4254   }
4255   auto IsSplatMask = [](ArrayRef<int> Mask) {
4256     int SingleElt = PoisonMaskElem;
4257     return all_of(Mask, [&](int I) {
4258       if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4259         SingleElt = I;
4260       return I == PoisonMaskElem || I == SingleElt;
4261     });
4262   };
4263   // Exclusive broadcast mask - ignore.
4264   if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4265        (Entries.size() != 1 ||
4266         Entries.front().front()->ReorderIndices.empty())) ||
4267       (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4268     return std::nullopt;
4269   SmallBitVector ShuffledSubMasks(NumParts);
4270   auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4271                                   ArrayRef<int> Mask, int PartSz, int NumParts,
4272                                   function_ref<unsigned(unsigned)> GetVF) {
4273     for (int I : seq<int>(0, NumParts)) {
4274       if (ShuffledSubMasks.test(I))
4275         continue;
4276       const int VF = GetVF(I);
4277       if (VF == 0)
4278         continue;
4279       unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4280       MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4281       // Shuffle of at least 2 vectors - ignore.
4282       if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4283         std::fill(Slice.begin(), Slice.end(), NumScalars);
4284         ShuffledSubMasks.set(I);
4285         continue;
4286       }
4287       // Try to include as much elements from the mask as possible.
4288       int FirstMin = INT_MAX;
4289       int SecondVecFound = false;
4290       for (int K : seq<int>(Limit)) {
4291         int Idx = Mask[I * PartSz + K];
4292         if (Idx == PoisonMaskElem) {
4293           Value *V = GatheredScalars[I * PartSz + K];
4294           if (isConstant(V) && !isa<PoisonValue>(V)) {
4295             SecondVecFound = true;
4296             break;
4297           }
4298           continue;
4299         }
4300         if (Idx < VF) {
4301           if (FirstMin > Idx)
4302             FirstMin = Idx;
4303         } else {
4304           SecondVecFound = true;
4305           break;
4306         }
4307       }
4308       FirstMin = (FirstMin / PartSz) * PartSz;
4309       // Shuffle of at least 2 vectors - ignore.
4310       if (SecondVecFound) {
4311         std::fill(Slice.begin(), Slice.end(), NumScalars);
4312         ShuffledSubMasks.set(I);
4313         continue;
4314       }
4315       for (int K : seq<int>(Limit)) {
4316         int Idx = Mask[I * PartSz + K];
4317         if (Idx == PoisonMaskElem)
4318           continue;
4319         Idx -= FirstMin;
4320         if (Idx >= PartSz) {
4321           SecondVecFound = true;
4322           break;
4323         }
4324         if (CurrentOrder[I * PartSz + Idx] >
4325                 static_cast<unsigned>(I * PartSz + K) &&
4326             CurrentOrder[I * PartSz + Idx] !=
4327                 static_cast<unsigned>(I * PartSz + Idx))
4328           CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4329       }
4330       // Shuffle of at least 2 vectors - ignore.
4331       if (SecondVecFound) {
4332         std::fill(Slice.begin(), Slice.end(), NumScalars);
4333         ShuffledSubMasks.set(I);
4334         continue;
4335       }
4336     }
4337   };
4338   int PartSz = getPartNumElems(NumScalars, NumParts);
4339   if (!ExtractShuffles.empty())
4340     TransformMaskToOrder(
4341         CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4342           if (!ExtractShuffles[I])
4343             return 0U;
4344           unsigned VF = 0;
4345           unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4346           for (unsigned Idx : seq<unsigned>(Sz)) {
4347             int K = I * PartSz + Idx;
4348             if (ExtractMask[K] == PoisonMaskElem)
4349               continue;
4350             if (!TE.ReuseShuffleIndices.empty())
4351               K = TE.ReuseShuffleIndices[K];
4352             if (!TE.ReorderIndices.empty())
4353               K = std::distance(TE.ReorderIndices.begin(),
4354                                 find(TE.ReorderIndices, K));
4355             auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4356             if (!EI)
4357               continue;
4358             VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4359                                   ->getElementCount()
4360                                   .getKnownMinValue());
4361           }
4362           return VF;
4363         });
4364   // Check special corner case - single shuffle of the same entry.
4365   if (GatherShuffles.size() == 1 && NumParts != 1) {
4366     if (ShuffledSubMasks.any())
4367       return std::nullopt;
4368     PartSz = NumScalars;
4369     NumParts = 1;
4370   }
4371   if (!Entries.empty())
4372     TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4373       if (!GatherShuffles[I])
4374         return 0U;
4375       return std::max(Entries[I].front()->getVectorFactor(),
4376                       Entries[I].back()->getVectorFactor());
4377     });
4378   int NumUndefs =
4379       count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4380   if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4381     return std::nullopt;
4382   return std::move(CurrentOrder);
4383 }
4384 
4385 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4386                                   const TargetLibraryInfo &TLI,
4387                                   bool CompareOpcodes = true) {
4388   if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4389     return false;
4390   auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4391   if (!GEP1)
4392     return false;
4393   auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4394   if (!GEP2)
4395     return false;
4396   return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4397          ((isConstant(GEP1->getOperand(1)) &&
4398            isConstant(GEP2->getOperand(1))) ||
4399           !CompareOpcodes ||
4400           getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4401               .getOpcode());
4402 }
4403 
4404 /// Calculates minimal alignment as a common alignment.
4405 template <typename T>
4406 static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4407   Align CommonAlignment = cast<T>(VL.front())->getAlign();
4408   for (Value *V : VL.drop_front())
4409     CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4410   return CommonAlignment;
4411 }
4412 
4413 /// Check if \p Order represents reverse order.
4414 static bool isReverseOrder(ArrayRef<unsigned> Order) {
4415   unsigned Sz = Order.size();
4416   return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4417     return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4418   });
4419 }
4420 
4421 /// Checks if the provided list of pointers \p Pointers represents the strided
4422 /// pointers for type ElemTy. If they are not, std::nullopt is returned.
4423 /// Otherwise, if \p Inst is not specified, just initialized optional value is
4424 /// returned to show that the pointers represent strided pointers. If \p Inst
4425 /// specified, the runtime stride is materialized before the given \p Inst.
4426 /// \returns std::nullopt if the pointers are not pointers with the runtime
4427 /// stride, nullptr or actual stride value, otherwise.
4428 static std::optional<Value *>
4429 calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4430                   const DataLayout &DL, ScalarEvolution &SE,
4431                   SmallVectorImpl<unsigned> &SortedIndices,
4432                   Instruction *Inst = nullptr) {
4433   SmallVector<const SCEV *> SCEVs;
4434   const SCEV *PtrSCEVLowest = nullptr;
4435   const SCEV *PtrSCEVHighest = nullptr;
4436   // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4437   // addresses).
4438   for (Value *Ptr : PointerOps) {
4439     const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4440     if (!PtrSCEV)
4441       return std::nullopt;
4442     SCEVs.push_back(PtrSCEV);
4443     if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444       PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4445       continue;
4446     }
4447     const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4448     if (isa<SCEVCouldNotCompute>(Diff))
4449       return std::nullopt;
4450     if (Diff->isNonConstantNegative()) {
4451       PtrSCEVLowest = PtrSCEV;
4452       continue;
4453     }
4454     const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4455     if (isa<SCEVCouldNotCompute>(Diff1))
4456       return std::nullopt;
4457     if (Diff1->isNonConstantNegative()) {
4458       PtrSCEVHighest = PtrSCEV;
4459       continue;
4460     }
4461   }
4462   // Dist = PtrSCEVHighest - PtrSCEVLowest;
4463   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4464   if (isa<SCEVCouldNotCompute>(Dist))
4465     return std::nullopt;
4466   int Size = DL.getTypeStoreSize(ElemTy);
4467   auto TryGetStride = [&](const SCEV *Dist,
4468                           const SCEV *Multiplier) -> const SCEV * {
4469     if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4470       if (M->getOperand(0) == Multiplier)
4471         return M->getOperand(1);
4472       if (M->getOperand(1) == Multiplier)
4473         return M->getOperand(0);
4474       return nullptr;
4475     }
4476     if (Multiplier == Dist)
4477       return SE.getConstant(Dist->getType(), 1);
4478     return SE.getUDivExactExpr(Dist, Multiplier);
4479   };
4480   // Stride_in_elements = Dist / element_size * (num_elems - 1).
4481   const SCEV *Stride = nullptr;
4482   if (Size != 1 || SCEVs.size() > 2) {
4483     const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4484     Stride = TryGetStride(Dist, Sz);
4485     if (!Stride)
4486       return std::nullopt;
4487   }
4488   if (!Stride || isa<SCEVConstant>(Stride))
4489     return std::nullopt;
4490   // Iterate through all pointers and check if all distances are
4491   // unique multiple of Stride.
4492   using DistOrdPair = std::pair<int64_t, int>;
4493   auto Compare = llvm::less_first();
4494   std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4495   int Cnt = 0;
4496   bool IsConsecutive = true;
4497   for (const SCEV *PtrSCEV : SCEVs) {
4498     unsigned Dist = 0;
4499     if (PtrSCEV != PtrSCEVLowest) {
4500       const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4501       const SCEV *Coeff = TryGetStride(Diff, Stride);
4502       if (!Coeff)
4503         return std::nullopt;
4504       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4505       if (!SC || isa<SCEVCouldNotCompute>(SC))
4506         return std::nullopt;
4507       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4508                                                   SE.getMulExpr(Stride, SC)))
4509                ->isZero())
4510         return std::nullopt;
4511       Dist = SC->getAPInt().getZExtValue();
4512     }
4513     // If the strides are not the same or repeated, we can't vectorize.
4514     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4515       return std::nullopt;
4516     auto Res = Offsets.emplace(Dist, Cnt);
4517     if (!Res.second)
4518       return std::nullopt;
4519     // Consecutive order if the inserted element is the last one.
4520     IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4521     ++Cnt;
4522   }
4523   if (Offsets.size() != SCEVs.size())
4524     return std::nullopt;
4525   SortedIndices.clear();
4526   if (!IsConsecutive) {
4527     // Fill SortedIndices array only if it is non-consecutive.
4528     SortedIndices.resize(PointerOps.size());
4529     Cnt = 0;
4530     for (const std::pair<int64_t, int> &Pair : Offsets) {
4531       SortedIndices[Cnt] = Pair.second;
4532       ++Cnt;
4533     }
4534   }
4535   if (!Inst)
4536     return nullptr;
4537   SCEVExpander Expander(SE, DL, "strided-load-vec");
4538   return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4539 }
4540 
4541 static std::pair<InstructionCost, InstructionCost>
4542 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4543             Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4544             Type *ScalarTy, VectorType *VecTy);
4545 
4546 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4547     ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4548     SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4549   // Check that a vectorized load would load the same memory as a scalar
4550   // load. For example, we don't want to vectorize loads that are smaller
4551   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4552   // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4553   // from such a struct, we read/write packed bits disagreeing with the
4554   // unvectorized version.
4555   Type *ScalarTy = VL0->getType();
4556 
4557   if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4558     return LoadsState::Gather;
4559 
4560   // Make sure all loads in the bundle are simple - we can't vectorize
4561   // atomic or volatile loads.
4562   PointerOps.clear();
4563   const unsigned Sz = VL.size();
4564   PointerOps.resize(Sz);
4565   auto *POIter = PointerOps.begin();
4566   for (Value *V : VL) {
4567     auto *L = cast<LoadInst>(V);
4568     if (!L->isSimple())
4569       return LoadsState::Gather;
4570     *POIter = L->getPointerOperand();
4571     ++POIter;
4572   }
4573 
4574   Order.clear();
4575   auto *VecTy = getWidenedType(ScalarTy, Sz);
4576   // Check the order of pointer operands or that all pointers are the same.
4577   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4578   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4579   if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4580     assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4581                                    "supported with VectorizeNonPowerOf2");
4582     return LoadsState::Gather;
4583   }
4584 
4585   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4586   if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4587       TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4588       calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4589     return LoadsState::StridedVectorize;
4590   if (IsSorted || all_of(PointerOps, [&](Value *P) {
4591         return arePointersCompatible(P, PointerOps.front(), *TLI);
4592       })) {
4593     if (IsSorted) {
4594       Value *Ptr0;
4595       Value *PtrN;
4596       if (Order.empty()) {
4597         Ptr0 = PointerOps.front();
4598         PtrN = PointerOps.back();
4599       } else {
4600         Ptr0 = PointerOps[Order.front()];
4601         PtrN = PointerOps[Order.back()];
4602       }
4603       std::optional<int> Diff =
4604           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4605       // Check that the sorted loads are consecutive.
4606       if (static_cast<unsigned>(*Diff) == Sz - 1)
4607         return LoadsState::Vectorize;
4608       // Simple check if not a strided access - clear order.
4609       bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4610       // Try to generate strided load node if:
4611       // 1. Target with strided load support is detected.
4612       // 2. The number of loads is greater than MinProfitableStridedLoads,
4613       // or the potential stride <= MaxProfitableLoadStride and the
4614       // potential stride is power-of-2 (to avoid perf regressions for the very
4615       // small number of loads) and max distance > number of loads, or potential
4616       // stride is -1.
4617       // 3. The loads are ordered, or number of unordered loads <=
4618       // MaxProfitableUnorderedLoads, or loads are in reversed order.
4619       // (this check is to avoid extra costs for very expensive shuffles).
4620       if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4621                                   (static_cast<unsigned>(std::abs(*Diff)) <=
4622                                        MaxProfitableLoadStride * Sz &&
4623                                    isPowerOf2_32(std::abs(*Diff)))) &&
4624                                  static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4625                                 *Diff == -(static_cast<int>(Sz) - 1))) {
4626         int Stride = *Diff / static_cast<int>(Sz - 1);
4627         if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4628           Align Alignment =
4629               cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4630                   ->getAlign();
4631           if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4632             // Iterate through all pointers and check if all distances are
4633             // unique multiple of Dist.
4634             SmallSet<int, 4> Dists;
4635             for (Value *Ptr : PointerOps) {
4636               int Dist = 0;
4637               if (Ptr == PtrN)
4638                 Dist = *Diff;
4639               else if (Ptr != Ptr0)
4640                 Dist =
4641                     *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4642               // If the strides are not the same or repeated, we can't
4643               // vectorize.
4644               if (((Dist / Stride) * Stride) != Dist ||
4645                   !Dists.insert(Dist).second)
4646                 break;
4647             }
4648             if (Dists.size() == Sz)
4649               return LoadsState::StridedVectorize;
4650           }
4651         }
4652       }
4653     }
4654     auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4655       unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4656       unsigned MinVF = getMinVF(Sz);
4657       unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4658       MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4659       for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4660         unsigned VectorizedCnt = 0;
4661         SmallVector<LoadsState> States;
4662         for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4663              Cnt += VF, ++VectorizedCnt) {
4664           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4665           SmallVector<unsigned> Order;
4666           SmallVector<Value *> PointerOps;
4667           LoadsState LS =
4668               canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4669                                 /*TryRecursiveCheck=*/false);
4670           // Check that the sorted loads are consecutive.
4671           if (LS == LoadsState::Gather)
4672             break;
4673           // If need the reorder - consider as high-cost masked gather for now.
4674           if ((LS == LoadsState::Vectorize ||
4675                LS == LoadsState::StridedVectorize) &&
4676               !Order.empty() && !isReverseOrder(Order))
4677             LS = LoadsState::ScatterVectorize;
4678           States.push_back(LS);
4679         }
4680         // Can be vectorized later as a serie of loads/insertelements.
4681         if (VectorizedCnt == VL.size() / VF) {
4682           // Compare masked gather cost and loads + insersubvector costs.
4683           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4684           auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4685               TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4686               CostKind, ScalarTy, VecTy);
4687           InstructionCost MaskedGatherCost =
4688               TTI.getGatherScatterOpCost(
4689                   Instruction::Load, VecTy,
4690                   cast<LoadInst>(VL0)->getPointerOperand(),
4691                   /*VariableMask=*/false, CommonAlignment, CostKind) +
4692               VectorGEPCost - ScalarGEPCost;
4693           InstructionCost VecLdCost = 0;
4694           auto *SubVecTy = getWidenedType(ScalarTy, VF);
4695           for (auto [I, LS] : enumerate(States)) {
4696             auto *LI0 = cast<LoadInst>(VL[I * VF]);
4697             switch (LS) {
4698             case LoadsState::Vectorize: {
4699               auto [ScalarGEPCost, VectorGEPCost] =
4700                   getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4701                               LI0->getPointerOperand(), Instruction::Load,
4702                               CostKind, ScalarTy, SubVecTy);
4703               VecLdCost += TTI.getMemoryOpCost(
4704                                Instruction::Load, SubVecTy, LI0->getAlign(),
4705                                LI0->getPointerAddressSpace(), CostKind,
4706                                TTI::OperandValueInfo()) +
4707                            VectorGEPCost - ScalarGEPCost;
4708               break;
4709             }
4710             case LoadsState::StridedVectorize: {
4711               auto [ScalarGEPCost, VectorGEPCost] =
4712                   getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4713                               LI0->getPointerOperand(), Instruction::Load,
4714                               CostKind, ScalarTy, SubVecTy);
4715               VecLdCost +=
4716                   TTI.getStridedMemoryOpCost(
4717                       Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4718                       /*VariableMask=*/false, CommonAlignment, CostKind) +
4719                   VectorGEPCost - ScalarGEPCost;
4720               break;
4721             }
4722             case LoadsState::ScatterVectorize: {
4723               auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4724                   TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4725                   LI0->getPointerOperand(), Instruction::GetElementPtr,
4726                   CostKind, ScalarTy, SubVecTy);
4727               VecLdCost +=
4728                   TTI.getGatherScatterOpCost(
4729                       Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4730                       /*VariableMask=*/false, CommonAlignment, CostKind) +
4731                   VectorGEPCost - ScalarGEPCost;
4732               break;
4733             }
4734             case LoadsState::Gather:
4735               llvm_unreachable(
4736                   "Expected only consecutive, strided or masked gather loads.");
4737             }
4738             SmallVector<int> ShuffleMask(VL.size());
4739             for (int Idx : seq<int>(0, VL.size()))
4740               ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4741             VecLdCost +=
4742                 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4743                                    CostKind, I * VF, SubVecTy);
4744           }
4745           // If masked gather cost is higher - better to vectorize, so
4746           // consider it as a gather node. It will be better estimated
4747           // later.
4748           if (MaskedGatherCost >= VecLdCost)
4749             return true;
4750         }
4751       }
4752       return false;
4753     };
4754     // TODO: need to improve analysis of the pointers, if not all of them are
4755     // GEPs or have > 2 operands, we end up with a gather node, which just
4756     // increases the cost.
4757     Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4758     bool ProfitableGatherPointers =
4759         L && Sz > 2 &&
4760         static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4761           return L->isLoopInvariant(V);
4762         })) <= Sz / 2;
4763     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4764           auto *GEP = dyn_cast<GetElementPtrInst>(P);
4765           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4766                  (GEP && GEP->getNumOperands() == 2 &&
4767                   isa<Constant, Instruction>(GEP->getOperand(1)));
4768         })) {
4769       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4770       if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4771           !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4772         // Check if potential masked gather can be represented as series
4773         // of loads + insertsubvectors.
4774         if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4775           // If masked gather cost is higher - better to vectorize, so
4776           // consider it as a gather node. It will be better estimated
4777           // later.
4778           return LoadsState::Gather;
4779         }
4780         return LoadsState::ScatterVectorize;
4781       }
4782     }
4783   }
4784 
4785   return LoadsState::Gather;
4786 }
4787 
4788 static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
4789                                    const DataLayout &DL, ScalarEvolution &SE,
4790                                    SmallVectorImpl<unsigned> &SortedIndices) {
4791   assert(llvm::all_of(
4792              VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4793          "Expected list of pointer operands.");
4794   // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4795   // Ptr into, sort and return the sorted indices with values next to one
4796   // another.
4797   MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
4798   Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4799 
4800   unsigned Cnt = 1;
4801   for (Value *Ptr : VL.drop_front()) {
4802     bool Found = any_of(Bases, [&](auto &Base) {
4803       std::optional<int> Diff =
4804           getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4805                           /*StrictCheck=*/true);
4806       if (!Diff)
4807         return false;
4808 
4809       Base.second.emplace_back(Ptr, *Diff, Cnt++);
4810       return true;
4811     });
4812 
4813     if (!Found) {
4814       // If we haven't found enough to usefully cluster, return early.
4815       if (Bases.size() > VL.size() / 2 - 1)
4816         return false;
4817 
4818       // Not found already - add a new Base
4819       Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4820     }
4821   }
4822 
4823   // For each of the bases sort the pointers by Offset and check if any of the
4824   // base become consecutively allocated.
4825   bool AnyConsecutive = false;
4826   for (auto &Base : Bases) {
4827     auto &Vec = Base.second;
4828     if (Vec.size() > 1) {
4829       llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4830                                 const std::tuple<Value *, int, unsigned> &Y) {
4831         return std::get<1>(X) < std::get<1>(Y);
4832       });
4833       int InitialOffset = std::get<1>(Vec[0]);
4834       AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4835         return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4836       });
4837     }
4838   }
4839 
4840   // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4841   SortedIndices.clear();
4842   if (!AnyConsecutive)
4843     return false;
4844 
4845   for (auto &Base : Bases) {
4846     for (auto &T : Base.second)
4847       SortedIndices.push_back(std::get<2>(T));
4848   }
4849 
4850   assert(SortedIndices.size() == VL.size() &&
4851          "Expected SortedIndices to be the size of VL");
4852   return true;
4853 }
4854 
4855 std::optional<BoUpSLP::OrdersType>
4856 BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4857   assert(TE.isGather() && "Expected gather node only.");
4858   Type *ScalarTy = TE.Scalars[0]->getType();
4859 
4860   SmallVector<Value *> Ptrs;
4861   Ptrs.reserve(TE.Scalars.size());
4862   for (Value *V : TE.Scalars) {
4863     auto *L = dyn_cast<LoadInst>(V);
4864     if (!L || !L->isSimple())
4865       return std::nullopt;
4866     Ptrs.push_back(L->getPointerOperand());
4867   }
4868 
4869   BoUpSLP::OrdersType Order;
4870   if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4871     return std::move(Order);
4872   return std::nullopt;
4873 }
4874 
4875 /// Check if two insertelement instructions are from the same buildvector.
4876 static bool areTwoInsertFromSameBuildVector(
4877     InsertElementInst *VU, InsertElementInst *V,
4878     function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4879   // Instructions must be from the same basic blocks.
4880   if (VU->getParent() != V->getParent())
4881     return false;
4882   // Checks if 2 insertelements are from the same buildvector.
4883   if (VU->getType() != V->getType())
4884     return false;
4885   // Multiple used inserts are separate nodes.
4886   if (!VU->hasOneUse() && !V->hasOneUse())
4887     return false;
4888   auto *IE1 = VU;
4889   auto *IE2 = V;
4890   std::optional<unsigned> Idx1 = getElementIndex(IE1);
4891   std::optional<unsigned> Idx2 = getElementIndex(IE2);
4892   if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4893     return false;
4894   // Go through the vector operand of insertelement instructions trying to find
4895   // either VU as the original vector for IE2 or V as the original vector for
4896   // IE1.
4897   SmallBitVector ReusedIdx(
4898       cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4899   bool IsReusedIdx = false;
4900   do {
4901     if (IE2 == VU && !IE1)
4902       return VU->hasOneUse();
4903     if (IE1 == V && !IE2)
4904       return V->hasOneUse();
4905     if (IE1 && IE1 != V) {
4906       unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
4907       IsReusedIdx |= ReusedIdx.test(Idx1);
4908       ReusedIdx.set(Idx1);
4909       if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4910         IE1 = nullptr;
4911       else
4912         IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4913     }
4914     if (IE2 && IE2 != VU) {
4915       unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
4916       IsReusedIdx |= ReusedIdx.test(Idx2);
4917       ReusedIdx.set(Idx2);
4918       if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4919         IE2 = nullptr;
4920       else
4921         IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4922     }
4923   } while (!IsReusedIdx && (IE1 || IE2));
4924   return false;
4925 }
4926 
4927 std::optional<BoUpSLP::OrdersType>
4928 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4929   // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4930   if (TE.isNonPowOf2Vec())
4931     return std::nullopt;
4932 
4933   // No need to reorder if need to shuffle reuses, still need to shuffle the
4934   // node.
4935   if (!TE.ReuseShuffleIndices.empty()) {
4936     if (isSplat(TE.Scalars))
4937       return std::nullopt;
4938     // Check if reuse shuffle indices can be improved by reordering.
4939     // For this, check that reuse mask is "clustered", i.e. each scalar values
4940     // is used once in each submask of size <number_of_scalars>.
4941     // Example: 4 scalar values.
4942     // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4943     //                           0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4944     //                           element 3 is used twice in the second submask.
4945     unsigned Sz = TE.Scalars.size();
4946     if (TE.isGather()) {
4947       if (std::optional<OrdersType> CurrentOrder =
4948               findReusedOrderedScalars(TE)) {
4949         SmallVector<int> Mask;
4950         fixupOrderingIndices(*CurrentOrder);
4951         inversePermutation(*CurrentOrder, Mask);
4952         ::addMask(Mask, TE.ReuseShuffleIndices);
4953         OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954         unsigned Sz = TE.Scalars.size();
4955         for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4956           for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4957             if (Idx != PoisonMaskElem)
4958               Res[Idx + K * Sz] = I + K * Sz;
4959         }
4960         return std::move(Res);
4961       }
4962     }
4963     if (Sz == 2 && TE.getVectorFactor() == 4 &&
4964         TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
4965                                              2 * TE.getVectorFactor())) == 1)
4966       return std::nullopt;
4967     if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4968                                                      Sz)) {
4969       SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4970       if (TE.ReorderIndices.empty())
4971         std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4972       else
4973         inversePermutation(TE.ReorderIndices, ReorderMask);
4974       ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4975       unsigned VF = ReorderMask.size();
4976       OrdersType ResOrder(VF, VF);
4977       unsigned NumParts = divideCeil(VF, Sz);
4978       SmallBitVector UsedVals(NumParts);
4979       for (unsigned I = 0; I < VF; I += Sz) {
4980         int Val = PoisonMaskElem;
4981         unsigned UndefCnt = 0;
4982         unsigned Limit = std::min(Sz, VF - I);
4983         if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
4984                    [&](int Idx) {
4985                      if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4986                        Val = Idx;
4987                      if (Idx == PoisonMaskElem)
4988                        ++UndefCnt;
4989                      return Idx != PoisonMaskElem && Idx != Val;
4990                    }) ||
4991             Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4992             UndefCnt > Sz / 2)
4993           return std::nullopt;
4994         UsedVals.set(Val);
4995         for (unsigned K = 0; K < NumParts; ++K)
4996           ResOrder[Val + Sz * K] = I + K;
4997       }
4998       return std::move(ResOrder);
4999     }
5000     unsigned VF = TE.getVectorFactor();
5001     // Try build correct order for extractelement instructions.
5002     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5003                                 TE.ReuseShuffleIndices.end());
5004     if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5005         all_of(TE.Scalars, [Sz](Value *V) {
5006           std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5007           return Idx && *Idx < Sz;
5008         })) {
5009       SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5010       if (TE.ReorderIndices.empty())
5011         std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5012       else
5013         inversePermutation(TE.ReorderIndices, ReorderMask);
5014       for (unsigned I = 0; I < VF; ++I) {
5015         int &Idx = ReusedMask[I];
5016         if (Idx == PoisonMaskElem)
5017           continue;
5018         Value *V = TE.Scalars[ReorderMask[Idx]];
5019         std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5020         Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5021       }
5022     }
5023     // Build the order of the VF size, need to reorder reuses shuffles, they are
5024     // always of VF size.
5025     OrdersType ResOrder(VF);
5026     std::iota(ResOrder.begin(), ResOrder.end(), 0);
5027     auto *It = ResOrder.begin();
5028     for (unsigned K = 0; K < VF; K += Sz) {
5029       OrdersType CurrentOrder(TE.ReorderIndices);
5030       SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5031       if (SubMask.front() == PoisonMaskElem)
5032         std::iota(SubMask.begin(), SubMask.end(), 0);
5033       reorderOrder(CurrentOrder, SubMask);
5034       transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5035       std::advance(It, Sz);
5036     }
5037     if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5038           return Data.index() == Data.value();
5039         }))
5040       return std::nullopt; // No need to reorder.
5041     return std::move(ResOrder);
5042   }
5043   if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044       any_of(TE.UserTreeIndices,
5045              [](const EdgeInfo &EI) {
5046                return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5047              }) &&
5048       (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5049     return std::nullopt;
5050   if ((TE.State == TreeEntry::Vectorize ||
5051        TE.State == TreeEntry::StridedVectorize) &&
5052       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5053        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5054       !TE.isAltShuffle())
5055     return TE.ReorderIndices;
5056   if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057     auto PHICompare = [&](unsigned I1, unsigned I2) {
5058       Value *V1 = TE.Scalars[I1];
5059       Value *V2 = TE.Scalars[I2];
5060       if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5061         return false;
5062       if (V1->getNumUses() < V2->getNumUses())
5063         return true;
5064       if (V1->getNumUses() > V2->getNumUses())
5065         return false;
5066       auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5067       auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5068       if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5069         if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5070           if (!areTwoInsertFromSameBuildVector(
5071                   IE1, IE2,
5072                   [](InsertElementInst *II) { return II->getOperand(0); }))
5073             return I1 < I2;
5074           return getElementIndex(IE1) < getElementIndex(IE2);
5075         }
5076       if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5077         if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5078           if (EE1->getOperand(0) != EE2->getOperand(0))
5079             return I1 < I2;
5080           return getElementIndex(EE1) < getElementIndex(EE2);
5081         }
5082       return I1 < I2;
5083     };
5084     auto IsIdentityOrder = [](const OrdersType &Order) {
5085       for (unsigned Idx : seq<unsigned>(0, Order.size()))
5086         if (Idx != Order[Idx])
5087           return false;
5088       return true;
5089     };
5090     if (!TE.ReorderIndices.empty())
5091       return TE.ReorderIndices;
5092     DenseMap<unsigned, unsigned> PhiToId;
5093     SmallVector<unsigned> Phis(TE.Scalars.size());
5094     std::iota(Phis.begin(), Phis.end(), 0);
5095     OrdersType ResOrder(TE.Scalars.size());
5096     for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5097       PhiToId[Id] = Id;
5098     stable_sort(Phis, PHICompare);
5099     for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5100       ResOrder[Id] = PhiToId[Phis[Id]];
5101     if (IsIdentityOrder(ResOrder))
5102       return std::nullopt; // No need to reorder.
5103     return std::move(ResOrder);
5104   }
5105   if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5106     // TODO: add analysis of other gather nodes with extractelement
5107     // instructions and other values/instructions, not only undefs.
5108     if ((TE.getOpcode() == Instruction::ExtractElement ||
5109          (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5110           any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5111         all_of(TE.Scalars, [](Value *V) {
5112           auto *EE = dyn_cast<ExtractElementInst>(V);
5113           return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5114         })) {
5115       // Check that gather of extractelements can be represented as
5116       // just a shuffle of a single vector.
5117       OrdersType CurrentOrder;
5118       bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5119                                    /*ResizeAllowed=*/true);
5120       if (Reuse || !CurrentOrder.empty())
5121         return std::move(CurrentOrder);
5122     }
5123     // If the gather node is <undef, v, .., poison> and
5124     // insertelement poison, v, 0 [+ permute]
5125     // is cheaper than
5126     // insertelement poison, v, n - try to reorder.
5127     // If rotating the whole graph, exclude the permute cost, the whole graph
5128     // might be transformed.
5129     int Sz = TE.Scalars.size();
5130     if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5131         count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5132       const auto *It =
5133           find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5134       if (It == TE.Scalars.begin())
5135         return OrdersType();
5136       auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5137       if (It != TE.Scalars.end()) {
5138         OrdersType Order(Sz, Sz);
5139         unsigned Idx = std::distance(TE.Scalars.begin(), It);
5140         Order[Idx] = 0;
5141         fixupOrderingIndices(Order);
5142         SmallVector<int> Mask;
5143         inversePermutation(Order, Mask);
5144         InstructionCost PermuteCost =
5145             TopToBottom
5146                 ? 0
5147                 : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);
5148         InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5149             Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5150             PoisonValue::get(Ty), *It);
5151         InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5152             Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5153             PoisonValue::get(Ty), *It);
5154         if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5155           OrdersType Order(Sz, Sz);
5156           Order[Idx] = 0;
5157           return std::move(Order);
5158         }
5159       }
5160     }
5161     if (isSplat(TE.Scalars))
5162       return std::nullopt;
5163     if (TE.Scalars.size() >= 4)
5164       if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5165         return Order;
5166     if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5167       return CurrentOrder;
5168   }
5169   return std::nullopt;
5170 }
5171 
5172 /// Checks if the given mask is a "clustered" mask with the same clusters of
5173 /// size \p Sz, which are not identity submasks.
5174 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5175                                                unsigned Sz) {
5176   ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5177   if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5178     return false;
5179   for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5180     ArrayRef<int> Cluster = Mask.slice(I, Sz);
5181     if (Cluster != FirstCluster)
5182       return false;
5183   }
5184   return true;
5185 }
5186 
5187 void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5188   // Reorder reuses mask.
5189   reorderReuses(TE.ReuseShuffleIndices, Mask);
5190   const unsigned Sz = TE.Scalars.size();
5191   // For vectorized and non-clustered reused no need to do anything else.
5192   if (!TE.isGather() ||
5193       !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5194                                                    Sz) ||
5195       !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5196     return;
5197   SmallVector<int> NewMask;
5198   inversePermutation(TE.ReorderIndices, NewMask);
5199   addMask(NewMask, TE.ReuseShuffleIndices);
5200   // Clear reorder since it is going to be applied to the new mask.
5201   TE.ReorderIndices.clear();
5202   // Try to improve gathered nodes with clustered reuses, if possible.
5203   ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5204   SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5205   inversePermutation(NewOrder, NewMask);
5206   reorderScalars(TE.Scalars, NewMask);
5207   // Fill the reuses mask with the identity submasks.
5208   for (auto *It = TE.ReuseShuffleIndices.begin(),
5209             *End = TE.ReuseShuffleIndices.end();
5210        It != End; std::advance(It, Sz))
5211     std::iota(It, std::next(It, Sz), 0);
5212 }
5213 
5214 static void combineOrders(MutableArrayRef<unsigned> Order,
5215                           ArrayRef<unsigned> SecondaryOrder) {
5216   assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5217          "Expected same size of orders");
5218   unsigned Sz = Order.size();
5219   SmallBitVector UsedIndices(Sz);
5220   for (unsigned Idx : seq<unsigned>(0, Sz)) {
5221     if (Order[Idx] != Sz)
5222       UsedIndices.set(Order[Idx]);
5223   }
5224   if (SecondaryOrder.empty()) {
5225     for (unsigned Idx : seq<unsigned>(0, Sz))
5226       if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5227         Order[Idx] = Idx;
5228   } else {
5229     for (unsigned Idx : seq<unsigned>(0, Sz))
5230       if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5231           !UsedIndices.test(SecondaryOrder[Idx]))
5232         Order[Idx] = SecondaryOrder[Idx];
5233   }
5234 }
5235 
5236 void BoUpSLP::reorderTopToBottom() {
5237   // Maps VF to the graph nodes.
5238   DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5239   // ExtractElement gather nodes which can be vectorized and need to handle
5240   // their ordering.
5241   DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5242 
5243   // Phi nodes can have preferred ordering based on their result users
5244   DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5245 
5246   // AltShuffles can also have a preferred ordering that leads to fewer
5247   // instructions, e.g., the addsub instruction in x86.
5248   DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5249 
5250   // Maps a TreeEntry to the reorder indices of external users.
5251   DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
5252       ExternalUserReorderMap;
5253   // Find all reorderable nodes with the given VF.
5254   // Currently the are vectorized stores,loads,extracts + some gathering of
5255   // extracts.
5256   for_each(VectorizableTree, [&, &TTIRef = *TTI](
5257                                  const std::unique_ptr<TreeEntry> &TE) {
5258     // Look for external users that will probably be vectorized.
5259     SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5260         findExternalStoreUsersReorderIndices(TE.get());
5261     if (!ExternalUserReorderIndices.empty()) {
5262       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5263       ExternalUserReorderMap.try_emplace(TE.get(),
5264                                          std::move(ExternalUserReorderIndices));
5265     }
5266 
5267     // Patterns like [fadd,fsub] can be combined into a single instruction in
5268     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5269     // to take into account their order when looking for the most used order.
5270     if (TE->isAltShuffle()) {
5271       VectorType *VecTy =
5272           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5273       unsigned Opcode0 = TE->getOpcode();
5274       unsigned Opcode1 = TE->getAltOpcode();
5275       SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5276       // If this pattern is supported by the target then we consider the order.
5277       if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278         VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5279         AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5280       }
5281       // TODO: Check the reverse order too.
5282     }
5283 
5284     if (std::optional<OrdersType> CurrentOrder =
5285             getReorderingData(*TE, /*TopToBottom=*/true)) {
5286       // Do not include ordering for nodes used in the alt opcode vectorization,
5287       // better to reorder them during bottom-to-top stage. If follow the order
5288       // here, it causes reordering of the whole graph though actually it is
5289       // profitable just to reorder the subgraph that starts from the alternate
5290       // opcode vectorization node. Such nodes already end-up with the shuffle
5291       // instruction and it is just enough to change this shuffle rather than
5292       // rotate the scalars for the whole graph.
5293       unsigned Cnt = 0;
5294       const TreeEntry *UserTE = TE.get();
5295       while (UserTE && Cnt < RecursionMaxDepth) {
5296         if (UserTE->UserTreeIndices.size() != 1)
5297           break;
5298         if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5299               return EI.UserTE->State == TreeEntry::Vectorize &&
5300                      EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5301             }))
5302           return;
5303         UserTE = UserTE->UserTreeIndices.back().UserTE;
5304         ++Cnt;
5305       }
5306       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5307       if (!(TE->State == TreeEntry::Vectorize ||
5308             TE->State == TreeEntry::StridedVectorize) ||
5309           !TE->ReuseShuffleIndices.empty())
5310         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5311       if (TE->State == TreeEntry::Vectorize &&
5312           TE->getOpcode() == Instruction::PHI)
5313         PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5314     }
5315   });
5316 
5317   // Reorder the graph nodes according to their vectorization factor.
5318   for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5319        VF /= 2) {
5320     auto It = VFToOrderedEntries.find(VF);
5321     if (It == VFToOrderedEntries.end())
5322       continue;
5323     // Try to find the most profitable order. We just are looking for the most
5324     // used order and reorder scalar elements in the nodes according to this
5325     // mostly used order.
5326     ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5327     // All operands are reordered and used only in this node - propagate the
5328     // most used order to the user node.
5329     MapVector<OrdersType, unsigned,
5330               DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5331         OrdersUses;
5332     SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5333     for (const TreeEntry *OpTE : OrderedEntries) {
5334       // No need to reorder this nodes, still need to extend and to use shuffle,
5335       // just need to merge reordering shuffle and the reuse shuffle.
5336       if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5337         continue;
5338       // Count number of orders uses.
5339       const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5340                            &PhisToOrders]() -> const OrdersType & {
5341         if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5342           auto It = GathersToOrders.find(OpTE);
5343           if (It != GathersToOrders.end())
5344             return It->second;
5345         }
5346         if (OpTE->isAltShuffle()) {
5347           auto It = AltShufflesToOrders.find(OpTE);
5348           if (It != AltShufflesToOrders.end())
5349             return It->second;
5350         }
5351         if (OpTE->State == TreeEntry::Vectorize &&
5352             OpTE->getOpcode() == Instruction::PHI) {
5353           auto It = PhisToOrders.find(OpTE);
5354           if (It != PhisToOrders.end())
5355             return It->second;
5356         }
5357         return OpTE->ReorderIndices;
5358       }();
5359       // First consider the order of the external scalar users.
5360       auto It = ExternalUserReorderMap.find(OpTE);
5361       if (It != ExternalUserReorderMap.end()) {
5362         const auto &ExternalUserReorderIndices = It->second;
5363         // If the OpTE vector factor != number of scalars - use natural order,
5364         // it is an attempt to reorder node with reused scalars but with
5365         // external uses.
5366         if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367           OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5368               ExternalUserReorderIndices.size();
5369         } else {
5370           for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371             ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5372         }
5373         // No other useful reorder data in this entry.
5374         if (Order.empty())
5375           continue;
5376       }
5377       // Stores actually store the mask, not the order, need to invert.
5378       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5380         SmallVector<int> Mask;
5381         inversePermutation(Order, Mask);
5382         unsigned E = Order.size();
5383         OrdersType CurrentOrder(E, E);
5384         transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5385           return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5386         });
5387         fixupOrderingIndices(CurrentOrder);
5388         ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5389       } else {
5390         ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5391       }
5392     }
5393     if (OrdersUses.empty())
5394       continue;
5395     auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5396       const unsigned Sz = Order.size();
5397       for (unsigned Idx : seq<unsigned>(0, Sz))
5398         if (Idx != Order[Idx] && Order[Idx] != Sz)
5399           return false;
5400       return true;
5401     };
5402     // Choose the most used order.
5403     unsigned IdentityCnt = 0;
5404     unsigned FilledIdentityCnt = 0;
5405     OrdersType IdentityOrder(VF, VF);
5406     for (auto &Pair : OrdersUses) {
5407       if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5408         if (!Pair.first.empty())
5409           FilledIdentityCnt += Pair.second;
5410         IdentityCnt += Pair.second;
5411         combineOrders(IdentityOrder, Pair.first);
5412       }
5413     }
5414     MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5415     unsigned Cnt = IdentityCnt;
5416     for (auto &Pair : OrdersUses) {
5417       // Prefer identity order. But, if filled identity found (non-empty order)
5418       // with same number of uses, as the new candidate order, we can choose
5419       // this candidate order.
5420       if (Cnt < Pair.second ||
5421           (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422            Cnt == Pair.second && !BestOrder.empty() &&
5423            IsIdentityOrder(BestOrder))) {
5424         combineOrders(Pair.first, BestOrder);
5425         BestOrder = Pair.first;
5426         Cnt = Pair.second;
5427       } else {
5428         combineOrders(BestOrder, Pair.first);
5429       }
5430     }
5431     // Set order of the user node.
5432     if (IsIdentityOrder(BestOrder))
5433       continue;
5434     fixupOrderingIndices(BestOrder);
5435     SmallVector<int> Mask;
5436     inversePermutation(BestOrder, Mask);
5437     SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5438     unsigned E = BestOrder.size();
5439     transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5440       return I < E ? static_cast<int>(I) : PoisonMaskElem;
5441     });
5442     // Do an actual reordering, if profitable.
5443     for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5444       // Just do the reordering for the nodes with the given VF.
5445       if (TE->Scalars.size() != VF) {
5446         if (TE->ReuseShuffleIndices.size() == VF) {
5447           // Need to reorder the reuses masks of the operands with smaller VF to
5448           // be able to find the match between the graph nodes and scalar
5449           // operands of the given node during vectorization/cost estimation.
5450           assert(all_of(TE->UserTreeIndices,
5451                         [VF, &TE](const EdgeInfo &EI) {
5452                           return EI.UserTE->Scalars.size() == VF ||
5453                                  EI.UserTE->Scalars.size() ==
5454                                      TE->Scalars.size();
5455                         }) &&
5456                  "All users must be of VF size.");
5457           // Update ordering of the operands with the smaller VF than the given
5458           // one.
5459           reorderNodeWithReuses(*TE, Mask);
5460         }
5461         continue;
5462       }
5463       if ((TE->State == TreeEntry::Vectorize ||
5464            TE->State == TreeEntry::StridedVectorize) &&
5465           isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5466               InsertElementInst>(TE->getMainOp()) &&
5467           !TE->isAltShuffle()) {
5468         // Build correct orders for extract{element,value}, loads and
5469         // stores.
5470         reorderOrder(TE->ReorderIndices, Mask);
5471         if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5472           TE->reorderOperands(Mask);
5473       } else {
5474         // Reorder the node and its operands.
5475         TE->reorderOperands(Mask);
5476         assert(TE->ReorderIndices.empty() &&
5477                "Expected empty reorder sequence.");
5478         reorderScalars(TE->Scalars, Mask);
5479       }
5480       if (!TE->ReuseShuffleIndices.empty()) {
5481         // Apply reversed order to keep the original ordering of the reused
5482         // elements to avoid extra reorder indices shuffling.
5483         OrdersType CurrentOrder;
5484         reorderOrder(CurrentOrder, MaskOrder);
5485         SmallVector<int> NewReuses;
5486         inversePermutation(CurrentOrder, NewReuses);
5487         addMask(NewReuses, TE->ReuseShuffleIndices);
5488         TE->ReuseShuffleIndices.swap(NewReuses);
5489       }
5490     }
5491   }
5492 }
5493 
5494 bool BoUpSLP::canReorderOperands(
5495     TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5496     ArrayRef<TreeEntry *> ReorderableGathers,
5497     SmallVectorImpl<TreeEntry *> &GatherOps) {
5498   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5499   if (UserTE->isNonPowOf2Vec())
5500     return false;
5501 
5502   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5503     if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5504           return OpData.first == I &&
5505                  (OpData.second->State == TreeEntry::Vectorize ||
5506                   OpData.second->State == TreeEntry::StridedVectorize);
5507         }))
5508       continue;
5509     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5510       // Do not reorder if operand node is used by many user nodes.
5511       if (any_of(TE->UserTreeIndices,
5512                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5513         return false;
5514       // Add the node to the list of the ordered nodes with the identity
5515       // order.
5516       Edges.emplace_back(I, TE);
5517       // Add ScatterVectorize nodes to the list of operands, where just
5518       // reordering of the scalars is required. Similar to the gathers, so
5519       // simply add to the list of gathered ops.
5520       // If there are reused scalars, process this node as a regular vectorize
5521       // node, just reorder reuses mask.
5522       if (TE->State != TreeEntry::Vectorize &&
5523           TE->State != TreeEntry::StridedVectorize &&
5524           TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5525         GatherOps.push_back(TE);
5526       continue;
5527     }
5528     TreeEntry *Gather = nullptr;
5529     if (count_if(ReorderableGathers,
5530                  [&Gather, UserTE, I](TreeEntry *TE) {
5531                    assert(TE->State != TreeEntry::Vectorize &&
5532                           TE->State != TreeEntry::StridedVectorize &&
5533                           "Only non-vectorized nodes are expected.");
5534                    if (any_of(TE->UserTreeIndices,
5535                               [UserTE, I](const EdgeInfo &EI) {
5536                                 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5537                               })) {
5538                      assert(TE->isSame(UserTE->getOperand(I)) &&
5539                             "Operand entry does not match operands.");
5540                      Gather = TE;
5541                      return true;
5542                    }
5543                    return false;
5544                  }) > 1 &&
5545         !allConstant(UserTE->getOperand(I)))
5546       return false;
5547     if (Gather)
5548       GatherOps.push_back(Gather);
5549   }
5550   return true;
5551 }
5552 
5553 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5554   SetVector<TreeEntry *> OrderedEntries;
5555   DenseSet<const TreeEntry *> GathersToOrders;
5556   // Find all reorderable leaf nodes with the given VF.
5557   // Currently the are vectorized loads,extracts without alternate operands +
5558   // some gathering of extracts.
5559   SmallVector<TreeEntry *> NonVectorized;
5560   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561     if (TE->State != TreeEntry::Vectorize &&
5562         TE->State != TreeEntry::StridedVectorize)
5563       NonVectorized.push_back(TE.get());
5564     if (std::optional<OrdersType> CurrentOrder =
5565             getReorderingData(*TE, /*TopToBottom=*/false)) {
5566       OrderedEntries.insert(TE.get());
5567       if (!(TE->State == TreeEntry::Vectorize ||
5568             TE->State == TreeEntry::StridedVectorize) ||
5569           !TE->ReuseShuffleIndices.empty())
5570         GathersToOrders.insert(TE.get());
5571     }
5572   }
5573 
5574   // 1. Propagate order to the graph nodes, which use only reordered nodes.
5575   // I.e., if the node has operands, that are reordered, try to make at least
5576   // one operand order in the natural order and reorder others + reorder the
5577   // user node itself.
5578   SmallPtrSet<const TreeEntry *, 4> Visited;
5579   while (!OrderedEntries.empty()) {
5580     // 1. Filter out only reordered nodes.
5581     // 2. If the entry has multiple uses - skip it and jump to the next node.
5582     DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
5583     SmallVector<TreeEntry *> Filtered;
5584     for (TreeEntry *TE : OrderedEntries) {
5585       if (!(TE->State == TreeEntry::Vectorize ||
5586             TE->State == TreeEntry::StridedVectorize ||
5587             (TE->isGather() && GathersToOrders.contains(TE))) ||
5588           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5589           !all_of(drop_begin(TE->UserTreeIndices),
5590                   [TE](const EdgeInfo &EI) {
5591                     return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5592                   }) ||
5593           !Visited.insert(TE).second) {
5594         Filtered.push_back(TE);
5595         continue;
5596       }
5597       // Build a map between user nodes and their operands order to speedup
5598       // search. The graph currently does not provide this dependency directly.
5599       for (EdgeInfo &EI : TE->UserTreeIndices) {
5600         TreeEntry *UserTE = EI.UserTE;
5601         auto It = Users.find(UserTE);
5602         if (It == Users.end())
5603           It = Users.insert({UserTE, {}}).first;
5604         It->second.emplace_back(EI.EdgeIdx, TE);
5605       }
5606     }
5607     // Erase filtered entries.
5608     for (TreeEntry *TE : Filtered)
5609       OrderedEntries.remove(TE);
5610     SmallVector<
5611         std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5612         UsersVec(Users.begin(), Users.end());
5613     sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5614       return Data1.first->Idx > Data2.first->Idx;
5615     });
5616     for (auto &Data : UsersVec) {
5617       // Check that operands are used only in the User node.
5618       SmallVector<TreeEntry *> GatherOps;
5619       if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5620                               GatherOps)) {
5621         for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5622           OrderedEntries.remove(Op.second);
5623         continue;
5624       }
5625       // All operands are reordered and used only in this node - propagate the
5626       // most used order to the user node.
5627       MapVector<OrdersType, unsigned,
5628                 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5629           OrdersUses;
5630       // Do the analysis for each tree entry only once, otherwise the order of
5631       // the same node my be considered several times, though might be not
5632       // profitable.
5633       SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5634       SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
5635       for (const auto &Op : Data.second) {
5636         TreeEntry *OpTE = Op.second;
5637         if (!VisitedOps.insert(OpTE).second)
5638           continue;
5639         if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5640           continue;
5641         const auto Order = [&]() -> const OrdersType {
5642           if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5643             return getReorderingData(*OpTE, /*TopToBottom=*/false)
5644                 .value_or(OrdersType(1));
5645           return OpTE->ReorderIndices;
5646         }();
5647         // The order is partially ordered, skip it in favor of fully non-ordered
5648         // orders.
5649         if (Order.size() == 1)
5650           continue;
5651         unsigned NumOps = count_if(
5652             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5653               return P.second == OpTE;
5654             });
5655         // Stores actually store the mask, not the order, need to invert.
5656         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5658           SmallVector<int> Mask;
5659           inversePermutation(Order, Mask);
5660           unsigned E = Order.size();
5661           OrdersType CurrentOrder(E, E);
5662           transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5663             return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5664           });
5665           fixupOrderingIndices(CurrentOrder);
5666           OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5667               NumOps;
5668         } else {
5669           OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5670         }
5671         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5672         const auto AllowsReordering = [&](const TreeEntry *TE) {
5673           // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5674           if (TE->isNonPowOf2Vec())
5675             return false;
5676           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5677               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5678               (IgnoreReorder && TE->Idx == 0))
5679             return true;
5680           if (TE->isGather()) {
5681             if (GathersToOrders.contains(TE))
5682               return !getReorderingData(*TE, /*TopToBottom=*/false)
5683                           .value_or(OrdersType(1))
5684                           .empty();
5685             return true;
5686           }
5687           return false;
5688         };
5689         for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690           TreeEntry *UserTE = EI.UserTE;
5691           if (!VisitedUsers.insert(UserTE).second)
5692             continue;
5693           // May reorder user node if it requires reordering, has reused
5694           // scalars, is an alternate op vectorize node or its op nodes require
5695           // reordering.
5696           if (AllowsReordering(UserTE))
5697             continue;
5698           // Check if users allow reordering.
5699           // Currently look up just 1 level of operands to avoid increase of
5700           // the compile time.
5701           // Profitable to reorder if definitely more operands allow
5702           // reordering rather than those with natural order.
5703           ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
5704           if (static_cast<unsigned>(count_if(
5705                   Ops, [UserTE, &AllowsReordering](
5706                            const std::pair<unsigned, TreeEntry *> &Op) {
5707                     return AllowsReordering(Op.second) &&
5708                            all_of(Op.second->UserTreeIndices,
5709                                   [UserTE](const EdgeInfo &EI) {
5710                                     return EI.UserTE == UserTE;
5711                                   });
5712                   })) <= Ops.size() / 2)
5713             ++Res.first->second;
5714         }
5715       }
5716       if (OrdersUses.empty()) {
5717         for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5718           OrderedEntries.remove(Op.second);
5719         continue;
5720       }
5721       auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5722         const unsigned Sz = Order.size();
5723         for (unsigned Idx : seq<unsigned>(0, Sz))
5724           if (Idx != Order[Idx] && Order[Idx] != Sz)
5725             return false;
5726         return true;
5727       };
5728       // Choose the most used order.
5729       unsigned IdentityCnt = 0;
5730       unsigned VF = Data.second.front().second->getVectorFactor();
5731       OrdersType IdentityOrder(VF, VF);
5732       for (auto &Pair : OrdersUses) {
5733         if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5734           IdentityCnt += Pair.second;
5735           combineOrders(IdentityOrder, Pair.first);
5736         }
5737       }
5738       MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5739       unsigned Cnt = IdentityCnt;
5740       for (auto &Pair : OrdersUses) {
5741         // Prefer identity order. But, if filled identity found (non-empty
5742         // order) with same number of uses, as the new candidate order, we can
5743         // choose this candidate order.
5744         if (Cnt < Pair.second) {
5745           combineOrders(Pair.first, BestOrder);
5746           BestOrder = Pair.first;
5747           Cnt = Pair.second;
5748         } else {
5749           combineOrders(BestOrder, Pair.first);
5750         }
5751       }
5752       // Set order of the user node.
5753       if (IsIdentityOrder(BestOrder)) {
5754         for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5755           OrderedEntries.remove(Op.second);
5756         continue;
5757       }
5758       fixupOrderingIndices(BestOrder);
5759       // Erase operands from OrderedEntries list and adjust their orders.
5760       VisitedOps.clear();
5761       SmallVector<int> Mask;
5762       inversePermutation(BestOrder, Mask);
5763       SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5764       unsigned E = BestOrder.size();
5765       transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5766         return I < E ? static_cast<int>(I) : PoisonMaskElem;
5767       });
5768       for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5769         TreeEntry *TE = Op.second;
5770         OrderedEntries.remove(TE);
5771         if (!VisitedOps.insert(TE).second)
5772           continue;
5773         if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5774           reorderNodeWithReuses(*TE, Mask);
5775           continue;
5776         }
5777         // Gathers are processed separately.
5778         if (TE->State != TreeEntry::Vectorize &&
5779             TE->State != TreeEntry::StridedVectorize &&
5780             (TE->State != TreeEntry::ScatterVectorize ||
5781              TE->ReorderIndices.empty()))
5782           continue;
5783         assert((BestOrder.size() == TE->ReorderIndices.size() ||
5784                 TE->ReorderIndices.empty()) &&
5785                "Non-matching sizes of user/operand entries.");
5786         reorderOrder(TE->ReorderIndices, Mask);
5787         if (IgnoreReorder && TE == VectorizableTree.front().get())
5788           IgnoreReorder = false;
5789       }
5790       // For gathers just need to reorder its scalars.
5791       for (TreeEntry *Gather : GatherOps) {
5792         assert(Gather->ReorderIndices.empty() &&
5793                "Unexpected reordering of gathers.");
5794         if (!Gather->ReuseShuffleIndices.empty()) {
5795           // Just reorder reuses indices.
5796           reorderReuses(Gather->ReuseShuffleIndices, Mask);
5797           continue;
5798         }
5799         reorderScalars(Gather->Scalars, Mask);
5800         OrderedEntries.remove(Gather);
5801       }
5802       // Reorder operands of the user node and set the ordering for the user
5803       // node itself.
5804       if (Data.first->State != TreeEntry::Vectorize ||
5805           !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806               Data.first->getMainOp()) ||
5807           Data.first->isAltShuffle())
5808         Data.first->reorderOperands(Mask);
5809       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5810           Data.first->isAltShuffle() ||
5811           Data.first->State == TreeEntry::StridedVectorize) {
5812         reorderScalars(Data.first->Scalars, Mask);
5813         reorderOrder(Data.first->ReorderIndices, MaskOrder,
5814                      /*BottomOrder=*/true);
5815         if (Data.first->ReuseShuffleIndices.empty() &&
5816             !Data.first->ReorderIndices.empty() &&
5817             !Data.first->isAltShuffle()) {
5818           // Insert user node to the list to try to sink reordering deeper in
5819           // the graph.
5820           OrderedEntries.insert(Data.first);
5821         }
5822       } else {
5823         reorderOrder(Data.first->ReorderIndices, Mask);
5824       }
5825     }
5826   }
5827   // If the reordering is unnecessary, just remove the reorder.
5828   if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829       VectorizableTree.front()->ReuseShuffleIndices.empty())
5830     VectorizableTree.front()->ReorderIndices.clear();
5831 }
5832 
5833 void BoUpSLP::buildExternalUses(
5834     const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5835   DenseMap<Value *, unsigned> ScalarToExtUses;
5836   // Collect the values that we need to extract from the tree.
5837   for (auto &TEPtr : VectorizableTree) {
5838     TreeEntry *Entry = TEPtr.get();
5839 
5840     // No need to handle users of gathered values.
5841     if (Entry->isGather())
5842       continue;
5843 
5844     // For each lane:
5845     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846       Value *Scalar = Entry->Scalars[Lane];
5847       if (!isa<Instruction>(Scalar))
5848         continue;
5849       // All uses must be replaced already? No need to do it again.
5850       auto It = ScalarToExtUses.find(Scalar);
5851       if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5852         continue;
5853 
5854       // Check if the scalar is externally used as an extra arg.
5855       const auto *ExtI = ExternallyUsedValues.find(Scalar);
5856       if (ExtI != ExternallyUsedValues.end()) {
5857         int FoundLane = Entry->findLaneForValue(Scalar);
5858         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5859                           << FoundLane << " from " << *Scalar << ".\n");
5860         ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5861         ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5862         continue;
5863       }
5864       for (User *U : Scalar->users()) {
5865         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5866 
5867         Instruction *UserInst = dyn_cast<Instruction>(U);
5868         if (!UserInst || isDeleted(UserInst))
5869           continue;
5870 
5871         // Ignore users in the user ignore list.
5872         if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5873           continue;
5874 
5875         // Skip in-tree scalars that become vectors
5876         if (TreeEntry *UseEntry = getTreeEntry(U)) {
5877           // Some in-tree scalars will remain as scalar in vectorized
5878           // instructions. If that is the case, the one in FoundLane will
5879           // be used.
5880           if (UseEntry->State == TreeEntry::ScatterVectorize ||
5881               !doesInTreeUserNeedToExtract(
5882                   Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5883             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5884                               << ".\n");
5885             assert(!UseEntry->isGather() && "Bad state");
5886             continue;
5887           }
5888           U = nullptr;
5889           if (It != ScalarToExtUses.end()) {
5890             ExternalUses[It->second].User = nullptr;
5891             break;
5892           }
5893         }
5894 
5895         if (U && Scalar->hasNUsesOrMore(UsesLimit))
5896           U = nullptr;
5897         int FoundLane = Entry->findLaneForValue(Scalar);
5898         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5899                           << " from lane " << FoundLane << " from " << *Scalar
5900                           << ".\n");
5901         It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5902         ExternalUses.emplace_back(Scalar, U, FoundLane);
5903         if (!U)
5904           break;
5905       }
5906     }
5907   }
5908 }
5909 
5910 DenseMap<Value *, SmallVector<StoreInst *>>
5911 BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5912   DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
5913   for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5914     Value *V = TE->Scalars[Lane];
5915     // To save compilation time we don't visit if we have too many users.
5916     if (V->hasNUsesOrMore(UsesLimit))
5917       break;
5918 
5919     // Collect stores per pointer object.
5920     for (User *U : V->users()) {
5921       auto *SI = dyn_cast<StoreInst>(U);
5922       if (SI == nullptr || !SI->isSimple() ||
5923           !isValidElementType(SI->getValueOperand()->getType()))
5924         continue;
5925       // Skip entry if already
5926       if (getTreeEntry(U))
5927         continue;
5928 
5929       Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5930       auto &StoresVec = PtrToStoresMap[Ptr];
5931       // For now just keep one store per pointer object per lane.
5932       // TODO: Extend this to support multiple stores per pointer per lane
5933       if (StoresVec.size() > Lane)
5934         continue;
5935       // Skip if in different BBs.
5936       if (!StoresVec.empty() &&
5937           SI->getParent() != StoresVec.back()->getParent())
5938         continue;
5939       // Make sure that the stores are of the same type.
5940       if (!StoresVec.empty() &&
5941           SI->getValueOperand()->getType() !=
5942               StoresVec.back()->getValueOperand()->getType())
5943         continue;
5944       StoresVec.push_back(SI);
5945     }
5946   }
5947   return PtrToStoresMap;
5948 }
5949 
5950 bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5951                             OrdersType &ReorderIndices) const {
5952   // We check whether the stores in StoreVec can form a vector by sorting them
5953   // and checking whether they are consecutive.
5954 
5955   // To avoid calling getPointersDiff() while sorting we create a vector of
5956   // pairs {store, offset from first} and sort this instead.
5957   SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5958   StoreInst *S0 = StoresVec[0];
5959   StoreOffsetVec[0] = {S0, 0};
5960   Type *S0Ty = S0->getValueOperand()->getType();
5961   Value *S0Ptr = S0->getPointerOperand();
5962   for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5963     StoreInst *SI = StoresVec[Idx];
5964     std::optional<int> Diff =
5965         getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5966                         SI->getPointerOperand(), *DL, *SE,
5967                         /*StrictCheck=*/true);
5968     // We failed to compare the pointers so just abandon this StoresVec.
5969     if (!Diff)
5970       return false;
5971     StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5972   }
5973 
5974   // Sort the vector based on the pointers. We create a copy because we may
5975   // need the original later for calculating the reorder (shuffle) indices.
5976   stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5977                                  const std::pair<StoreInst *, int> &Pair2) {
5978     int Offset1 = Pair1.second;
5979     int Offset2 = Pair2.second;
5980     return Offset1 < Offset2;
5981   });
5982 
5983   // Check if the stores are consecutive by checking if their difference is 1.
5984   for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5985     if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5986       return false;
5987 
5988   // Calculate the shuffle indices according to their offset against the sorted
5989   // StoreOffsetVec.
5990   ReorderIndices.reserve(StoresVec.size());
5991   for (StoreInst *SI : StoresVec) {
5992     unsigned Idx = find_if(StoreOffsetVec,
5993                            [SI](const std::pair<StoreInst *, int> &Pair) {
5994                              return Pair.first == SI;
5995                            }) -
5996                    StoreOffsetVec.begin();
5997     ReorderIndices.push_back(Idx);
5998   }
5999   // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6000   // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6001   // same convention here.
6002   auto IsIdentityOrder = [](const OrdersType &Order) {
6003     for (unsigned Idx : seq<unsigned>(0, Order.size()))
6004       if (Idx != Order[Idx])
6005         return false;
6006     return true;
6007   };
6008   if (IsIdentityOrder(ReorderIndices))
6009     ReorderIndices.clear();
6010 
6011   return true;
6012 }
6013 
6014 #ifndef NDEBUG
6015 LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
6016   for (unsigned Idx : Order)
6017     dbgs() << Idx << ", ";
6018   dbgs() << "\n";
6019 }
6020 #endif
6021 
6022 SmallVector<BoUpSLP::OrdersType, 1>
6023 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6024   unsigned NumLanes = TE->Scalars.size();
6025 
6026   DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
6027       collectUserStores(TE);
6028 
6029   // Holds the reorder indices for each candidate store vector that is a user of
6030   // the current TreeEntry.
6031   SmallVector<OrdersType, 1> ExternalReorderIndices;
6032 
6033   // Now inspect the stores collected per pointer and look for vectorization
6034   // candidates. For each candidate calculate the reorder index vector and push
6035   // it into `ExternalReorderIndices`
6036   for (const auto &Pair : PtrToStoresMap) {
6037     auto &StoresVec = Pair.second;
6038     // If we have fewer than NumLanes stores, then we can't form a vector.
6039     if (StoresVec.size() != NumLanes)
6040       continue;
6041 
6042     // If the stores are not consecutive then abandon this StoresVec.
6043     OrdersType ReorderIndices;
6044     if (!canFormVector(StoresVec, ReorderIndices))
6045       continue;
6046 
6047     // We now know that the scalars in StoresVec can form a vector instruction,
6048     // so set the reorder indices.
6049     ExternalReorderIndices.push_back(ReorderIndices);
6050   }
6051   return ExternalReorderIndices;
6052 }
6053 
6054 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
6055                         const SmallDenseSet<Value *> &UserIgnoreLst) {
6056   deleteTree();
6057   UserIgnoreList = &UserIgnoreLst;
6058   if (!allSameType(Roots))
6059     return;
6060   buildTree_rec(Roots, 0, EdgeInfo());
6061 }
6062 
6063 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6064   deleteTree();
6065   if (!allSameType(Roots))
6066     return;
6067   buildTree_rec(Roots, 0, EdgeInfo());
6068 }
6069 
6070 /// \return true if the specified list of values has only one instruction that
6071 /// requires scheduling, false otherwise.
6072 #ifndef NDEBUG
6073 static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
6074   Value *NeedsScheduling = nullptr;
6075   for (Value *V : VL) {
6076     if (doesNotNeedToBeScheduled(V))
6077       continue;
6078     if (!NeedsScheduling) {
6079       NeedsScheduling = V;
6080       continue;
6081     }
6082     return false;
6083   }
6084   return NeedsScheduling;
6085 }
6086 #endif
6087 
6088 /// Generates key/subkey pair for the given value to provide effective sorting
6089 /// of the values and better detection of the vectorizable values sequences. The
6090 /// keys/subkeys can be used for better sorting of the values themselves (keys)
6091 /// and in values subgroups (subkeys).
6092 static std::pair<size_t, size_t> generateKeySubkey(
6093     Value *V, const TargetLibraryInfo *TLI,
6094     function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6095     bool AllowAlternate) {
6096   hash_code Key = hash_value(V->getValueID() + 2);
6097   hash_code SubKey = hash_value(0);
6098   // Sort the loads by the distance between the pointers.
6099   if (auto *LI = dyn_cast<LoadInst>(V)) {
6100     Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6101     if (LI->isSimple())
6102       SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6103     else
6104       Key = SubKey = hash_value(LI);
6105   } else if (isVectorLikeInstWithConstOps(V)) {
6106     // Sort extracts by the vector operands.
6107     if (isa<ExtractElementInst, UndefValue>(V))
6108       Key = hash_value(Value::UndefValueVal + 1);
6109     if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6110       if (!isUndefVector(EI->getVectorOperand()).all() &&
6111           !isa<UndefValue>(EI->getIndexOperand()))
6112         SubKey = hash_value(EI->getVectorOperand());
6113     }
6114   } else if (auto *I = dyn_cast<Instruction>(V)) {
6115     // Sort other instructions just by the opcodes except for CMPInst.
6116     // For CMP also sort by the predicate kind.
6117     if ((isa<BinaryOperator, CastInst>(I)) &&
6118         isValidForAlternation(I->getOpcode())) {
6119       if (AllowAlternate)
6120         Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6121       else
6122         Key = hash_combine(hash_value(I->getOpcode()), Key);
6123       SubKey = hash_combine(
6124           hash_value(I->getOpcode()), hash_value(I->getType()),
6125           hash_value(isa<BinaryOperator>(I)
6126                          ? I->getType()
6127                          : cast<CastInst>(I)->getOperand(0)->getType()));
6128       // For casts, look through the only operand to improve compile time.
6129       if (isa<CastInst>(I)) {
6130         std::pair<size_t, size_t> OpVals =
6131             generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6132                               /*AllowAlternate=*/true);
6133         Key = hash_combine(OpVals.first, Key);
6134         SubKey = hash_combine(OpVals.first, SubKey);
6135       }
6136     } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6137       CmpInst::Predicate Pred = CI->getPredicate();
6138       if (CI->isCommutative())
6139         Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6140       CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
6141       SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6142                             hash_value(SwapPred),
6143                             hash_value(CI->getOperand(0)->getType()));
6144     } else if (auto *Call = dyn_cast<CallInst>(I)) {
6145       Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
6146       if (isTriviallyVectorizable(ID)) {
6147         SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
6148       } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
6149         SubKey = hash_combine(hash_value(I->getOpcode()),
6150                               hash_value(Call->getCalledFunction()));
6151       } else {
6152         Key = hash_combine(hash_value(Call), Key);
6153         SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
6154       }
6155       for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6156         SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
6157                               hash_value(Op.Tag), SubKey);
6158     } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
6159       if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6160         SubKey = hash_value(Gep->getPointerOperand());
6161       else
6162         SubKey = hash_value(Gep);
6163     } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
6164                !isa<ConstantInt>(I->getOperand(1))) {
6165       // Do not try to vectorize instructions with potentially high cost.
6166       SubKey = hash_value(I);
6167     } else {
6168       SubKey = hash_value(I->getOpcode());
6169     }
6170     Key = hash_combine(hash_value(I->getParent()), Key);
6171   }
6172   return std::make_pair(Key, SubKey);
6173 }
6174 
6175 /// Checks if the specified instruction \p I is an alternate operation for
6176 /// the given \p MainOp and \p AltOp instructions.
6177 static bool isAlternateInstruction(const Instruction *I,
6178                                    const Instruction *MainOp,
6179                                    const Instruction *AltOp,
6180                                    const TargetLibraryInfo &TLI);
6181 
6182 bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6183                                        ArrayRef<Value *> VL) const {
6184   unsigned Opcode0 = S.getOpcode();
6185   unsigned Opcode1 = S.getAltOpcode();
6186   SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6187   // If this pattern is supported by the target then consider it profitable.
6188   if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
6189                            Opcode0, Opcode1, OpcodeMask))
6190     return true;
6191   SmallVector<ValueList> Operands;
6192   for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6193     Operands.emplace_back();
6194     // Prepare the operand vector.
6195     for (Value *V : VL)
6196       Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6197   }
6198   if (Operands.size() == 2) {
6199     // Try find best operands candidates.
6200     for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6201       SmallVector<std::pair<Value *, Value *>> Candidates(3);
6202       Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6203       Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6204       Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6205       std::optional<int> Res = findBestRootPair(Candidates);
6206       switch (Res.value_or(0)) {
6207       case 0:
6208         break;
6209       case 1:
6210         std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6211         break;
6212       case 2:
6213         std::swap(Operands[0][I], Operands[1][I]);
6214         break;
6215       default:
6216         llvm_unreachable("Unexpected index.");
6217       }
6218     }
6219   }
6220   DenseSet<unsigned> UniqueOpcodes;
6221   constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6222   unsigned NonInstCnt = 0;
6223   // Estimate number of instructions, required for the vectorized node and for
6224   // the buildvector node.
6225   unsigned UndefCnt = 0;
6226   // Count the number of extra shuffles, required for vector nodes.
6227   unsigned ExtraShuffleInsts = 0;
6228   // Check that operands do not contain same values and create either perfect
6229   // diamond match or shuffled match.
6230   if (Operands.size() == 2) {
6231     // Do not count same operands twice.
6232     if (Operands.front() == Operands.back()) {
6233       Operands.erase(Operands.begin());
6234     } else if (!allConstant(Operands.front()) &&
6235                all_of(Operands.front(), [&](Value *V) {
6236                  return is_contained(Operands.back(), V);
6237                })) {
6238       Operands.erase(Operands.begin());
6239       ++ExtraShuffleInsts;
6240     }
6241   }
6242   const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6243   // Vectorize node, if:
6244   // 1. at least single operand is constant or splat.
6245   // 2. Operands have many loop invariants (the instructions are not loop
6246   // invariants).
6247   // 3. At least single unique operands is supposed to vectorized.
6248   return none_of(Operands,
6249                  [&](ArrayRef<Value *> Op) {
6250                    if (allConstant(Op) ||
6251                        (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6252                         getSameOpcode(Op, *TLI).MainOp))
6253                      return false;
6254                    DenseMap<Value *, unsigned> Uniques;
6255                    for (Value *V : Op) {
6256                      if (isa<Constant, ExtractElementInst>(V) ||
6257                          getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6258                        if (isa<UndefValue>(V))
6259                          ++UndefCnt;
6260                        continue;
6261                      }
6262                      auto Res = Uniques.try_emplace(V, 0);
6263                      // Found first duplicate - need to add shuffle.
6264                      if (!Res.second && Res.first->second == 1)
6265                        ++ExtraShuffleInsts;
6266                      ++Res.first->getSecond();
6267                      if (auto *I = dyn_cast<Instruction>(V))
6268                        UniqueOpcodes.insert(I->getOpcode());
6269                      else if (Res.second)
6270                        ++NonInstCnt;
6271                    }
6272                    return none_of(Uniques, [&](const auto &P) {
6273                      return P.first->hasNUsesOrMore(P.second + 1) &&
6274                             none_of(P.first->users(), [&](User *U) {
6275                               return getTreeEntry(U) || Uniques.contains(U);
6276                             });
6277                    });
6278                  }) ||
6279          // Do not vectorize node, if estimated number of vector instructions is
6280          // more than estimated number of buildvector instructions. Number of
6281          // vector operands is number of vector instructions + number of vector
6282          // instructions for operands (buildvectors). Number of buildvector
6283          // instructions is just number_of_operands * number_of_scalars.
6284          (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6285           (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6286            NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6287 }
6288 
6289 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6290     InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6291     OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6292   assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6293 
6294   unsigned ShuffleOrOp =
6295       S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6296   auto *VL0 = cast<Instruction>(S.OpValue);
6297   switch (ShuffleOrOp) {
6298   case Instruction::PHI: {
6299     // Too many operands - gather, most probably won't be vectorized.
6300     if (VL0->getNumOperands() > MaxPHINumOperands)
6301       return TreeEntry::NeedToGather;
6302     // Check for terminator values (e.g. invoke).
6303     for (Value *V : VL)
6304       for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6305         Instruction *Term = dyn_cast<Instruction>(Incoming);
6306         if (Term && Term->isTerminator()) {
6307           LLVM_DEBUG(dbgs()
6308                      << "SLP: Need to swizzle PHINodes (terminator use).\n");
6309           return TreeEntry::NeedToGather;
6310         }
6311       }
6312 
6313     return TreeEntry::Vectorize;
6314   }
6315   case Instruction::ExtractValue:
6316   case Instruction::ExtractElement: {
6317     bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6318     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6319     if (!isPowerOf2_32(VL.size()))
6320       return TreeEntry::NeedToGather;
6321     if (Reuse || !CurrentOrder.empty())
6322       return TreeEntry::Vectorize;
6323     LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6324     return TreeEntry::NeedToGather;
6325   }
6326   case Instruction::InsertElement: {
6327     // Check that we have a buildvector and not a shuffle of 2 or more
6328     // different vectors.
6329     ValueSet SourceVectors;
6330     for (Value *V : VL) {
6331       SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6332       assert(getElementIndex(V) != std::nullopt &&
6333              "Non-constant or undef index?");
6334     }
6335 
6336     if (count_if(VL, [&SourceVectors](Value *V) {
6337           return !SourceVectors.contains(V);
6338         }) >= 2) {
6339       // Found 2nd source vector - cancel.
6340       LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6341                            "different source vectors.\n");
6342       return TreeEntry::NeedToGather;
6343     }
6344 
6345     return TreeEntry::Vectorize;
6346   }
6347   case Instruction::Load: {
6348     // Check that a vectorized load would load the same memory as a scalar
6349     // load. For example, we don't want to vectorize loads that are smaller
6350     // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6351     // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6352     // from such a struct, we read/write packed bits disagreeing with the
6353     // unvectorized version.
6354     switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6355     case LoadsState::Vectorize:
6356       return TreeEntry::Vectorize;
6357     case LoadsState::ScatterVectorize:
6358       return TreeEntry::ScatterVectorize;
6359     case LoadsState::StridedVectorize:
6360       return TreeEntry::StridedVectorize;
6361     case LoadsState::Gather:
6362 #ifndef NDEBUG
6363       Type *ScalarTy = VL0->getType();
6364       if (DL->getTypeSizeInBits(ScalarTy) !=
6365           DL->getTypeAllocSizeInBits(ScalarTy))
6366         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6367       else if (any_of(VL,
6368                       [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6369         LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6370       else
6371         LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6372 #endif // NDEBUG
6373       return TreeEntry::NeedToGather;
6374     }
6375     llvm_unreachable("Unexpected state of loads");
6376   }
6377   case Instruction::ZExt:
6378   case Instruction::SExt:
6379   case Instruction::FPToUI:
6380   case Instruction::FPToSI:
6381   case Instruction::FPExt:
6382   case Instruction::PtrToInt:
6383   case Instruction::IntToPtr:
6384   case Instruction::SIToFP:
6385   case Instruction::UIToFP:
6386   case Instruction::Trunc:
6387   case Instruction::FPTrunc:
6388   case Instruction::BitCast: {
6389     Type *SrcTy = VL0->getOperand(0)->getType();
6390     for (Value *V : VL) {
6391       Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6392       if (Ty != SrcTy || !isValidElementType(Ty)) {
6393         LLVM_DEBUG(
6394             dbgs() << "SLP: Gathering casts with different src types.\n");
6395         return TreeEntry::NeedToGather;
6396       }
6397     }
6398     return TreeEntry::Vectorize;
6399   }
6400   case Instruction::ICmp:
6401   case Instruction::FCmp: {
6402     // Check that all of the compares have the same predicate.
6403     CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6404     CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
6405     Type *ComparedTy = VL0->getOperand(0)->getType();
6406     for (Value *V : VL) {
6407       CmpInst *Cmp = cast<CmpInst>(V);
6408       if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6409           Cmp->getOperand(0)->getType() != ComparedTy) {
6410         LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6411         return TreeEntry::NeedToGather;
6412       }
6413     }
6414     return TreeEntry::Vectorize;
6415   }
6416   case Instruction::Select:
6417   case Instruction::FNeg:
6418   case Instruction::Add:
6419   case Instruction::FAdd:
6420   case Instruction::Sub:
6421   case Instruction::FSub:
6422   case Instruction::Mul:
6423   case Instruction::FMul:
6424   case Instruction::UDiv:
6425   case Instruction::SDiv:
6426   case Instruction::FDiv:
6427   case Instruction::URem:
6428   case Instruction::SRem:
6429   case Instruction::FRem:
6430   case Instruction::Shl:
6431   case Instruction::LShr:
6432   case Instruction::AShr:
6433   case Instruction::And:
6434   case Instruction::Or:
6435   case Instruction::Xor:
6436     return TreeEntry::Vectorize;
6437   case Instruction::GetElementPtr: {
6438     // We don't combine GEPs with complicated (nested) indexing.
6439     for (Value *V : VL) {
6440       auto *I = dyn_cast<GetElementPtrInst>(V);
6441       if (!I)
6442         continue;
6443       if (I->getNumOperands() != 2) {
6444         LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6445         return TreeEntry::NeedToGather;
6446       }
6447     }
6448 
6449     // We can't combine several GEPs into one vector if they operate on
6450     // different types.
6451     Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6452     for (Value *V : VL) {
6453       auto *GEP = dyn_cast<GEPOperator>(V);
6454       if (!GEP)
6455         continue;
6456       Type *CurTy = GEP->getSourceElementType();
6457       if (Ty0 != CurTy) {
6458         LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6459         return TreeEntry::NeedToGather;
6460       }
6461     }
6462 
6463     // We don't combine GEPs with non-constant indexes.
6464     Type *Ty1 = VL0->getOperand(1)->getType();
6465     for (Value *V : VL) {
6466       auto *I = dyn_cast<GetElementPtrInst>(V);
6467       if (!I)
6468         continue;
6469       auto *Op = I->getOperand(1);
6470       if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6471           (Op->getType() != Ty1 &&
6472            ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6473             Op->getType()->getScalarSizeInBits() >
6474                 DL->getIndexSizeInBits(
6475                     V->getType()->getPointerAddressSpace())))) {
6476         LLVM_DEBUG(
6477             dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6478         return TreeEntry::NeedToGather;
6479       }
6480     }
6481 
6482     return TreeEntry::Vectorize;
6483   }
6484   case Instruction::Store: {
6485     // Check if the stores are consecutive or if we need to swizzle them.
6486     llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6487     // Avoid types that are padded when being allocated as scalars, while
6488     // being packed together in a vector (such as i1).
6489     if (DL->getTypeSizeInBits(ScalarTy) !=
6490         DL->getTypeAllocSizeInBits(ScalarTy)) {
6491       LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6492       return TreeEntry::NeedToGather;
6493     }
6494     // Make sure all stores in the bundle are simple - we can't vectorize
6495     // atomic or volatile stores.
6496     for (Value *V : VL) {
6497       auto *SI = cast<StoreInst>(V);
6498       if (!SI->isSimple()) {
6499         LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6500         return TreeEntry::NeedToGather;
6501       }
6502       PointerOps.push_back(SI->getPointerOperand());
6503     }
6504 
6505     // Check the order of pointer operands.
6506     if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6507       Value *Ptr0;
6508       Value *PtrN;
6509       if (CurrentOrder.empty()) {
6510         Ptr0 = PointerOps.front();
6511         PtrN = PointerOps.back();
6512       } else {
6513         Ptr0 = PointerOps[CurrentOrder.front()];
6514         PtrN = PointerOps[CurrentOrder.back()];
6515       }
6516       std::optional<int> Dist =
6517           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6518       // Check that the sorted pointer operands are consecutive.
6519       if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6520         return TreeEntry::Vectorize;
6521     }
6522 
6523     LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6524     return TreeEntry::NeedToGather;
6525   }
6526   case Instruction::Call: {
6527     // Check if the calls are all to the same vectorizable intrinsic or
6528     // library function.
6529     CallInst *CI = cast<CallInst>(VL0);
6530     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6531 
6532     VFShape Shape = VFShape::get(
6533         CI->getFunctionType(),
6534         ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6535         false /*HasGlobalPred*/);
6536     Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6537 
6538     if (!VecFunc && !isTriviallyVectorizable(ID)) {
6539       LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6540       return TreeEntry::NeedToGather;
6541     }
6542     Function *F = CI->getCalledFunction();
6543     unsigned NumArgs = CI->arg_size();
6544     SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6545     for (unsigned J = 0; J != NumArgs; ++J)
6546       if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
6547         ScalarArgs[J] = CI->getArgOperand(J);
6548     for (Value *V : VL) {
6549       CallInst *CI2 = dyn_cast<CallInst>(V);
6550       if (!CI2 || CI2->getCalledFunction() != F ||
6551           getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6552           (VecFunc &&
6553            VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6554           !CI->hasIdenticalOperandBundleSchema(*CI2)) {
6555         LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6556                           << "\n");
6557         return TreeEntry::NeedToGather;
6558       }
6559       // Some intrinsics have scalar arguments and should be same in order for
6560       // them to be vectorized.
6561       for (unsigned J = 0; J != NumArgs; ++J) {
6562         if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
6563           Value *A1J = CI2->getArgOperand(J);
6564           if (ScalarArgs[J] != A1J) {
6565             LLVM_DEBUG(dbgs()
6566                        << "SLP: mismatched arguments in call:" << *CI
6567                        << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6568             return TreeEntry::NeedToGather;
6569           }
6570         }
6571       }
6572       // Verify that the bundle operands are identical between the two calls.
6573       if (CI->hasOperandBundles() &&
6574           !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6575                       CI->op_begin() + CI->getBundleOperandsEndIndex(),
6576                       CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6577         LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6578                           << "!=" << *V << '\n');
6579         return TreeEntry::NeedToGather;
6580       }
6581     }
6582 
6583     return TreeEntry::Vectorize;
6584   }
6585   case Instruction::ShuffleVector: {
6586     // If this is not an alternate sequence of opcode like add-sub
6587     // then do not vectorize this instruction.
6588     if (!S.isAltShuffle()) {
6589       LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6590       return TreeEntry::NeedToGather;
6591     }
6592     if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6593       LLVM_DEBUG(
6594           dbgs()
6595           << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6596              "the whole alt sequence is not profitable.\n");
6597       return TreeEntry::NeedToGather;
6598     }
6599 
6600     return TreeEntry::Vectorize;
6601   }
6602   default:
6603     LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6604     return TreeEntry::NeedToGather;
6605   }
6606 }
6607 
6608 namespace {
6609 /// Allows to correctly handle operands of the phi nodes based on the \p Main
6610 /// PHINode order of incoming basic blocks/values.
6611 class PHIHandler {
6612   DominatorTree &DT;
6613   PHINode *Main = nullptr;
6614   SmallVector<Value *> Phis;
6615   SmallVector<SmallVector<Value *>> Operands;
6616 
6617 public:
6618   PHIHandler() = delete;
6619   PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6620       : DT(DT), Main(Main), Phis(Phis),
6621         Operands(Main->getNumIncomingValues(),
6622                  SmallVector<Value *>(Phis.size(), nullptr)) {}
6623   void buildOperands() {
6624     constexpr unsigned FastLimit = 4;
6625     if (Main->getNumIncomingValues() <= FastLimit) {
6626       for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6627         BasicBlock *InBB = Main->getIncomingBlock(I);
6628         if (!DT.isReachableFromEntry(InBB)) {
6629           Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6630           continue;
6631         }
6632         // Prepare the operand vector.
6633         for (auto [Idx, V] : enumerate(Phis)) {
6634           auto *P = cast<PHINode>(V);
6635           if (P->getIncomingBlock(I) == InBB)
6636             Operands[I][Idx] = P->getIncomingValue(I);
6637           else
6638             Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6639         }
6640       }
6641       return;
6642     }
6643     SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
6644     for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6645       BasicBlock *InBB = Main->getIncomingBlock(I);
6646       if (!DT.isReachableFromEntry(InBB)) {
6647         Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6648         continue;
6649       }
6650       Blocks.try_emplace(InBB).first->second.push_back(I);
6651     }
6652     for (auto [Idx, V] : enumerate(Phis)) {
6653       auto *P = cast<PHINode>(V);
6654       for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6655         BasicBlock *InBB = P->getIncomingBlock(I);
6656         if (InBB == Main->getIncomingBlock(I)) {
6657           if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6658             continue;
6659           Operands[I][Idx] = P->getIncomingValue(I);
6660           continue;
6661         }
6662         auto It = Blocks.find(InBB);
6663         if (It == Blocks.end())
6664           continue;
6665         Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6666       }
6667     }
6668     for (const auto &P : Blocks) {
6669       if (P.getSecond().size() <= 1)
6670         continue;
6671       unsigned BasicI = P.getSecond().front();
6672       for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6673         assert(all_of(enumerate(Operands[I]),
6674                       [&](const auto &Data) {
6675                         return !Data.value() ||
6676                                Data.value() == Operands[BasicI][Data.index()];
6677                       }) &&
6678                "Expected empty operands list.");
6679         Operands[I] = Operands[BasicI];
6680       }
6681     }
6682   }
6683   ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6684 };
6685 } // namespace
6686 
6687 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6688                             const EdgeInfo &UserTreeIdx) {
6689   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6690 
6691   SmallVector<int> ReuseShuffleIndices;
6692   SmallVector<Value *> UniqueValues;
6693   SmallVector<Value *> NonUniqueValueVL;
6694   auto TryToFindDuplicates = [&](const InstructionsState &S,
6695                                  bool DoNotFail = false) {
6696     // Check that every instruction appears once in this bundle.
6697     DenseMap<Value *, unsigned> UniquePositions(VL.size());
6698     for (Value *V : VL) {
6699       if (isConstant(V)) {
6700         ReuseShuffleIndices.emplace_back(
6701             isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6702         UniqueValues.emplace_back(V);
6703         continue;
6704       }
6705       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6706       ReuseShuffleIndices.emplace_back(Res.first->second);
6707       if (Res.second)
6708         UniqueValues.emplace_back(V);
6709     }
6710     size_t NumUniqueScalarValues = UniqueValues.size();
6711     if (NumUniqueScalarValues == VL.size()) {
6712       ReuseShuffleIndices.clear();
6713     } else {
6714       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6715       if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6717                              "for nodes with padding.\n");
6718         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6719         return false;
6720       }
6721       LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6722       if (NumUniqueScalarValues <= 1 ||
6723           (UniquePositions.size() == 1 && all_of(UniqueValues,
6724                                                  [](Value *V) {
6725                                                    return isa<UndefValue>(V) ||
6726                                                           !isConstant(V);
6727                                                  })) ||
6728           !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6729         if (DoNotFail && UniquePositions.size() > 1 &&
6730             NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6731             all_of(UniqueValues, [=](Value *V) {
6732               return isa<ExtractElementInst>(V) ||
6733                      areAllUsersVectorized(cast<Instruction>(V),
6734                                            UserIgnoreList);
6735             })) {
6736           unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6737           if (PWSz == VL.size()) {
6738             ReuseShuffleIndices.clear();
6739           } else {
6740             NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6741             NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6742                                     UniqueValues.back());
6743             VL = NonUniqueValueVL;
6744           }
6745           return true;
6746         }
6747         LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6748         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6749         return false;
6750       }
6751       VL = UniqueValues;
6752     }
6753     return true;
6754   };
6755 
6756   InstructionsState S = getSameOpcode(VL, *TLI);
6757 
6758   // Don't vectorize ephemeral values.
6759   if (!EphValues.empty()) {
6760     for (Value *V : VL) {
6761       if (EphValues.count(V)) {
6762         LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6763                           << ") is ephemeral.\n");
6764         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6765         return;
6766       }
6767     }
6768   }
6769 
6770   // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6771   // a load), in which case peek through to include it in the tree, without
6772   // ballooning over-budget.
6773   if (Depth >= RecursionMaxDepth &&
6774       !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6775         VL.size() >= 4 &&
6776         (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6777            return match(I,
6778                         m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
6779                   cast<Instruction>(I)->getOpcode() ==
6780                       cast<Instruction>(S.MainOp)->getOpcode();
6781          })))) {
6782     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6783     if (TryToFindDuplicates(S))
6784       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6785                    ReuseShuffleIndices);
6786     return;
6787   }
6788 
6789   // Don't handle scalable vectors
6790   if (S.getOpcode() == Instruction::ExtractElement &&
6791       isa<ScalableVectorType>(
6792           cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6793     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6794     if (TryToFindDuplicates(S))
6795       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6796                    ReuseShuffleIndices);
6797     return;
6798   }
6799 
6800   // Don't handle vectors.
6801   if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802       !isa<InsertElementInst>(S.OpValue)) {
6803     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6804     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6805     return;
6806   }
6807 
6808   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6809     if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6810       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6811       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6812       return;
6813     }
6814 
6815   // If all of the operands are identical or constant we have a simple solution.
6816   // If we deal with insert/extract instructions, they all must have constant
6817   // indices, otherwise we should gather them, not try to vectorize.
6818   // If alternate op node with 2 elements with gathered operands - do not
6819   // vectorize.
6820   auto &&NotProfitableForVectorization = [&S, this,
6821                                           Depth](ArrayRef<Value *> VL) {
6822     if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6823       return false;
6824     if (VectorizableTree.size() < MinTreeSize)
6825       return false;
6826     if (Depth >= RecursionMaxDepth - 1)
6827       return true;
6828     // Check if all operands are extracts, part of vector node or can build a
6829     // regular vectorize node.
6830     SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6831     for (Value *V : VL) {
6832       auto *I = cast<Instruction>(V);
6833       InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6834         return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6835       }));
6836     }
6837     bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6838     if ((IsCommutative &&
6839          std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6840         (!IsCommutative &&
6841          all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6842       return true;
6843     assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6844     SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
6845     auto *I1 = cast<Instruction>(VL.front());
6846     auto *I2 = cast<Instruction>(VL.back());
6847     for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6848       Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6849                                              I2->getOperand(Op));
6850     if (static_cast<unsigned>(count_if(
6851             Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6852               return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
6853             })) >= S.MainOp->getNumOperands() / 2)
6854       return false;
6855     if (S.MainOp->getNumOperands() > 2)
6856       return true;
6857     if (IsCommutative) {
6858       // Check permuted operands.
6859       Candidates.clear();
6860       for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6861         Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6862                                                I2->getOperand((Op + 1) % E));
6863       if (any_of(
6864               Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6865                 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
6866               }))
6867         return false;
6868     }
6869     return true;
6870   };
6871   SmallVector<unsigned> SortedIndices;
6872   BasicBlock *BB = nullptr;
6873   bool IsScatterVectorizeUserTE =
6874       UserTreeIdx.UserTE &&
6875       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876   bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6877   bool AreScatterAllGEPSameBlock =
6878       (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6879        VL.size() > 2 &&
6880        all_of(VL,
6881               [&BB](Value *V) {
6882                 auto *I = dyn_cast<GetElementPtrInst>(V);
6883                 if (!I)
6884                   return doesNotNeedToBeScheduled(V);
6885                 if (!BB)
6886                   BB = I->getParent();
6887                 return BB == I->getParent() && I->getNumOperands() == 2;
6888               }) &&
6889        BB &&
6890        sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6891                        SortedIndices));
6892   bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6893   if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6894       (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6895            S.OpValue) &&
6896        !all_of(VL, isVectorLikeInstWithConstOps)) ||
6897       NotProfitableForVectorization(VL)) {
6898     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899     if (TryToFindDuplicates(S))
6900       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6901                    ReuseShuffleIndices);
6902     return;
6903   }
6904 
6905   // We now know that this is a vector of instructions of the same type from
6906   // the same block.
6907 
6908   // Check if this is a duplicate of another entry.
6909   if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6910     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6911     if (!E->isSame(VL)) {
6912       auto It = MultiNodeScalars.find(S.OpValue);
6913       if (It != MultiNodeScalars.end()) {
6914         auto *TEIt = find_if(It->getSecond(),
6915                              [&](TreeEntry *ME) { return ME->isSame(VL); });
6916         if (TEIt != It->getSecond().end())
6917           E = *TEIt;
6918         else
6919           E = nullptr;
6920       } else {
6921         E = nullptr;
6922       }
6923     }
6924     if (!E) {
6925       if (!doesNotNeedToBeScheduled(S.OpValue)) {
6926         LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6927         if (TryToFindDuplicates(S))
6928           newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6929                        ReuseShuffleIndices);
6930         return;
6931       }
6932     } else {
6933       // Record the reuse of the tree node.  FIXME, currently this is only used
6934       // to properly draw the graph rather than for the actual vectorization.
6935       E->UserTreeIndices.push_back(UserTreeIdx);
6936       LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6937                         << ".\n");
6938       return;
6939     }
6940   }
6941 
6942   // Check that none of the instructions in the bundle are already in the tree.
6943   for (Value *V : VL) {
6944     if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6945         doesNotNeedToBeScheduled(V))
6946       continue;
6947     if (getTreeEntry(V)) {
6948       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6949                         << ") is already in tree.\n");
6950       if (TryToFindDuplicates(S))
6951         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6952                      ReuseShuffleIndices);
6953       return;
6954     }
6955   }
6956 
6957   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6958   if (UserIgnoreList && !UserIgnoreList->empty()) {
6959     for (Value *V : VL) {
6960       if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961         LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6962         if (TryToFindDuplicates(S))
6963           newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6964                        ReuseShuffleIndices);
6965         return;
6966       }
6967     }
6968   }
6969 
6970   // Special processing for sorted pointers for ScatterVectorize node with
6971   // constant indeces only.
6972   if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973     assert(S.OpValue->getType()->isPointerTy() &&
6974            count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6975            "Expected pointers only.");
6976     // Reset S to make it GetElementPtr kind of node.
6977     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6978     assert(It != VL.end() && "Expected at least one GEP.");
6979     S = getSameOpcode(*It, *TLI);
6980   }
6981 
6982   // Check that all of the users of the scalars that we want to vectorize are
6983   // schedulable.
6984   auto *VL0 = cast<Instruction>(S.OpValue);
6985   BB = VL0->getParent();
6986 
6987   if (!DT->isReachableFromEntry(BB)) {
6988     // Don't go into unreachable blocks. They may contain instructions with
6989     // dependency cycles which confuse the final scheduling.
6990     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6991     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6992     return;
6993   }
6994 
6995   // Don't go into catchswitch blocks, which can happen with PHIs.
6996   // Such blocks can only have PHIs and the catchswitch.  There is no
6997   // place to insert a shuffle if we need to, so just avoid that issue.
6998   if (isa<CatchSwitchInst>(BB->getTerminator())) {
6999     LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7000     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7001     return;
7002   }
7003 
7004   // Check that every instruction appears once in this bundle.
7005   if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7006     return;
7007 
7008   // Perform specific checks for each particular instruction kind.
7009   OrdersType CurrentOrder;
7010   SmallVector<Value *> PointerOps;
7011   TreeEntry::EntryState State = getScalarsVectorizationState(
7012       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013   if (State == TreeEntry::NeedToGather) {
7014     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7015                  ReuseShuffleIndices);
7016     return;
7017   }
7018 
7019   auto &BSRef = BlocksSchedules[BB];
7020   if (!BSRef)
7021     BSRef = std::make_unique<BlockScheduling>(BB);
7022 
7023   BlockScheduling &BS = *BSRef;
7024 
7025   std::optional<ScheduleData *> Bundle =
7026       BS.tryScheduleBundle(UniqueValues, this, S);
7027 #ifdef EXPENSIVE_CHECKS
7028   // Make sure we didn't break any internal invariants
7029   BS.verify();
7030 #endif
7031   if (!Bundle) {
7032     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7033     assert((!BS.getScheduleData(VL0) ||
7034             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035            "tryScheduleBundle should cancelScheduling on failure");
7036     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7037                  ReuseShuffleIndices);
7038     NonScheduledFirst.insert(VL.front());
7039     return;
7040   }
7041   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7042 
7043   unsigned ShuffleOrOp = S.isAltShuffle() ?
7044                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7045   switch (ShuffleOrOp) {
7046     case Instruction::PHI: {
7047       auto *PH = cast<PHINode>(VL0);
7048 
7049       TreeEntry *TE =
7050           newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7051       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7052 
7053       // Keeps the reordered operands to avoid code duplication.
7054       PHIHandler Handler(*DT, PH, VL);
7055       Handler.buildOperands();
7056       for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7057         TE->setOperand(I, Handler.getOperands(I));
7058       for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7059         buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
7060       return;
7061     }
7062     case Instruction::ExtractValue:
7063     case Instruction::ExtractElement: {
7064       if (CurrentOrder.empty()) {
7065         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7066       } else {
7067         LLVM_DEBUG({
7068           dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7069                     "with order";
7070           for (unsigned Idx : CurrentOrder)
7071             dbgs() << " " << Idx;
7072           dbgs() << "\n";
7073         });
7074         fixupOrderingIndices(CurrentOrder);
7075       }
7076       // Insert new order with initial value 0, if it does not exist,
7077       // otherwise return the iterator to the existing one.
7078       newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7079                    ReuseShuffleIndices, CurrentOrder);
7080       // This is a special case, as it does not gather, but at the same time
7081       // we are not extending buildTree_rec() towards the operands.
7082       ValueList Op0;
7083       Op0.assign(VL.size(), VL0->getOperand(0));
7084       VectorizableTree.back()->setOperand(0, Op0);
7085       return;
7086     }
7087     case Instruction::InsertElement: {
7088       assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7089 
7090       auto OrdCompare = [](const std::pair<int, int> &P1,
7091                            const std::pair<int, int> &P2) {
7092         return P1.first > P2.first;
7093       };
7094       PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
7095                     decltype(OrdCompare)>
7096           Indices(OrdCompare);
7097       for (int I = 0, E = VL.size(); I < E; ++I) {
7098         unsigned Idx = *getElementIndex(VL[I]);
7099         Indices.emplace(Idx, I);
7100       }
7101       OrdersType CurrentOrder(VL.size(), VL.size());
7102       bool IsIdentity = true;
7103       for (int I = 0, E = VL.size(); I < E; ++I) {
7104         CurrentOrder[Indices.top().second] = I;
7105         IsIdentity &= Indices.top().second == I;
7106         Indices.pop();
7107       }
7108       if (IsIdentity)
7109         CurrentOrder.clear();
7110       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7111                                    std::nullopt, CurrentOrder);
7112       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7113 
7114       TE->setOperandsInOrder();
7115       buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
7116       return;
7117     }
7118     case Instruction::Load: {
7119       // Check that a vectorized load would load the same memory as a scalar
7120       // load. For example, we don't want to vectorize loads that are smaller
7121       // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7122       // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7123       // from such a struct, we read/write packed bits disagreeing with the
7124       // unvectorized version.
7125       TreeEntry *TE = nullptr;
7126       fixupOrderingIndices(CurrentOrder);
7127       switch (State) {
7128       case TreeEntry::Vectorize:
7129         TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7130                           ReuseShuffleIndices, CurrentOrder);
7131         if (CurrentOrder.empty())
7132           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7133         else
7134           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7135         TE->setOperandsInOrder();
7136         break;
7137       case TreeEntry::StridedVectorize:
7138         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7139         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7140                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7141         TE->setOperandsInOrder();
7142         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7143         break;
7144       case TreeEntry::ScatterVectorize:
7145         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7146         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7147                           UserTreeIdx, ReuseShuffleIndices);
7148         TE->setOperandsInOrder();
7149         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7150         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7151         break;
7152       case TreeEntry::NeedToGather:
7153         llvm_unreachable("Unexpected loads state.");
7154       }
7155       return;
7156     }
7157     case Instruction::ZExt:
7158     case Instruction::SExt:
7159     case Instruction::FPToUI:
7160     case Instruction::FPToSI:
7161     case Instruction::FPExt:
7162     case Instruction::PtrToInt:
7163     case Instruction::IntToPtr:
7164     case Instruction::SIToFP:
7165     case Instruction::UIToFP:
7166     case Instruction::Trunc:
7167     case Instruction::FPTrunc:
7168     case Instruction::BitCast: {
7169       auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170           std::make_pair(std::numeric_limits<unsigned>::min(),
7171                          std::numeric_limits<unsigned>::max()));
7172       if (ShuffleOrOp == Instruction::ZExt ||
7173           ShuffleOrOp == Instruction::SExt) {
7174         CastMaxMinBWSizes = std::make_pair(
7175             std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7176                                PrevMaxBW),
7177             std::min<unsigned>(
7178                 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7179                 PrevMinBW));
7180       } else if (ShuffleOrOp == Instruction::Trunc) {
7181         CastMaxMinBWSizes = std::make_pair(
7182             std::max<unsigned>(
7183                 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7184                 PrevMaxBW),
7185             std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7186                                PrevMinBW));
7187         ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7188       } else if (ShuffleOrOp == Instruction::SIToFP ||
7189                  ShuffleOrOp == Instruction::UIToFP) {
7190         unsigned NumSignBits =
7191             ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7192         if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7193           APInt Mask = DB->getDemandedBits(OpI);
7194           NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7195         }
7196         if (NumSignBits * 2 >=
7197             DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7198           ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7199       }
7200       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7201                                    ReuseShuffleIndices);
7202       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7203 
7204       TE->setOperandsInOrder();
7205       for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7206         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7207       return;
7208     }
7209     case Instruction::ICmp:
7210     case Instruction::FCmp: {
7211       // Check that all of the compares have the same predicate.
7212       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7213       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7214                                    ReuseShuffleIndices);
7215       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7216 
7217       ValueList Left, Right;
7218       if (cast<CmpInst>(VL0)->isCommutative()) {
7219         // Commutative predicate - collect + sort operands of the instructions
7220         // so that each side is more likely to have the same opcode.
7221         assert(P0 == CmpInst::getSwappedPredicate(P0) &&
7222                "Commutative Predicate mismatch");
7223         reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7224       } else {
7225         // Collect operands - commute if it uses the swapped predicate.
7226         for (Value *V : VL) {
7227           auto *Cmp = cast<CmpInst>(V);
7228           Value *LHS = Cmp->getOperand(0);
7229           Value *RHS = Cmp->getOperand(1);
7230           if (Cmp->getPredicate() != P0)
7231             std::swap(LHS, RHS);
7232           Left.push_back(LHS);
7233           Right.push_back(RHS);
7234         }
7235       }
7236       TE->setOperand(0, Left);
7237       TE->setOperand(1, Right);
7238       buildTree_rec(Left, Depth + 1, {TE, 0});
7239       buildTree_rec(Right, Depth + 1, {TE, 1});
7240       if (ShuffleOrOp == Instruction::ICmp) {
7241         unsigned NumSignBits0 =
7242             ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7243         if (NumSignBits0 * 2 >=
7244             DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7245           ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7246         unsigned NumSignBits1 =
7247             ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7248         if (NumSignBits1 * 2 >=
7249             DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7250           ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7251       }
7252       return;
7253     }
7254     case Instruction::Select:
7255     case Instruction::FNeg:
7256     case Instruction::Add:
7257     case Instruction::FAdd:
7258     case Instruction::Sub:
7259     case Instruction::FSub:
7260     case Instruction::Mul:
7261     case Instruction::FMul:
7262     case Instruction::UDiv:
7263     case Instruction::SDiv:
7264     case Instruction::FDiv:
7265     case Instruction::URem:
7266     case Instruction::SRem:
7267     case Instruction::FRem:
7268     case Instruction::Shl:
7269     case Instruction::LShr:
7270     case Instruction::AShr:
7271     case Instruction::And:
7272     case Instruction::Or:
7273     case Instruction::Xor: {
7274       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7275                                    ReuseShuffleIndices);
7276       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7277 
7278       // Sort operands of the instructions so that each side is more likely to
7279       // have the same opcode.
7280       if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7281         ValueList Left, Right;
7282         reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7283         TE->setOperand(0, Left);
7284         TE->setOperand(1, Right);
7285         buildTree_rec(Left, Depth + 1, {TE, 0});
7286         buildTree_rec(Right, Depth + 1, {TE, 1});
7287         return;
7288       }
7289 
7290       TE->setOperandsInOrder();
7291       for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7292         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7293       return;
7294     }
7295     case Instruction::GetElementPtr: {
7296       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7297                                    ReuseShuffleIndices);
7298       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7299       SmallVector<ValueList, 2> Operands(2);
7300       // Prepare the operand vector for pointer operands.
7301       for (Value *V : VL) {
7302         auto *GEP = dyn_cast<GetElementPtrInst>(V);
7303         if (!GEP) {
7304           Operands.front().push_back(V);
7305           continue;
7306         }
7307         Operands.front().push_back(GEP->getPointerOperand());
7308       }
7309       TE->setOperand(0, Operands.front());
7310       // Need to cast all indices to the same type before vectorization to
7311       // avoid crash.
7312       // Required to be able to find correct matches between different gather
7313       // nodes and reuse the vectorized values rather than trying to gather them
7314       // again.
7315       int IndexIdx = 1;
7316       Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7317       Type *Ty = all_of(VL,
7318                         [VL0Ty, IndexIdx](Value *V) {
7319                           auto *GEP = dyn_cast<GetElementPtrInst>(V);
7320                           if (!GEP)
7321                             return true;
7322                           return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7323                         })
7324                      ? VL0Ty
7325                      : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7326                                             ->getPointerOperandType()
7327                                             ->getScalarType());
7328       // Prepare the operand vector.
7329       for (Value *V : VL) {
7330         auto *I = dyn_cast<GetElementPtrInst>(V);
7331         if (!I) {
7332           Operands.back().push_back(
7333               ConstantInt::get(Ty, 0, /*isSigned=*/false));
7334           continue;
7335         }
7336         auto *Op = I->getOperand(IndexIdx);
7337         auto *CI = dyn_cast<ConstantInt>(Op);
7338         if (!CI)
7339           Operands.back().push_back(Op);
7340         else
7341           Operands.back().push_back(ConstantFoldIntegerCast(
7342               CI, Ty, CI->getValue().isSignBitSet(), *DL));
7343       }
7344       TE->setOperand(IndexIdx, Operands.back());
7345 
7346       for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7347         buildTree_rec(Operands[I], Depth + 1, {TE, I});
7348       return;
7349     }
7350     case Instruction::Store: {
7351       bool Consecutive = CurrentOrder.empty();
7352       if (!Consecutive)
7353         fixupOrderingIndices(CurrentOrder);
7354       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7355                                    ReuseShuffleIndices, CurrentOrder);
7356       TE->setOperandsInOrder();
7357       buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7358       if (Consecutive)
7359         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7360       else
7361         LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7362       return;
7363     }
7364     case Instruction::Call: {
7365       // Check if the calls are all to the same vectorizable intrinsic or
7366       // library function.
7367       CallInst *CI = cast<CallInst>(VL0);
7368       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7369 
7370       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7371                                    ReuseShuffleIndices);
7372       // Sort operands of the instructions so that each side is more likely to
7373       // have the same opcode.
7374       if (isCommutative(VL0)) {
7375         ValueList Left, Right;
7376         reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7377         TE->setOperand(0, Left);
7378         TE->setOperand(1, Right);
7379         SmallVector<ValueList> Operands;
7380         for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7381           Operands.emplace_back();
7382           if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
7383             continue;
7384           for (Value *V : VL) {
7385             auto *CI2 = cast<CallInst>(V);
7386             Operands.back().push_back(CI2->getArgOperand(I));
7387           }
7388           TE->setOperand(I, Operands.back());
7389         }
7390         buildTree_rec(Left, Depth + 1, {TE, 0});
7391         buildTree_rec(Right, Depth + 1, {TE, 1});
7392         for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7393           if (Operands[I - 2].empty())
7394             continue;
7395           buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7396         }
7397         return;
7398       }
7399       TE->setOperandsInOrder();
7400       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7401         // For scalar operands no need to create an entry since no need to
7402         // vectorize it.
7403         if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
7404           continue;
7405         ValueList Operands;
7406         // Prepare the operand vector.
7407         for (Value *V : VL) {
7408           auto *CI2 = cast<CallInst>(V);
7409           Operands.push_back(CI2->getArgOperand(I));
7410         }
7411         buildTree_rec(Operands, Depth + 1, {TE, I});
7412       }
7413       return;
7414     }
7415     case Instruction::ShuffleVector: {
7416       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7417                                    ReuseShuffleIndices);
7418       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7419 
7420       // Reorder operands if reordering would enable vectorization.
7421       auto *CI = dyn_cast<CmpInst>(VL0);
7422       if (isa<BinaryOperator>(VL0) || CI) {
7423         ValueList Left, Right;
7424         if (!CI || all_of(VL, [](Value *V) {
7425               return cast<CmpInst>(V)->isCommutative();
7426             })) {
7427           reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7428         } else {
7429           auto *MainCI = cast<CmpInst>(S.MainOp);
7430           auto *AltCI = cast<CmpInst>(S.AltOp);
7431           CmpInst::Predicate MainP = MainCI->getPredicate();
7432           CmpInst::Predicate AltP = AltCI->getPredicate();
7433           assert(MainP != AltP &&
7434                  "Expected different main/alternate predicates.");
7435           // Collect operands - commute if it uses the swapped predicate or
7436           // alternate operation.
7437           for (Value *V : VL) {
7438             auto *Cmp = cast<CmpInst>(V);
7439             Value *LHS = Cmp->getOperand(0);
7440             Value *RHS = Cmp->getOperand(1);
7441 
7442             if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7443               if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7444                 std::swap(LHS, RHS);
7445             } else {
7446               if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7447                 std::swap(LHS, RHS);
7448             }
7449             Left.push_back(LHS);
7450             Right.push_back(RHS);
7451           }
7452         }
7453         TE->setOperand(0, Left);
7454         TE->setOperand(1, Right);
7455         buildTree_rec(Left, Depth + 1, {TE, 0});
7456         buildTree_rec(Right, Depth + 1, {TE, 1});
7457         return;
7458       }
7459 
7460       TE->setOperandsInOrder();
7461       for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7462         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7463       return;
7464     }
7465     default:
7466       break;
7467   }
7468   llvm_unreachable("Unexpected vectorization of the instructions.");
7469 }
7470 
7471 unsigned BoUpSLP::canMapToVector(Type *T) const {
7472   unsigned N = 1;
7473   Type *EltTy = T;
7474 
7475   while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7476     if (auto *ST = dyn_cast<StructType>(EltTy)) {
7477       // Check that struct is homogeneous.
7478       for (const auto *Ty : ST->elements())
7479         if (Ty != *ST->element_begin())
7480           return 0;
7481       N *= ST->getNumElements();
7482       EltTy = *ST->element_begin();
7483     } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7484       N *= AT->getNumElements();
7485       EltTy = AT->getElementType();
7486     } else {
7487       auto *VT = cast<FixedVectorType>(EltTy);
7488       N *= VT->getNumElements();
7489       EltTy = VT->getElementType();
7490     }
7491   }
7492 
7493   if (!isValidElementType(EltTy))
7494     return 0;
7495   uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
7496   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7497       VTSize != DL->getTypeStoreSizeInBits(T))
7498     return 0;
7499   return N;
7500 }
7501 
7502 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7503                               SmallVectorImpl<unsigned> &CurrentOrder,
7504                               bool ResizeAllowed) const {
7505   const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7506   assert(It != VL.end() && "Expected at least one extract instruction.");
7507   auto *E0 = cast<Instruction>(*It);
7508   assert(
7509       all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7510       "Invalid opcode");
7511   // Check if all of the extracts come from the same vector and from the
7512   // correct offset.
7513   Value *Vec = E0->getOperand(0);
7514 
7515   CurrentOrder.clear();
7516 
7517   // We have to extract from a vector/aggregate with the same number of elements.
7518   unsigned NElts;
7519   if (E0->getOpcode() == Instruction::ExtractValue) {
7520     NElts = canMapToVector(Vec->getType());
7521     if (!NElts)
7522       return false;
7523     // Check if load can be rewritten as load of vector.
7524     LoadInst *LI = dyn_cast<LoadInst>(Vec);
7525     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7526       return false;
7527   } else {
7528     NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7529   }
7530 
7531   unsigned E = VL.size();
7532   if (!ResizeAllowed && NElts != E)
7533     return false;
7534   SmallVector<int> Indices(E, PoisonMaskElem);
7535   unsigned MinIdx = NElts, MaxIdx = 0;
7536   for (auto [I, V] : enumerate(VL)) {
7537     auto *Inst = dyn_cast<Instruction>(V);
7538     if (!Inst)
7539       continue;
7540     if (Inst->getOperand(0) != Vec)
7541       return false;
7542     if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7543       if (isa<UndefValue>(EE->getIndexOperand()))
7544         continue;
7545     std::optional<unsigned> Idx = getExtractIndex(Inst);
7546     if (!Idx)
7547       return false;
7548     const unsigned ExtIdx = *Idx;
7549     if (ExtIdx >= NElts)
7550       continue;
7551     Indices[I] = ExtIdx;
7552     if (MinIdx > ExtIdx)
7553       MinIdx = ExtIdx;
7554     if (MaxIdx < ExtIdx)
7555       MaxIdx = ExtIdx;
7556   }
7557   if (MaxIdx - MinIdx + 1 > E)
7558     return false;
7559   if (MaxIdx + 1 <= E)
7560     MinIdx = 0;
7561 
7562   // Check that all of the indices extract from the correct offset.
7563   bool ShouldKeepOrder = true;
7564   // Assign to all items the initial value E + 1 so we can check if the extract
7565   // instruction index was used already.
7566   // Also, later we can check that all the indices are used and we have a
7567   // consecutive access in the extract instructions, by checking that no
7568   // element of CurrentOrder still has value E + 1.
7569   CurrentOrder.assign(E, E);
7570   for (unsigned I = 0; I < E; ++I) {
7571     if (Indices[I] == PoisonMaskElem)
7572       continue;
7573     const unsigned ExtIdx = Indices[I] - MinIdx;
7574     if (CurrentOrder[ExtIdx] != E) {
7575       CurrentOrder.clear();
7576       return false;
7577     }
7578     ShouldKeepOrder &= ExtIdx == I;
7579     CurrentOrder[ExtIdx] = I;
7580   }
7581   if (ShouldKeepOrder)
7582     CurrentOrder.clear();
7583 
7584   return ShouldKeepOrder;
7585 }
7586 
7587 bool BoUpSLP::areAllUsersVectorized(
7588     Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7589   return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7590          all_of(I->users(), [this](User *U) {
7591            return ScalarToTreeEntry.contains(U) ||
7592                   isVectorLikeInstWithConstOps(U) ||
7593                   (isa<ExtractElementInst>(U) && MustGather.contains(U));
7594          });
7595 }
7596 
7597 static std::pair<InstructionCost, InstructionCost>
7598 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7599                    TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7600                    ArrayRef<Type *> ArgTys) {
7601   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7602 
7603   // Calculate the cost of the scalar and vector calls.
7604   FastMathFlags FMF;
7605   if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7606     FMF = FPCI->getFastMathFlags();
7607   SmallVector<const Value *> Arguments(CI->args());
7608   IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7609                                     dyn_cast<IntrinsicInst>(CI));
7610   auto IntrinsicCost =
7611     TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
7612 
7613   auto Shape = VFShape::get(CI->getFunctionType(),
7614                             ElementCount::getFixed(VecTy->getNumElements()),
7615                             false /*HasGlobalPred*/);
7616   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7617   auto LibCost = IntrinsicCost;
7618   if (!CI->isNoBuiltin() && VecFunc) {
7619     // Calculate the cost of the vector library call.
7620     // If the corresponding vector call is cheaper, return its cost.
7621     LibCost =
7622         TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7623   }
7624   return {IntrinsicCost, LibCost};
7625 }
7626 
7627 void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7628     const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7629     SmallVectorImpl<Value *> *OpScalars,
7630     SmallVectorImpl<Value *> *AltScalars) const {
7631   unsigned Sz = Scalars.size();
7632   Mask.assign(Sz, PoisonMaskElem);
7633   SmallVector<int> OrderMask;
7634   if (!ReorderIndices.empty())
7635     inversePermutation(ReorderIndices, OrderMask);
7636   for (unsigned I = 0; I < Sz; ++I) {
7637     unsigned Idx = I;
7638     if (!ReorderIndices.empty())
7639       Idx = OrderMask[I];
7640     auto *OpInst = cast<Instruction>(Scalars[Idx]);
7641     if (IsAltOp(OpInst)) {
7642       Mask[I] = Sz + Idx;
7643       if (AltScalars)
7644         AltScalars->push_back(OpInst);
7645     } else {
7646       Mask[I] = Idx;
7647       if (OpScalars)
7648         OpScalars->push_back(OpInst);
7649     }
7650   }
7651   if (!ReuseShuffleIndices.empty()) {
7652     SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7653     transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7654       return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7655     });
7656     Mask.swap(NewMask);
7657   }
7658 }
7659 
7660 static bool isAlternateInstruction(const Instruction *I,
7661                                    const Instruction *MainOp,
7662                                    const Instruction *AltOp,
7663                                    const TargetLibraryInfo &TLI) {
7664   if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7665     auto *AltCI = cast<CmpInst>(AltOp);
7666     CmpInst::Predicate MainP = MainCI->getPredicate();
7667     CmpInst::Predicate AltP = AltCI->getPredicate();
7668     assert(MainP != AltP && "Expected different main/alternate predicates.");
7669     auto *CI = cast<CmpInst>(I);
7670     if (isCmpSameOrSwapped(MainCI, CI, TLI))
7671       return false;
7672     if (isCmpSameOrSwapped(AltCI, CI, TLI))
7673       return true;
7674     CmpInst::Predicate P = CI->getPredicate();
7675     CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
7676 
7677     assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7678            "CmpInst expected to match either main or alternate predicate or "
7679            "their swap.");
7680     (void)AltP;
7681     return MainP != P && MainP != SwappedP;
7682   }
7683   return I->getOpcode() == AltOp->getOpcode();
7684 }
7685 
7686 TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7687   assert(!Ops.empty());
7688   const auto *Op0 = Ops.front();
7689 
7690   const bool IsConstant = all_of(Ops, [](Value *V) {
7691     // TODO: We should allow undef elements here
7692     return isConstant(V) && !isa<UndefValue>(V);
7693   });
7694   const bool IsUniform = all_of(Ops, [=](Value *V) {
7695     // TODO: We should allow undef elements here
7696     return V == Op0;
7697   });
7698   const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7699     // TODO: We should allow undef elements here
7700     if (auto *CI = dyn_cast<ConstantInt>(V))
7701       return CI->getValue().isPowerOf2();
7702     return false;
7703   });
7704   const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7705     // TODO: We should allow undef elements here
7706     if (auto *CI = dyn_cast<ConstantInt>(V))
7707       return CI->getValue().isNegatedPowerOf2();
7708     return false;
7709   });
7710 
7711   TTI::OperandValueKind VK = TTI::OK_AnyValue;
7712   if (IsConstant && IsUniform)
7713     VK = TTI::OK_UniformConstantValue;
7714   else if (IsConstant)
7715     VK = TTI::OK_NonUniformConstantValue;
7716   else if (IsUniform)
7717     VK = TTI::OK_UniformValue;
7718 
7719   TTI::OperandValueProperties VP = TTI::OP_None;
7720   VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7721   VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7722 
7723   return {VK, VP};
7724 }
7725 
7726 namespace {
7727 /// The base class for shuffle instruction emission and shuffle cost estimation.
7728 class BaseShuffleAnalysis {
7729 protected:
7730   /// Checks if the mask is an identity mask.
7731   /// \param IsStrict if is true the function returns false if mask size does
7732   /// not match vector size.
7733   static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7734                              bool IsStrict) {
7735     int Limit = Mask.size();
7736     int VF = VecTy->getNumElements();
7737     int Index = -1;
7738     if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7739       return true;
7740     if (!IsStrict) {
7741       // Consider extract subvector starting from index 0.
7742       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
7743           Index == 0)
7744         return true;
7745       // All VF-size submasks are identity (e.g.
7746       // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7747       if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7748             ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7749             return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7750                    ShuffleVectorInst::isIdentityMask(Slice, VF);
7751           }))
7752         return true;
7753     }
7754     return false;
7755   }
7756 
7757   /// Tries to combine 2 different masks into single one.
7758   /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7759   /// change the size of the vector, \p LocalVF is the original size of the
7760   /// shuffled vector.
7761   static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7762                            ArrayRef<int> ExtMask) {
7763     unsigned VF = Mask.size();
7764     SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7765     for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7766       if (ExtMask[I] == PoisonMaskElem)
7767         continue;
7768       int MaskedIdx = Mask[ExtMask[I] % VF];
7769       NewMask[I] =
7770           MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7771     }
7772     Mask.swap(NewMask);
7773   }
7774 
7775   /// Looks through shuffles trying to reduce final number of shuffles in the
7776   /// code. The function looks through the previously emitted shuffle
7777   /// instructions and properly mark indices in mask as undef.
7778   /// For example, given the code
7779   /// \code
7780   /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7781   /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7782   /// \endcode
7783   /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7784   /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7785   /// <0, 1, 2, 3> for the shuffle.
7786   /// If 2 operands are of different size, the smallest one will be resized and
7787   /// the mask recalculated properly.
7788   /// For example, given the code
7789   /// \code
7790   /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7791   /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7792   /// \endcode
7793   /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7794   /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7795   /// <0, 1, 2, 3> for the shuffle.
7796   /// So, it tries to transform permutations to simple vector merge, if
7797   /// possible.
7798   /// \param V The input vector which must be shuffled using the given \p Mask.
7799   /// If the better candidate is found, \p V is set to this best candidate
7800   /// vector.
7801   /// \param Mask The input mask for the shuffle. If the best candidate is found
7802   /// during looking-through-shuffles attempt, it is updated accordingly.
7803   /// \param SinglePermute true if the shuffle operation is originally a
7804   /// single-value-permutation. In this case the look-through-shuffles procedure
7805   /// may look for resizing shuffles as the best candidates.
7806   /// \return true if the shuffle results in the non-resizing identity shuffle
7807   /// (and thus can be ignored), false - otherwise.
7808   static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7809                                   bool SinglePermute) {
7810     Value *Op = V;
7811     ShuffleVectorInst *IdentityOp = nullptr;
7812     SmallVector<int> IdentityMask;
7813     while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7814       // Exit if not a fixed vector type or changing size shuffle.
7815       auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7816       if (!SVTy)
7817         break;
7818       // Remember the identity or broadcast mask, if it is not a resizing
7819       // shuffle. If no better candidates are found, this Op and Mask will be
7820       // used in the final shuffle.
7821       if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7822         if (!IdentityOp || !SinglePermute ||
7823             (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7824              !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
7825                                                     IdentityMask.size()))) {
7826           IdentityOp = SV;
7827           // Store current mask in the IdentityMask so later we did not lost
7828           // this info if IdentityOp is selected as the best candidate for the
7829           // permutation.
7830           IdentityMask.assign(Mask);
7831         }
7832       }
7833       // Remember the broadcast mask. If no better candidates are found, this Op
7834       // and Mask will be used in the final shuffle.
7835       // Zero splat can be used as identity too, since it might be used with
7836       // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7837       // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7838       // expensive, the analysis founds out, that the source vector is just a
7839       // broadcast, this original mask can be transformed to identity mask <0,
7840       // 1, 2, 3>.
7841       // \code
7842       // %0 = shuffle %v, poison, zeroinitalizer
7843       // %res = shuffle %0, poison, <3, 1, 2, 0>
7844       // \endcode
7845       // may be transformed to
7846       // \code
7847       // %0 = shuffle %v, poison, zeroinitalizer
7848       // %res = shuffle %0, poison, <0, 1, 2, 3>
7849       // \endcode
7850       if (SV->isZeroEltSplat()) {
7851         IdentityOp = SV;
7852         IdentityMask.assign(Mask);
7853       }
7854       int LocalVF = Mask.size();
7855       if (auto *SVOpTy =
7856               dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7857         LocalVF = SVOpTy->getNumElements();
7858       SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7859       for (auto [Idx, I] : enumerate(Mask)) {
7860         if (I == PoisonMaskElem ||
7861             static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7862           continue;
7863         ExtMask[Idx] = SV->getMaskValue(I);
7864       }
7865       bool IsOp1Undef =
7866           isUndefVector(SV->getOperand(0),
7867                         buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7868               .all();
7869       bool IsOp2Undef =
7870           isUndefVector(SV->getOperand(1),
7871                         buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7872               .all();
7873       if (!IsOp1Undef && !IsOp2Undef) {
7874         // Update mask and mark undef elems.
7875         for (int &I : Mask) {
7876           if (I == PoisonMaskElem)
7877             continue;
7878           if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7879               PoisonMaskElem)
7880             I = PoisonMaskElem;
7881         }
7882         break;
7883       }
7884       SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7885                                    SV->getShuffleMask().end());
7886       combineMasks(LocalVF, ShuffleMask, Mask);
7887       Mask.swap(ShuffleMask);
7888       if (IsOp2Undef)
7889         Op = SV->getOperand(0);
7890       else
7891         Op = SV->getOperand(1);
7892     }
7893     if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7894         !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7895         ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
7896       if (IdentityOp) {
7897         V = IdentityOp;
7898         assert(Mask.size() == IdentityMask.size() &&
7899                "Expected masks of same sizes.");
7900         // Clear known poison elements.
7901         for (auto [I, Idx] : enumerate(Mask))
7902           if (Idx == PoisonMaskElem)
7903             IdentityMask[I] = PoisonMaskElem;
7904         Mask.swap(IdentityMask);
7905         auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7906         return SinglePermute &&
7907                (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7908                                /*IsStrict=*/true) ||
7909                 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7910                  Shuffle->isZeroEltSplat() &&
7911                  ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
7912       }
7913       V = Op;
7914       return false;
7915     }
7916     V = Op;
7917     return true;
7918   }
7919 
7920   /// Smart shuffle instruction emission, walks through shuffles trees and
7921   /// tries to find the best matching vector for the actual shuffle
7922   /// instruction.
7923   template <typename T, typename ShuffleBuilderTy>
7924   static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7925                          ShuffleBuilderTy &Builder) {
7926     assert(V1 && "Expected at least one vector value.");
7927     if (V2)
7928       Builder.resizeToMatch(V1, V2);
7929     int VF = Mask.size();
7930     if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7931       VF = FTy->getNumElements();
7932     if (V2 &&
7933         !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7934       // Peek through shuffles.
7935       Value *Op1 = V1;
7936       Value *Op2 = V2;
7937       int VF =
7938           cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7939       SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7940       SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7941       for (int I = 0, E = Mask.size(); I < E; ++I) {
7942         if (Mask[I] < VF)
7943           CombinedMask1[I] = Mask[I];
7944         else
7945           CombinedMask2[I] = Mask[I] - VF;
7946       }
7947       Value *PrevOp1;
7948       Value *PrevOp2;
7949       do {
7950         PrevOp1 = Op1;
7951         PrevOp2 = Op2;
7952         (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7953         (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7954         // Check if we have 2 resizing shuffles - need to peek through operands
7955         // again.
7956         if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7957           if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7958             SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7959             for (auto [Idx, I] : enumerate(CombinedMask1)) {
7960                 if (I == PoisonMaskElem)
7961                 continue;
7962                 ExtMask1[Idx] = SV1->getMaskValue(I);
7963             }
7964             SmallBitVector UseMask1 = buildUseMask(
7965                 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7966                     ->getNumElements(),
7967                 ExtMask1, UseMask::SecondArg);
7968             SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7969             for (auto [Idx, I] : enumerate(CombinedMask2)) {
7970                 if (I == PoisonMaskElem)
7971                 continue;
7972                 ExtMask2[Idx] = SV2->getMaskValue(I);
7973             }
7974             SmallBitVector UseMask2 = buildUseMask(
7975                 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7976                     ->getNumElements(),
7977                 ExtMask2, UseMask::SecondArg);
7978             if (SV1->getOperand(0)->getType() ==
7979                     SV2->getOperand(0)->getType() &&
7980                 SV1->getOperand(0)->getType() != SV1->getType() &&
7981                 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7982                 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7983               Op1 = SV1->getOperand(0);
7984               Op2 = SV2->getOperand(0);
7985               SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7986                                             SV1->getShuffleMask().end());
7987               int LocalVF = ShuffleMask1.size();
7988               if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7989                 LocalVF = FTy->getNumElements();
7990               combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7991               CombinedMask1.swap(ShuffleMask1);
7992               SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7993                                             SV2->getShuffleMask().end());
7994               LocalVF = ShuffleMask2.size();
7995               if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7996                 LocalVF = FTy->getNumElements();
7997               combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7998               CombinedMask2.swap(ShuffleMask2);
7999             }
8000           }
8001       } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8002       Builder.resizeToMatch(Op1, Op2);
8003       VF = std::max(cast<VectorType>(Op1->getType())
8004                         ->getElementCount()
8005                         .getKnownMinValue(),
8006                     cast<VectorType>(Op2->getType())
8007                         ->getElementCount()
8008                         .getKnownMinValue());
8009       for (int I = 0, E = Mask.size(); I < E; ++I) {
8010         if (CombinedMask2[I] != PoisonMaskElem) {
8011           assert(CombinedMask1[I] == PoisonMaskElem &&
8012                  "Expected undefined mask element");
8013           CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8014         }
8015       }
8016       if (Op1 == Op2 &&
8017           (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8018            (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8019             isa<ShuffleVectorInst>(Op1) &&
8020             cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8021                 ArrayRef(CombinedMask1))))
8022         return Builder.createIdentity(Op1);
8023       return Builder.createShuffleVector(
8024           Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8025           CombinedMask1);
8026     }
8027     if (isa<PoisonValue>(V1))
8028       return Builder.createPoison(
8029           cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8030     SmallVector<int> NewMask(Mask.begin(), Mask.end());
8031     bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8032     assert(V1 && "Expected non-null value after looking through shuffles.");
8033 
8034     if (!IsIdentity)
8035       return Builder.createShuffleVector(V1, NewMask);
8036     return Builder.createIdentity(V1);
8037   }
8038 };
8039 } // namespace
8040 
8041 /// Returns the cost of the shuffle instructions with the given \p Kind, vector
8042 /// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8043 /// subvector pattern.
8044 static InstructionCost
8045 getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
8046                VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
8047                TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
8048                int Index = 0, VectorType *SubTp = nullptr,
8049                ArrayRef<const Value *> Args = std::nullopt) {
8050   if (Kind != TTI::SK_PermuteTwoSrc)
8051     return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8052   int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8053   int NumSubElts;
8054   if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
8055                              Mask, NumSrcElts, NumSubElts, Index)) {
8056     if (Index + NumSubElts > NumSrcElts &&
8057         Index + NumSrcElts <= static_cast<int>(Mask.size()))
8058       return TTI.getShuffleCost(
8059           TTI::SK_InsertSubvector,
8060           getWidenedType(Tp->getElementType(), Mask.size()), Mask,
8061           TTI::TCK_RecipThroughput, Index, Tp);
8062   }
8063   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8064 }
8065 
8066 /// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8067 static std::pair<InstructionCost, InstructionCost>
8068 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
8069             Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8070             Type *ScalarTy, VectorType *VecTy) {
8071   InstructionCost ScalarCost = 0;
8072   InstructionCost VecCost = 0;
8073   // Here we differentiate two cases: (1) when Ptrs represent a regular
8074   // vectorization tree node (as they are pointer arguments of scattered
8075   // loads) or (2) when Ptrs are the arguments of loads or stores being
8076   // vectorized as plane wide unit-stride load/store since all the
8077   // loads/stores are known to be from/to adjacent locations.
8078   if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8079     // Case 2: estimate costs for pointer related costs when vectorizing to
8080     // a wide load/store.
8081     // Scalar cost is estimated as a set of pointers with known relationship
8082     // between them.
8083     // For vector code we will use BasePtr as argument for the wide load/store
8084     // but we also need to account all the instructions which are going to
8085     // stay in vectorized code due to uses outside of these scalar
8086     // loads/stores.
8087     ScalarCost = TTI.getPointersChainCost(
8088         Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8089         CostKind);
8090 
8091     SmallVector<const Value *> PtrsRetainedInVecCode;
8092     for (Value *V : Ptrs) {
8093       if (V == BasePtr) {
8094         PtrsRetainedInVecCode.push_back(V);
8095         continue;
8096       }
8097       auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8098       // For simplicity assume Ptr to stay in vectorized code if it's not a
8099       // GEP instruction. We don't care since it's cost considered free.
8100       // TODO: We should check for any uses outside of vectorizable tree
8101       // rather than just single use.
8102       if (!Ptr || !Ptr->hasOneUse())
8103         PtrsRetainedInVecCode.push_back(V);
8104     }
8105 
8106     if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8107       // If all pointers stay in vectorized code then we don't have
8108       // any savings on that.
8109       return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
8110     }
8111     VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
8112                                        TTI::PointersChainInfo::getKnownStride(),
8113                                        VecTy, CostKind);
8114   } else {
8115     // Case 1: Ptrs are the arguments of loads that we are going to transform
8116     // into masked gather load intrinsic.
8117     // All the scalar GEPs will be removed as a result of vectorization.
8118     // For any external uses of some lanes extract element instructions will
8119     // be generated (which cost is estimated separately).
8120     TTI::PointersChainInfo PtrsInfo =
8121         all_of(Ptrs,
8122                [](const Value *V) {
8123                  auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8124                  return Ptr && !Ptr->hasAllConstantIndices();
8125                })
8126             ? TTI::PointersChainInfo::getUnknownStride()
8127             : TTI::PointersChainInfo::getKnownStride();
8128 
8129     ScalarCost =
8130         TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
8131     auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8132     if (!BaseGEP) {
8133       auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
8134       if (It != Ptrs.end())
8135         BaseGEP = cast<GEPOperator>(*It);
8136     }
8137     if (BaseGEP) {
8138       SmallVector<const Value *> Indices(BaseGEP->indices());
8139       VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
8140                                BaseGEP->getPointerOperand(), Indices, VecTy,
8141                                CostKind);
8142     }
8143   }
8144 
8145   return std::make_pair(ScalarCost, VecCost);
8146 }
8147 
8148 void BoUpSLP::transformNodes() {
8149   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8150   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8151     TreeEntry &E = *TE;
8152     switch (E.getOpcode()) {
8153     case Instruction::Load: {
8154       // No need to reorder masked gather loads, just reorder the scalar
8155       // operands.
8156       if (E.State != TreeEntry::Vectorize)
8157         break;
8158       Type *ScalarTy = E.getMainOp()->getType();
8159       auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8160       Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8161       // Check if profitable to represent consecutive load + reverse as strided
8162       // load with stride -1.
8163       if (isReverseOrder(E.ReorderIndices) &&
8164           TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8165         SmallVector<int> Mask;
8166         inversePermutation(E.ReorderIndices, Mask);
8167         auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8168         InstructionCost OriginalVecCost =
8169             TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8170                                  BaseLI->getPointerAddressSpace(), CostKind,
8171                                  TTI::OperandValueInfo()) +
8172             ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
8173         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8174             Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8175             /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8176         if (StridedCost < OriginalVecCost)
8177           // Strided load is more profitable than consecutive load + reverse -
8178           // transform the node to strided load.
8179           E.State = TreeEntry::StridedVectorize;
8180       }
8181       break;
8182     }
8183     case Instruction::Store: {
8184       Type *ScalarTy =
8185           cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8186       auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8187       Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8188       // Check if profitable to represent consecutive load + reverse as strided
8189       // load with stride -1.
8190       if (isReverseOrder(E.ReorderIndices) &&
8191           TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8192         SmallVector<int> Mask;
8193         inversePermutation(E.ReorderIndices, Mask);
8194         auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8195         InstructionCost OriginalVecCost =
8196             TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8197                                  BaseSI->getPointerAddressSpace(), CostKind,
8198                                  TTI::OperandValueInfo()) +
8199             ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
8200         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8201             Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8202             /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8203         if (StridedCost < OriginalVecCost)
8204           // Strided load is more profitable than consecutive load + reverse -
8205           // transform the node to strided load.
8206           E.State = TreeEntry::StridedVectorize;
8207       }
8208       break;
8209     }
8210     default:
8211       break;
8212     }
8213   }
8214 }
8215 
8216 /// Merges shuffle masks and emits final shuffle instruction, if required. It
8217 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8218 /// when the actual shuffle instruction is generated only if this is actually
8219 /// required. Otherwise, the shuffle instruction emission is delayed till the
8220 /// end of the process, to reduce the number of emitted instructions and further
8221 /// analysis/transformations.
8222 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8223   bool IsFinalized = false;
8224   SmallVector<int> CommonMask;
8225   SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
8226   Type *ScalarTy = nullptr;
8227   const TargetTransformInfo &TTI;
8228   InstructionCost Cost = 0;
8229   SmallDenseSet<Value *> VectorizedVals;
8230   BoUpSLP &R;
8231   SmallPtrSetImpl<Value *> &CheckedExtracts;
8232   constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8233   /// While set, still trying to estimate the cost for the same nodes and we
8234   /// can delay actual cost estimation (virtual shuffle instruction emission).
8235   /// May help better estimate the cost if same nodes must be permuted + allows
8236   /// to move most of the long shuffles cost estimation to TTI.
8237   bool SameNodesEstimated = true;
8238 
8239   static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8240     if (Ty->getScalarType()->isPointerTy()) {
8241       Constant *Res = ConstantExpr::getIntToPtr(
8242           ConstantInt::getAllOnesValue(
8243               IntegerType::get(Ty->getContext(),
8244                                DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8245           Ty->getScalarType());
8246       if (auto *VTy = dyn_cast<VectorType>(Ty))
8247         Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8248       return Res;
8249     }
8250     return Constant::getAllOnesValue(Ty);
8251   }
8252 
8253   InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8254     if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8255       return TTI::TCC_Free;
8256     auto *VecTy = getWidenedType(ScalarTy, VL.size());
8257     InstructionCost GatherCost = 0;
8258     SmallVector<Value *> Gathers(VL.begin(), VL.end());
8259     // Improve gather cost for gather of loads, if we can group some of the
8260     // loads into vector loads.
8261     InstructionsState S = getSameOpcode(VL, *R.TLI);
8262     const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8263     unsigned MinVF = R.getMinVF(2 * Sz);
8264     if (VL.size() > 2 &&
8265         ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8266          (InVectors.empty() &&
8267           any_of(seq<unsigned>(0, VL.size() / MinVF),
8268                  [&](unsigned Idx) {
8269                    ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8270                    InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8271                    return S.getOpcode() == Instruction::Load &&
8272                           !S.isAltShuffle();
8273                  }))) &&
8274         !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8275         !isSplat(Gathers)) {
8276       InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8277       SetVector<Value *> VectorizedLoads;
8278       SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
8279       SmallVector<unsigned> ScatterVectorized;
8280       unsigned StartIdx = 0;
8281       unsigned VF = VL.size() / 2;
8282       for (; VF >= MinVF; VF /= 2) {
8283         for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8284              Cnt += VF) {
8285           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8286           if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8287             InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8288             if (SliceS.getOpcode() != Instruction::Load ||
8289                 SliceS.isAltShuffle())
8290               continue;
8291           }
8292           if (!VectorizedLoads.count(Slice.front()) &&
8293               !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8294             SmallVector<Value *> PointerOps;
8295             OrdersType CurrentOrder;
8296             LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8297                                                 CurrentOrder, PointerOps);
8298             switch (LS) {
8299             case LoadsState::Vectorize:
8300             case LoadsState::ScatterVectorize:
8301             case LoadsState::StridedVectorize:
8302               // Mark the vectorized loads so that we don't vectorize them
8303               // again.
8304               // TODO: better handling of loads with reorders.
8305               if (((LS == LoadsState::Vectorize ||
8306                     LS == LoadsState::StridedVectorize) &&
8307                    CurrentOrder.empty()) ||
8308                   (LS == LoadsState::StridedVectorize &&
8309                    isReverseOrder(CurrentOrder)))
8310                 VectorizedStarts.emplace_back(Cnt, LS);
8311               else
8312                 ScatterVectorized.push_back(Cnt);
8313               VectorizedLoads.insert(Slice.begin(), Slice.end());
8314               // If we vectorized initial block, no need to try to vectorize
8315               // it again.
8316               if (Cnt == StartIdx)
8317                 StartIdx += VF;
8318               break;
8319             case LoadsState::Gather:
8320               break;
8321             }
8322           }
8323         }
8324         // Check if the whole array was vectorized already - exit.
8325         if (StartIdx >= VL.size())
8326           break;
8327         // Found vectorizable parts - exit.
8328         if (!VectorizedLoads.empty())
8329           break;
8330       }
8331       if (!VectorizedLoads.empty()) {
8332         unsigned NumParts = TTI.getNumberOfParts(VecTy);
8333         bool NeedInsertSubvectorAnalysis =
8334             !NumParts || (VL.size() / VF) > NumParts;
8335         // Get the cost for gathered loads.
8336         for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8337           if (VectorizedLoads.contains(VL[I]))
8338             continue;
8339           GatherCost +=
8340               getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8341         }
8342         // Exclude potentially vectorized loads from list of gathered
8343         // scalars.
8344         Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8345         // The cost for vectorized loads.
8346         InstructionCost ScalarsCost = 0;
8347         for (Value *V : VectorizedLoads) {
8348           auto *LI = cast<LoadInst>(V);
8349           ScalarsCost +=
8350               TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8351                                   LI->getAlign(), LI->getPointerAddressSpace(),
8352                                   CostKind, TTI::OperandValueInfo(), LI);
8353         }
8354         auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
8355         for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8356           auto *LI = cast<LoadInst>(VL[P.first]);
8357           Align Alignment = LI->getAlign();
8358           GatherCost +=
8359               P.second == LoadsState::Vectorize
8360                   ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8361                                         LI->getPointerAddressSpace(), CostKind,
8362                                         TTI::OperandValueInfo(), LI)
8363                   : TTI.getStridedMemoryOpCost(
8364                         Instruction::Load, LoadTy, LI->getPointerOperand(),
8365                         /*VariableMask=*/false, Alignment, CostKind, LI);
8366           // Estimate GEP cost.
8367           SmallVector<Value *> PointerOps(VF);
8368           for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8369             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8370           auto [ScalarGEPCost, VectorGEPCost] =
8371               getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8372                           Instruction::Load, CostKind, LI->getType(), LoadTy);
8373           GatherCost += VectorGEPCost - ScalarGEPCost;
8374         }
8375         for (unsigned P : ScatterVectorized) {
8376           auto *LI0 = cast<LoadInst>(VL[P]);
8377           ArrayRef<Value *> Slice = VL.slice(P, VF);
8378           Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8379           GatherCost += TTI.getGatherScatterOpCost(
8380               Instruction::Load, LoadTy, LI0->getPointerOperand(),
8381               /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8382           // Estimate GEP cost.
8383           SmallVector<Value *> PointerOps(VF);
8384           for (auto [I, V] : enumerate(Slice))
8385             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8386           OrdersType Order;
8387           if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8388                               Order)) {
8389             // TODO: improve checks if GEPs can be vectorized.
8390             Value *Ptr0 = PointerOps.front();
8391             Type *ScalarTy = Ptr0->getType();
8392             auto *VecTy = getWidenedType(ScalarTy, VF);
8393             auto [ScalarGEPCost, VectorGEPCost] =
8394                 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8395                             CostKind, ScalarTy, VecTy);
8396             GatherCost += VectorGEPCost - ScalarGEPCost;
8397             if (!Order.empty()) {
8398               SmallVector<int> Mask;
8399               inversePermutation(Order, Mask);
8400               GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
8401                                              VecTy, Mask, CostKind);
8402             }
8403           } else {
8404             GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8405                                           PointerOps.front()->getType());
8406           }
8407         }
8408         if (NeedInsertSubvectorAnalysis) {
8409           // Add the cost for the subvectors insert.
8410           SmallVector<int> ShuffleMask(VL.size());
8411           for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8412             for (unsigned Idx : seq<unsigned>(0, E))
8413               ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8414             GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8415                                              ShuffleMask, CostKind, I, LoadTy);
8416           }
8417         }
8418         GatherCost -= ScalarsCost;
8419       }
8420       GatherCost = std::min(BaseCost, GatherCost);
8421     } else if (!Root && isSplat(VL)) {
8422       // Found the broadcasting of the single scalar, calculate the cost as
8423       // the broadcast.
8424       const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8425       assert(It != VL.end() && "Expected at least one non-undef value.");
8426       // Add broadcast for non-identity shuffle only.
8427       bool NeedShuffle =
8428           count(VL, *It) > 1 &&
8429           (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8430       if (!NeedShuffle)
8431         return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8432                                       CostKind, std::distance(VL.begin(), It),
8433                                       PoisonValue::get(VecTy), *It);
8434 
8435       SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8436       transform(VL, ShuffleMask.begin(), [](Value *V) {
8437         return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8438       });
8439       InstructionCost InsertCost =
8440           TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8441                                  PoisonValue::get(VecTy), *It);
8442       return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
8443                                              VecTy, ShuffleMask, CostKind,
8444                                              /*Index=*/0, /*SubTp=*/nullptr,
8445                                              /*Args=*/*It);
8446     }
8447     return GatherCost +
8448            (all_of(Gathers, IsaPred<UndefValue>)
8449                 ? TTI::TCC_Free
8450                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8451                                   ScalarTy));
8452   };
8453 
8454   /// Compute the cost of creating a vector containing the extracted values from
8455   /// \p VL.
8456   InstructionCost
8457   computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8458                      ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459                      unsigned NumParts) {
8460     assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8461     unsigned NumElts =
8462         std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8463           auto *EE = dyn_cast<ExtractElementInst>(V);
8464           if (!EE)
8465             return Sz;
8466           auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8467           if (!VecTy)
8468             return Sz;
8469           return std::max(Sz, VecTy->getNumElements());
8470         });
8471     // FIXME: this must be moved to TTI for better estimation.
8472     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8473     auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8474                                         SmallVectorImpl<unsigned> &Indices)
8475         -> std::optional<TTI::ShuffleKind> {
8476       if (NumElts <= EltsPerVector)
8477         return std::nullopt;
8478       int OffsetReg0 =
8479           alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8480                                     [](int S, int I) {
8481                                       if (I == PoisonMaskElem)
8482                                         return S;
8483                                       return std::min(S, I);
8484                                     }),
8485                     EltsPerVector);
8486       int OffsetReg1 = OffsetReg0;
8487       DenseSet<int> RegIndices;
8488       // Check that if trying to permute same single/2 input vectors.
8489       TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8490       int FirstRegId = -1;
8491       Indices.assign(1, OffsetReg0);
8492       for (auto [Pos, I] : enumerate(Mask)) {
8493         if (I == PoisonMaskElem)
8494           continue;
8495         int Idx = I - OffsetReg0;
8496         int RegId =
8497             (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8498         if (FirstRegId < 0)
8499           FirstRegId = RegId;
8500         RegIndices.insert(RegId);
8501         if (RegIndices.size() > 2)
8502           return std::nullopt;
8503         if (RegIndices.size() == 2) {
8504           ShuffleKind = TTI::SK_PermuteTwoSrc;
8505           if (Indices.size() == 1) {
8506             OffsetReg1 = alignDown(
8507                 std::accumulate(
8508                     std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8509                     [&](int S, int I) {
8510                       if (I == PoisonMaskElem)
8511                         return S;
8512                       int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513                                   ((I - OffsetReg0) % NumElts) / EltsPerVector;
8514                       if (RegId == FirstRegId)
8515                         return S;
8516                       return std::min(S, I);
8517                     }),
8518                 EltsPerVector);
8519             Indices.push_back(OffsetReg1 % NumElts);
8520           }
8521           Idx = I - OffsetReg1;
8522         }
8523         I = (Idx % NumElts) % EltsPerVector +
8524             (RegId == FirstRegId ? 0 : EltsPerVector);
8525       }
8526       return ShuffleKind;
8527     };
8528     InstructionCost Cost = 0;
8529 
8530     // Process extracts in blocks of EltsPerVector to check if the source vector
8531     // operand can be re-used directly. If not, add the cost of creating a
8532     // shuffle to extract the values into a vector register.
8533     for (unsigned Part : seq<unsigned>(NumParts)) {
8534       if (!ShuffleKinds[Part])
8535         continue;
8536       ArrayRef<int> MaskSlice = Mask.slice(
8537           Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8538       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8539       copy(MaskSlice, SubMask.begin());
8540       SmallVector<unsigned, 2> Indices;
8541       std::optional<TTI::ShuffleKind> RegShuffleKind =
8542           CheckPerRegistersShuffle(SubMask, Indices);
8543       if (!RegShuffleKind) {
8544         if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8545             !ShuffleVectorInst::isIdentityMask(
8546                 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8547           Cost +=
8548               ::getShuffleCost(TTI, *ShuffleKinds[Part],
8549                                getWidenedType(ScalarTy, NumElts), MaskSlice);
8550         continue;
8551       }
8552       if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8553           !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8554         Cost +=
8555             ::getShuffleCost(TTI, *RegShuffleKind,
8556                              getWidenedType(ScalarTy, EltsPerVector), SubMask);
8557       }
8558       for (unsigned Idx : Indices) {
8559         assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8560                "SK_ExtractSubvector index out of range");
8561         Cost += ::getShuffleCost(
8562             TTI, TTI::SK_ExtractSubvector,
8563             getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)),
8564             std::nullopt, CostKind, Idx,
8565             getWidenedType(ScalarTy, EltsPerVector));
8566       }
8567       // Second attempt to check, if just a permute is better estimated than
8568       // subvector extract.
8569       SubMask.assign(NumElts, PoisonMaskElem);
8570       copy(MaskSlice, SubMask.begin());
8571       InstructionCost OriginalCost = ::getShuffleCost(
8572           TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
8573       if (OriginalCost < Cost)
8574         Cost = OriginalCost;
8575     }
8576     return Cost;
8577   }
8578   /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8579   /// shuffle emission.
8580   static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8581                                         ArrayRef<int> Mask) {
8582     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8583       if (Mask[Idx] != PoisonMaskElem)
8584         CommonMask[Idx] = Idx;
8585   }
8586   /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8587   /// mask \p Mask, register number \p Part, that includes \p SliceSize
8588   /// elements.
8589   void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8590                                 ArrayRef<int> Mask, unsigned Part,
8591                                 unsigned SliceSize) {
8592     if (SameNodesEstimated) {
8593       // Delay the cost estimation if the same nodes are reshuffling.
8594       // If we already requested the cost of reshuffling of E1 and E2 before, no
8595       // need to estimate another cost with the sub-Mask, instead include this
8596       // sub-Mask into the CommonMask to estimate it later and avoid double cost
8597       // estimation.
8598       if ((InVectors.size() == 2 &&
8599            InVectors.front().get<const TreeEntry *>() == &E1 &&
8600            InVectors.back().get<const TreeEntry *>() == E2) ||
8601           (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8602         unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8603         assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8604                       [](int Idx) { return Idx == PoisonMaskElem; }) &&
8605                "Expected all poisoned elements.");
8606         ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8607         copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8608         return;
8609       }
8610       // Found non-matching nodes - need to estimate the cost for the matched
8611       // and transform mask.
8612       Cost += createShuffle(InVectors.front(),
8613                             InVectors.size() == 1 ? nullptr : InVectors.back(),
8614                             CommonMask);
8615       transformMaskAfterShuffle(CommonMask, CommonMask);
8616     }
8617     SameNodesEstimated = false;
8618     if (!E2 && InVectors.size() == 1) {
8619       unsigned VF = E1.getVectorFactor();
8620       if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8621         VF = std::max(VF,
8622                       cast<FixedVectorType>(V1->getType())->getNumElements());
8623       } else {
8624         const auto *E = InVectors.front().get<const TreeEntry *>();
8625         VF = std::max(VF, E->getVectorFactor());
8626       }
8627       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8628         if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8629           CommonMask[Idx] = Mask[Idx] + VF;
8630       Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8631       transformMaskAfterShuffle(CommonMask, CommonMask);
8632     } else {
8633       Cost += createShuffle(&E1, E2, Mask);
8634       transformMaskAfterShuffle(CommonMask, Mask);
8635     }
8636   }
8637 
8638   class ShuffleCostBuilder {
8639     const TargetTransformInfo &TTI;
8640 
8641     static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8642       int Index = -1;
8643       return Mask.empty() ||
8644              (VF == Mask.size() &&
8645               ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
8646              (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
8647               Index == 0);
8648     }
8649 
8650   public:
8651     ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8652     ~ShuffleCostBuilder() = default;
8653     InstructionCost createShuffleVector(Value *V1, Value *,
8654                                         ArrayRef<int> Mask) const {
8655       // Empty mask or identity mask are free.
8656       unsigned VF =
8657           cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8658       if (isEmptyOrIdentity(Mask, VF))
8659         return TTI::TCC_Free;
8660       return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8661                               cast<VectorType>(V1->getType()), Mask);
8662     }
8663     InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8664       // Empty mask or identity mask are free.
8665       unsigned VF =
8666           cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8667       if (isEmptyOrIdentity(Mask, VF))
8668         return TTI::TCC_Free;
8669       return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
8670                                 cast<VectorType>(V1->getType()), Mask);
8671     }
8672     InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8673     InstructionCost createPoison(Type *Ty, unsigned VF) const {
8674       return TTI::TCC_Free;
8675     }
8676     void resizeToMatch(Value *&, Value *&) const {}
8677   };
8678 
8679   /// Smart shuffle instruction emission, walks through shuffles trees and
8680   /// tries to find the best matching vector for the actual shuffle
8681   /// instruction.
8682   InstructionCost
8683   createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8684                 const PointerUnion<Value *, const TreeEntry *> &P2,
8685                 ArrayRef<int> Mask) {
8686     ShuffleCostBuilder Builder(TTI);
8687     SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8688     Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8689     unsigned CommonVF = Mask.size();
8690     InstructionCost ExtraCost = 0;
8691     auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8692                                         unsigned VF) -> InstructionCost {
8693       if (E.isGather() && allConstant(E.Scalars))
8694         return TTI::TCC_Free;
8695       Type *EScalarTy = E.Scalars.front()->getType();
8696       bool IsSigned = true;
8697       if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8698         EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8699         IsSigned = It->second.second;
8700       }
8701       if (EScalarTy != ScalarTy) {
8702         unsigned CastOpcode = Instruction::Trunc;
8703         unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8704         unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8705         if (DstSz > SrcSz)
8706           CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8707         return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
8708                                     getWidenedType(EScalarTy, VF),
8709                                     TTI::CastContextHint::None, CostKind);
8710       }
8711       return TTI::TCC_Free;
8712     };
8713     auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8714       if (isa<Constant>(V))
8715         return TTI::TCC_Free;
8716       auto *VecTy = cast<VectorType>(V->getType());
8717       Type *EScalarTy = VecTy->getElementType();
8718       if (EScalarTy != ScalarTy) {
8719         bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8720         unsigned CastOpcode = Instruction::Trunc;
8721         unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8722         unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8723         if (DstSz > SrcSz)
8724           CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8725         return TTI.getCastInstrCost(
8726             CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8727             VecTy, TTI::CastContextHint::None, CostKind);
8728       }
8729       return TTI::TCC_Free;
8730     };
8731     if (!V1 && !V2 && !P2.isNull()) {
8732       // Shuffle 2 entry nodes.
8733       const TreeEntry *E = P1.get<const TreeEntry *>();
8734       unsigned VF = E->getVectorFactor();
8735       const TreeEntry *E2 = P2.get<const TreeEntry *>();
8736       CommonVF = std::max(VF, E2->getVectorFactor());
8737       assert(all_of(Mask,
8738                     [=](int Idx) {
8739                       return Idx < 2 * static_cast<int>(CommonVF);
8740                     }) &&
8741              "All elements in mask must be less than 2 * CommonVF.");
8742       if (E->Scalars.size() == E2->Scalars.size()) {
8743         SmallVector<int> EMask = E->getCommonMask();
8744         SmallVector<int> E2Mask = E2->getCommonMask();
8745         if (!EMask.empty() || !E2Mask.empty()) {
8746           for (int &Idx : CommonMask) {
8747             if (Idx == PoisonMaskElem)
8748               continue;
8749             if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8750               Idx = EMask[Idx];
8751             else if (Idx >= static_cast<int>(CommonVF))
8752               Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8753                     E->Scalars.size();
8754           }
8755         }
8756         CommonVF = E->Scalars.size();
8757         ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758                      GetNodeMinBWAffectedCost(*E2, CommonVF);
8759       } else {
8760         ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761                      GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8762       }
8763       V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8764       V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8765     } else if (!V1 && P2.isNull()) {
8766       // Shuffle single entry node.
8767       const TreeEntry *E = P1.get<const TreeEntry *>();
8768       unsigned VF = E->getVectorFactor();
8769       CommonVF = VF;
8770       assert(
8771           all_of(Mask,
8772                  [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8773           "All elements in mask must be less than CommonVF.");
8774       if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8775         SmallVector<int> EMask = E->getCommonMask();
8776         assert(!EMask.empty() && "Expected non-empty common mask.");
8777         for (int &Idx : CommonMask) {
8778           if (Idx != PoisonMaskElem)
8779             Idx = EMask[Idx];
8780         }
8781         CommonVF = E->Scalars.size();
8782       }
8783       ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8784       V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8785       // Not identity/broadcast? Try to see if the original vector is better.
8786       if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787           CommonVF == CommonMask.size() &&
8788           any_of(enumerate(CommonMask),
8789                  [](const auto &&P) {
8790                    return P.value() != PoisonMaskElem &&
8791                           static_cast<unsigned>(P.value()) != P.index();
8792                  }) &&
8793           any_of(CommonMask,
8794                  [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8795         SmallVector<int> ReorderMask;
8796         inversePermutation(E->ReorderIndices, ReorderMask);
8797         ::addMask(CommonMask, ReorderMask);
8798       }
8799     } else if (V1 && P2.isNull()) {
8800       // Shuffle single vector.
8801       ExtraCost += GetValueMinBWAffectedCost(V1);
8802       CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8803       assert(
8804           all_of(Mask,
8805                  [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8806           "All elements in mask must be less than CommonVF.");
8807     } else if (V1 && !V2) {
8808       // Shuffle vector and tree node.
8809       unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8810       const TreeEntry *E2 = P2.get<const TreeEntry *>();
8811       CommonVF = std::max(VF, E2->getVectorFactor());
8812       assert(all_of(Mask,
8813                     [=](int Idx) {
8814                       return Idx < 2 * static_cast<int>(CommonVF);
8815                     }) &&
8816              "All elements in mask must be less than 2 * CommonVF.");
8817       if (E2->Scalars.size() == VF && VF != CommonVF) {
8818         SmallVector<int> E2Mask = E2->getCommonMask();
8819         assert(!E2Mask.empty() && "Expected non-empty common mask.");
8820         for (int &Idx : CommonMask) {
8821           if (Idx == PoisonMaskElem)
8822             continue;
8823           if (Idx >= static_cast<int>(CommonVF))
8824             Idx = E2Mask[Idx - CommonVF] + VF;
8825         }
8826         CommonVF = VF;
8827       }
8828       ExtraCost += GetValueMinBWAffectedCost(V1);
8829       V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8830       ExtraCost += GetNodeMinBWAffectedCost(
8831           *E2, std::min(CommonVF, E2->getVectorFactor()));
8832       V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8833     } else if (!V1 && V2) {
8834       // Shuffle vector and tree node.
8835       unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8836       const TreeEntry *E1 = P1.get<const TreeEntry *>();
8837       CommonVF = std::max(VF, E1->getVectorFactor());
8838       assert(all_of(Mask,
8839                     [=](int Idx) {
8840                       return Idx < 2 * static_cast<int>(CommonVF);
8841                     }) &&
8842              "All elements in mask must be less than 2 * CommonVF.");
8843       if (E1->Scalars.size() == VF && VF != CommonVF) {
8844         SmallVector<int> E1Mask = E1->getCommonMask();
8845         assert(!E1Mask.empty() && "Expected non-empty common mask.");
8846         for (int &Idx : CommonMask) {
8847           if (Idx == PoisonMaskElem)
8848             continue;
8849           if (Idx >= static_cast<int>(CommonVF))
8850             Idx = E1Mask[Idx - CommonVF] + VF;
8851           else
8852             Idx = E1Mask[Idx];
8853         }
8854         CommonVF = VF;
8855       }
8856       ExtraCost += GetNodeMinBWAffectedCost(
8857           *E1, std::min(CommonVF, E1->getVectorFactor()));
8858       V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8859       ExtraCost += GetValueMinBWAffectedCost(V2);
8860       V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8861     } else {
8862       assert(V1 && V2 && "Expected both vectors.");
8863       unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8864       CommonVF =
8865           std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8866       assert(all_of(Mask,
8867                     [=](int Idx) {
8868                       return Idx < 2 * static_cast<int>(CommonVF);
8869                     }) &&
8870              "All elements in mask must be less than 2 * CommonVF.");
8871       ExtraCost +=
8872           GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873       if (V1->getType() != V2->getType()) {
8874         V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8875         V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8876       } else {
8877         if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8878           V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8879         if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8880           V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8881       }
8882     }
8883     InVectors.front() =
8884         Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8885     if (InVectors.size() == 2)
8886       InVectors.pop_back();
8887     return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888                            V1, V2, CommonMask, Builder);
8889   }
8890 
8891 public:
8892   ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
8893                        ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8894                        SmallPtrSetImpl<Value *> &CheckedExtracts)
8895       : ScalarTy(ScalarTy), TTI(TTI),
8896         VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897         CheckedExtracts(CheckedExtracts) {}
8898   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8899                         ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900                         unsigned NumParts, bool &UseVecBaseAsInput) {
8901     UseVecBaseAsInput = false;
8902     if (Mask.empty())
8903       return nullptr;
8904     Value *VecBase = nullptr;
8905     ArrayRef<Value *> VL = E->Scalars;
8906     // If the resulting type is scalarized, do not adjust the cost.
8907     if (NumParts == VL.size())
8908       return nullptr;
8909     // Check if it can be considered reused if same extractelements were
8910     // vectorized already.
8911     bool PrevNodeFound = any_of(
8912         ArrayRef(R.VectorizableTree).take_front(E->Idx),
8913         [&](const std::unique_ptr<TreeEntry> &TE) {
8914           return ((!TE->isAltShuffle() &&
8915                    TE->getOpcode() == Instruction::ExtractElement) ||
8916                   TE->isGather()) &&
8917                  all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8918                    return VL.size() > Data.index() &&
8919                           (Mask[Data.index()] == PoisonMaskElem ||
8920                            isa<UndefValue>(VL[Data.index()]) ||
8921                            Data.value() == VL[Data.index()]);
8922                  });
8923         });
8924     SmallPtrSet<Value *, 4> UniqueBases;
8925     unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
8926     for (unsigned Part : seq<unsigned>(NumParts)) {
8927       unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
8928       ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8929       for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
8930         // Ignore non-extractelement scalars.
8931         if (isa<UndefValue>(V) ||
8932             (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8933           continue;
8934         // If all users of instruction are going to be vectorized and this
8935         // instruction itself is not going to be vectorized, consider this
8936         // instruction as dead and remove its cost from the final cost of the
8937         // vectorized tree.
8938         // Also, avoid adjusting the cost for extractelements with multiple uses
8939         // in different graph entries.
8940         auto *EE = cast<ExtractElementInst>(V);
8941         VecBase = EE->getVectorOperand();
8942         UniqueBases.insert(VecBase);
8943         const TreeEntry *VE = R.getTreeEntry(V);
8944         if (!CheckedExtracts.insert(V).second ||
8945             !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8946             any_of(EE->users(),
8947                    [&](User *U) {
8948                      return isa<GetElementPtrInst>(U) &&
8949                             !R.areAllUsersVectorized(cast<Instruction>(U),
8950                                                      &VectorizedVals);
8951                    }) ||
8952             (VE && VE != E))
8953           continue;
8954         std::optional<unsigned> EEIdx = getExtractIndex(EE);
8955         if (!EEIdx)
8956           continue;
8957         unsigned Idx = *EEIdx;
8958         // Take credit for instruction that will become dead.
8959         if (EE->hasOneUse() || !PrevNodeFound) {
8960           Instruction *Ext = EE->user_back();
8961           if (isa<SExtInst, ZExtInst>(Ext) &&
8962               all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8963             // Use getExtractWithExtendCost() to calculate the cost of
8964             // extractelement/ext pair.
8965             Cost -=
8966                 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8967                                              EE->getVectorOperandType(), Idx);
8968             // Add back the cost of s|zext which is subtracted separately.
8969             Cost += TTI.getCastInstrCost(
8970                 Ext->getOpcode(), Ext->getType(), EE->getType(),
8971                 TTI::getCastContextHint(Ext), CostKind, Ext);
8972             continue;
8973           }
8974         }
8975         Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8976                                        CostKind, Idx);
8977       }
8978     }
8979     // Check that gather of extractelements can be represented as just a
8980     // shuffle of a single/two vectors the scalars are extracted from.
8981     // Found the bunch of extractelement instructions that must be gathered
8982     // into a vector and can be represented as a permutation elements in a
8983     // single input vector or of 2 input vectors.
8984     // Done for reused if same extractelements were vectorized already.
8985     if (!PrevNodeFound)
8986       Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8987     InVectors.assign(1, E);
8988     CommonMask.assign(Mask.begin(), Mask.end());
8989     transformMaskAfterShuffle(CommonMask, CommonMask);
8990     SameNodesEstimated = false;
8991     if (NumParts != 1 && UniqueBases.size() != 1) {
8992       UseVecBaseAsInput = true;
8993       VecBase =
8994           Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8995     }
8996     return VecBase;
8997   }
8998   /// Checks if the specified entry \p E needs to be delayed because of its
8999   /// dependency nodes.
9000   std::optional<InstructionCost>
9001   needToDelay(const TreeEntry *,
9002               ArrayRef<SmallVector<const TreeEntry *>>) const {
9003     // No need to delay the cost estimation during analysis.
9004     return std::nullopt;
9005   }
9006   void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9007     if (&E1 == &E2) {
9008       assert(all_of(Mask,
9009                     [&](int Idx) {
9010                       return Idx < static_cast<int>(E1.getVectorFactor());
9011                     }) &&
9012              "Expected single vector shuffle mask.");
9013       add(E1, Mask);
9014       return;
9015     }
9016     if (InVectors.empty()) {
9017       CommonMask.assign(Mask.begin(), Mask.end());
9018       InVectors.assign({&E1, &E2});
9019       return;
9020     }
9021     assert(!CommonMask.empty() && "Expected non-empty common mask.");
9022     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9023     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9024     if (NumParts == 0 || NumParts >= Mask.size())
9025       NumParts = 1;
9026     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9027     const auto *It =
9028         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9029     unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9030     estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9031   }
9032   void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9033     if (InVectors.empty()) {
9034       CommonMask.assign(Mask.begin(), Mask.end());
9035       InVectors.assign(1, &E1);
9036       return;
9037     }
9038     assert(!CommonMask.empty() && "Expected non-empty common mask.");
9039     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9040     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9041     if (NumParts == 0 || NumParts >= Mask.size())
9042       NumParts = 1;
9043     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9044     const auto *It =
9045         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9046     unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9047     estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
9048     if (!SameNodesEstimated && InVectors.size() == 1)
9049       InVectors.emplace_back(&E1);
9050   }
9051   /// Adds 2 input vectors and the mask for their shuffling.
9052   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9053     // May come only for shuffling of 2 vectors with extractelements, already
9054     // handled in adjustExtracts.
9055     assert(InVectors.size() == 1 &&
9056            all_of(enumerate(CommonMask),
9057                   [&](auto P) {
9058                     if (P.value() == PoisonMaskElem)
9059                       return Mask[P.index()] == PoisonMaskElem;
9060                     auto *EI =
9061                         cast<ExtractElementInst>(InVectors.front()
9062                                                      .get<const TreeEntry *>()
9063                                                      ->Scalars[P.index()]);
9064                     return EI->getVectorOperand() == V1 ||
9065                            EI->getVectorOperand() == V2;
9066                   }) &&
9067            "Expected extractelement vectors.");
9068   }
9069   /// Adds another one input vector and the mask for the shuffling.
9070   void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9071     if (InVectors.empty()) {
9072       assert(CommonMask.empty() && !ForExtracts &&
9073              "Expected empty input mask/vectors.");
9074       CommonMask.assign(Mask.begin(), Mask.end());
9075       InVectors.assign(1, V1);
9076       return;
9077     }
9078     if (ForExtracts) {
9079       // No need to add vectors here, already handled them in adjustExtracts.
9080       assert(InVectors.size() == 1 &&
9081              InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9082              all_of(enumerate(CommonMask),
9083                     [&](auto P) {
9084                       Value *Scalar = InVectors.front()
9085                                           .get<const TreeEntry *>()
9086                                           ->Scalars[P.index()];
9087                       if (P.value() == PoisonMaskElem)
9088                         return P.value() == Mask[P.index()] ||
9089                                isa<UndefValue>(Scalar);
9090                       if (isa<Constant>(V1))
9091                         return true;
9092                       auto *EI = cast<ExtractElementInst>(Scalar);
9093                       return EI->getVectorOperand() == V1;
9094                     }) &&
9095              "Expected only tree entry for extractelement vectors.");
9096       return;
9097     }
9098     assert(!InVectors.empty() && !CommonMask.empty() &&
9099            "Expected only tree entries from extracts/reused buildvectors.");
9100     unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
9101     if (InVectors.size() == 2) {
9102       Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
9103       transformMaskAfterShuffle(CommonMask, CommonMask);
9104       VF = std::max<unsigned>(VF, CommonMask.size());
9105     } else if (const auto *InTE =
9106                    InVectors.front().dyn_cast<const TreeEntry *>()) {
9107       VF = std::max(VF, InTE->getVectorFactor());
9108     } else {
9109       VF = std::max(
9110           VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
9111                   ->getNumElements());
9112     }
9113     InVectors.push_back(V1);
9114     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9115       if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9116         CommonMask[Idx] = Mask[Idx] + VF;
9117   }
9118   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9119                 Value *Root = nullptr) {
9120     Cost += getBuildVectorCost(VL, Root);
9121     if (!Root) {
9122       // FIXME: Need to find a way to avoid use of getNullValue here.
9123       SmallVector<Constant *> Vals;
9124       unsigned VF = VL.size();
9125       if (MaskVF != 0)
9126         VF = std::min(VF, MaskVF);
9127       for (Value *V : VL.take_front(VF)) {
9128         if (isa<UndefValue>(V)) {
9129           Vals.push_back(cast<Constant>(V));
9130           continue;
9131         }
9132         Vals.push_back(Constant::getNullValue(V->getType()));
9133       }
9134       return ConstantVector::get(Vals);
9135     }
9136     return ConstantVector::getSplat(
9137         ElementCount::getFixed(
9138             cast<FixedVectorType>(Root->getType())->getNumElements()),
9139         getAllOnesValue(*R.DL, ScalarTy));
9140   }
9141   InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
9142   /// Finalize emission of the shuffles.
9143   InstructionCost
9144   finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9145            function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9146     IsFinalized = true;
9147     if (Action) {
9148       const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9149       if (InVectors.size() == 2)
9150         Cost += createShuffle(Vec, InVectors.back(), CommonMask);
9151       else
9152         Cost += createShuffle(Vec, nullptr, CommonMask);
9153       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9154         if (CommonMask[Idx] != PoisonMaskElem)
9155           CommonMask[Idx] = Idx;
9156       assert(VF > 0 &&
9157              "Expected vector length for the final value before action.");
9158       Value *V = Vec.get<Value *>();
9159       Action(V, CommonMask);
9160       InVectors.front() = V;
9161     }
9162     ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
9163     if (CommonMask.empty()) {
9164       assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9165       return Cost;
9166     }
9167     return Cost +
9168            createShuffle(InVectors.front(),
9169                          InVectors.size() == 2 ? InVectors.back() : nullptr,
9170                          CommonMask);
9171   }
9172 
9173   ~ShuffleCostEstimator() {
9174     assert((IsFinalized || CommonMask.empty()) &&
9175            "Shuffle construction must be finalized.");
9176   }
9177 };
9178 
9179 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9180                                                    unsigned Idx) const {
9181   Value *Op = E->getOperand(Idx).front();
9182   if (const TreeEntry *TE = getTreeEntry(Op)) {
9183     if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9184           return EI.EdgeIdx == Idx && EI.UserTE == E;
9185         }) != TE->UserTreeIndices.end())
9186       return TE;
9187     auto MIt = MultiNodeScalars.find(Op);
9188     if (MIt != MultiNodeScalars.end()) {
9189       for (const TreeEntry *TE : MIt->second) {
9190         if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9191               return EI.EdgeIdx == Idx && EI.UserTE == E;
9192             }) != TE->UserTreeIndices.end())
9193           return TE;
9194       }
9195     }
9196   }
9197   const auto *It =
9198       find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9199         return TE->isGather() &&
9200                find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9201                  return EI.EdgeIdx == Idx && EI.UserTE == E;
9202                }) != TE->UserTreeIndices.end();
9203       });
9204   assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9205   return It->get();
9206 }
9207 
9208 TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9209   if (TE.State == TreeEntry::ScatterVectorize ||
9210       TE.State == TreeEntry::StridedVectorize)
9211     return TTI::CastContextHint::GatherScatter;
9212   if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9213       !TE.isAltShuffle()) {
9214     if (TE.ReorderIndices.empty())
9215       return TTI::CastContextHint::Normal;
9216     SmallVector<int> Mask;
9217     inversePermutation(TE.ReorderIndices, Mask);
9218     if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9219       return TTI::CastContextHint::Reversed;
9220   }
9221   return TTI::CastContextHint::None;
9222 }
9223 
9224 /// Builds the arguments types vector for the given call instruction with the
9225 /// given \p ID for the specified vector factor.
9226 static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
9227                                                   const Intrinsic::ID ID,
9228                                                   const unsigned VF,
9229                                                   unsigned MinBW) {
9230   SmallVector<Type *> ArgTys;
9231   for (auto [Idx, Arg] : enumerate(CI->args())) {
9232     if (ID != Intrinsic::not_intrinsic) {
9233       if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
9234         ArgTys.push_back(Arg->getType());
9235         continue;
9236       }
9237       if (MinBW > 0) {
9238         ArgTys.push_back(
9239             getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9240         continue;
9241       }
9242     }
9243     ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9244   }
9245   return ArgTys;
9246 }
9247 
9248 InstructionCost
9249 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9250                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
9251   ArrayRef<Value *> VL = E->Scalars;
9252 
9253   Type *ScalarTy = VL[0]->getType();
9254   if (!E->isGather()) {
9255     if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9256       ScalarTy = SI->getValueOperand()->getType();
9257     else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9258       ScalarTy = CI->getOperand(0)->getType();
9259     else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9260       ScalarTy = IE->getOperand(1)->getType();
9261   }
9262   if (!isValidElementType(ScalarTy))
9263     return InstructionCost::getInvalid();
9264   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9265 
9266   // If we have computed a smaller type for the expression, update VecTy so
9267   // that the costs will be accurate.
9268   auto It = MinBWs.find(E);
9269   Type *OrigScalarTy = ScalarTy;
9270   if (It != MinBWs.end())
9271     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9272   auto *VecTy = getWidenedType(ScalarTy, VL.size());
9273   unsigned EntryVF = E->getVectorFactor();
9274   auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
9275 
9276   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277   if (E->isGather()) {
9278     if (allConstant(VL))
9279       return 0;
9280     if (isa<InsertElementInst>(VL[0]))
9281       return InstructionCost::getInvalid();
9282     return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283         E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9284   }
9285   InstructionCost CommonCost = 0;
9286   SmallVector<int> Mask;
9287   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9288   if (!E->ReorderIndices.empty() &&
9289       (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9290     SmallVector<int> NewMask;
9291     if (E->getOpcode() == Instruction::Store) {
9292       // For stores the order is actually a mask.
9293       NewMask.resize(E->ReorderIndices.size());
9294       copy(E->ReorderIndices, NewMask.begin());
9295     } else {
9296       inversePermutation(E->ReorderIndices, NewMask);
9297     }
9298     ::addMask(Mask, NewMask);
9299   }
9300   if (NeedToShuffleReuses)
9301     ::addMask(Mask, E->ReuseShuffleIndices);
9302   if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9303     CommonCost =
9304         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9305   assert((E->State == TreeEntry::Vectorize ||
9306           E->State == TreeEntry::ScatterVectorize ||
9307           E->State == TreeEntry::StridedVectorize) &&
9308          "Unhandled state");
9309   assert(E->getOpcode() &&
9310          ((allSameType(VL) && allSameBlock(VL)) ||
9311           (E->getOpcode() == Instruction::GetElementPtr &&
9312            E->getMainOp()->getType()->isPointerTy())) &&
9313          "Invalid VL");
9314   Instruction *VL0 = E->getMainOp();
9315   unsigned ShuffleOrOp =
9316       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9317   SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9318   const unsigned Sz = UniqueValues.size();
9319   SmallBitVector UsedScalars(Sz, false);
9320   for (unsigned I = 0; I < Sz; ++I) {
9321     if (getTreeEntry(UniqueValues[I]) == E)
9322       continue;
9323     UsedScalars.set(I);
9324   }
9325   auto GetCastContextHint = [&](Value *V) {
9326     if (const TreeEntry *OpTE = getTreeEntry(V))
9327       return getCastContextHint(*OpTE);
9328     InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9329     if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9330       return TTI::CastContextHint::GatherScatter;
9331     return TTI::CastContextHint::None;
9332   };
9333   auto GetCostDiff =
9334       [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9335           function_ref<InstructionCost(InstructionCost)> VectorCost) {
9336         // Calculate the cost of this instruction.
9337         InstructionCost ScalarCost = 0;
9338         if (isa<CastInst, CallInst>(VL0)) {
9339           // For some of the instructions no need to calculate cost for each
9340           // particular instruction, we can use the cost of the single
9341           // instruction x total number of scalar instructions.
9342           ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9343         } else {
9344           for (unsigned I = 0; I < Sz; ++I) {
9345             if (UsedScalars.test(I))
9346               continue;
9347             ScalarCost += ScalarEltCost(I);
9348           }
9349         }
9350 
9351         InstructionCost VecCost = VectorCost(CommonCost);
9352         // Check if the current node must be resized, if the parent node is not
9353         // resized.
9354         if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9355           const EdgeInfo &EI = E->UserTreeIndices.front();
9356           if ((EI.UserTE->getOpcode() != Instruction::Select ||
9357                EI.EdgeIdx != 0) &&
9358               It != MinBWs.end()) {
9359             auto UserBWIt = MinBWs.find(EI.UserTE);
9360             Type *UserScalarTy =
9361                 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9362             if (UserBWIt != MinBWs.end())
9363               UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9364                                               UserBWIt->second.first);
9365             if (ScalarTy != UserScalarTy) {
9366               unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9367               unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9368               unsigned VecOpcode;
9369               auto *UserVecTy =
9370                   getWidenedType(UserScalarTy, E->getVectorFactor());
9371               if (BWSz > SrcBWSz)
9372                 VecOpcode = Instruction::Trunc;
9373               else
9374                 VecOpcode =
9375                     It->second.second ? Instruction::SExt : Instruction::ZExt;
9376               TTI::CastContextHint CCH = GetCastContextHint(VL0);
9377               VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9378                                                CostKind);
9379             }
9380           }
9381         }
9382         LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383                                  ScalarCost, "Calculated costs for Tree"));
9384         return VecCost - ScalarCost;
9385       };
9386   // Calculate cost difference from vectorizing set of GEPs.
9387   // Negative value means vectorizing is profitable.
9388   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9389     assert((E->State == TreeEntry::Vectorize ||
9390             E->State == TreeEntry::StridedVectorize) &&
9391            "Entry state expected to be Vectorize or StridedVectorize here.");
9392     InstructionCost ScalarCost = 0;
9393     InstructionCost VecCost = 0;
9394     std::tie(ScalarCost, VecCost) = getGEPCosts(
9395         *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9396     LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9397                              "Calculated GEPs cost for Tree"));
9398 
9399     return VecCost - ScalarCost;
9400   };
9401 
9402   switch (ShuffleOrOp) {
9403   case Instruction::PHI: {
9404     // Count reused scalars.
9405     InstructionCost ScalarCost = 0;
9406     SmallPtrSet<const TreeEntry *, 4> CountedOps;
9407     for (Value *V : UniqueValues) {
9408       auto *PHI = dyn_cast<PHINode>(V);
9409       if (!PHI)
9410         continue;
9411 
9412       ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9413       for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9414         Value *Op = PHI->getIncomingValue(I);
9415         Operands[I] = Op;
9416       }
9417       if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9418         if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9419           if (!OpTE->ReuseShuffleIndices.empty())
9420             ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421                                             OpTE->Scalars.size());
9422     }
9423 
9424     return CommonCost - ScalarCost;
9425   }
9426   case Instruction::ExtractValue:
9427   case Instruction::ExtractElement: {
9428     auto GetScalarCost = [&](unsigned Idx) {
9429       auto *I = cast<Instruction>(UniqueValues[Idx]);
9430       VectorType *SrcVecTy;
9431       if (ShuffleOrOp == Instruction::ExtractElement) {
9432         auto *EE = cast<ExtractElementInst>(I);
9433         SrcVecTy = EE->getVectorOperandType();
9434       } else {
9435         auto *EV = cast<ExtractValueInst>(I);
9436         Type *AggregateTy = EV->getAggregateOperand()->getType();
9437         unsigned NumElts;
9438         if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9439           NumElts = ATy->getNumElements();
9440         else
9441           NumElts = AggregateTy->getStructNumElements();
9442         SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
9443       }
9444       if (I->hasOneUse()) {
9445         Instruction *Ext = I->user_back();
9446         if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9447             all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9448           // Use getExtractWithExtendCost() to calculate the cost of
9449           // extractelement/ext pair.
9450           InstructionCost Cost = TTI->getExtractWithExtendCost(
9451               Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9452           // Subtract the cost of s|zext which is subtracted separately.
9453           Cost -= TTI->getCastInstrCost(
9454               Ext->getOpcode(), Ext->getType(), I->getType(),
9455               TTI::getCastContextHint(Ext), CostKind, Ext);
9456           return Cost;
9457         }
9458       }
9459       return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9460                                      CostKind, *getExtractIndex(I));
9461     };
9462     auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9463     return GetCostDiff(GetScalarCost, GetVectorCost);
9464   }
9465   case Instruction::InsertElement: {
9466     assert(E->ReuseShuffleIndices.empty() &&
9467            "Unique insertelements only are expected.");
9468     auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9469     unsigned const NumElts = SrcVecTy->getNumElements();
9470     unsigned const NumScalars = VL.size();
9471 
9472     unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9473 
9474     SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9475     unsigned OffsetBeg = *getElementIndex(VL.front());
9476     unsigned OffsetEnd = OffsetBeg;
9477     InsertMask[OffsetBeg] = 0;
9478     for (auto [I, V] : enumerate(VL.drop_front())) {
9479       unsigned Idx = *getElementIndex(V);
9480       if (OffsetBeg > Idx)
9481         OffsetBeg = Idx;
9482       else if (OffsetEnd < Idx)
9483         OffsetEnd = Idx;
9484       InsertMask[Idx] = I + 1;
9485     }
9486     unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9487     if (NumOfParts > 0)
9488       VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9489     unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9490                      VecScalarsSz;
9491     unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492     unsigned InsertVecSz = std::min<unsigned>(
9493         PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9494         ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495     bool IsWholeSubvector =
9496         OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9497     // Check if we can safely insert a subvector. If it is not possible, just
9498     // generate a whole-sized vector and shuffle the source vector and the new
9499     // subvector.
9500     if (OffsetBeg + InsertVecSz > VecSz) {
9501       // Align OffsetBeg to generate correct mask.
9502       OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9503       InsertVecSz = VecSz;
9504     }
9505 
9506     APInt DemandedElts = APInt::getZero(NumElts);
9507     // TODO: Add support for Instruction::InsertValue.
9508     SmallVector<int> Mask;
9509     if (!E->ReorderIndices.empty()) {
9510       inversePermutation(E->ReorderIndices, Mask);
9511       Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9512     } else {
9513       Mask.assign(VecSz, PoisonMaskElem);
9514       std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9515     }
9516     bool IsIdentity = true;
9517     SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9518     Mask.swap(PrevMask);
9519     for (unsigned I = 0; I < NumScalars; ++I) {
9520       unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
9521       DemandedElts.setBit(InsertIdx);
9522       IsIdentity &= InsertIdx - OffsetBeg == I;
9523       Mask[InsertIdx - OffsetBeg] = I;
9524     }
9525     assert(Offset < NumElts && "Failed to find vector index offset");
9526 
9527     InstructionCost Cost = 0;
9528     Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9529                                           /*Insert*/ true, /*Extract*/ false,
9530                                           CostKind);
9531 
9532     // First cost - resize to actual vector size if not identity shuffle or
9533     // need to shift the vector.
9534     // Do not calculate the cost if the actual size is the register size and
9535     // we can merge this shuffle with the following SK_Select.
9536     auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
9537     if (!IsIdentity)
9538       Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
9539                                   InsertVecTy, Mask);
9540     auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9541       return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9542     }));
9543     // Second cost - permutation with subvector, if some elements are from the
9544     // initial vector or inserting a subvector.
9545     // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9546     // subvector of ActualVecTy.
9547     SmallBitVector InMask =
9548         isUndefVector(FirstInsert->getOperand(0),
9549                       buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9550     if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9551       if (InsertVecSz != VecSz) {
9552         auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
9553         Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
9554                                     std::nullopt, CostKind, OffsetBeg - Offset,
9555                                     InsertVecTy);
9556       } else {
9557         for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9558           Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9559         for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9560              I <= End; ++I)
9561           if (Mask[I] != PoisonMaskElem)
9562             Mask[I] = I + VecSz;
9563         for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9564           Mask[I] =
9565               ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9566         Cost +=
9567             ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9568       }
9569     }
9570     return Cost;
9571   }
9572   case Instruction::ZExt:
9573   case Instruction::SExt:
9574   case Instruction::FPToUI:
9575   case Instruction::FPToSI:
9576   case Instruction::FPExt:
9577   case Instruction::PtrToInt:
9578   case Instruction::IntToPtr:
9579   case Instruction::SIToFP:
9580   case Instruction::UIToFP:
9581   case Instruction::Trunc:
9582   case Instruction::FPTrunc:
9583   case Instruction::BitCast: {
9584     auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9585     Type *SrcScalarTy = VL0->getOperand(0)->getType();
9586     auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9587     unsigned Opcode = ShuffleOrOp;
9588     unsigned VecOpcode = Opcode;
9589     if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9590         (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9591       // Check if the values are candidates to demote.
9592       unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9593       if (SrcIt != MinBWs.end()) {
9594         SrcBWSz = SrcIt->second.first;
9595         SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9596         SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9597       }
9598       unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9599       if (BWSz == SrcBWSz) {
9600         VecOpcode = Instruction::BitCast;
9601       } else if (BWSz < SrcBWSz) {
9602         VecOpcode = Instruction::Trunc;
9603       } else if (It != MinBWs.end()) {
9604         assert(BWSz > SrcBWSz && "Invalid cast!");
9605         VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9606       } else if (SrcIt != MinBWs.end()) {
9607         assert(BWSz > SrcBWSz && "Invalid cast!");
9608         VecOpcode =
9609             SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9610       }
9611     } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9612                !SrcIt->second.second) {
9613       VecOpcode = Instruction::UIToFP;
9614     }
9615     auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9616       auto *VI = cast<Instruction>(UniqueValues[Idx]);
9617       return TTI->getCastInstrCost(Opcode, VL0->getType(),
9618                                    VL0->getOperand(0)->getType(),
9619                                    TTI::getCastContextHint(VI), CostKind, VI);
9620     };
9621     auto GetVectorCost = [=](InstructionCost CommonCost) {
9622       // Do not count cost here if minimum bitwidth is in effect and it is just
9623       // a bitcast (here it is just a noop).
9624       if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9625         return CommonCost;
9626       auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9627       TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9628       return CommonCost +
9629              TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9630                                    VecOpcode == Opcode ? VI : nullptr);
9631     };
9632     return GetCostDiff(GetScalarCost, GetVectorCost);
9633   }
9634   case Instruction::FCmp:
9635   case Instruction::ICmp:
9636   case Instruction::Select: {
9637     CmpInst::Predicate VecPred, SwappedVecPred;
9638     auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9639     if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9640         match(VL0, MatchCmp))
9641       SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9642     else
9643       SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9644                                      ? CmpInst::BAD_FCMP_PREDICATE
9645                                      : CmpInst::BAD_ICMP_PREDICATE;
9646     auto GetScalarCost = [&](unsigned Idx) {
9647       auto *VI = cast<Instruction>(UniqueValues[Idx]);
9648       CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9649                                            ? CmpInst::BAD_FCMP_PREDICATE
9650                                            : CmpInst::BAD_ICMP_PREDICATE;
9651       auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9652       if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9653            !match(VI, MatchCmp)) ||
9654           (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9655         VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9656                                        ? CmpInst::BAD_FCMP_PREDICATE
9657                                        : CmpInst::BAD_ICMP_PREDICATE;
9658 
9659       InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
9660           E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9661           CostKind, VI);
9662       auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9663       if (MinMaxID != Intrinsic::not_intrinsic) {
9664         Type *CanonicalType = OrigScalarTy;
9665         if (CanonicalType->isPtrOrPtrVectorTy())
9666           CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9667               CanonicalType->getContext(),
9668               DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9669 
9670         IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9671                                           {CanonicalType, CanonicalType});
9672         InstructionCost IntrinsicCost =
9673             TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9674         // If the selects are the only uses of the compares, they will be
9675         // dead and we can adjust the cost by removing their cost.
9676         if (SelectOnly) {
9677           auto *CI = cast<CmpInst>(VI->getOperand(0));
9678           IntrinsicCost -= TTI->getCmpSelInstrCost(
9679               CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9680               CI->getPredicate(), CostKind, CI);
9681         }
9682         ScalarCost = std::min(ScalarCost, IntrinsicCost);
9683       }
9684 
9685       return ScalarCost;
9686     };
9687     auto GetVectorCost = [&](InstructionCost CommonCost) {
9688       auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9689 
9690       InstructionCost VecCost = TTI->getCmpSelInstrCost(
9691           E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9692       // Check if it is possible and profitable to use min/max for selects
9693       // in VL.
9694       //
9695       auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9696       if (MinMaxID != Intrinsic::not_intrinsic) {
9697         Type *CanonicalType = VecTy;
9698         if (CanonicalType->isPtrOrPtrVectorTy())
9699           CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9700               CanonicalType->getContext(),
9701               DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9702         IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9703                                           {CanonicalType, CanonicalType});
9704         InstructionCost IntrinsicCost =
9705             TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9706         // If the selects are the only uses of the compares, they will be
9707         // dead and we can adjust the cost by removing their cost.
9708         if (SelectOnly) {
9709           auto *CI =
9710               cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9711           IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9712                                                    MaskTy, VecPred, CostKind);
9713         }
9714         VecCost = std::min(VecCost, IntrinsicCost);
9715       }
9716       return VecCost + CommonCost;
9717     };
9718     return GetCostDiff(GetScalarCost, GetVectorCost);
9719   }
9720   case Instruction::FNeg:
9721   case Instruction::Add:
9722   case Instruction::FAdd:
9723   case Instruction::Sub:
9724   case Instruction::FSub:
9725   case Instruction::Mul:
9726   case Instruction::FMul:
9727   case Instruction::UDiv:
9728   case Instruction::SDiv:
9729   case Instruction::FDiv:
9730   case Instruction::URem:
9731   case Instruction::SRem:
9732   case Instruction::FRem:
9733   case Instruction::Shl:
9734   case Instruction::LShr:
9735   case Instruction::AShr:
9736   case Instruction::And:
9737   case Instruction::Or:
9738   case Instruction::Xor: {
9739     auto GetScalarCost = [&](unsigned Idx) {
9740       auto *VI = cast<Instruction>(UniqueValues[Idx]);
9741       unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9742       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9743       TTI::OperandValueInfo Op2Info =
9744           TTI::getOperandInfo(VI->getOperand(OpIdx));
9745       SmallVector<const Value *> Operands(VI->operand_values());
9746       return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9747                                          Op1Info, Op2Info, Operands, VI);
9748     };
9749     auto GetVectorCost = [=](InstructionCost CommonCost) {
9750       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9751         for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9752           ArrayRef<Value *> Ops = E->getOperand(I);
9753           if (all_of(Ops, [&](Value *Op) {
9754                 auto *CI = dyn_cast<ConstantInt>(Op);
9755                 return CI && CI->getValue().countr_one() >= It->second.first;
9756               }))
9757             return CommonCost;
9758         }
9759       }
9760       unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9761       TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9762       TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9763       return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9764                                          Op2Info, std::nullopt, nullptr, TLI) +
9765              CommonCost;
9766     };
9767     return GetCostDiff(GetScalarCost, GetVectorCost);
9768   }
9769   case Instruction::GetElementPtr: {
9770     return CommonCost + GetGEPCostDiff(VL, VL0);
9771   }
9772   case Instruction::Load: {
9773     auto GetScalarCost = [&](unsigned Idx) {
9774       auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9775       return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9776                                   VI->getAlign(), VI->getPointerAddressSpace(),
9777                                   CostKind, TTI::OperandValueInfo(), VI);
9778     };
9779     auto *LI0 = cast<LoadInst>(VL0);
9780     auto GetVectorCost = [&](InstructionCost CommonCost) {
9781       InstructionCost VecLdCost;
9782       if (E->State == TreeEntry::Vectorize) {
9783         VecLdCost = TTI->getMemoryOpCost(
9784             Instruction::Load, VecTy, LI0->getAlign(),
9785             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9786       } else if (E->State == TreeEntry::StridedVectorize) {
9787         Align CommonAlignment =
9788             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9789         VecLdCost = TTI->getStridedMemoryOpCost(
9790             Instruction::Load, VecTy, LI0->getPointerOperand(),
9791             /*VariableMask=*/false, CommonAlignment, CostKind);
9792       } else {
9793         assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9794         Align CommonAlignment =
9795             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9796         VecLdCost = TTI->getGatherScatterOpCost(
9797             Instruction::Load, VecTy, LI0->getPointerOperand(),
9798             /*VariableMask=*/false, CommonAlignment, CostKind);
9799       }
9800       return VecLdCost + CommonCost;
9801     };
9802 
9803     InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9804     // If this node generates masked gather load then it is not a terminal node.
9805     // Hence address operand cost is estimated separately.
9806     if (E->State == TreeEntry::ScatterVectorize)
9807       return Cost;
9808 
9809     // Estimate cost of GEPs since this tree node is a terminator.
9810     SmallVector<Value *> PointerOps(VL.size());
9811     for (auto [I, V] : enumerate(VL))
9812       PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9813     return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9814   }
9815   case Instruction::Store: {
9816     bool IsReorder = !E->ReorderIndices.empty();
9817     auto GetScalarCost = [=](unsigned Idx) {
9818       auto *VI = cast<StoreInst>(VL[Idx]);
9819       TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9820       return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9821                                   VI->getAlign(), VI->getPointerAddressSpace(),
9822                                   CostKind, OpInfo, VI);
9823     };
9824     auto *BaseSI =
9825         cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9826     auto GetVectorCost = [=](InstructionCost CommonCost) {
9827       // We know that we can merge the stores. Calculate the cost.
9828       InstructionCost VecStCost;
9829       if (E->State == TreeEntry::StridedVectorize) {
9830         Align CommonAlignment =
9831             computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9832         VecStCost = TTI->getStridedMemoryOpCost(
9833             Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9834             /*VariableMask=*/false, CommonAlignment, CostKind);
9835       } else {
9836         assert(E->State == TreeEntry::Vectorize &&
9837                "Expected either strided or consecutive stores.");
9838         TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9839         VecStCost = TTI->getMemoryOpCost(
9840             Instruction::Store, VecTy, BaseSI->getAlign(),
9841             BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9842       }
9843       return VecStCost + CommonCost;
9844     };
9845     SmallVector<Value *> PointerOps(VL.size());
9846     for (auto [I, V] : enumerate(VL)) {
9847       unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9848       PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9849     }
9850 
9851     return GetCostDiff(GetScalarCost, GetVectorCost) +
9852            GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9853   }
9854   case Instruction::Call: {
9855     auto GetScalarCost = [&](unsigned Idx) {
9856       auto *CI = cast<CallInst>(UniqueValues[Idx]);
9857       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9858       if (ID != Intrinsic::not_intrinsic) {
9859         IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9860         return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9861       }
9862       return TTI->getCallInstrCost(CI->getCalledFunction(),
9863                                    CI->getFunctionType()->getReturnType(),
9864                                    CI->getFunctionType()->params(), CostKind);
9865     };
9866     auto GetVectorCost = [=](InstructionCost CommonCost) {
9867       auto *CI = cast<CallInst>(VL0);
9868       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9869       SmallVector<Type *> ArgTys =
9870           buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
9871                                  It != MinBWs.end() ? It->second.first : 0);
9872       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9873       return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9874     };
9875     return GetCostDiff(GetScalarCost, GetVectorCost);
9876   }
9877   case Instruction::ShuffleVector: {
9878     assert(E->isAltShuffle() &&
9879            ((Instruction::isBinaryOp(E->getOpcode()) &&
9880              Instruction::isBinaryOp(E->getAltOpcode())) ||
9881             (Instruction::isCast(E->getOpcode()) &&
9882              Instruction::isCast(E->getAltOpcode())) ||
9883             (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884            "Invalid Shuffle Vector Operand");
9885     // Try to find the previous shuffle node with the same operands and same
9886     // main/alternate ops.
9887     auto TryFindNodeWithEqualOperands = [=]() {
9888       for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9889         if (TE.get() == E)
9890           break;
9891         if (TE->isAltShuffle() &&
9892             ((TE->getOpcode() == E->getOpcode() &&
9893               TE->getAltOpcode() == E->getAltOpcode()) ||
9894              (TE->getOpcode() == E->getAltOpcode() &&
9895               TE->getAltOpcode() == E->getOpcode())) &&
9896             TE->hasEqualOperands(*E))
9897           return true;
9898       }
9899       return false;
9900     };
9901     auto GetScalarCost = [&](unsigned Idx) {
9902       auto *VI = cast<Instruction>(UniqueValues[Idx]);
9903       assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9904       (void)E;
9905       return TTI->getInstructionCost(VI, CostKind);
9906     };
9907     // Need to clear CommonCost since the final shuffle cost is included into
9908     // vector cost.
9909     auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9910       // VecCost is equal to sum of the cost of creating 2 vectors
9911       // and the cost of creating shuffle.
9912       InstructionCost VecCost = 0;
9913       if (TryFindNodeWithEqualOperands()) {
9914         LLVM_DEBUG({
9915           dbgs() << "SLP: diamond match for alternate node found.\n";
9916           E->dump();
9917         });
9918         // No need to add new vector costs here since we're going to reuse
9919         // same main/alternate vector ops, just do different shuffling.
9920       } else if (Instruction::isBinaryOp(E->getOpcode())) {
9921         VecCost =
9922             TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9923         VecCost +=
9924             TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9925       } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9926         auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9927         VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9928                                             CI0->getPredicate(), CostKind, VL0);
9929         VecCost += TTIRef.getCmpSelInstrCost(
9930             E->getOpcode(), VecTy, MaskTy,
9931             cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9932             E->getAltOp());
9933       } else {
9934         Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9935         auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
9936         if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9937           auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9938           unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9939           unsigned SrcBWSz =
9940               DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9941           if (SrcIt != MinBWs.end()) {
9942             SrcBWSz = SrcIt->second.first;
9943             SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9944             SrcTy = getWidenedType(SrcSclTy, VL.size());
9945           }
9946           if (BWSz <= SrcBWSz) {
9947             if (BWSz < SrcBWSz)
9948               VecCost =
9949                   TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9950                                           TTI::CastContextHint::None, CostKind);
9951             LLVM_DEBUG({
9952               dbgs()
9953                   << "SLP: alternate extension, which should be truncated.\n";
9954               E->dump();
9955             });
9956             return VecCost;
9957           }
9958         }
9959         VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9960                                           TTI::CastContextHint::None, CostKind);
9961         VecCost +=
9962             TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9963                                     TTI::CastContextHint::None, CostKind);
9964       }
9965       SmallVector<int> Mask;
9966       E->buildAltOpShuffleMask(
9967           [E](Instruction *I) {
9968             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9969             return I->getOpcode() == E->getAltOpcode();
9970           },
9971           Mask);
9972       VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
9973                                   FinalVecTy, Mask);
9974       // Patterns like [fadd,fsub] can be combined into a single instruction
9975       // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9976       // need to take into account their order when looking for the most used
9977       // order.
9978       unsigned Opcode0 = E->getOpcode();
9979       unsigned Opcode1 = E->getAltOpcode();
9980       SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
9981       // If this pattern is supported by the target then we consider the
9982       // order.
9983       if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9984         InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9985             VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9986         return AltVecCost < VecCost ? AltVecCost : VecCost;
9987       }
9988       // TODO: Check the reverse order too.
9989       return VecCost;
9990     };
9991     return GetCostDiff(GetScalarCost, GetVectorCost);
9992   }
9993   default:
9994     llvm_unreachable("Unknown instruction");
9995   }
9996 }
9997 
9998 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9999   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10000                     << VectorizableTree.size() << " is fully vectorizable .\n");
10001 
10002   auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10003     SmallVector<int> Mask;
10004     return TE->isGather() &&
10005            !any_of(TE->Scalars,
10006                    [this](Value *V) { return EphValues.contains(V); }) &&
10007            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
10008             TE->Scalars.size() < Limit ||
10009             ((TE->getOpcode() == Instruction::ExtractElement ||
10010               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10011              isFixedVectorShuffle(TE->Scalars, Mask)) ||
10012             (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10013              !TE->isAltShuffle()));
10014   };
10015 
10016   // We only handle trees of heights 1 and 2.
10017   if (VectorizableTree.size() == 1 &&
10018       (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10019        (ForReduction &&
10020         AreVectorizableGathers(VectorizableTree[0].get(),
10021                                VectorizableTree[0]->Scalars.size()) &&
10022         VectorizableTree[0]->getVectorFactor() > 2)))
10023     return true;
10024 
10025   if (VectorizableTree.size() != 2)
10026     return false;
10027 
10028   // Handle splat and all-constants stores. Also try to vectorize tiny trees
10029   // with the second gather nodes if they have less scalar operands rather than
10030   // the initial tree element (may be profitable to shuffle the second gather)
10031   // or they are extractelements, which form shuffle.
10032   SmallVector<int> Mask;
10033   if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10034       AreVectorizableGathers(VectorizableTree[1].get(),
10035                              VectorizableTree[0]->Scalars.size()))
10036     return true;
10037 
10038   // Gathering cost would be too much for tiny trees.
10039   if (VectorizableTree[0]->isGather() ||
10040       (VectorizableTree[1]->isGather() &&
10041        VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10042        VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10043     return false;
10044 
10045   return true;
10046 }
10047 
10048 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10049                                        TargetTransformInfo *TTI,
10050                                        bool MustMatchOrInst) {
10051   // Look past the root to find a source value. Arbitrarily follow the
10052   // path through operand 0 of any 'or'. Also, peek through optional
10053   // shift-left-by-multiple-of-8-bits.
10054   Value *ZextLoad = Root;
10055   const APInt *ShAmtC;
10056   bool FoundOr = false;
10057   while (!isa<ConstantExpr>(ZextLoad) &&
10058          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
10059           (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
10060            ShAmtC->urem(8) == 0))) {
10061     auto *BinOp = cast<BinaryOperator>(ZextLoad);
10062     ZextLoad = BinOp->getOperand(0);
10063     if (BinOp->getOpcode() == Instruction::Or)
10064       FoundOr = true;
10065   }
10066   // Check if the input is an extended load of the required or/shift expression.
10067   Value *Load;
10068   if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10069       !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
10070     return false;
10071 
10072   // Require that the total load bit width is a legal integer type.
10073   // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10074   // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10075   Type *SrcTy = Load->getType();
10076   unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10077   if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
10078     return false;
10079 
10080   // Everything matched - assume that we can fold the whole sequence using
10081   // load combining.
10082   LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10083              << *(cast<Instruction>(Root)) << "\n");
10084 
10085   return true;
10086 }
10087 
10088 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
10089   if (RdxKind != RecurKind::Or)
10090     return false;
10091 
10092   unsigned NumElts = VectorizableTree[0]->Scalars.size();
10093   Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10094   return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
10095                                     /* MatchOr */ false);
10096 }
10097 
10098 bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
10099   // Peek through a final sequence of stores and check if all operations are
10100   // likely to be load-combined.
10101   unsigned NumElts = Stores.size();
10102   for (Value *Scalar : Stores) {
10103     Value *X;
10104     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
10105         !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
10106       return false;
10107   }
10108   return true;
10109 }
10110 
10111 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10112   // No need to vectorize inserts of gathered values.
10113   if (VectorizableTree.size() == 2 &&
10114       isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10115       VectorizableTree[1]->isGather() &&
10116       (VectorizableTree[1]->getVectorFactor() <= 2 ||
10117        !(isSplat(VectorizableTree[1]->Scalars) ||
10118          allConstant(VectorizableTree[1]->Scalars))))
10119     return true;
10120 
10121   // If the graph includes only PHI nodes and gathers, it is defnitely not
10122   // profitable for the vectorization, we can skip it, if the cost threshold is
10123   // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10124   // gathers/buildvectors.
10125   constexpr int Limit = 4;
10126   if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10127       !VectorizableTree.empty() &&
10128       all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10129         return (TE->isGather() &&
10130                 TE->getOpcode() != Instruction::ExtractElement &&
10131                 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10132                TE->getOpcode() == Instruction::PHI;
10133       }))
10134     return true;
10135 
10136   // We can vectorize the tree if its size is greater than or equal to the
10137   // minimum size specified by the MinTreeSize command line option.
10138   if (VectorizableTree.size() >= MinTreeSize)
10139     return false;
10140 
10141   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10142   // can vectorize it if we can prove it fully vectorizable.
10143   if (isFullyVectorizableTinyTree(ForReduction))
10144     return false;
10145 
10146   // Check if any of the gather node forms an insertelement buildvector
10147   // somewhere.
10148   bool IsAllowedSingleBVNode =
10149       VectorizableTree.size() > 1 ||
10150       (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10151        !VectorizableTree.front()->isAltShuffle() &&
10152        VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153        VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10154        allSameBlock(VectorizableTree.front()->Scalars));
10155   if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10156         return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
10157                  return isa<ExtractElementInst, UndefValue>(V) ||
10158                         (IsAllowedSingleBVNode &&
10159                          !V->hasNUsesOrMore(UsesLimit) &&
10160                          any_of(V->users(), IsaPred<InsertElementInst>));
10161                });
10162       }))
10163     return false;
10164 
10165   assert(VectorizableTree.empty()
10166              ? ExternalUses.empty()
10167              : true && "We shouldn't have any external users");
10168 
10169   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10170   // vectorizable.
10171   return true;
10172 }
10173 
10174 InstructionCost BoUpSLP::getSpillCost() const {
10175   // Walk from the bottom of the tree to the top, tracking which values are
10176   // live. When we see a call instruction that is not part of our tree,
10177   // query TTI to see if there is a cost to keeping values live over it
10178   // (for example, if spills and fills are required).
10179   unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10180   InstructionCost Cost = 0;
10181 
10182   SmallPtrSet<Instruction *, 4> LiveValues;
10183   Instruction *PrevInst = nullptr;
10184 
10185   // The entries in VectorizableTree are not necessarily ordered by their
10186   // position in basic blocks. Collect them and order them by dominance so later
10187   // instructions are guaranteed to be visited first. For instructions in
10188   // different basic blocks, we only scan to the beginning of the block, so
10189   // their order does not matter, as long as all instructions in a basic block
10190   // are grouped together. Using dominance ensures a deterministic order.
10191   SmallVector<Instruction *, 16> OrderedScalars;
10192   for (const auto &TEPtr : VectorizableTree) {
10193     if (TEPtr->State != TreeEntry::Vectorize)
10194       continue;
10195     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10196     if (!Inst)
10197       continue;
10198     OrderedScalars.push_back(Inst);
10199   }
10200   llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10201     auto *NodeA = DT->getNode(A->getParent());
10202     auto *NodeB = DT->getNode(B->getParent());
10203     assert(NodeA && "Should only process reachable instructions");
10204     assert(NodeB && "Should only process reachable instructions");
10205     assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206            "Different nodes should have different DFS numbers");
10207     if (NodeA != NodeB)
10208       return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209     return B->comesBefore(A);
10210   });
10211 
10212   for (Instruction *Inst : OrderedScalars) {
10213     if (!PrevInst) {
10214       PrevInst = Inst;
10215       continue;
10216     }
10217 
10218     // Update LiveValues.
10219     LiveValues.erase(PrevInst);
10220     for (auto &J : PrevInst->operands()) {
10221       if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10222         LiveValues.insert(cast<Instruction>(&*J));
10223     }
10224 
10225     LLVM_DEBUG({
10226       dbgs() << "SLP: #LV: " << LiveValues.size();
10227       for (auto *X : LiveValues)
10228         dbgs() << " " << X->getName();
10229       dbgs() << ", Looking at ";
10230       Inst->dump();
10231     });
10232 
10233     // Now find the sequence of instructions between PrevInst and Inst.
10234     unsigned NumCalls = 0;
10235     BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10236                                  PrevInstIt =
10237                                      PrevInst->getIterator().getReverse();
10238     while (InstIt != PrevInstIt) {
10239       if (PrevInstIt == PrevInst->getParent()->rend()) {
10240         PrevInstIt = Inst->getParent()->rbegin();
10241         continue;
10242       }
10243 
10244       auto NoCallIntrinsic = [this](Instruction *I) {
10245         if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10246           if (II->isAssumeLikeIntrinsic())
10247             return true;
10248           FastMathFlags FMF;
10249           SmallVector<Type *, 4> Tys;
10250           for (auto &ArgOp : II->args())
10251             Tys.push_back(ArgOp->getType());
10252           if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10253             FMF = FPMO->getFastMathFlags();
10254           IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10255                                       FMF);
10256           InstructionCost IntrCost =
10257               TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
10258           InstructionCost CallCost = TTI->getCallInstrCost(
10259               nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10260           if (IntrCost < CallCost)
10261             return true;
10262         }
10263         return false;
10264       };
10265 
10266       // Debug information does not impact spill cost.
10267       if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10268           &*PrevInstIt != PrevInst)
10269         NumCalls++;
10270 
10271       ++PrevInstIt;
10272     }
10273 
10274     if (NumCalls) {
10275       SmallVector<Type *, 4> V;
10276       for (auto *II : LiveValues) {
10277         auto *ScalarTy = II->getType();
10278         if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10279           ScalarTy = VectorTy->getElementType();
10280         V.push_back(getWidenedType(ScalarTy, BundleWidth));
10281       }
10282       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10283     }
10284 
10285     PrevInst = Inst;
10286   }
10287 
10288   return Cost;
10289 }
10290 
10291 /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10292 /// buildvector sequence.
10293 static bool isFirstInsertElement(const InsertElementInst *IE1,
10294                                  const InsertElementInst *IE2) {
10295   if (IE1 == IE2)
10296     return false;
10297   const auto *I1 = IE1;
10298   const auto *I2 = IE2;
10299   const InsertElementInst *PrevI1;
10300   const InsertElementInst *PrevI2;
10301   unsigned Idx1 = *getElementIndex(IE1);
10302   unsigned Idx2 = *getElementIndex(IE2);
10303   do {
10304     if (I2 == IE1)
10305       return true;
10306     if (I1 == IE2)
10307       return false;
10308     PrevI1 = I1;
10309     PrevI2 = I2;
10310     if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10311         getElementIndex(I1).value_or(Idx2) != Idx2)
10312       I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10313     if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10314         getElementIndex(I2).value_or(Idx1) != Idx1)
10315       I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10316   } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10317   llvm_unreachable("Two different buildvectors not expected.");
10318 }
10319 
10320 namespace {
10321 /// Returns incoming Value *, if the requested type is Value * too, or a default
10322 /// value, otherwise.
10323 struct ValueSelect {
10324   template <typename U>
10325   static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10326     return V;
10327   }
10328   template <typename U>
10329   static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10330     return U();
10331   }
10332 };
10333 } // namespace
10334 
10335 /// Does the analysis of the provided shuffle masks and performs the requested
10336 /// actions on the vectors with the given shuffle masks. It tries to do it in
10337 /// several steps.
10338 /// 1. If the Base vector is not undef vector, resizing the very first mask to
10339 /// have common VF and perform action for 2 input vectors (including non-undef
10340 /// Base). Other shuffle masks are combined with the resulting after the 1 stage
10341 /// and processed as a shuffle of 2 elements.
10342 /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10343 /// action only for 1 vector with the given mask, if it is not the identity
10344 /// mask.
10345 /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10346 /// vectors, combing the masks properly between the steps.
10347 template <typename T>
10348 static T *performExtractsShuffleAction(
10349     MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10350     function_ref<unsigned(T *)> GetVF,
10351     function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10352     function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
10353   assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10354   SmallVector<int> Mask(ShuffleMask.begin()->second);
10355   auto VMIt = std::next(ShuffleMask.begin());
10356   T *Prev = nullptr;
10357   SmallBitVector UseMask =
10358       buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10359   SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10360   if (!IsBaseUndef.all()) {
10361     // Base is not undef, need to combine it with the next subvectors.
10362     std::pair<T *, bool> Res =
10363         ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10364     SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10365     for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10366       if (Mask[Idx] == PoisonMaskElem)
10367         Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10368       else
10369         Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10370     }
10371     auto *V = ValueSelect::get<T *>(Base);
10372     (void)V;
10373     assert((!V || GetVF(V) == Mask.size()) &&
10374            "Expected base vector of VF number of elements.");
10375     Prev = Action(Mask, {nullptr, Res.first});
10376   } else if (ShuffleMask.size() == 1) {
10377     // Base is undef and only 1 vector is shuffled - perform the action only for
10378     // single vector, if the mask is not the identity mask.
10379     std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10380                                             /*ForSingleMask=*/true);
10381     if (Res.second)
10382       // Identity mask is found.
10383       Prev = Res.first;
10384     else
10385       Prev = Action(Mask, {ShuffleMask.begin()->first});
10386   } else {
10387     // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10388     // shuffles step by step, combining shuffle between the steps.
10389     unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390     unsigned Vec2VF = GetVF(VMIt->first);
10391     if (Vec1VF == Vec2VF) {
10392       // No need to resize the input vectors since they are of the same size, we
10393       // can shuffle them directly.
10394       ArrayRef<int> SecMask = VMIt->second;
10395       for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10396         if (SecMask[I] != PoisonMaskElem) {
10397           assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10398           Mask[I] = SecMask[I] + Vec1VF;
10399         }
10400       }
10401       Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10402     } else {
10403       // Vectors of different sizes - resize and reshuffle.
10404       std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10405                                                /*ForSingleMask=*/false);
10406       std::pair<T *, bool> Res2 =
10407           ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10408       ArrayRef<int> SecMask = VMIt->second;
10409       for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10410         if (Mask[I] != PoisonMaskElem) {
10411           assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10412           if (Res1.second)
10413             Mask[I] = I;
10414         } else if (SecMask[I] != PoisonMaskElem) {
10415           assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10416           Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10417         }
10418       }
10419       Prev = Action(Mask, {Res1.first, Res2.first});
10420     }
10421     VMIt = std::next(VMIt);
10422   }
10423   bool IsBaseNotUndef = !IsBaseUndef.all();
10424   (void)IsBaseNotUndef;
10425   // Perform requested actions for the remaining masks/vectors.
10426   for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10427     // Shuffle other input vectors, if any.
10428     std::pair<T *, bool> Res =
10429         ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10430     ArrayRef<int> SecMask = VMIt->second;
10431     for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10432       if (SecMask[I] != PoisonMaskElem) {
10433         assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10434                "Multiple uses of scalars.");
10435         Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10436       } else if (Mask[I] != PoisonMaskElem) {
10437         Mask[I] = I;
10438       }
10439     }
10440     Prev = Action(Mask, {Prev, Res.first});
10441   }
10442   return Prev;
10443 }
10444 
10445 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10446   InstructionCost Cost = 0;
10447   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10448                     << VectorizableTree.size() << ".\n");
10449 
10450   unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10451 
10452   SmallPtrSet<Value *, 4> CheckedExtracts;
10453   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10454     TreeEntry &TE = *VectorizableTree[I];
10455     if (TE.isGather()) {
10456       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10457           E && E->getVectorFactor() == TE.getVectorFactor() &&
10458           E->isSame(TE.Scalars)) {
10459         // Some gather nodes might be absolutely the same as some vectorizable
10460         // nodes after reordering, need to handle it.
10461         LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10462                           << shortBundleName(TE.Scalars) << ".\n"
10463                           << "SLP: Current total cost = " << Cost << "\n");
10464         continue;
10465       }
10466     }
10467 
10468     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10469     Cost += C;
10470     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10471                       << shortBundleName(TE.Scalars) << ".\n"
10472                       << "SLP: Current total cost = " << Cost << "\n");
10473   }
10474 
10475   SmallPtrSet<Value *, 16> ExtractCostCalculated;
10476   InstructionCost ExtractCost = 0;
10477   SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
10478   SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
10479   SmallVector<APInt> DemandedElts;
10480   SmallDenseSet<Value *, 4> UsedInserts;
10481   DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
10482   std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10483   for (ExternalUser &EU : ExternalUses) {
10484     // We only add extract cost once for the same scalar.
10485     if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10486         !ExtractCostCalculated.insert(EU.Scalar).second)
10487       continue;
10488 
10489     // Uses by ephemeral values are free (because the ephemeral value will be
10490     // removed prior to code generation, and so the extraction will be
10491     // removed as well).
10492     if (EphValues.count(EU.User))
10493       continue;
10494 
10495     // No extract cost for vector "scalar"
10496     if (isa<FixedVectorType>(EU.Scalar->getType()))
10497       continue;
10498 
10499     // If found user is an insertelement, do not calculate extract cost but try
10500     // to detect it as a final shuffled/identity match.
10501     if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10502         VU && VU->getOperand(1) == EU.Scalar) {
10503       if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10504         if (!UsedInserts.insert(VU).second)
10505           continue;
10506         std::optional<unsigned> InsertIdx = getElementIndex(VU);
10507         if (InsertIdx) {
10508           const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10509           auto *It = find_if(
10510               FirstUsers,
10511               [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10512                 return areTwoInsertFromSameBuildVector(
10513                     VU, cast<InsertElementInst>(Pair.first),
10514                     [this](InsertElementInst *II) -> Value * {
10515                       Value *Op0 = II->getOperand(0);
10516                       if (getTreeEntry(II) && !getTreeEntry(Op0))
10517                         return nullptr;
10518                       return Op0;
10519                     });
10520               });
10521           int VecId = -1;
10522           if (It == FirstUsers.end()) {
10523             (void)ShuffleMasks.emplace_back();
10524             SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10525             if (Mask.empty())
10526               Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10527             // Find the insertvector, vectorized in tree, if any.
10528             Value *Base = VU;
10529             while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10530               if (IEBase != EU.User &&
10531                   (!IEBase->hasOneUse() ||
10532                    getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10533                 break;
10534               // Build the mask for the vectorized insertelement instructions.
10535               if (const TreeEntry *E = getTreeEntry(IEBase)) {
10536                 VU = IEBase;
10537                 do {
10538                   IEBase = cast<InsertElementInst>(Base);
10539                   int Idx = *getElementIndex(IEBase);
10540                   assert(Mask[Idx] == PoisonMaskElem &&
10541                          "InsertElementInstruction used already.");
10542                   Mask[Idx] = Idx;
10543                   Base = IEBase->getOperand(0);
10544                 } while (E == getTreeEntry(Base));
10545                 break;
10546               }
10547               Base = cast<InsertElementInst>(Base)->getOperand(0);
10548             }
10549             FirstUsers.emplace_back(VU, ScalarTE);
10550             DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10551             VecId = FirstUsers.size() - 1;
10552             auto It = MinBWs.find(ScalarTE);
10553             if (It != MinBWs.end() &&
10554                 VectorCasts
10555                     .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10556                     .second) {
10557               unsigned BWSz = It->second.first;
10558               unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10559               unsigned VecOpcode;
10560               if (DstBWSz < BWSz)
10561                 VecOpcode = Instruction::Trunc;
10562               else
10563                 VecOpcode =
10564                     It->second.second ? Instruction::SExt : Instruction::ZExt;
10565               TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10566               InstructionCost C = TTI->getCastInstrCost(
10567                   VecOpcode, FTy,
10568                   getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
10569                                  FTy->getNumElements()),
10570                   TTI::CastContextHint::None, CostKind);
10571               LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10572                                 << " for extending externally used vector with "
10573                                    "non-equal minimum bitwidth.\n");
10574               Cost += C;
10575             }
10576           } else {
10577             if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10578               It->first = VU;
10579             VecId = std::distance(FirstUsers.begin(), It);
10580           }
10581           int InIdx = *InsertIdx;
10582           SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10583           if (Mask.empty())
10584             Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10585           Mask[InIdx] = EU.Lane;
10586           DemandedElts[VecId].setBit(InIdx);
10587           continue;
10588         }
10589       }
10590     }
10591     // Leave the GEPs as is, they are free in most cases and better to keep them
10592     // as GEPs.
10593     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10594     if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10595       if (!ValueToExtUses) {
10596         ValueToExtUses.emplace();
10597         for_each(enumerate(ExternalUses), [&](const auto &P) {
10598           ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10599         });
10600       }
10601       // Can use original GEP, if no operands vectorized or they are marked as
10602       // externally used already.
10603       bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10604         if (!getTreeEntry(V))
10605           return true;
10606         auto It = ValueToExtUses->find(V);
10607         if (It != ValueToExtUses->end()) {
10608           // Replace all uses to avoid compiler crash.
10609           ExternalUses[It->second].User = nullptr;
10610           return true;
10611         }
10612         return false;
10613       });
10614       if (CanBeUsedAsGEP) {
10615         ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10616         ExternalUsesAsGEPs.insert(EU.Scalar);
10617         continue;
10618       }
10619     }
10620 
10621     // If we plan to rewrite the tree in a smaller type, we will need to sign
10622     // extend the extracted value back to the original type. Here, we account
10623     // for the extract and the added cost of the sign extend if needed.
10624     auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10625     auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10626     if (It != MinBWs.end()) {
10627       auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10628       unsigned Extend =
10629           It->second.second ? Instruction::SExt : Instruction::ZExt;
10630       VecTy = getWidenedType(MinTy, BundleWidth);
10631       ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10632                                                    VecTy, EU.Lane);
10633     } else {
10634       ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10635                                              CostKind, EU.Lane);
10636     }
10637   }
10638   // Add reduced value cost, if resized.
10639   if (!VectorizedVals.empty()) {
10640     const TreeEntry &Root = *VectorizableTree.front();
10641     auto BWIt = MinBWs.find(&Root);
10642     if (BWIt != MinBWs.end()) {
10643       Type *DstTy = Root.Scalars.front()->getType();
10644       unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10645       unsigned SrcSz =
10646           ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10647       if (OriginalSz != SrcSz) {
10648         unsigned Opcode = Instruction::Trunc;
10649         if (OriginalSz > SrcSz)
10650           Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10651         Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10652         Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10653                                       TTI::CastContextHint::None,
10654                                       TTI::TCK_RecipThroughput);
10655       }
10656     }
10657   }
10658 
10659   InstructionCost SpillCost = getSpillCost();
10660   Cost += SpillCost + ExtractCost;
10661   auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10662                                     bool) {
10663     InstructionCost C = 0;
10664     unsigned VF = Mask.size();
10665     unsigned VecVF = TE->getVectorFactor();
10666     if (VF != VecVF &&
10667         (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10668          !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
10669       SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10670       std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10671                 OrigMask.begin());
10672       C = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc,
10673                               getWidenedType(TE->getMainOp()->getType(), VecVF),
10674                               OrigMask);
10675       LLVM_DEBUG(
10676           dbgs() << "SLP: Adding cost " << C
10677                  << " for final shuffle of insertelement external users.\n";
10678           TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10679       Cost += C;
10680       return std::make_pair(TE, true);
10681     }
10682     return std::make_pair(TE, false);
10683   };
10684   // Calculate the cost of the reshuffled vectors, if any.
10685   for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10686     Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10687     auto Vector = ShuffleMasks[I].takeVector();
10688     unsigned VF = 0;
10689     auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10690                                     ArrayRef<const TreeEntry *> TEs) {
10691       assert((TEs.size() == 1 || TEs.size() == 2) &&
10692              "Expected exactly 1 or 2 tree entries.");
10693       if (TEs.size() == 1) {
10694         if (VF == 0)
10695           VF = TEs.front()->getVectorFactor();
10696         auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10697         if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10698             !all_of(enumerate(Mask), [=](const auto &Data) {
10699               return Data.value() == PoisonMaskElem ||
10700                      (Data.index() < VF &&
10701                       static_cast<int>(Data.index()) == Data.value());
10702             })) {
10703           InstructionCost C =
10704               TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
10705           LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10706                             << " for final shuffle of insertelement "
10707                                "external users.\n";
10708                      TEs.front()->dump();
10709                      dbgs() << "SLP: Current total cost = " << Cost << "\n");
10710           Cost += C;
10711         }
10712       } else {
10713         if (VF == 0) {
10714           if (TEs.front() &&
10715               TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716             VF = TEs.front()->getVectorFactor();
10717           else
10718             VF = Mask.size();
10719         }
10720         auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10721         InstructionCost C =
10722             ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
10723         LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10724                           << " for final shuffle of vector node and external "
10725                              "insertelement users.\n";
10726                    if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727                    dbgs() << "SLP: Current total cost = " << Cost << "\n");
10728         Cost += C;
10729       }
10730       VF = Mask.size();
10731       return TEs.back();
10732     };
10733     (void)performExtractsShuffleAction<const TreeEntry>(
10734         MutableArrayRef(Vector.data(), Vector.size()), Base,
10735         [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10736         EstimateShufflesCost);
10737     InstructionCost InsertCost = TTI->getScalarizationOverhead(
10738         cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10739         /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10740     Cost -= InsertCost;
10741   }
10742 
10743   // Add the cost for reduced value resize (if required).
10744   if (ReductionBitWidth != 0) {
10745     assert(UserIgnoreList && "Expected reduction tree.");
10746     const TreeEntry &E = *VectorizableTree.front();
10747     auto It = MinBWs.find(&E);
10748     if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10749       unsigned SrcSize = It->second.first;
10750       unsigned DstSize = ReductionBitWidth;
10751       unsigned Opcode = Instruction::Trunc;
10752       if (SrcSize < DstSize)
10753         Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10754       auto *SrcVecTy =
10755           getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10756       auto *DstVecTy =
10757           getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
10758       TTI::CastContextHint CCH = getCastContextHint(E);
10759       InstructionCost CastCost;
10760       switch (E.getOpcode()) {
10761       case Instruction::SExt:
10762       case Instruction::ZExt:
10763       case Instruction::Trunc: {
10764         const TreeEntry *OpTE = getOperandEntry(&E, 0);
10765         CCH = getCastContextHint(*OpTE);
10766         break;
10767       }
10768       default:
10769         break;
10770       }
10771       CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10772                                         TTI::TCK_RecipThroughput);
10773       Cost += CastCost;
10774       LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10775                         << " for final resize for reduction from " << SrcVecTy
10776                         << " to " << DstVecTy << "\n";
10777                  dbgs() << "SLP: Current total cost = " << Cost << "\n");
10778     }
10779   }
10780 
10781 #ifndef NDEBUG
10782   SmallString<256> Str;
10783   {
10784     raw_svector_ostream OS(Str);
10785     OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10786        << "SLP: Extract Cost = " << ExtractCost << ".\n"
10787        << "SLP: Total Cost = " << Cost << ".\n";
10788   }
10789   LLVM_DEBUG(dbgs() << Str);
10790   if (ViewSLPTree)
10791     ViewGraph(this, "SLP" + F->getName(), false, Str);
10792 #endif
10793 
10794   return Cost;
10795 }
10796 
10797 /// Tries to find extractelement instructions with constant indices from fixed
10798 /// vector type and gather such instructions into a bunch, which highly likely
10799 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10800 /// successful, the matched scalars are replaced by poison values in \p VL for
10801 /// future analysis.
10802 std::optional<TTI::ShuffleKind>
10803 BoUpSLP::tryToGatherSingleRegisterExtractElements(
10804     MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
10805   // Scan list of gathered scalars for extractelements that can be represented
10806   // as shuffles.
10807   MapVector<Value *, SmallVector<int>> VectorOpToIdx;
10808   SmallVector<int> UndefVectorExtracts;
10809   for (int I = 0, E = VL.size(); I < E; ++I) {
10810     auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10811     if (!EI) {
10812       if (isa<UndefValue>(VL[I]))
10813         UndefVectorExtracts.push_back(I);
10814       continue;
10815     }
10816     auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10817     if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10818       continue;
10819     std::optional<unsigned> Idx = getExtractIndex(EI);
10820     // Undefined index.
10821     if (!Idx) {
10822       UndefVectorExtracts.push_back(I);
10823       continue;
10824     }
10825     SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10826     ExtractMask.reset(*Idx);
10827     if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10828       UndefVectorExtracts.push_back(I);
10829       continue;
10830     }
10831     VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10832   }
10833   // Sort the vector operands by the maximum number of uses in extractelements.
10834   SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
10835       VectorOpToIdx.takeVector();
10836   stable_sort(Vectors, [](const auto &P1, const auto &P2) {
10837     return P1.second.size() > P2.second.size();
10838   });
10839   // Find the best pair of the vectors or a single vector.
10840   const int UndefSz = UndefVectorExtracts.size();
10841   unsigned SingleMax = 0;
10842   unsigned PairMax = 0;
10843   if (!Vectors.empty()) {
10844     SingleMax = Vectors.front().second.size() + UndefSz;
10845     if (Vectors.size() > 1) {
10846       auto *ItNext = std::next(Vectors.begin());
10847       PairMax = SingleMax + ItNext->second.size();
10848     }
10849   }
10850   if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10851     return std::nullopt;
10852   // Check if better to perform a shuffle of 2 vectors or just of a single
10853   // vector.
10854   SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10855   SmallVector<Value *> GatheredExtracts(
10856       VL.size(), PoisonValue::get(VL.front()->getType()));
10857   if (SingleMax >= PairMax && SingleMax) {
10858     for (int Idx : Vectors.front().second)
10859       std::swap(GatheredExtracts[Idx], VL[Idx]);
10860   } else if (!Vectors.empty()) {
10861     for (unsigned Idx : {0, 1})
10862       for (int Idx : Vectors[Idx].second)
10863         std::swap(GatheredExtracts[Idx], VL[Idx]);
10864   }
10865   // Add extracts from undefs too.
10866   for (int Idx : UndefVectorExtracts)
10867     std::swap(GatheredExtracts[Idx], VL[Idx]);
10868   // Check that gather of extractelements can be represented as just a
10869   // shuffle of a single/two vectors the scalars are extracted from.
10870   std::optional<TTI::ShuffleKind> Res =
10871       isFixedVectorShuffle(GatheredExtracts, Mask);
10872   if (!Res) {
10873     // TODO: try to check other subsets if possible.
10874     // Restore the original VL if attempt was not successful.
10875     copy(SavedVL, VL.begin());
10876     return std::nullopt;
10877   }
10878   // Restore unused scalars from mask, if some of the extractelements were not
10879   // selected for shuffle.
10880   for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10881     if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10882         isa<UndefValue>(GatheredExtracts[I])) {
10883       std::swap(VL[I], GatheredExtracts[I]);
10884       continue;
10885     }
10886     auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10887     if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10888         !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10889         is_contained(UndefVectorExtracts, I))
10890       continue;
10891   }
10892   return Res;
10893 }
10894 
10895 /// Tries to find extractelement instructions with constant indices from fixed
10896 /// vector type and gather such instructions into a bunch, which highly likely
10897 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10898 /// successful, the matched scalars are replaced by poison values in \p VL for
10899 /// future analysis.
10900 SmallVector<std::optional<TTI::ShuffleKind>>
10901 BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10902                                     SmallVectorImpl<int> &Mask,
10903                                     unsigned NumParts) const {
10904   assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10905   SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10906   Mask.assign(VL.size(), PoisonMaskElem);
10907   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10908   for (unsigned Part : seq<unsigned>(NumParts)) {
10909     // Scan list of gathered scalars for extractelements that can be represented
10910     // as shuffles.
10911     MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
10912         Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
10913     SmallVector<int> SubMask;
10914     std::optional<TTI::ShuffleKind> Res =
10915         tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10916     ShufflesRes[Part] = Res;
10917     copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10918   }
10919   if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10920         return Res.has_value();
10921       }))
10922     ShufflesRes.clear();
10923   return ShufflesRes;
10924 }
10925 
10926 std::optional<TargetTransformInfo::ShuffleKind>
10927 BoUpSLP::isGatherShuffledSingleRegisterEntry(
10928     const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10929     SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10930   Entries.clear();
10931   // TODO: currently checking only for Scalars in the tree entry, need to count
10932   // reused elements too for better cost estimation.
10933   const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10934   const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10935   const BasicBlock *TEInsertBlock = nullptr;
10936   // Main node of PHI entries keeps the correct order of operands/incoming
10937   // blocks.
10938   if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10939     TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10940     TEInsertPt = TEInsertBlock->getTerminator();
10941   } else {
10942     TEInsertBlock = TEInsertPt->getParent();
10943   }
10944   if (!DT->isReachableFromEntry(TEInsertBlock))
10945     return std::nullopt;
10946   auto *NodeUI = DT->getNode(TEInsertBlock);
10947   assert(NodeUI && "Should only process reachable instructions");
10948   SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10949   auto CheckOrdering = [&](const Instruction *InsertPt) {
10950     // Argument InsertPt is an instruction where vector code for some other
10951     // tree entry (one that shares one or more scalars with TE) is going to be
10952     // generated. This lambda returns true if insertion point of vector code
10953     // for the TE dominates that point (otherwise dependency is the other way
10954     // around). The other node is not limited to be of a gather kind. Gather
10955     // nodes are not scheduled and their vector code is inserted before their
10956     // first user. If user is PHI, that is supposed to be at the end of a
10957     // predecessor block. Otherwise it is the last instruction among scalars of
10958     // the user node. So, instead of checking dependency between instructions
10959     // themselves, we check dependency between their insertion points for vector
10960     // code (since each scalar instruction ends up as a lane of a vector
10961     // instruction).
10962     const BasicBlock *InsertBlock = InsertPt->getParent();
10963     auto *NodeEUI = DT->getNode(InsertBlock);
10964     if (!NodeEUI)
10965       return false;
10966     assert((NodeUI == NodeEUI) ==
10967                (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968            "Different nodes should have different DFS numbers");
10969     // Check the order of the gather nodes users.
10970     if (TEInsertPt->getParent() != InsertBlock &&
10971         (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10972       return false;
10973     if (TEInsertPt->getParent() == InsertBlock &&
10974         TEInsertPt->comesBefore(InsertPt))
10975       return false;
10976     return true;
10977   };
10978   // Find all tree entries used by the gathered values. If no common entries
10979   // found - not a shuffle.
10980   // Here we build a set of tree nodes for each gathered value and trying to
10981   // find the intersection between these sets. If we have at least one common
10982   // tree node for each gathered value - we have just a permutation of the
10983   // single vector. If we have 2 different sets, we're in situation where we
10984   // have a permutation of 2 input vectors.
10985   SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
10986   DenseMap<Value *, int> UsedValuesEntry;
10987   for (Value *V : VL) {
10988     if (isConstant(V))
10989       continue;
10990     // Build a list of tree entries where V is used.
10991     SmallPtrSet<const TreeEntry *, 4> VToTEs;
10992     for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10993       if (TEPtr == TE)
10994         continue;
10995       assert(any_of(TEPtr->Scalars,
10996                     [&](Value *V) { return GatheredScalars.contains(V); }) &&
10997              "Must contain at least single gathered value.");
10998       assert(TEPtr->UserTreeIndices.size() == 1 &&
10999              "Expected only single user of a gather node.");
11000       const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11001 
11002       PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11003       const Instruction *InsertPt =
11004           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
11005                   : &getLastInstructionInBundle(UseEI.UserTE);
11006       if (TEInsertPt == InsertPt) {
11007         // If 2 gathers are operands of the same entry (regardless of whether
11008         // user is PHI or else), compare operands indices, use the earlier one
11009         // as the base.
11010         if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11011           continue;
11012         // If the user instruction is used for some reason in different
11013         // vectorized nodes - make it depend on index.
11014         if (TEUseEI.UserTE != UseEI.UserTE &&
11015             TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11016           continue;
11017       }
11018 
11019       // Check if the user node of the TE comes after user node of TEPtr,
11020       // otherwise TEPtr depends on TE.
11021       if ((TEInsertBlock != InsertPt->getParent() ||
11022            TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11023           !CheckOrdering(InsertPt))
11024         continue;
11025       VToTEs.insert(TEPtr);
11026     }
11027     if (const TreeEntry *VTE = getTreeEntry(V)) {
11028       if (ForOrder) {
11029         if (VTE->State != TreeEntry::Vectorize) {
11030           auto It = MultiNodeScalars.find(V);
11031           if (It == MultiNodeScalars.end())
11032             continue;
11033           VTE = *It->getSecond().begin();
11034           // Iterate through all vectorized nodes.
11035           auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
11036             return MTE->State == TreeEntry::Vectorize;
11037           });
11038           if (MIt == It->getSecond().end())
11039             continue;
11040           VTE = *MIt;
11041         }
11042       }
11043       Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11044       if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11045         continue;
11046       VToTEs.insert(VTE);
11047     }
11048     if (VToTEs.empty())
11049       continue;
11050     if (UsedTEs.empty()) {
11051       // The first iteration, just insert the list of nodes to vector.
11052       UsedTEs.push_back(VToTEs);
11053       UsedValuesEntry.try_emplace(V, 0);
11054     } else {
11055       // Need to check if there are any previously used tree nodes which use V.
11056       // If there are no such nodes, consider that we have another one input
11057       // vector.
11058       SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11059       unsigned Idx = 0;
11060       for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11061         // Do we have a non-empty intersection of previously listed tree entries
11062         // and tree entries using current V?
11063         set_intersect(VToTEs, Set);
11064         if (!VToTEs.empty()) {
11065           // Yes, write the new subset and continue analysis for the next
11066           // scalar.
11067           Set.swap(VToTEs);
11068           break;
11069         }
11070         VToTEs = SavedVToTEs;
11071         ++Idx;
11072       }
11073       // No non-empty intersection found - need to add a second set of possible
11074       // source vectors.
11075       if (Idx == UsedTEs.size()) {
11076         // If the number of input vectors is greater than 2 - not a permutation,
11077         // fallback to the regular gather.
11078         // TODO: support multiple reshuffled nodes.
11079         if (UsedTEs.size() == 2)
11080           continue;
11081         UsedTEs.push_back(SavedVToTEs);
11082         Idx = UsedTEs.size() - 1;
11083       }
11084       UsedValuesEntry.try_emplace(V, Idx);
11085     }
11086   }
11087 
11088   if (UsedTEs.empty()) {
11089     Entries.clear();
11090     return std::nullopt;
11091   }
11092 
11093   unsigned VF = 0;
11094   if (UsedTEs.size() == 1) {
11095     // Keep the order to avoid non-determinism.
11096     SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11097                                                 UsedTEs.front().end());
11098     sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11099       return TE1->Idx < TE2->Idx;
11100     });
11101     // Try to find the perfect match in another gather node at first.
11102     auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
11103       return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
11104     });
11105     if (It != FirstEntries.end() &&
11106         ((*It)->getVectorFactor() == VL.size() ||
11107          ((*It)->getVectorFactor() == TE->Scalars.size() &&
11108           TE->ReuseShuffleIndices.size() == VL.size() &&
11109           (*It)->isSame(TE->Scalars)))) {
11110       Entries.push_back(*It);
11111       if ((*It)->getVectorFactor() == VL.size()) {
11112         std::iota(std::next(Mask.begin(), Part * VL.size()),
11113                   std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
11114       } else {
11115         SmallVector<int> CommonMask = TE->getCommonMask();
11116         copy(CommonMask, Mask.begin());
11117       }
11118       // Clear undef scalars.
11119       for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11120         if (isa<PoisonValue>(VL[I]))
11121           Mask[I] = PoisonMaskElem;
11122       return TargetTransformInfo::SK_PermuteSingleSrc;
11123     }
11124     // No perfect match, just shuffle, so choose the first tree node from the
11125     // tree.
11126     Entries.push_back(FirstEntries.front());
11127   } else {
11128     // Try to find nodes with the same vector factor.
11129     assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11130     // Keep the order of tree nodes to avoid non-determinism.
11131     DenseMap<int, const TreeEntry *> VFToTE;
11132     for (const TreeEntry *TE : UsedTEs.front()) {
11133       unsigned VF = TE->getVectorFactor();
11134       auto It = VFToTE.find(VF);
11135       if (It != VFToTE.end()) {
11136         if (It->second->Idx > TE->Idx)
11137           It->getSecond() = TE;
11138         continue;
11139       }
11140       VFToTE.try_emplace(VF, TE);
11141     }
11142     // Same, keep the order to avoid non-determinism.
11143     SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11144                                                  UsedTEs.back().end());
11145     sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11146       return TE1->Idx < TE2->Idx;
11147     });
11148     for (const TreeEntry *TE : SecondEntries) {
11149       auto It = VFToTE.find(TE->getVectorFactor());
11150       if (It != VFToTE.end()) {
11151         VF = It->first;
11152         Entries.push_back(It->second);
11153         Entries.push_back(TE);
11154         break;
11155       }
11156     }
11157     // No 2 source vectors with the same vector factor - just choose 2 with max
11158     // index.
11159     if (Entries.empty()) {
11160       Entries.push_back(*llvm::max_element(
11161           UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
11162             return TE1->Idx < TE2->Idx;
11163           }));
11164       Entries.push_back(SecondEntries.front());
11165       VF = std::max(Entries.front()->getVectorFactor(),
11166                     Entries.back()->getVectorFactor());
11167     }
11168   }
11169 
11170   bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
11171   // Checks if the 2 PHIs are compatible in terms of high possibility to be
11172   // vectorized.
11173   auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11174     auto *PHI = cast<PHINode>(V);
11175     auto *PHI1 = cast<PHINode>(V1);
11176     // Check that all incoming values are compatible/from same parent (if they
11177     // are instructions).
11178     // The incoming values are compatible if they all are constants, or
11179     // instruction with the same/alternate opcodes from the same basic block.
11180     for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11181       Value *In = PHI->getIncomingValue(I);
11182       Value *In1 = PHI1->getIncomingValue(I);
11183       if (isConstant(In) && isConstant(In1))
11184         continue;
11185       if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11186         return false;
11187       if (cast<Instruction>(In)->getParent() !=
11188           cast<Instruction>(In1)->getParent())
11189         return false;
11190     }
11191     return true;
11192   };
11193   // Check if the value can be ignored during analysis for shuffled gathers.
11194   // We suppose it is better to ignore instruction, which do not form splats,
11195   // are not vectorized/not extractelements (these instructions will be handled
11196   // by extractelements processing) or may form vector node in future.
11197   auto MightBeIgnored = [=](Value *V) {
11198     auto *I = dyn_cast<Instruction>(V);
11199     return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11200            !isVectorLikeInstWithConstOps(I) &&
11201            !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11202   };
11203   // Check that the neighbor instruction may form a full vector node with the
11204   // current instruction V. It is possible, if they have same/alternate opcode
11205   // and same parent basic block.
11206   auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11207     Value *V1 = VL[Idx];
11208     bool UsedInSameVTE = false;
11209     auto It = UsedValuesEntry.find(V1);
11210     if (It != UsedValuesEntry.end())
11211       UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11212     return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11213            getSameOpcode({V, V1}, *TLI).getOpcode() &&
11214            cast<Instruction>(V)->getParent() ==
11215                cast<Instruction>(V1)->getParent() &&
11216            (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11217   };
11218   // Build a shuffle mask for better cost estimation and vector emission.
11219   SmallBitVector UsedIdxs(Entries.size());
11220   SmallVector<std::pair<unsigned, int>> EntryLanes;
11221   for (int I = 0, E = VL.size(); I < E; ++I) {
11222     Value *V = VL[I];
11223     auto It = UsedValuesEntry.find(V);
11224     if (It == UsedValuesEntry.end())
11225       continue;
11226     // Do not try to shuffle scalars, if they are constants, or instructions
11227     // that can be vectorized as a result of the following vector build
11228     // vectorization.
11229     if (isConstant(V) || (MightBeIgnored(V) &&
11230                           ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11231                            (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11232       continue;
11233     unsigned Idx = It->second;
11234     EntryLanes.emplace_back(Idx, I);
11235     UsedIdxs.set(Idx);
11236   }
11237   // Iterate through all shuffled scalars and select entries, which can be used
11238   // for final shuffle.
11239   SmallVector<const TreeEntry *> TempEntries;
11240   for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11241     if (!UsedIdxs.test(I))
11242       continue;
11243     // Fix the entry number for the given scalar. If it is the first entry, set
11244     // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11245     // These indices are used when calculating final shuffle mask as the vector
11246     // offset.
11247     for (std::pair<unsigned, int> &Pair : EntryLanes)
11248       if (Pair.first == I)
11249         Pair.first = TempEntries.size();
11250     TempEntries.push_back(Entries[I]);
11251   }
11252   Entries.swap(TempEntries);
11253   if (EntryLanes.size() == Entries.size() &&
11254       !VL.equals(ArrayRef(TE->Scalars)
11255                      .slice(Part * VL.size(),
11256                             std::min<int>(VL.size(), TE->Scalars.size())))) {
11257     // We may have here 1 or 2 entries only. If the number of scalars is equal
11258     // to the number of entries, no need to do the analysis, it is not very
11259     // profitable. Since VL is not the same as TE->Scalars, it means we already
11260     // have some shuffles before. Cut off not profitable case.
11261     Entries.clear();
11262     return std::nullopt;
11263   }
11264   // Build the final mask, check for the identity shuffle, if possible.
11265   bool IsIdentity = Entries.size() == 1;
11266   // Pair.first is the offset to the vector, while Pair.second is the index of
11267   // scalar in the list.
11268   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11269     unsigned Idx = Part * VL.size() + Pair.second;
11270     Mask[Idx] =
11271         Pair.first * VF +
11272         (ForOrder ? std::distance(
11273                         Entries[Pair.first]->Scalars.begin(),
11274                         find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11275                   : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11276     IsIdentity &= Mask[Idx] == Pair.second;
11277   }
11278   switch (Entries.size()) {
11279   case 1:
11280     if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11281       return TargetTransformInfo::SK_PermuteSingleSrc;
11282     break;
11283   case 2:
11284     if (EntryLanes.size() > 2 || VL.size() <= 2)
11285       return TargetTransformInfo::SK_PermuteTwoSrc;
11286     break;
11287   default:
11288     break;
11289   }
11290   Entries.clear();
11291   // Clear the corresponding mask elements.
11292   std::fill(std::next(Mask.begin(), Part * VL.size()),
11293             std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11294   return std::nullopt;
11295 }
11296 
11297 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
11298 BoUpSLP::isGatherShuffledEntry(
11299     const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11300     SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11301     bool ForOrder) {
11302   assert(NumParts > 0 && NumParts < VL.size() &&
11303          "Expected positive number of registers.");
11304   Entries.clear();
11305   // No need to check for the topmost gather node.
11306   if (TE == VectorizableTree.front().get())
11307     return {};
11308   // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11309   if (TE->isNonPowOf2Vec())
11310     return {};
11311   Mask.assign(VL.size(), PoisonMaskElem);
11312   assert(TE->UserTreeIndices.size() == 1 &&
11313          "Expected only single user of the gather node.");
11314   assert(VL.size() % NumParts == 0 &&
11315          "Number of scalars must be divisible by NumParts.");
11316   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11317   SmallVector<std::optional<TTI::ShuffleKind>> Res;
11318   for (unsigned Part : seq<unsigned>(NumParts)) {
11319     ArrayRef<Value *> SubVL =
11320         VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11321     SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11322     std::optional<TTI::ShuffleKind> SubRes =
11323         isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11324                                             ForOrder);
11325     if (!SubRes)
11326       SubEntries.clear();
11327     Res.push_back(SubRes);
11328     if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11329         SubEntries.front()->getVectorFactor() == VL.size() &&
11330         (SubEntries.front()->isSame(TE->Scalars) ||
11331          SubEntries.front()->isSame(VL))) {
11332       SmallVector<const TreeEntry *> LocalSubEntries;
11333       LocalSubEntries.swap(SubEntries);
11334       Entries.clear();
11335       Res.clear();
11336       std::iota(Mask.begin(), Mask.end(), 0);
11337       // Clear undef scalars.
11338       for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11339         if (isa<PoisonValue>(VL[I]))
11340           Mask[I] = PoisonMaskElem;
11341       Entries.emplace_back(1, LocalSubEntries.front());
11342       Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
11343       return Res;
11344     }
11345   }
11346   if (all_of(Res,
11347              [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11348     Entries.clear();
11349     return {};
11350   }
11351   return Res;
11352 }
11353 
11354 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11355                                        Type *ScalarTy) const {
11356   auto *VecTy = getWidenedType(ScalarTy, VL.size());
11357   bool DuplicateNonConst = false;
11358   // Find the cost of inserting/extracting values from the vector.
11359   // Check if the same elements are inserted several times and count them as
11360   // shuffle candidates.
11361   APInt ShuffledElements = APInt::getZero(VL.size());
11362   DenseMap<Value *, unsigned> UniqueElements;
11363   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11364   InstructionCost Cost;
11365   auto EstimateInsertCost = [&](unsigned I, Value *V) {
11366     if (V->getType() != ScalarTy) {
11367       Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11368                                     TTI::CastContextHint::None, CostKind);
11369       V = nullptr;
11370     }
11371     if (!ForPoisonSrc)
11372       Cost +=
11373           TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11374                                   I, Constant::getNullValue(VecTy), V);
11375   };
11376   SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11377   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11378     Value *V = VL[I];
11379     // No need to shuffle duplicates for constants.
11380     if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11381       ShuffledElements.setBit(I);
11382       ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11383       continue;
11384     }
11385 
11386     auto Res = UniqueElements.try_emplace(V, I);
11387     if (Res.second) {
11388       EstimateInsertCost(I, V);
11389       ShuffleMask[I] = I;
11390       continue;
11391     }
11392 
11393     DuplicateNonConst = true;
11394     ShuffledElements.setBit(I);
11395     ShuffleMask[I] = Res.first->second;
11396   }
11397   if (ForPoisonSrc)
11398     Cost =
11399         TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11400                                       /*Extract*/ false, CostKind);
11401   if (DuplicateNonConst)
11402     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
11403                                 VecTy, ShuffleMask);
11404   return Cost;
11405 }
11406 
11407 // Perform operand reordering on the instructions in VL and return the reordered
11408 // operands in Left and Right.
11409 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11410                                              SmallVectorImpl<Value *> &Left,
11411                                              SmallVectorImpl<Value *> &Right,
11412                                              const BoUpSLP &R) {
11413   if (VL.empty())
11414     return;
11415   VLOperands Ops(VL, R);
11416   // Reorder the operands in place.
11417   Ops.reorder();
11418   Left = Ops.getVL(0);
11419   Right = Ops.getVL(1);
11420 }
11421 
11422 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11423   auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11424   if (Res.second)
11425     return *Res.second;
11426   // Get the basic block this bundle is in. All instructions in the bundle
11427   // should be in this block (except for extractelement-like instructions with
11428   // constant indeces).
11429   auto *Front = E->getMainOp();
11430   auto *BB = Front->getParent();
11431   assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11432     if (E->getOpcode() == Instruction::GetElementPtr &&
11433         !isa<GetElementPtrInst>(V))
11434       return true;
11435     auto *I = cast<Instruction>(V);
11436     return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11437            isVectorLikeInstWithConstOps(I);
11438   }));
11439 
11440   auto FindLastInst = [&]() {
11441     Instruction *LastInst = Front;
11442     for (Value *V : E->Scalars) {
11443       auto *I = dyn_cast<Instruction>(V);
11444       if (!I)
11445         continue;
11446       if (LastInst->getParent() == I->getParent()) {
11447         if (LastInst->comesBefore(I))
11448           LastInst = I;
11449         continue;
11450       }
11451       assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452                !isa<GetElementPtrInst>(I)) ||
11453               (isVectorLikeInstWithConstOps(LastInst) &&
11454                isVectorLikeInstWithConstOps(I))) &&
11455              "Expected vector-like or non-GEP in GEP node insts only.");
11456       if (!DT->isReachableFromEntry(LastInst->getParent())) {
11457         LastInst = I;
11458         continue;
11459       }
11460       if (!DT->isReachableFromEntry(I->getParent()))
11461         continue;
11462       auto *NodeA = DT->getNode(LastInst->getParent());
11463       auto *NodeB = DT->getNode(I->getParent());
11464       assert(NodeA && "Should only process reachable instructions");
11465       assert(NodeB && "Should only process reachable instructions");
11466       assert((NodeA == NodeB) ==
11467                  (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468              "Different nodes should have different DFS numbers");
11469       if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11470         LastInst = I;
11471     }
11472     BB = LastInst->getParent();
11473     return LastInst;
11474   };
11475 
11476   auto FindFirstInst = [&]() {
11477     Instruction *FirstInst = Front;
11478     for (Value *V : E->Scalars) {
11479       auto *I = dyn_cast<Instruction>(V);
11480       if (!I)
11481         continue;
11482       if (FirstInst->getParent() == I->getParent()) {
11483         if (I->comesBefore(FirstInst))
11484           FirstInst = I;
11485         continue;
11486       }
11487       assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488               !isa<GetElementPtrInst>(I)) ||
11489              (isVectorLikeInstWithConstOps(FirstInst) &&
11490               isVectorLikeInstWithConstOps(I))) &&
11491                  "Expected vector-like or non-GEP in GEP node insts only.");
11492       if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11493         FirstInst = I;
11494         continue;
11495       }
11496       if (!DT->isReachableFromEntry(I->getParent()))
11497         continue;
11498       auto *NodeA = DT->getNode(FirstInst->getParent());
11499       auto *NodeB = DT->getNode(I->getParent());
11500       assert(NodeA && "Should only process reachable instructions");
11501       assert(NodeB && "Should only process reachable instructions");
11502       assert((NodeA == NodeB) ==
11503                  (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504              "Different nodes should have different DFS numbers");
11505       if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11506         FirstInst = I;
11507     }
11508     return FirstInst;
11509   };
11510 
11511   // Set the insert point to the beginning of the basic block if the entry
11512   // should not be scheduled.
11513   if (doesNotNeedToSchedule(E->Scalars) ||
11514       (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11515     if ((E->getOpcode() == Instruction::GetElementPtr &&
11516          any_of(E->Scalars,
11517                 [](Value *V) {
11518                   return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11519                 })) ||
11520         all_of(E->Scalars,
11521                [](Value *V) {
11522                  return !isVectorLikeInstWithConstOps(V) &&
11523                         isUsedOutsideBlock(V);
11524                }) ||
11525         (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
11526            return isa<ExtractElementInst, UndefValue>(V) ||
11527                   areAllOperandsNonInsts(V);
11528          })))
11529       Res.second = FindLastInst();
11530     else
11531       Res.second = FindFirstInst();
11532     return *Res.second;
11533   }
11534 
11535   // Find the last instruction. The common case should be that BB has been
11536   // scheduled, and the last instruction is VL.back(). So we start with
11537   // VL.back() and iterate over schedule data until we reach the end of the
11538   // bundle. The end of the bundle is marked by null ScheduleData.
11539   if (BlocksSchedules.count(BB)) {
11540     Value *V = E->isOneOf(E->Scalars.back());
11541     if (doesNotNeedToBeScheduled(V))
11542       V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11543     auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11544     if (Bundle && Bundle->isPartOfBundle())
11545       for (; Bundle; Bundle = Bundle->NextInBundle)
11546         if (Bundle->OpValue == Bundle->Inst)
11547           Res.second = Bundle->Inst;
11548   }
11549 
11550   // LastInst can still be null at this point if there's either not an entry
11551   // for BB in BlocksSchedules or there's no ScheduleData available for
11552   // VL.back(). This can be the case if buildTree_rec aborts for various
11553   // reasons (e.g., the maximum recursion depth is reached, the maximum region
11554   // size is reached, etc.). ScheduleData is initialized in the scheduling
11555   // "dry-run".
11556   //
11557   // If this happens, we can still find the last instruction by brute force. We
11558   // iterate forwards from Front (inclusive) until we either see all
11559   // instructions in the bundle or reach the end of the block. If Front is the
11560   // last instruction in program order, LastInst will be set to Front, and we
11561   // will visit all the remaining instructions in the block.
11562   //
11563   // One of the reasons we exit early from buildTree_rec is to place an upper
11564   // bound on compile-time. Thus, taking an additional compile-time hit here is
11565   // not ideal. However, this should be exceedingly rare since it requires that
11566   // we both exit early from buildTree_rec and that the bundle be out-of-order
11567   // (causing us to iterate all the way to the end of the block).
11568   if (!Res.second)
11569     Res.second = FindLastInst();
11570   assert(Res.second && "Failed to find last instruction in bundle");
11571   return *Res.second;
11572 }
11573 
11574 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11575   auto *Front = E->getMainOp();
11576   Instruction *LastInst = &getLastInstructionInBundle(E);
11577   assert(LastInst && "Failed to find last instruction in bundle");
11578   BasicBlock::iterator LastInstIt = LastInst->getIterator();
11579   // If the instruction is PHI, set the insert point after all the PHIs.
11580   bool IsPHI = isa<PHINode>(LastInst);
11581   if (IsPHI)
11582     LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11583   if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
11584     Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11585   } else {
11586     // Set the insertion point after the last instruction in the bundle. Set the
11587     // debug location to Front.
11588     Builder.SetInsertPoint(
11589         LastInst->getParent(),
11590         LastInst->getNextNonDebugInstruction()->getIterator());
11591   }
11592   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11593 }
11594 
11595 Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11596   // List of instructions/lanes from current block and/or the blocks which are
11597   // part of the current loop. These instructions will be inserted at the end to
11598   // make it possible to optimize loops and hoist invariant instructions out of
11599   // the loops body with better chances for success.
11600   SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
11601   SmallSet<int, 4> PostponedIndices;
11602   Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11603   auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11604     SmallPtrSet<BasicBlock *, 4> Visited;
11605     while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11606       InsertBB = InsertBB->getSinglePredecessor();
11607     return InsertBB && InsertBB == InstBB;
11608   };
11609   for (int I = 0, E = VL.size(); I < E; ++I) {
11610     if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11611       if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11612            getTreeEntry(Inst) ||
11613            (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11614           PostponedIndices.insert(I).second)
11615         PostponedInsts.emplace_back(Inst, I);
11616   }
11617 
11618   auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11619                                       Type *Ty) {
11620     Value *Scalar = V;
11621     if (Scalar->getType() != Ty) {
11622       assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11623              "Expected integer types only.");
11624       Value *V = Scalar;
11625       if (auto *CI = dyn_cast<CastInst>(Scalar);
11626           isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11627         Value *Op = CI->getOperand(0);
11628         if (auto *IOp = dyn_cast<Instruction>(Op);
11629             !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11630           V = Op;
11631       }
11632       Scalar = Builder.CreateIntCast(
11633           V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11634     }
11635 
11636     Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11637     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11638     if (!InsElt)
11639       return Vec;
11640     GatherShuffleExtractSeq.insert(InsElt);
11641     CSEBlocks.insert(InsElt->getParent());
11642     // Add to our 'need-to-extract' list.
11643     if (isa<Instruction>(V)) {
11644       if (TreeEntry *Entry = getTreeEntry(V)) {
11645         // Find which lane we need to extract.
11646         User *UserOp = nullptr;
11647         if (Scalar != V) {
11648           if (auto *SI = dyn_cast<Instruction>(Scalar))
11649             UserOp = SI;
11650         } else {
11651           UserOp = InsElt;
11652         }
11653         if (UserOp) {
11654           unsigned FoundLane = Entry->findLaneForValue(V);
11655           ExternalUses.emplace_back(V, UserOp, FoundLane);
11656         }
11657       }
11658     }
11659     return Vec;
11660   };
11661   auto *VecTy = getWidenedType(ScalarTy, VL.size());
11662   Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11663   SmallVector<int> NonConsts;
11664   // Insert constant values at first.
11665   for (int I = 0, E = VL.size(); I < E; ++I) {
11666     if (PostponedIndices.contains(I))
11667       continue;
11668     if (!isConstant(VL[I])) {
11669       NonConsts.push_back(I);
11670       continue;
11671     }
11672     if (Root) {
11673       if (!isa<UndefValue>(VL[I])) {
11674         NonConsts.push_back(I);
11675         continue;
11676       }
11677       if (isa<PoisonValue>(VL[I]))
11678         continue;
11679       if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11680         if (SV->getMaskValue(I) == PoisonMaskElem)
11681           continue;
11682       }
11683     }
11684     Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11685   }
11686   // Insert non-constant values.
11687   for (int I : NonConsts)
11688     Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11689   // Append instructions, which are/may be part of the loop, in the end to make
11690   // it possible to hoist non-loop-based instructions.
11691   for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11692     Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11693 
11694   return Vec;
11695 }
11696 
11697 /// Merges shuffle masks and emits final shuffle instruction, if required. It
11698 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11699 /// when the actual shuffle instruction is generated only if this is actually
11700 /// required. Otherwise, the shuffle instruction emission is delayed till the
11701 /// end of the process, to reduce the number of emitted instructions and further
11702 /// analysis/transformations.
11703 /// The class also will look through the previously emitted shuffle instructions
11704 /// and properly mark indices in mask as undef.
11705 /// For example, given the code
11706 /// \code
11707 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11708 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11709 /// \endcode
11710 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11711 /// look through %s1 and %s2 and emit
11712 /// \code
11713 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11714 /// \endcode
11715 /// instead.
11716 /// If 2 operands are of different size, the smallest one will be resized and
11717 /// the mask recalculated properly.
11718 /// For example, given the code
11719 /// \code
11720 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11721 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11722 /// \endcode
11723 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11724 /// look through %s1 and %s2 and emit
11725 /// \code
11726 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11727 /// \endcode
11728 /// instead.
11729 class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11730   bool IsFinalized = false;
11731   /// Combined mask for all applied operands and masks. It is built during
11732   /// analysis and actual emission of shuffle vector instructions.
11733   SmallVector<int> CommonMask;
11734   /// List of operands for the shuffle vector instruction. It hold at max 2
11735   /// operands, if the 3rd is going to be added, the first 2 are combined into
11736   /// shuffle with \p CommonMask mask, the first operand sets to be the
11737   /// resulting shuffle and the second operand sets to be the newly added
11738   /// operand. The \p CommonMask is transformed in the proper way after that.
11739   SmallVector<Value *, 2> InVectors;
11740   Type *ScalarTy = nullptr;
11741   IRBuilderBase &Builder;
11742   BoUpSLP &R;
11743 
11744   class ShuffleIRBuilder {
11745     IRBuilderBase &Builder;
11746     /// Holds all of the instructions that we gathered.
11747     SetVector<Instruction *> &GatherShuffleExtractSeq;
11748     /// A list of blocks that we are going to CSE.
11749     DenseSet<BasicBlock *> &CSEBlocks;
11750     /// Data layout.
11751     const DataLayout &DL;
11752 
11753   public:
11754     ShuffleIRBuilder(IRBuilderBase &Builder,
11755                      SetVector<Instruction *> &GatherShuffleExtractSeq,
11756                      DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11757         : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758           CSEBlocks(CSEBlocks), DL(DL) {}
11759     ~ShuffleIRBuilder() = default;
11760     /// Creates shufflevector for the 2 operands with the given mask.
11761     Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11762       if (V1->getType() != V2->getType()) {
11763         assert(V1->getType()->isIntOrIntVectorTy() &&
11764                V1->getType()->isIntOrIntVectorTy() &&
11765                "Expected integer vector types only.");
11766         if (V1->getType() != V2->getType()) {
11767           if (cast<VectorType>(V2->getType())
11768                   ->getElementType()
11769                   ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11770                                                ->getElementType()
11771                                                ->getIntegerBitWidth())
11772             V2 = Builder.CreateIntCast(
11773                 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11774           else
11775             V1 = Builder.CreateIntCast(
11776                 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11777         }
11778       }
11779       Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11780       if (auto *I = dyn_cast<Instruction>(Vec)) {
11781         GatherShuffleExtractSeq.insert(I);
11782         CSEBlocks.insert(I->getParent());
11783       }
11784       return Vec;
11785     }
11786     /// Creates permutation of the single vector operand with the given mask, if
11787     /// it is not identity mask.
11788     Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11789       if (Mask.empty())
11790         return V1;
11791       unsigned VF = Mask.size();
11792       unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11793       if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11794         return V1;
11795       Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11796       if (auto *I = dyn_cast<Instruction>(Vec)) {
11797         GatherShuffleExtractSeq.insert(I);
11798         CSEBlocks.insert(I->getParent());
11799       }
11800       return Vec;
11801     }
11802     Value *createIdentity(Value *V) { return V; }
11803     Value *createPoison(Type *Ty, unsigned VF) {
11804       return PoisonValue::get(getWidenedType(Ty, VF));
11805     }
11806     /// Resizes 2 input vector to match the sizes, if the they are not equal
11807     /// yet. The smallest vector is resized to the size of the larger vector.
11808     void resizeToMatch(Value *&V1, Value *&V2) {
11809       if (V1->getType() == V2->getType())
11810         return;
11811       int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11812       int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11813       int VF = std::max(V1VF, V2VF);
11814       int MinVF = std::min(V1VF, V2VF);
11815       SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11816       std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11817                 0);
11818       Value *&Op = MinVF == V1VF ? V1 : V2;
11819       Op = Builder.CreateShuffleVector(Op, IdentityMask);
11820       if (auto *I = dyn_cast<Instruction>(Op)) {
11821         GatherShuffleExtractSeq.insert(I);
11822         CSEBlocks.insert(I->getParent());
11823       }
11824       if (MinVF == V1VF)
11825         V1 = Op;
11826       else
11827         V2 = Op;
11828     }
11829   };
11830 
11831   /// Smart shuffle instruction emission, walks through shuffles trees and
11832   /// tries to find the best matching vector for the actual shuffle
11833   /// instruction.
11834   Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11835     assert(V1 && "Expected at least one vector value.");
11836     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837                                     R.CSEBlocks, *R.DL);
11838     return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11839                                                        ShuffleBuilder);
11840   }
11841 
11842   /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11843   /// shuffle emission.
11844   static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11845                                         ArrayRef<int> Mask) {
11846     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11847       if (Mask[Idx] != PoisonMaskElem)
11848         CommonMask[Idx] = Idx;
11849   }
11850 
11851   /// Cast value \p V to the vector type with the same number of elements, but
11852   /// the base type \p ScalarTy.
11853   Value *castToScalarTyElem(Value *V,
11854                             std::optional<bool> IsSigned = std::nullopt) {
11855     auto *VecTy = cast<VectorType>(V->getType());
11856     assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
11857     if (VecTy->getElementType() == ScalarTy->getScalarType())
11858       return V;
11859     return Builder.CreateIntCast(
11860         V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
11861         IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11862   }
11863 
11864 public:
11865   ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
11866       : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11867 
11868   /// Adjusts extractelements after reusing them.
11869   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11870                         ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871                         unsigned NumParts, bool &UseVecBaseAsInput) {
11872     UseVecBaseAsInput = false;
11873     SmallPtrSet<Value *, 4> UniqueBases;
11874     Value *VecBase = nullptr;
11875     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11876       int Idx = Mask[I];
11877       if (Idx == PoisonMaskElem)
11878         continue;
11879       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11880       VecBase = EI->getVectorOperand();
11881       if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11882         VecBase = TE->VectorizedValue;
11883       assert(VecBase && "Expected vectorized value.");
11884       UniqueBases.insert(VecBase);
11885       // If the only one use is vectorized - can delete the extractelement
11886       // itself.
11887       if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11888           any_of(EI->users(), [&](User *U) {
11889             const TreeEntry *UTE = R.getTreeEntry(U);
11890             return !UTE || R.MultiNodeScalars.contains(U) ||
11891                    (isa<GetElementPtrInst>(U) &&
11892                     !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11893                    count_if(R.VectorizableTree,
11894                             [&](const std::unique_ptr<TreeEntry> &TE) {
11895                               return any_of(TE->UserTreeIndices,
11896                                             [&](const EdgeInfo &Edge) {
11897                                               return Edge.UserTE == UTE;
11898                                             }) &&
11899                                      is_contained(TE->Scalars, EI);
11900                             }) != 1;
11901           }))
11902         continue;
11903       R.eraseInstruction(EI);
11904     }
11905     if (NumParts == 1 || UniqueBases.size() == 1) {
11906       assert(VecBase && "Expected vectorized value.");
11907       return castToScalarTyElem(VecBase);
11908     }
11909     UseVecBaseAsInput = true;
11910     auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11911       for (auto [I, Idx] : enumerate(Mask))
11912         if (Idx != PoisonMaskElem)
11913           Idx = I;
11914     };
11915     // Perform multi-register vector shuffle, joining them into a single virtual
11916     // long vector.
11917     // Need to shuffle each part independently and then insert all this parts
11918     // into a long virtual vector register, forming the original vector.
11919     Value *Vec = nullptr;
11920     SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11921     unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
11922     for (unsigned Part : seq<unsigned>(NumParts)) {
11923       unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
11924       ArrayRef<Value *> VL =
11925           ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
11926       MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
11927       constexpr int MaxBases = 2;
11928       SmallVector<Value *, MaxBases> Bases(MaxBases);
11929       auto VLMask = zip(VL, SubMask);
11930       const unsigned VF = std::accumulate(
11931           VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
11932             if (std::get<1>(D) == PoisonMaskElem)
11933               return S;
11934             Value *VecOp =
11935                 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
11936             if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11937               VecOp = TE->VectorizedValue;
11938             assert(VecOp && "Expected vectorized value.");
11939             const unsigned Size =
11940                 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11941             return std::max(S, Size);
11942           });
11943       for (const auto [V, I] : VLMask) {
11944         if (I == PoisonMaskElem)
11945           continue;
11946         Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11947         if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11948           VecOp = TE->VectorizedValue;
11949         assert(VecOp && "Expected vectorized value.");
11950         VecOp = castToScalarTyElem(VecOp);
11951         Bases[I / VF] = VecOp;
11952       }
11953       if (!Bases.front())
11954         continue;
11955       Value *SubVec;
11956       if (Bases.back()) {
11957         SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11958         TransformToIdentity(SubMask);
11959       } else {
11960         SubVec = Bases.front();
11961       }
11962       if (!Vec) {
11963         Vec = SubVec;
11964         assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11965                                     [&](unsigned P) {
11966                                       ArrayRef<int> SubMask =
11967                                           Mask.slice(P * SliceSize,
11968                                                      getNumElems(Mask.size(),
11969                                                                  SliceSize, P));
11970                                       return all_of(SubMask, [](int Idx) {
11971                                         return Idx == PoisonMaskElem;
11972                                       });
11973                                     })) &&
11974                "Expected first part or all previous parts masked.");
11975         copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11976       } else {
11977         unsigned NewVF =
11978             cast<FixedVectorType>(Vec->getType())->getNumElements();
11979         if (Vec->getType() != SubVec->getType()) {
11980           unsigned SubVecVF =
11981               cast<FixedVectorType>(SubVec->getType())->getNumElements();
11982           NewVF = std::max(NewVF, SubVecVF);
11983         }
11984         // Adjust SubMask.
11985         for (int &Idx : SubMask)
11986           if (Idx != PoisonMaskElem)
11987             Idx += NewVF;
11988         copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11989         Vec = createShuffle(Vec, SubVec, VecMask);
11990         TransformToIdentity(VecMask);
11991       }
11992     }
11993     copy(VecMask, Mask.begin());
11994     return Vec;
11995   }
11996   /// Checks if the specified entry \p E needs to be delayed because of its
11997   /// dependency nodes.
11998   std::optional<Value *>
11999   needToDelay(const TreeEntry *E,
12000               ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
12001     // No need to delay emission if all deps are ready.
12002     if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
12003           return all_of(
12004               TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
12005         }))
12006       return std::nullopt;
12007     // Postpone gather emission, will be emitted after the end of the
12008     // process to keep correct order.
12009     auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
12010     return Builder.CreateAlignedLoad(
12011         ResVecTy,
12012         PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
12013         MaybeAlign());
12014   }
12015   /// Adds 2 input vectors (in form of tree entries) and the mask for their
12016   /// shuffling.
12017   void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12018     Value *V1 = E1.VectorizedValue;
12019     if (V1->getType()->isIntOrIntVectorTy())
12020       V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12021                                 return !isKnownNonNegative(
12022                                     V, SimplifyQuery(*R.DL));
12023                               }));
12024     Value *V2 = E2.VectorizedValue;
12025     if (V2->getType()->isIntOrIntVectorTy())
12026       V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
12027                                 return !isKnownNonNegative(
12028                                     V, SimplifyQuery(*R.DL));
12029                               }));
12030     add(V1, V2, Mask);
12031   }
12032   /// Adds single input vector (in form of tree entry) and the mask for its
12033   /// shuffling.
12034   void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12035     Value *V1 = E1.VectorizedValue;
12036     if (V1->getType()->isIntOrIntVectorTy())
12037       V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12038                                 return !isKnownNonNegative(
12039                                     V, SimplifyQuery(*R.DL));
12040                               }));
12041     add(V1, Mask);
12042   }
12043   /// Adds 2 input vectors and the mask for their shuffling.
12044   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12045     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12046     V1 = castToScalarTyElem(V1);
12047     V2 = castToScalarTyElem(V2);
12048     if (InVectors.empty()) {
12049       InVectors.push_back(V1);
12050       InVectors.push_back(V2);
12051       CommonMask.assign(Mask.begin(), Mask.end());
12052       return;
12053     }
12054     Value *Vec = InVectors.front();
12055     if (InVectors.size() == 2) {
12056       Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12057       transformMaskAfterShuffle(CommonMask, CommonMask);
12058     } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
12059                Mask.size()) {
12060       Vec = createShuffle(Vec, nullptr, CommonMask);
12061       transformMaskAfterShuffle(CommonMask, CommonMask);
12062     }
12063     V1 = createShuffle(V1, V2, Mask);
12064     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12065       if (Mask[Idx] != PoisonMaskElem)
12066         CommonMask[Idx] = Idx + Sz;
12067     InVectors.front() = Vec;
12068     if (InVectors.size() == 2)
12069       InVectors.back() = V1;
12070     else
12071       InVectors.push_back(V1);
12072   }
12073   /// Adds another one input vector and the mask for the shuffling.
12074   void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12075     V1 = castToScalarTyElem(V1);
12076     if (InVectors.empty()) {
12077       if (!isa<FixedVectorType>(V1->getType())) {
12078         V1 = createShuffle(V1, nullptr, CommonMask);
12079         CommonMask.assign(Mask.size(), PoisonMaskElem);
12080         transformMaskAfterShuffle(CommonMask, Mask);
12081       }
12082       InVectors.push_back(V1);
12083       CommonMask.assign(Mask.begin(), Mask.end());
12084       return;
12085     }
12086     const auto *It = find(InVectors, V1);
12087     if (It == InVectors.end()) {
12088       if (InVectors.size() == 2 ||
12089           InVectors.front()->getType() != V1->getType() ||
12090           !isa<FixedVectorType>(V1->getType())) {
12091         Value *V = InVectors.front();
12092         if (InVectors.size() == 2) {
12093           V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12094           transformMaskAfterShuffle(CommonMask, CommonMask);
12095         } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12096                    CommonMask.size()) {
12097           V = createShuffle(InVectors.front(), nullptr, CommonMask);
12098           transformMaskAfterShuffle(CommonMask, CommonMask);
12099         }
12100         for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12101           if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12102             CommonMask[Idx] =
12103                 V->getType() != V1->getType()
12104                     ? Idx + Sz
12105                     : Mask[Idx] + cast<FixedVectorType>(V1->getType())
12106                                       ->getNumElements();
12107         if (V->getType() != V1->getType())
12108           V1 = createShuffle(V1, nullptr, Mask);
12109         InVectors.front() = V;
12110         if (InVectors.size() == 2)
12111           InVectors.back() = V1;
12112         else
12113           InVectors.push_back(V1);
12114         return;
12115       }
12116       // Check if second vector is required if the used elements are already
12117       // used from the first one.
12118       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12119         if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12120           InVectors.push_back(V1);
12121           break;
12122         }
12123     }
12124     int VF = CommonMask.size();
12125     if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12126       VF = FTy->getNumElements();
12127     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12128       if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12129         CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12130   }
12131   /// Adds another one input vector and the mask for the shuffling.
12132   void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
12133     SmallVector<int> NewMask;
12134     inversePermutation(Order, NewMask);
12135     add(V1, NewMask);
12136   }
12137   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12138                 Value *Root = nullptr) {
12139     return R.gather(VL, Root, ScalarTy);
12140   }
12141   Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12142   /// Finalize emission of the shuffles.
12143   /// \param Action the action (if any) to be performed before final applying of
12144   /// the \p ExtMask mask.
12145   Value *
12146   finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12147            function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12148     IsFinalized = true;
12149     if (Action) {
12150       Value *Vec = InVectors.front();
12151       if (InVectors.size() == 2) {
12152         Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12153         InVectors.pop_back();
12154       } else {
12155         Vec = createShuffle(Vec, nullptr, CommonMask);
12156       }
12157       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12158         if (CommonMask[Idx] != PoisonMaskElem)
12159           CommonMask[Idx] = Idx;
12160       assert(VF > 0 &&
12161              "Expected vector length for the final value before action.");
12162       unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12163       if (VecVF < VF) {
12164         SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12165         std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12166         Vec = createShuffle(Vec, nullptr, ResizeMask);
12167       }
12168       Action(Vec, CommonMask);
12169       InVectors.front() = Vec;
12170     }
12171     if (!ExtMask.empty()) {
12172       if (CommonMask.empty()) {
12173         CommonMask.assign(ExtMask.begin(), ExtMask.end());
12174       } else {
12175         SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12176         for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12177           if (ExtMask[I] == PoisonMaskElem)
12178             continue;
12179           NewMask[I] = CommonMask[ExtMask[I]];
12180         }
12181         CommonMask.swap(NewMask);
12182       }
12183     }
12184     if (CommonMask.empty()) {
12185       assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12186       return InVectors.front();
12187     }
12188     if (InVectors.size() == 2)
12189       return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12190     return createShuffle(InVectors.front(), nullptr, CommonMask);
12191   }
12192 
12193   ~ShuffleInstructionBuilder() {
12194     assert((IsFinalized || CommonMask.empty()) &&
12195            "Shuffle construction must be finalized.");
12196   }
12197 };
12198 
12199 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12200                                  bool PostponedPHIs) {
12201   ValueList &VL = E->getOperand(NodeIdx);
12202   const unsigned VF = VL.size();
12203   InstructionsState S = getSameOpcode(VL, *TLI);
12204   // Special processing for GEPs bundle, which may include non-gep values.
12205   if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12206     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12207     if (It != VL.end())
12208       S = getSameOpcode(*It, *TLI);
12209   }
12210   if (S.getOpcode()) {
12211     auto CheckSameVE = [&](const TreeEntry *VE) {
12212       return VE->isSame(VL) &&
12213              (any_of(VE->UserTreeIndices,
12214                      [E, NodeIdx](const EdgeInfo &EI) {
12215                        return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12216                      }) ||
12217               any_of(VectorizableTree,
12218                      [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12219                        return TE->isOperandGatherNode({E, NodeIdx}) &&
12220                               VE->isSame(TE->Scalars);
12221                      }));
12222     };
12223     TreeEntry *VE = getTreeEntry(S.OpValue);
12224     bool IsSameVE = VE && CheckSameVE(VE);
12225     if (!IsSameVE) {
12226       auto It = MultiNodeScalars.find(S.OpValue);
12227       if (It != MultiNodeScalars.end()) {
12228         auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12229           return TE != VE && CheckSameVE(TE);
12230         });
12231         if (I != It->getSecond().end()) {
12232           VE = *I;
12233           IsSameVE = true;
12234         }
12235       }
12236     }
12237     if (IsSameVE) {
12238       auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12239         ShuffleInstructionBuilder ShuffleBuilder(
12240             cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12241         ShuffleBuilder.add(V, Mask);
12242         return ShuffleBuilder.finalize(std::nullopt);
12243       };
12244       Value *V = vectorizeTree(VE, PostponedPHIs);
12245       if (VF * getNumElements(VL[0]->getType()) !=
12246           cast<FixedVectorType>(V->getType())->getNumElements()) {
12247         if (!VE->ReuseShuffleIndices.empty()) {
12248           // Reshuffle to get only unique values.
12249           // If some of the scalars are duplicated in the vectorization
12250           // tree entry, we do not vectorize them but instead generate a
12251           // mask for the reuses. But if there are several users of the
12252           // same entry, they may have different vectorization factors.
12253           // This is especially important for PHI nodes. In this case, we
12254           // need to adapt the resulting instruction for the user
12255           // vectorization factor and have to reshuffle it again to take
12256           // only unique elements of the vector. Without this code the
12257           // function incorrectly returns reduced vector instruction with
12258           // the same elements, not with the unique ones.
12259 
12260           // block:
12261           // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12262           // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12263           // ... (use %2)
12264           // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12265           // br %block
12266           SmallVector<int> Mask(VF, PoisonMaskElem);
12267           for (auto [I, V] : enumerate(VL)) {
12268             if (isa<PoisonValue>(V))
12269               continue;
12270             Mask[I] = VE->findLaneForValue(V);
12271           }
12272           V = FinalShuffle(V, Mask);
12273         } else {
12274           assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12275                  "Expected vectorization factor less "
12276                  "than original vector size.");
12277           SmallVector<int> UniformMask(VF, 0);
12278           std::iota(UniformMask.begin(), UniformMask.end(), 0);
12279           V = FinalShuffle(V, UniformMask);
12280         }
12281       }
12282       // Need to update the operand gather node, if actually the operand is not a
12283       // vectorized node, but the buildvector/gather node, which matches one of
12284       // the vectorized nodes.
12285       if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12286             return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12287           }) == VE->UserTreeIndices.end()) {
12288         auto *It = find_if(
12289             VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12290               return TE->isGather() &&
12291                      TE->UserTreeIndices.front().UserTE == E &&
12292                      TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12293             });
12294         assert(It != VectorizableTree.end() && "Expected gather node operand.");
12295         (*It)->VectorizedValue = V;
12296       }
12297       return V;
12298     }
12299   }
12300 
12301   // Find the corresponding gather entry and vectorize it.
12302   // Allows to be more accurate with tree/graph transformations, checks for the
12303   // correctness of the transformations in many cases.
12304   auto *I = find_if(VectorizableTree,
12305                     [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12306                       return TE->isOperandGatherNode({E, NodeIdx});
12307                     });
12308   assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12309   assert(I->get()->UserTreeIndices.size() == 1 &&
12310          "Expected only single user for the gather node.");
12311   assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12312   return vectorizeTree(I->get(), PostponedPHIs);
12313 }
12314 
12315 template <typename BVTy, typename ResTy, typename... Args>
12316 ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12317                                   Args &...Params) {
12318   assert(E->isGather() && "Expected gather node.");
12319   unsigned VF = E->getVectorFactor();
12320 
12321   bool NeedFreeze = false;
12322   SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12323                                        E->ReuseShuffleIndices.end());
12324   SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12325   // Build a mask out of the reorder indices and reorder scalars per this
12326   // mask.
12327   SmallVector<int> ReorderMask;
12328   inversePermutation(E->ReorderIndices, ReorderMask);
12329   if (!ReorderMask.empty())
12330     reorderScalars(GatheredScalars, ReorderMask);
12331   auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12332                              unsigned I, unsigned SliceSize) {
12333     if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12334           return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12335         }))
12336       return false;
12337     TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12338     unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12339     if (UserTE->getNumOperands() != 2)
12340       return false;
12341     auto *It =
12342         find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12343           return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12344                    return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12345                  }) != TE->UserTreeIndices.end();
12346         });
12347     if (It == VectorizableTree.end())
12348       return false;
12349     int Idx;
12350     if ((Mask.size() < InputVF &&
12351          ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
12352          Idx == 0) ||
12353         (Mask.size() == InputVF &&
12354          ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12355       std::iota(
12356           std::next(Mask.begin(), I * SliceSize),
12357           std::next(Mask.begin(),
12358                     I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12359           0);
12360     } else {
12361       unsigned IVal =
12362           *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12363       std::fill(
12364           std::next(Mask.begin(), I * SliceSize),
12365           std::next(Mask.begin(),
12366                     I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12367           IVal);
12368     }
12369     return true;
12370   };
12371   BVTy ShuffleBuilder(ScalarTy, Params...);
12372   ResTy Res = ResTy();
12373   SmallVector<int> Mask;
12374   SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12375   SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
12376   Value *ExtractVecBase = nullptr;
12377   bool UseVecBaseAsInput = false;
12378   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
12379   SmallVector<SmallVector<const TreeEntry *>> Entries;
12380   Type *OrigScalarTy = GatheredScalars.front()->getType();
12381   auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12382   unsigned NumParts = TTI->getNumberOfParts(VecTy);
12383   if (NumParts == 0 || NumParts >= GatheredScalars.size())
12384     NumParts = 1;
12385   if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12386     // Check for gathered extracts.
12387     bool Resized = false;
12388     ExtractShuffles =
12389         tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12390     if (!ExtractShuffles.empty()) {
12391       SmallVector<const TreeEntry *> ExtractEntries;
12392       for (auto [Idx, I] : enumerate(ExtractMask)) {
12393         if (I == PoisonMaskElem)
12394           continue;
12395         if (const auto *TE = getTreeEntry(
12396                 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12397           ExtractEntries.push_back(TE);
12398       }
12399       if (std::optional<ResTy> Delayed =
12400               ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12401         // Delay emission of gathers which are not ready yet.
12402         PostponedGathers.insert(E);
12403         // Postpone gather emission, will be emitted after the end of the
12404         // process to keep correct order.
12405         return *Delayed;
12406       }
12407       if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12408               E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12409         ExtractVecBase = VecBase;
12410         if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12411           if (VF == VecBaseTy->getNumElements() &&
12412               GatheredScalars.size() != VF) {
12413             Resized = true;
12414             GatheredScalars.append(VF - GatheredScalars.size(),
12415                                    PoisonValue::get(OrigScalarTy));
12416           }
12417       }
12418     }
12419     // Gather extracts after we check for full matched gathers only.
12420     if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12421         E->isAltShuffle() ||
12422         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12423         isSplat(E->Scalars) ||
12424         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12425       GatherShuffles =
12426           isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12427     }
12428     if (!GatherShuffles.empty()) {
12429       if (std::optional<ResTy> Delayed =
12430               ShuffleBuilder.needToDelay(E, Entries)) {
12431         // Delay emission of gathers which are not ready yet.
12432         PostponedGathers.insert(E);
12433         // Postpone gather emission, will be emitted after the end of the
12434         // process to keep correct order.
12435         return *Delayed;
12436       }
12437       if (GatherShuffles.size() == 1 &&
12438           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12439           Entries.front().front()->isSame(E->Scalars)) {
12440         // Perfect match in the graph, will reuse the previously vectorized
12441         // node. Cost is 0.
12442         LLVM_DEBUG(
12443             dbgs()
12444             << "SLP: perfect diamond match for gather bundle "
12445             << shortBundleName(E->Scalars) << ".\n");
12446         // Restore the mask for previous partially matched values.
12447         Mask.resize(E->Scalars.size());
12448         const TreeEntry *FrontTE = Entries.front().front();
12449         if (FrontTE->ReorderIndices.empty() &&
12450             ((FrontTE->ReuseShuffleIndices.empty() &&
12451               E->Scalars.size() == FrontTE->Scalars.size()) ||
12452              (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12453           std::iota(Mask.begin(), Mask.end(), 0);
12454         } else {
12455           for (auto [I, V] : enumerate(E->Scalars)) {
12456             if (isa<PoisonValue>(V)) {
12457               Mask[I] = PoisonMaskElem;
12458               continue;
12459             }
12460             Mask[I] = FrontTE->findLaneForValue(V);
12461           }
12462         }
12463         ShuffleBuilder.add(*FrontTE, Mask);
12464         Res = ShuffleBuilder.finalize(E->getCommonMask());
12465         return Res;
12466       }
12467       if (!Resized) {
12468         if (GatheredScalars.size() != VF &&
12469             any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12470               return any_of(TEs, [&](const TreeEntry *TE) {
12471                 return TE->getVectorFactor() == VF;
12472               });
12473             }))
12474           GatheredScalars.append(VF - GatheredScalars.size(),
12475                                  PoisonValue::get(OrigScalarTy));
12476       }
12477       // Remove shuffled elements from list of gathers.
12478       for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12479         if (Mask[I] != PoisonMaskElem)
12480           GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12481       }
12482     }
12483   }
12484   auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12485                             SmallVectorImpl<int> &ReuseMask,
12486                             bool IsRootPoison) {
12487     // For splats with can emit broadcasts instead of gathers, so try to find
12488     // such sequences.
12489     bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12490                    (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12491     Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12492     SmallVector<int> UndefPos;
12493     DenseMap<Value *, unsigned> UniquePositions;
12494     // Gather unique non-const values and all constant values.
12495     // For repeated values, just shuffle them.
12496     int NumNonConsts = 0;
12497     int SinglePos = 0;
12498     for (auto [I, V] : enumerate(Scalars)) {
12499       if (isa<UndefValue>(V)) {
12500         if (!isa<PoisonValue>(V)) {
12501           ReuseMask[I] = I;
12502           UndefPos.push_back(I);
12503         }
12504         continue;
12505       }
12506       if (isConstant(V)) {
12507         ReuseMask[I] = I;
12508         continue;
12509       }
12510       ++NumNonConsts;
12511       SinglePos = I;
12512       Value *OrigV = V;
12513       Scalars[I] = PoisonValue::get(OrigScalarTy);
12514       if (IsSplat) {
12515         Scalars.front() = OrigV;
12516         ReuseMask[I] = 0;
12517       } else {
12518         const auto Res = UniquePositions.try_emplace(OrigV, I);
12519         Scalars[Res.first->second] = OrigV;
12520         ReuseMask[I] = Res.first->second;
12521       }
12522     }
12523     if (NumNonConsts == 1) {
12524       // Restore single insert element.
12525       if (IsSplat) {
12526         ReuseMask.assign(VF, PoisonMaskElem);
12527         std::swap(Scalars.front(), Scalars[SinglePos]);
12528         if (!UndefPos.empty() && UndefPos.front() == 0)
12529           Scalars.front() = UndefValue::get(OrigScalarTy);
12530       }
12531       ReuseMask[SinglePos] = SinglePos;
12532     } else if (!UndefPos.empty() && IsSplat) {
12533       // For undef values, try to replace them with the simple broadcast.
12534       // We can do it if the broadcasted value is guaranteed to be
12535       // non-poisonous, or by freezing the incoming scalar value first.
12536       auto *It = find_if(Scalars, [this, E](Value *V) {
12537         return !isa<UndefValue>(V) &&
12538                (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12539                 (E->UserTreeIndices.size() == 1 &&
12540                  any_of(V->uses(), [E](const Use &U) {
12541                    // Check if the value already used in the same operation in
12542                    // one of the nodes already.
12543                    return E->UserTreeIndices.front().EdgeIdx !=
12544                               U.getOperandNo() &&
12545                           is_contained(
12546                               E->UserTreeIndices.front().UserTE->Scalars,
12547                               U.getUser());
12548                  })));
12549       });
12550       if (It != Scalars.end()) {
12551         // Replace undefs by the non-poisoned scalars and emit broadcast.
12552         int Pos = std::distance(Scalars.begin(), It);
12553         for (int I : UndefPos) {
12554           // Set the undef position to the non-poisoned scalar.
12555           ReuseMask[I] = Pos;
12556           // Replace the undef by the poison, in the mask it is replaced by
12557           // non-poisoned scalar already.
12558           if (I != Pos)
12559             Scalars[I] = PoisonValue::get(OrigScalarTy);
12560         }
12561       } else {
12562         // Replace undefs by the poisons, emit broadcast and then emit
12563         // freeze.
12564         for (int I : UndefPos) {
12565           ReuseMask[I] = PoisonMaskElem;
12566           if (isa<UndefValue>(Scalars[I]))
12567             Scalars[I] = PoisonValue::get(OrigScalarTy);
12568         }
12569         NeedFreeze = true;
12570       }
12571     }
12572   };
12573   if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12574     bool IsNonPoisoned = true;
12575     bool IsUsedInExpr = true;
12576     Value *Vec1 = nullptr;
12577     if (!ExtractShuffles.empty()) {
12578       // Gather of extractelements can be represented as just a shuffle of
12579       // a single/two vectors the scalars are extracted from.
12580       // Find input vectors.
12581       Value *Vec2 = nullptr;
12582       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12583         if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12584           ExtractMask[I] = PoisonMaskElem;
12585       }
12586       if (UseVecBaseAsInput) {
12587         Vec1 = ExtractVecBase;
12588       } else {
12589         for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12590           if (ExtractMask[I] == PoisonMaskElem)
12591             continue;
12592           if (isa<UndefValue>(E->Scalars[I]))
12593             continue;
12594           auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12595           Value *VecOp = EI->getVectorOperand();
12596           if (const auto *TE = getTreeEntry(VecOp))
12597             if (TE->VectorizedValue)
12598               VecOp = TE->VectorizedValue;
12599           if (!Vec1) {
12600             Vec1 = VecOp;
12601           } else if (Vec1 != VecOp) {
12602             assert((!Vec2 || Vec2 == VecOp) &&
12603                    "Expected only 1 or 2 vectors shuffle.");
12604             Vec2 = VecOp;
12605           }
12606         }
12607       }
12608       if (Vec2) {
12609         IsUsedInExpr = false;
12610         IsNonPoisoned &=
12611             isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
12612         ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12613       } else if (Vec1) {
12614         IsUsedInExpr &= FindReusedSplat(
12615             ExtractMask,
12616             cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12617             ExtractMask.size());
12618         ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12619         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12620       } else {
12621         IsUsedInExpr = false;
12622         ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12623                            /*ForExtracts=*/true);
12624       }
12625     }
12626     if (!GatherShuffles.empty()) {
12627       unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12628       SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12629       for (const auto [I, TEs] : enumerate(Entries)) {
12630         if (TEs.empty()) {
12631           assert(!GatherShuffles[I] &&
12632                  "No shuffles with empty entries list expected.");
12633           continue;
12634         }
12635         assert((TEs.size() == 1 || TEs.size() == 2) &&
12636                "Expected shuffle of 1 or 2 entries.");
12637         unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12638         auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12639         VecMask.assign(VecMask.size(), PoisonMaskElem);
12640         copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12641         if (TEs.size() == 1) {
12642           IsUsedInExpr &= FindReusedSplat(
12643               VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12644           ShuffleBuilder.add(*TEs.front(), VecMask);
12645           if (TEs.front()->VectorizedValue)
12646             IsNonPoisoned &=
12647                 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12648         } else {
12649           IsUsedInExpr = false;
12650           ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12651           if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12652             IsNonPoisoned &=
12653                 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12654                 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12655         }
12656       }
12657     }
12658     // Try to figure out best way to combine values: build a shuffle and insert
12659     // elements or just build several shuffles.
12660     // Insert non-constant scalars.
12661     SmallVector<Value *> NonConstants(GatheredScalars);
12662     int EMSz = ExtractMask.size();
12663     int MSz = Mask.size();
12664     // Try to build constant vector and shuffle with it only if currently we
12665     // have a single permutation and more than 1 scalar constants.
12666     bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12667     bool IsIdentityShuffle =
12668         ((UseVecBaseAsInput ||
12669           all_of(ExtractShuffles,
12670                  [](const std::optional<TTI::ShuffleKind> &SK) {
12671                    return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12672                           TTI::SK_PermuteSingleSrc;
12673                  })) &&
12674          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12675          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12676         (!GatherShuffles.empty() &&
12677          all_of(GatherShuffles,
12678                 [](const std::optional<TTI::ShuffleKind> &SK) {
12679                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12680                          TTI::SK_PermuteSingleSrc;
12681                 }) &&
12682          none_of(Mask, [&](int I) { return I >= MSz; }) &&
12683          ShuffleVectorInst::isIdentityMask(Mask, MSz));
12684     bool EnoughConstsForShuffle =
12685         IsSingleShuffle &&
12686         (none_of(GatheredScalars,
12687                  [](Value *V) {
12688                    return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12689                  }) ||
12690          any_of(GatheredScalars,
12691                 [](Value *V) {
12692                   return isa<Constant>(V) && !isa<UndefValue>(V);
12693                 })) &&
12694         (!IsIdentityShuffle ||
12695          (GatheredScalars.size() == 2 &&
12696           any_of(GatheredScalars,
12697                  [](Value *V) { return !isa<UndefValue>(V); })) ||
12698          count_if(GatheredScalars, [](Value *V) {
12699            return isa<Constant>(V) && !isa<PoisonValue>(V);
12700          }) > 1);
12701     // NonConstants array contains just non-constant values, GatheredScalars
12702     // contains only constant to build final vector and then shuffle.
12703     for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12704       if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12705         NonConstants[I] = PoisonValue::get(OrigScalarTy);
12706       else
12707         GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12708     }
12709     // Generate constants for final shuffle and build a mask for them.
12710     if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12711       SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12712       TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12713       Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12714       ShuffleBuilder.add(BV, BVMask);
12715     }
12716     if (all_of(NonConstants, [=](Value *V) {
12717           return isa<PoisonValue>(V) ||
12718                  (IsSingleShuffle && ((IsIdentityShuffle &&
12719                   IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12720         }))
12721       Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12722     else
12723       Res = ShuffleBuilder.finalize(
12724           E->ReuseShuffleIndices, E->Scalars.size(),
12725           [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12726             TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12727             Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12728           });
12729   } else if (!allConstant(GatheredScalars)) {
12730     // Gather unique scalars and all constants.
12731     SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12732     TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12733     Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12734     ShuffleBuilder.add(BV, ReuseMask);
12735     Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12736   } else {
12737     // Gather all constants.
12738     SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12739     for (auto [I, V] : enumerate(E->Scalars)) {
12740       if (!isa<PoisonValue>(V))
12741         Mask[I] = I;
12742     }
12743     Value *BV = ShuffleBuilder.gather(E->Scalars);
12744     ShuffleBuilder.add(BV, Mask);
12745     Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12746   }
12747 
12748   if (NeedFreeze)
12749     Res = ShuffleBuilder.createFreeze(Res);
12750   return Res;
12751 }
12752 
12753 Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12754   return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12755                                                                 Builder, *this);
12756 }
12757 
12758 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12759   IRBuilderBase::InsertPointGuard Guard(Builder);
12760 
12761   if (E->VectorizedValue &&
12762       (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12763        E->isAltShuffle())) {
12764     LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12765     return E->VectorizedValue;
12766   }
12767 
12768   Value *V = E->Scalars.front();
12769   Type *ScalarTy = V->getType();
12770   if (auto *Store = dyn_cast<StoreInst>(V))
12771     ScalarTy = Store->getValueOperand()->getType();
12772   else if (auto *IE = dyn_cast<InsertElementInst>(V))
12773     ScalarTy = IE->getOperand(1)->getType();
12774   auto It = MinBWs.find(E);
12775   if (It != MinBWs.end())
12776     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12777   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
12778   if (E->isGather()) {
12779     // Set insert point for non-reduction initial nodes.
12780     if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12781       setInsertPointAfterBundle(E);
12782     Value *Vec = createBuildVector(E, ScalarTy);
12783     E->VectorizedValue = Vec;
12784     return Vec;
12785   }
12786 
12787   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12788   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12789     ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12790     if (E->getOpcode() == Instruction::Store &&
12791         E->State == TreeEntry::Vectorize) {
12792       ArrayRef<int> Mask =
12793           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12794                    E->ReorderIndices.size());
12795       ShuffleBuilder.add(V, Mask);
12796     } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12797       ShuffleBuilder.addOrdered(V, std::nullopt);
12798     } else {
12799       ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12800     }
12801     return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12802   };
12803 
12804   assert((E->State == TreeEntry::Vectorize ||
12805           E->State == TreeEntry::ScatterVectorize ||
12806           E->State == TreeEntry::StridedVectorize) &&
12807          "Unhandled state");
12808   unsigned ShuffleOrOp =
12809       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12810   Instruction *VL0 = E->getMainOp();
12811   auto GetOperandSignedness = [&](unsigned Idx) {
12812     const TreeEntry *OpE = getOperandEntry(E, Idx);
12813     bool IsSigned = false;
12814     auto It = MinBWs.find(OpE);
12815     if (It != MinBWs.end())
12816       IsSigned = It->second.second;
12817     else
12818       IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12819         return !isKnownNonNegative(R, SimplifyQuery(*DL));
12820       });
12821     return IsSigned;
12822   };
12823   switch (ShuffleOrOp) {
12824     case Instruction::PHI: {
12825       assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12826               E != VectorizableTree.front().get() ||
12827               !E->UserTreeIndices.empty()) &&
12828              "PHI reordering is free.");
12829       if (PostponedPHIs && E->VectorizedValue)
12830         return E->VectorizedValue;
12831       auto *PH = cast<PHINode>(VL0);
12832       Builder.SetInsertPoint(PH->getParent(),
12833                              PH->getParent()->getFirstNonPHIIt());
12834       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12835       if (PostponedPHIs || !E->VectorizedValue) {
12836         PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12837         E->PHI = NewPhi;
12838         Value *V = NewPhi;
12839 
12840         // Adjust insertion point once all PHI's have been generated.
12841         Builder.SetInsertPoint(PH->getParent(),
12842                                PH->getParent()->getFirstInsertionPt());
12843         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12844 
12845         V = FinalShuffle(V, E, VecTy);
12846 
12847         E->VectorizedValue = V;
12848         if (PostponedPHIs)
12849           return V;
12850       }
12851       PHINode *NewPhi = cast<PHINode>(E->PHI);
12852       // If phi node is fully emitted - exit.
12853       if (NewPhi->getNumIncomingValues() != 0)
12854         return NewPhi;
12855 
12856       // PHINodes may have multiple entries from the same block. We want to
12857       // visit every block once.
12858       SmallPtrSet<BasicBlock *, 4> VisitedBBs;
12859 
12860       for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12861         ValueList Operands;
12862         BasicBlock *IBB = PH->getIncomingBlock(I);
12863 
12864         // Stop emission if all incoming values are generated.
12865         if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12866           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12867           return NewPhi;
12868         }
12869 
12870         if (!VisitedBBs.insert(IBB).second) {
12871           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12872           continue;
12873         }
12874 
12875         Builder.SetInsertPoint(IBB->getTerminator());
12876         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12877         Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12878         if (VecTy != Vec->getType()) {
12879           assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
12880                   MinBWs.contains(getOperandEntry(E, I))) &&
12881                  "Expected item in MinBWs.");
12882           Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12883         }
12884         NewPhi->addIncoming(Vec, IBB);
12885       }
12886 
12887       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12888              "Invalid number of incoming values");
12889       return NewPhi;
12890     }
12891 
12892     case Instruction::ExtractElement: {
12893       Value *V = E->getSingleOperand(0);
12894       if (const TreeEntry *TE = getTreeEntry(V))
12895         V = TE->VectorizedValue;
12896       setInsertPointAfterBundle(E);
12897       V = FinalShuffle(V, E, VecTy);
12898       E->VectorizedValue = V;
12899       return V;
12900     }
12901     case Instruction::ExtractValue: {
12902       auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12903       Builder.SetInsertPoint(LI);
12904       Value *Ptr = LI->getPointerOperand();
12905       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12906       Value *NewV = propagateMetadata(V, E->Scalars);
12907       NewV = FinalShuffle(NewV, E, VecTy);
12908       E->VectorizedValue = NewV;
12909       return NewV;
12910     }
12911     case Instruction::InsertElement: {
12912       assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12913       Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12914       Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12915       ArrayRef<Value *> Op = E->getOperand(1);
12916       Type *ScalarTy = Op.front()->getType();
12917       if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12918         assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12919         std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12920         assert(Res.first > 0 && "Expected item in MinBWs.");
12921         V = Builder.CreateIntCast(
12922             V,
12923             getWidenedType(
12924                 ScalarTy,
12925                 cast<FixedVectorType>(V->getType())->getNumElements()),
12926             Res.second);
12927       }
12928 
12929       // Create InsertVector shuffle if necessary
12930       auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12931         return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12932       }));
12933       const unsigned NumElts =
12934           cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12935       const unsigned NumScalars = E->Scalars.size();
12936 
12937       unsigned Offset = *getElementIndex(VL0);
12938       assert(Offset < NumElts && "Failed to find vector index offset");
12939 
12940       // Create shuffle to resize vector
12941       SmallVector<int> Mask;
12942       if (!E->ReorderIndices.empty()) {
12943         inversePermutation(E->ReorderIndices, Mask);
12944         Mask.append(NumElts - NumScalars, PoisonMaskElem);
12945       } else {
12946         Mask.assign(NumElts, PoisonMaskElem);
12947         std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12948       }
12949       // Create InsertVector shuffle if necessary
12950       bool IsIdentity = true;
12951       SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12952       Mask.swap(PrevMask);
12953       for (unsigned I = 0; I < NumScalars; ++I) {
12954         Value *Scalar = E->Scalars[PrevMask[I]];
12955         unsigned InsertIdx = *getElementIndex(Scalar);
12956         IsIdentity &= InsertIdx - Offset == I;
12957         Mask[InsertIdx - Offset] = I;
12958       }
12959       if (!IsIdentity || NumElts != NumScalars) {
12960         Value *V2 = nullptr;
12961         bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12962         SmallVector<int> InsertMask(Mask);
12963         if (NumElts != NumScalars && Offset == 0) {
12964           // Follow all insert element instructions from the current buildvector
12965           // sequence.
12966           InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12967           do {
12968             std::optional<unsigned> InsertIdx = getElementIndex(Ins);
12969             if (!InsertIdx)
12970               break;
12971             if (InsertMask[*InsertIdx] == PoisonMaskElem)
12972               InsertMask[*InsertIdx] = *InsertIdx;
12973             if (!Ins->hasOneUse())
12974               break;
12975             Ins = dyn_cast_or_null<InsertElementInst>(
12976                 Ins->getUniqueUndroppableUser());
12977           } while (Ins);
12978           SmallBitVector UseMask =
12979               buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12980           SmallBitVector IsFirstPoison =
12981               isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12982           SmallBitVector IsFirstUndef =
12983               isUndefVector(FirstInsert->getOperand(0), UseMask);
12984           if (!IsFirstPoison.all()) {
12985             unsigned Idx = 0;
12986             for (unsigned I = 0; I < NumElts; I++) {
12987               if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12988                   IsFirstUndef.test(I)) {
12989                 if (IsVNonPoisonous) {
12990                   InsertMask[I] = I < NumScalars ? I : 0;
12991                   continue;
12992                 }
12993                 if (!V2)
12994                   V2 = UndefValue::get(V->getType());
12995                 if (Idx >= NumScalars)
12996                   Idx = NumScalars - 1;
12997                 InsertMask[I] = NumScalars + Idx;
12998                 ++Idx;
12999               } else if (InsertMask[I] != PoisonMaskElem &&
13000                          Mask[I] == PoisonMaskElem) {
13001                 InsertMask[I] = PoisonMaskElem;
13002               }
13003             }
13004           } else {
13005             InsertMask = Mask;
13006           }
13007         }
13008         if (!V2)
13009           V2 = PoisonValue::get(V->getType());
13010         V = Builder.CreateShuffleVector(V, V2, InsertMask);
13011         if (auto *I = dyn_cast<Instruction>(V)) {
13012           GatherShuffleExtractSeq.insert(I);
13013           CSEBlocks.insert(I->getParent());
13014         }
13015       }
13016 
13017       SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13018       for (unsigned I = 0; I < NumElts; I++) {
13019         if (Mask[I] != PoisonMaskElem)
13020           InsertMask[Offset + I] = I;
13021       }
13022       SmallBitVector UseMask =
13023           buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13024       SmallBitVector IsFirstUndef =
13025           isUndefVector(FirstInsert->getOperand(0), UseMask);
13026       if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13027           NumElts != NumScalars) {
13028         if (IsFirstUndef.all()) {
13029           if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
13030             SmallBitVector IsFirstPoison =
13031                 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13032             if (!IsFirstPoison.all()) {
13033               for (unsigned I = 0; I < NumElts; I++) {
13034                 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
13035                   InsertMask[I] = I + NumElts;
13036               }
13037             }
13038             V = Builder.CreateShuffleVector(
13039                 V,
13040                 IsFirstPoison.all() ? PoisonValue::get(V->getType())
13041                                     : FirstInsert->getOperand(0),
13042                 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
13043             if (auto *I = dyn_cast<Instruction>(V)) {
13044               GatherShuffleExtractSeq.insert(I);
13045               CSEBlocks.insert(I->getParent());
13046             }
13047           }
13048         } else {
13049           SmallBitVector IsFirstPoison =
13050               isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13051           for (unsigned I = 0; I < NumElts; I++) {
13052             if (InsertMask[I] == PoisonMaskElem)
13053               InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
13054             else
13055               InsertMask[I] += NumElts;
13056           }
13057           V = Builder.CreateShuffleVector(
13058               FirstInsert->getOperand(0), V, InsertMask,
13059               cast<Instruction>(E->Scalars.back())->getName());
13060           if (auto *I = dyn_cast<Instruction>(V)) {
13061             GatherShuffleExtractSeq.insert(I);
13062             CSEBlocks.insert(I->getParent());
13063           }
13064         }
13065       }
13066 
13067       ++NumVectorInstructions;
13068       E->VectorizedValue = V;
13069       return V;
13070     }
13071     case Instruction::ZExt:
13072     case Instruction::SExt:
13073     case Instruction::FPToUI:
13074     case Instruction::FPToSI:
13075     case Instruction::FPExt:
13076     case Instruction::PtrToInt:
13077     case Instruction::IntToPtr:
13078     case Instruction::SIToFP:
13079     case Instruction::UIToFP:
13080     case Instruction::Trunc:
13081     case Instruction::FPTrunc:
13082     case Instruction::BitCast: {
13083       setInsertPointAfterBundle(E);
13084 
13085       Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13086       if (E->VectorizedValue) {
13087         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13088         return E->VectorizedValue;
13089       }
13090 
13091       auto *CI = cast<CastInst>(VL0);
13092       Instruction::CastOps VecOpcode = CI->getOpcode();
13093       Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
13094       auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
13095       if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13096           (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13097            SrcScalarTy != CI->getOperand(0)->getType())) {
13098         // Check if the values are candidates to demote.
13099         unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
13100         if (SrcIt != MinBWs.end())
13101           SrcBWSz = SrcIt->second.first;
13102         unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13103         if (BWSz == SrcBWSz) {
13104           VecOpcode = Instruction::BitCast;
13105         } else if (BWSz < SrcBWSz) {
13106           VecOpcode = Instruction::Trunc;
13107         } else if (It != MinBWs.end()) {
13108           assert(BWSz > SrcBWSz && "Invalid cast!");
13109           VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13110         } else if (SrcIt != MinBWs.end()) {
13111           assert(BWSz > SrcBWSz && "Invalid cast!");
13112           VecOpcode =
13113               SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13114         }
13115       } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13116                  !SrcIt->second.second) {
13117         VecOpcode = Instruction::UIToFP;
13118       }
13119       Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13120                      ? InVec
13121                      : Builder.CreateCast(VecOpcode, InVec, VecTy);
13122       V = FinalShuffle(V, E, VecTy);
13123 
13124       E->VectorizedValue = V;
13125       ++NumVectorInstructions;
13126       return V;
13127     }
13128     case Instruction::FCmp:
13129     case Instruction::ICmp: {
13130       setInsertPointAfterBundle(E);
13131 
13132       Value *L = vectorizeOperand(E, 0, PostponedPHIs);
13133       if (E->VectorizedValue) {
13134         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13135         return E->VectorizedValue;
13136       }
13137       Value *R = vectorizeOperand(E, 1, PostponedPHIs);
13138       if (E->VectorizedValue) {
13139         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13140         return E->VectorizedValue;
13141       }
13142       if (L->getType() != R->getType()) {
13143         assert((getOperandEntry(E, 0)->isGather() ||
13144                 getOperandEntry(E, 1)->isGather() ||
13145                 MinBWs.contains(getOperandEntry(E, 0)) ||
13146                 MinBWs.contains(getOperandEntry(E, 1))) &&
13147                "Expected item in MinBWs.");
13148         if (cast<VectorType>(L->getType())
13149                 ->getElementType()
13150                 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
13151                                              ->getElementType()
13152                                              ->getIntegerBitWidth()) {
13153           Type *CastTy = R->getType();
13154           L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
13155         } else {
13156           Type *CastTy = L->getType();
13157           R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
13158         }
13159       }
13160 
13161       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13162       Value *V = Builder.CreateCmp(P0, L, R);
13163       propagateIRFlags(V, E->Scalars, VL0);
13164       // Do not cast for cmps.
13165       VecTy = cast<FixedVectorType>(V->getType());
13166       V = FinalShuffle(V, E, VecTy);
13167 
13168       E->VectorizedValue = V;
13169       ++NumVectorInstructions;
13170       return V;
13171     }
13172     case Instruction::Select: {
13173       setInsertPointAfterBundle(E);
13174 
13175       Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
13176       if (E->VectorizedValue) {
13177         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13178         return E->VectorizedValue;
13179       }
13180       Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13181       if (E->VectorizedValue) {
13182         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13183         return E->VectorizedValue;
13184       }
13185       Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13186       if (E->VectorizedValue) {
13187         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13188         return E->VectorizedValue;
13189       }
13190       if (True->getType() != VecTy || False->getType() != VecTy) {
13191         assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13192                 getOperandEntry(E, 2)->isGather() ||
13193                 MinBWs.contains(getOperandEntry(E, 1)) ||
13194                 MinBWs.contains(getOperandEntry(E, 2))) &&
13195                "Expected item in MinBWs.");
13196         if (True->getType() != VecTy)
13197           True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13198         if (False->getType() != VecTy)
13199           False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13200       }
13201 
13202       Value *V = Builder.CreateSelect(Cond, True, False);
13203       V = FinalShuffle(V, E, VecTy);
13204 
13205       E->VectorizedValue = V;
13206       ++NumVectorInstructions;
13207       return V;
13208     }
13209     case Instruction::FNeg: {
13210       setInsertPointAfterBundle(E);
13211 
13212       Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13213 
13214       if (E->VectorizedValue) {
13215         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13216         return E->VectorizedValue;
13217       }
13218 
13219       Value *V = Builder.CreateUnOp(
13220           static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13221       propagateIRFlags(V, E->Scalars, VL0);
13222       if (auto *I = dyn_cast<Instruction>(V))
13223         V = propagateMetadata(I, E->Scalars);
13224 
13225       V = FinalShuffle(V, E, VecTy);
13226 
13227       E->VectorizedValue = V;
13228       ++NumVectorInstructions;
13229 
13230       return V;
13231     }
13232     case Instruction::Add:
13233     case Instruction::FAdd:
13234     case Instruction::Sub:
13235     case Instruction::FSub:
13236     case Instruction::Mul:
13237     case Instruction::FMul:
13238     case Instruction::UDiv:
13239     case Instruction::SDiv:
13240     case Instruction::FDiv:
13241     case Instruction::URem:
13242     case Instruction::SRem:
13243     case Instruction::FRem:
13244     case Instruction::Shl:
13245     case Instruction::LShr:
13246     case Instruction::AShr:
13247     case Instruction::And:
13248     case Instruction::Or:
13249     case Instruction::Xor: {
13250       setInsertPointAfterBundle(E);
13251 
13252       Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13253       if (E->VectorizedValue) {
13254         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13255         return E->VectorizedValue;
13256       }
13257       Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13258       if (E->VectorizedValue) {
13259         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13260         return E->VectorizedValue;
13261       }
13262       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13263         for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13264           ArrayRef<Value *> Ops = E->getOperand(I);
13265           if (all_of(Ops, [&](Value *Op) {
13266                 auto *CI = dyn_cast<ConstantInt>(Op);
13267                 return CI && CI->getValue().countr_one() >= It->second.first;
13268               })) {
13269             V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13270             E->VectorizedValue = V;
13271             ++NumVectorInstructions;
13272             return V;
13273           }
13274         }
13275       }
13276       if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13277         assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13278                 getOperandEntry(E, 1)->isGather() ||
13279                 MinBWs.contains(getOperandEntry(E, 0)) ||
13280                 MinBWs.contains(getOperandEntry(E, 1))) &&
13281                "Expected item in MinBWs.");
13282         if (LHS->getType() != VecTy)
13283           LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13284         if (RHS->getType() != VecTy)
13285           RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13286       }
13287 
13288       Value *V = Builder.CreateBinOp(
13289           static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13290           RHS);
13291       propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13292       if (auto *I = dyn_cast<Instruction>(V)) {
13293         V = propagateMetadata(I, E->Scalars);
13294         // Drop nuw flags for abs(sub(commutative), true).
13295         if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13296             any_of(E->Scalars, [](Value *V) {
13297               return isCommutative(cast<Instruction>(V));
13298             }))
13299           I->setHasNoUnsignedWrap(/*b=*/false);
13300       }
13301 
13302       V = FinalShuffle(V, E, VecTy);
13303 
13304       E->VectorizedValue = V;
13305       ++NumVectorInstructions;
13306 
13307       return V;
13308     }
13309     case Instruction::Load: {
13310       // Loads are inserted at the head of the tree because we don't want to
13311       // sink them all the way down past store instructions.
13312       setInsertPointAfterBundle(E);
13313 
13314       LoadInst *LI = cast<LoadInst>(VL0);
13315       Instruction *NewLI;
13316       Value *PO = LI->getPointerOperand();
13317       if (E->State == TreeEntry::Vectorize) {
13318         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13319       } else if (E->State == TreeEntry::StridedVectorize) {
13320         Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13321         Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13322         PO = IsReverseOrder ? PtrN : Ptr0;
13323         std::optional<int> Diff = getPointersDiff(
13324             VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13325         Type *StrideTy = DL->getIndexType(PO->getType());
13326         Value *StrideVal;
13327         if (Diff) {
13328           int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13329           StrideVal =
13330               ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13331                                              DL->getTypeAllocSize(ScalarTy));
13332         } else {
13333           SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13334           transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13335             return cast<LoadInst>(V)->getPointerOperand();
13336           });
13337           OrdersType Order;
13338           std::optional<Value *> Stride =
13339               calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13340                                 &*Builder.GetInsertPoint());
13341           Value *NewStride =
13342               Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13343           StrideVal = Builder.CreateMul(
13344               NewStride,
13345               ConstantInt::get(
13346                   StrideTy,
13347                   (IsReverseOrder ? -1 : 1) *
13348                       static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13349         }
13350         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13351         auto *Inst = Builder.CreateIntrinsic(
13352             Intrinsic::experimental_vp_strided_load,
13353             {VecTy, PO->getType(), StrideTy},
13354             {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13355              Builder.getInt32(E->Scalars.size())});
13356         Inst->addParamAttr(
13357             /*ArgNo=*/0,
13358             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13359         NewLI = Inst;
13360       } else {
13361         assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13362         Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13363         if (E->VectorizedValue) {
13364           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13365           return E->VectorizedValue;
13366         }
13367         // Use the minimum alignment of the gathered loads.
13368         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13369         NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13370       }
13371       Value *V = propagateMetadata(NewLI, E->Scalars);
13372 
13373       V = FinalShuffle(V, E, VecTy);
13374       E->VectorizedValue = V;
13375       ++NumVectorInstructions;
13376       return V;
13377     }
13378     case Instruction::Store: {
13379       auto *SI = cast<StoreInst>(VL0);
13380 
13381       setInsertPointAfterBundle(E);
13382 
13383       Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13384       if (VecValue->getType() != VecTy)
13385         VecValue =
13386             Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13387       VecValue = FinalShuffle(VecValue, E, VecTy);
13388 
13389       Value *Ptr = SI->getPointerOperand();
13390       Instruction *ST;
13391       if (E->State == TreeEntry::Vectorize) {
13392         ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13393       } else {
13394         assert(E->State == TreeEntry::StridedVectorize &&
13395                "Expected either strided or conseutive stores.");
13396         if (!E->ReorderIndices.empty()) {
13397           SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13398           Ptr = SI->getPointerOperand();
13399         }
13400         Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13401         Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13402         auto *Inst = Builder.CreateIntrinsic(
13403             Intrinsic::experimental_vp_strided_store,
13404             {VecTy, Ptr->getType(), StrideTy},
13405             {VecValue, Ptr,
13406              ConstantInt::get(
13407                  StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13408              Builder.getAllOnesMask(VecTy->getElementCount()),
13409              Builder.getInt32(E->Scalars.size())});
13410         Inst->addParamAttr(
13411             /*ArgNo=*/1,
13412             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13413         ST = Inst;
13414       }
13415 
13416       Value *V = propagateMetadata(ST, E->Scalars);
13417 
13418       E->VectorizedValue = V;
13419       ++NumVectorInstructions;
13420       return V;
13421     }
13422     case Instruction::GetElementPtr: {
13423       auto *GEP0 = cast<GetElementPtrInst>(VL0);
13424       setInsertPointAfterBundle(E);
13425 
13426       Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13427       if (E->VectorizedValue) {
13428         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13429         return E->VectorizedValue;
13430       }
13431 
13432       SmallVector<Value *> OpVecs;
13433       for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13434         Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13435         if (E->VectorizedValue) {
13436           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13437           return E->VectorizedValue;
13438         }
13439         OpVecs.push_back(OpVec);
13440       }
13441 
13442       Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13443       if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13444         SmallVector<Value *> GEPs;
13445         for (Value *V : E->Scalars) {
13446           if (isa<GetElementPtrInst>(V))
13447             GEPs.push_back(V);
13448         }
13449         V = propagateMetadata(I, GEPs);
13450       }
13451 
13452       V = FinalShuffle(V, E, VecTy);
13453 
13454       E->VectorizedValue = V;
13455       ++NumVectorInstructions;
13456 
13457       return V;
13458     }
13459     case Instruction::Call: {
13460       CallInst *CI = cast<CallInst>(VL0);
13461       setInsertPointAfterBundle(E);
13462 
13463       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13464 
13465       SmallVector<Type *> ArgTys =
13466           buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
13467                                  It != MinBWs.end() ? It->second.first : 0);
13468       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13469       bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13470                           VecCallCosts.first <= VecCallCosts.second;
13471 
13472       Value *ScalarArg = nullptr;
13473       SmallVector<Value *> OpVecs;
13474       SmallVector<Type *, 2> TysForDecl;
13475       // Add return type if intrinsic is overloaded on it.
13476       if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13477         TysForDecl.push_back(VecTy);
13478       auto *CEI = cast<CallInst>(VL0);
13479       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13480         ValueList OpVL;
13481         // Some intrinsics have scalar arguments. This argument should not be
13482         // vectorized.
13483         if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13484           ScalarArg = CEI->getArgOperand(I);
13485           // if decided to reduce bitwidth of abs intrinsic, it second argument
13486           // must be set false (do not return poison, if value issigned min).
13487           if (ID == Intrinsic::abs && It != MinBWs.end() &&
13488               It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13489             ScalarArg = Builder.getFalse();
13490           OpVecs.push_back(ScalarArg);
13491           if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13492             TysForDecl.push_back(ScalarArg->getType());
13493           continue;
13494         }
13495 
13496         Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13497         if (E->VectorizedValue) {
13498           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13499           return E->VectorizedValue;
13500         }
13501         ScalarArg = CEI->getArgOperand(I);
13502         if (cast<VectorType>(OpVec->getType())->getElementType() !=
13503                 ScalarArg->getType()->getScalarType() &&
13504             It == MinBWs.end()) {
13505           auto *CastTy =
13506               getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
13507           OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13508         } else if (It != MinBWs.end()) {
13509           OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13510         }
13511         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13512         OpVecs.push_back(OpVec);
13513         if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13514           TysForDecl.push_back(OpVec->getType());
13515       }
13516 
13517       Function *CF;
13518       if (!UseIntrinsic) {
13519         VFShape Shape =
13520             VFShape::get(CI->getFunctionType(),
13521                          ElementCount::getFixed(
13522                              static_cast<unsigned>(VecTy->getNumElements())),
13523                          false /*HasGlobalPred*/);
13524         CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13525       } else {
13526         CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13527       }
13528 
13529       SmallVector<OperandBundleDef, 1> OpBundles;
13530       CI->getOperandBundlesAsDefs(OpBundles);
13531       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13532 
13533       propagateIRFlags(V, E->Scalars, VL0);
13534       V = FinalShuffle(V, E, VecTy);
13535 
13536       E->VectorizedValue = V;
13537       ++NumVectorInstructions;
13538       return V;
13539     }
13540     case Instruction::ShuffleVector: {
13541       assert(E->isAltShuffle() &&
13542              ((Instruction::isBinaryOp(E->getOpcode()) &&
13543                Instruction::isBinaryOp(E->getAltOpcode())) ||
13544               (Instruction::isCast(E->getOpcode()) &&
13545                Instruction::isCast(E->getAltOpcode())) ||
13546               (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13547              "Invalid Shuffle Vector Operand");
13548 
13549       Value *LHS = nullptr, *RHS = nullptr;
13550       if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13551         setInsertPointAfterBundle(E);
13552         LHS = vectorizeOperand(E, 0, PostponedPHIs);
13553         if (E->VectorizedValue) {
13554           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13555           return E->VectorizedValue;
13556         }
13557         RHS = vectorizeOperand(E, 1, PostponedPHIs);
13558       } else {
13559         setInsertPointAfterBundle(E);
13560         LHS = vectorizeOperand(E, 0, PostponedPHIs);
13561       }
13562       if (E->VectorizedValue) {
13563         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13564         return E->VectorizedValue;
13565       }
13566       if (LHS && RHS &&
13567           ((Instruction::isBinaryOp(E->getOpcode()) &&
13568             (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13569            (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13570         assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13571                 getOperandEntry(E, 1)->isGather() ||
13572                 MinBWs.contains(getOperandEntry(E, 0)) ||
13573                 MinBWs.contains(getOperandEntry(E, 1))) &&
13574                "Expected item in MinBWs.");
13575         Type *CastTy = VecTy;
13576         if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13577           if (cast<VectorType>(LHS->getType())
13578                   ->getElementType()
13579                   ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13580                                                ->getElementType()
13581                                                ->getIntegerBitWidth())
13582             CastTy = RHS->getType();
13583           else
13584             CastTy = LHS->getType();
13585         }
13586         if (LHS->getType() != CastTy)
13587           LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13588         if (RHS->getType() != CastTy)
13589           RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13590       }
13591 
13592       Value *V0, *V1;
13593       if (Instruction::isBinaryOp(E->getOpcode())) {
13594         V0 = Builder.CreateBinOp(
13595             static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13596         V1 = Builder.CreateBinOp(
13597             static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13598       } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13599         V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13600         auto *AltCI = cast<CmpInst>(E->getAltOp());
13601         CmpInst::Predicate AltPred = AltCI->getPredicate();
13602         V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13603       } else {
13604         if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13605           unsigned SrcBWSz = DL->getTypeSizeInBits(
13606               cast<VectorType>(LHS->getType())->getElementType());
13607           unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13608           if (BWSz <= SrcBWSz) {
13609             if (BWSz < SrcBWSz)
13610               LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13611             assert(LHS->getType() == VecTy && "Expected same type as operand.");
13612             if (auto *I = dyn_cast<Instruction>(LHS))
13613               LHS = propagateMetadata(I, E->Scalars);
13614             E->VectorizedValue = LHS;
13615             ++NumVectorInstructions;
13616             return LHS;
13617           }
13618         }
13619         V0 = Builder.CreateCast(
13620             static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13621         V1 = Builder.CreateCast(
13622             static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13623       }
13624       // Add V0 and V1 to later analysis to try to find and remove matching
13625       // instruction, if any.
13626       for (Value *V : {V0, V1}) {
13627         if (auto *I = dyn_cast<Instruction>(V)) {
13628           GatherShuffleExtractSeq.insert(I);
13629           CSEBlocks.insert(I->getParent());
13630         }
13631       }
13632 
13633       // Create shuffle to take alternate operations from the vector.
13634       // Also, gather up main and alt scalar ops to propagate IR flags to
13635       // each vector operation.
13636       ValueList OpScalars, AltScalars;
13637       SmallVector<int> Mask;
13638       E->buildAltOpShuffleMask(
13639           [E, this](Instruction *I) {
13640             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13641             return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13642                                           *TLI);
13643           },
13644           Mask, &OpScalars, &AltScalars);
13645 
13646       propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13647       propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13648       auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13649         // Drop nuw flags for abs(sub(commutative), true).
13650         if (auto *I = dyn_cast<Instruction>(Vec);
13651             I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13652             any_of(E->Scalars, [](Value *V) {
13653               auto *IV = cast<Instruction>(V);
13654               return IV->getOpcode() == Instruction::Sub &&
13655                      isCommutative(cast<Instruction>(IV));
13656             }))
13657           I->setHasNoUnsignedWrap(/*b=*/false);
13658       };
13659       DropNuwFlag(V0, E->getOpcode());
13660       DropNuwFlag(V1, E->getAltOpcode());
13661 
13662       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13663       if (auto *I = dyn_cast<Instruction>(V)) {
13664         V = propagateMetadata(I, E->Scalars);
13665         GatherShuffleExtractSeq.insert(I);
13666         CSEBlocks.insert(I->getParent());
13667       }
13668 
13669       E->VectorizedValue = V;
13670       ++NumVectorInstructions;
13671 
13672       return V;
13673     }
13674     default:
13675       llvm_unreachable("unknown inst");
13676   }
13677   return nullptr;
13678 }
13679 
13680 Value *BoUpSLP::vectorizeTree() {
13681   ExtraValueToDebugLocsMap ExternallyUsedValues;
13682   SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13683   return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13684 }
13685 
13686 namespace {
13687 /// Data type for handling buildvector sequences with the reused scalars from
13688 /// other tree entries.
13689 struct ShuffledInsertData {
13690   /// List of insertelements to be replaced by shuffles.
13691   SmallVector<InsertElementInst *> InsertElements;
13692   /// The parent vectors and shuffle mask for the given list of inserts.
13693   MapVector<Value *, SmallVector<int>> ValueMasks;
13694 };
13695 } // namespace
13696 
13697 Value *BoUpSLP::vectorizeTree(
13698     const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13699     SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13700     Instruction *ReductionRoot) {
13701   // All blocks must be scheduled before any instructions are inserted.
13702   for (auto &BSIter : BlocksSchedules) {
13703     scheduleBlock(BSIter.second.get());
13704   }
13705   // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13706   // need to rebuild it.
13707   EntryToLastInstruction.clear();
13708 
13709   if (ReductionRoot)
13710     Builder.SetInsertPoint(ReductionRoot->getParent(),
13711                            ReductionRoot->getIterator());
13712   else
13713     Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13714 
13715   // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13716   (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13717   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13718     if (TE->State == TreeEntry::Vectorize &&
13719         TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13720         TE->VectorizedValue)
13721       (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13722   // Run through the list of postponed gathers and emit them, replacing the temp
13723   // emitted allocas with actual vector instructions.
13724   ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13725   DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
13726   for (const TreeEntry *E : PostponedNodes) {
13727     auto *TE = const_cast<TreeEntry *>(E);
13728     if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13729       if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13730               TE->UserTreeIndices.front().EdgeIdx)) &&
13731           VecTE->isSame(TE->Scalars))
13732         // Found gather node which is absolutely the same as one of the
13733         // vectorized nodes. It may happen after reordering.
13734         continue;
13735     auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13736     TE->VectorizedValue = nullptr;
13737     auto *UserI =
13738         cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13739     // If user is a PHI node, its vector code have to be inserted right before
13740     // block terminator. Since the node was delayed, there were some unresolved
13741     // dependencies at the moment when stab instruction was emitted. In a case
13742     // when any of these dependencies turn out an operand of another PHI, coming
13743     // from this same block, position of a stab instruction will become invalid.
13744     // The is because source vector that supposed to feed this gather node was
13745     // inserted at the end of the block [after stab instruction]. So we need
13746     // to adjust insertion point again to the end of block.
13747     if (isa<PHINode>(UserI)) {
13748       // Insert before all users.
13749       Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13750       for (User *U : PrevVec->users()) {
13751         if (U == UserI)
13752           continue;
13753         auto *UI = dyn_cast<Instruction>(U);
13754         if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13755           continue;
13756         if (UI->comesBefore(InsertPt))
13757           InsertPt = UI;
13758       }
13759       Builder.SetInsertPoint(InsertPt);
13760     } else {
13761       Builder.SetInsertPoint(PrevVec);
13762     }
13763     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13764     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13765     if (Vec->getType() != PrevVec->getType()) {
13766       assert(Vec->getType()->isIntOrIntVectorTy() &&
13767              PrevVec->getType()->isIntOrIntVectorTy() &&
13768              "Expected integer vector types only.");
13769       std::optional<bool> IsSigned;
13770       for (Value *V : TE->Scalars) {
13771         if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13772           auto It = MinBWs.find(BaseTE);
13773           if (It != MinBWs.end()) {
13774             IsSigned = IsSigned.value_or(false) || It->second.second;
13775             if (*IsSigned)
13776               break;
13777           }
13778           for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13779             auto It = MinBWs.find(MNTE);
13780             if (It != MinBWs.end()) {
13781               IsSigned = IsSigned.value_or(false) || It->second.second;
13782               if (*IsSigned)
13783                 break;
13784             }
13785           }
13786           if (IsSigned.value_or(false))
13787             break;
13788           // Scan through gather nodes.
13789           for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13790             auto It = MinBWs.find(BVE);
13791             if (It != MinBWs.end()) {
13792               IsSigned = IsSigned.value_or(false) || It->second.second;
13793               if (*IsSigned)
13794                 break;
13795             }
13796           }
13797           if (IsSigned.value_or(false))
13798             break;
13799           if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13800             IsSigned =
13801                 IsSigned.value_or(false) ||
13802                 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13803             continue;
13804           }
13805           if (IsSigned.value_or(false))
13806             break;
13807         }
13808       }
13809       if (IsSigned.value_or(false)) {
13810         // Final attempt - check user node.
13811         auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13812         if (It != MinBWs.end())
13813           IsSigned = It->second.second;
13814       }
13815       assert(IsSigned &&
13816              "Expected user node or perfect diamond match in MinBWs.");
13817       Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13818     }
13819     PrevVec->replaceAllUsesWith(Vec);
13820     PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13821     // Replace the stub vector node, if it was used before for one of the
13822     // buildvector nodes already.
13823     auto It = PostponedValues.find(PrevVec);
13824     if (It != PostponedValues.end()) {
13825       for (TreeEntry *VTE : It->getSecond())
13826         VTE->VectorizedValue = Vec;
13827     }
13828     eraseInstruction(PrevVec);
13829   }
13830 
13831   LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13832                     << " values .\n");
13833 
13834   SmallVector<ShuffledInsertData> ShuffledInserts;
13835   // Maps vector instruction to original insertelement instruction
13836   DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13837   // Maps extract Scalar to the corresponding extractelement instruction in the
13838   // basic block. Only one extractelement per block should be emitted.
13839   DenseMap<Value *,
13840            DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
13841       ScalarToEEs;
13842   SmallDenseSet<Value *, 4> UsedInserts;
13843   DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
13844   SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13845   // Extract all of the elements with the external uses.
13846   for (const auto &ExternalUse : ExternalUses) {
13847     Value *Scalar = ExternalUse.Scalar;
13848     llvm::User *User = ExternalUse.User;
13849 
13850     // Skip users that we already RAUW. This happens when one instruction
13851     // has multiple uses of the same value.
13852     if (User && !is_contained(Scalar->users(), User))
13853       continue;
13854     TreeEntry *E = getTreeEntry(Scalar);
13855     assert(E && "Invalid scalar");
13856     assert(!E->isGather() && "Extracting from a gather list");
13857     // Non-instruction pointers are not deleted, just skip them.
13858     if (E->getOpcode() == Instruction::GetElementPtr &&
13859         !isa<GetElementPtrInst>(Scalar))
13860       continue;
13861 
13862     Value *Vec = E->VectorizedValue;
13863     assert(Vec && "Can't find vectorizable value");
13864 
13865     Value *Lane = Builder.getInt32(ExternalUse.Lane);
13866     auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13867       if (Scalar->getType() != Vec->getType()) {
13868         Value *Ex = nullptr;
13869         Value *ExV = nullptr;
13870         auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13871         bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13872         auto It = ScalarToEEs.find(Scalar);
13873         if (It != ScalarToEEs.end()) {
13874           // No need to emit many extracts, just move the only one in the
13875           // current block.
13876           auto EEIt = It->second.find(Builder.GetInsertBlock());
13877           if (EEIt != It->second.end()) {
13878             Instruction *I = EEIt->second.first;
13879             if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13880                 Builder.GetInsertPoint()->comesBefore(I)) {
13881               I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13882                             Builder.GetInsertPoint());
13883               if (auto *CI = EEIt->second.second)
13884                 CI->moveAfter(I);
13885             }
13886             Ex = I;
13887             ExV = EEIt->second.second ? EEIt->second.second : Ex;
13888           }
13889         }
13890         if (!Ex) {
13891           // "Reuse" the existing extract to improve final codegen.
13892           if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13893             Value *V = ES->getVectorOperand();
13894             if (const TreeEntry *ETE = getTreeEntry(V))
13895               V = ETE->VectorizedValue;
13896             Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13897           } else if (ReplaceGEP) {
13898             // Leave the GEPs as is, they are free in most cases and better to
13899             // keep them as GEPs.
13900             auto *CloneGEP = GEP->clone();
13901             if (isa<Instruction>(Vec))
13902               CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13903                                      Builder.GetInsertPoint());
13904             else
13905               CloneGEP->insertBefore(GEP);
13906             if (GEP->hasName())
13907               CloneGEP->takeName(GEP);
13908             Ex = CloneGEP;
13909           } else {
13910             Ex = Builder.CreateExtractElement(Vec, Lane);
13911           }
13912           // If necessary, sign-extend or zero-extend ScalarRoot
13913           // to the larger type.
13914           ExV = Ex;
13915           if (Scalar->getType() != Ex->getType())
13916             ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13917                                         MinBWs.find(E)->second.second);
13918           if (auto *I = dyn_cast<Instruction>(Ex))
13919             ScalarToEEs[Scalar].try_emplace(
13920                 Builder.GetInsertBlock(),
13921                 std::make_pair(I, cast<Instruction>(ExV)));
13922         }
13923         // The then branch of the previous if may produce constants, since 0
13924         // operand might be a constant.
13925         if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13926           GatherShuffleExtractSeq.insert(ExI);
13927           CSEBlocks.insert(ExI->getParent());
13928         }
13929         return ExV;
13930       }
13931       assert(isa<FixedVectorType>(Scalar->getType()) &&
13932              isa<InsertElementInst>(Scalar) &&
13933              "In-tree scalar of vector type is not insertelement?");
13934       auto *IE = cast<InsertElementInst>(Scalar);
13935       VectorToInsertElement.try_emplace(Vec, IE);
13936       return Vec;
13937     };
13938     // If User == nullptr, the Scalar remains as scalar in vectorized
13939     // instructions or is used as extra arg. Generate ExtractElement instruction
13940     // and update the record for this scalar in ExternallyUsedValues.
13941     if (!User) {
13942       if (!ScalarsWithNullptrUser.insert(Scalar).second)
13943         continue;
13944       assert((ExternallyUsedValues.count(Scalar) ||
13945               Scalar->hasNUsesOrMore(UsesLimit) ||
13946               any_of(Scalar->users(),
13947                      [&](llvm::User *U) {
13948                        if (ExternalUsesAsGEPs.contains(U))
13949                          return true;
13950                        TreeEntry *UseEntry = getTreeEntry(U);
13951                        return UseEntry &&
13952                               (UseEntry->State == TreeEntry::Vectorize ||
13953                                UseEntry->State ==
13954                                    TreeEntry::StridedVectorize) &&
13955                               (E->State == TreeEntry::Vectorize ||
13956                                E->State == TreeEntry::StridedVectorize) &&
13957                               doesInTreeUserNeedToExtract(
13958                                   Scalar,
13959                                   cast<Instruction>(UseEntry->Scalars.front()),
13960                                   TLI);
13961                      })) &&
13962              "Scalar with nullptr User must be registered in "
13963              "ExternallyUsedValues map or remain as scalar in vectorized "
13964              "instructions");
13965       if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13966         if (auto *PHI = dyn_cast<PHINode>(VecI))
13967           Builder.SetInsertPoint(PHI->getParent(),
13968                                  PHI->getParent()->getFirstNonPHIIt());
13969         else
13970           Builder.SetInsertPoint(VecI->getParent(),
13971                                  std::next(VecI->getIterator()));
13972       } else {
13973         Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13974       }
13975       Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13976       // Required to update internally referenced instructions.
13977       Scalar->replaceAllUsesWith(NewInst);
13978       ReplacedExternals.emplace_back(Scalar, NewInst);
13979       continue;
13980     }
13981 
13982     if (auto *VU = dyn_cast<InsertElementInst>(User);
13983         VU && VU->getOperand(1) == Scalar) {
13984       // Skip if the scalar is another vector op or Vec is not an instruction.
13985       if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13986         if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13987           if (!UsedInserts.insert(VU).second)
13988             continue;
13989           // Need to use original vector, if the root is truncated.
13990           auto BWIt = MinBWs.find(E);
13991           if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13992             auto *ScalarTy = FTy->getElementType();
13993             auto Key = std::make_pair(Vec, ScalarTy);
13994             auto VecIt = VectorCasts.find(Key);
13995             if (VecIt == VectorCasts.end()) {
13996               IRBuilderBase::InsertPointGuard Guard(Builder);
13997               if (auto *IVec = dyn_cast<PHINode>(Vec))
13998                 Builder.SetInsertPoint(
13999                     IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14000               else if (auto *IVec = dyn_cast<Instruction>(Vec))
14001                 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14002               Vec = Builder.CreateIntCast(
14003                   Vec,
14004                   getWidenedType(
14005                       ScalarTy,
14006                       cast<FixedVectorType>(Vec->getType())->getNumElements()),
14007                   BWIt->second.second);
14008               VectorCasts.try_emplace(Key, Vec);
14009             } else {
14010               Vec = VecIt->second;
14011             }
14012           }
14013 
14014           std::optional<unsigned> InsertIdx = getElementIndex(VU);
14015           if (InsertIdx) {
14016             auto *It =
14017                 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14018                   // Checks if 2 insertelements are from the same buildvector.
14019                   InsertElementInst *VecInsert = Data.InsertElements.front();
14020                   return areTwoInsertFromSameBuildVector(
14021                       VU, VecInsert,
14022                       [](InsertElementInst *II) { return II->getOperand(0); });
14023                 });
14024             unsigned Idx = *InsertIdx;
14025             if (It == ShuffledInserts.end()) {
14026               (void)ShuffledInserts.emplace_back();
14027               It = std::next(ShuffledInserts.begin(),
14028                              ShuffledInserts.size() - 1);
14029               SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14030               if (Mask.empty())
14031                 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14032               // Find the insertvector, vectorized in tree, if any.
14033               Value *Base = VU;
14034               while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14035                 if (IEBase != User &&
14036                     (!IEBase->hasOneUse() ||
14037                      getElementIndex(IEBase).value_or(Idx) == Idx))
14038                   break;
14039                 // Build the mask for the vectorized insertelement instructions.
14040                 if (const TreeEntry *E = getTreeEntry(IEBase)) {
14041                   do {
14042                     IEBase = cast<InsertElementInst>(Base);
14043                     int IEIdx = *getElementIndex(IEBase);
14044                     assert(Mask[IEIdx] == PoisonMaskElem &&
14045                            "InsertElementInstruction used already.");
14046                     Mask[IEIdx] = IEIdx;
14047                     Base = IEBase->getOperand(0);
14048                   } while (E == getTreeEntry(Base));
14049                   break;
14050                 }
14051                 Base = cast<InsertElementInst>(Base)->getOperand(0);
14052                 // After the vectorization the def-use chain has changed, need
14053                 // to look through original insertelement instructions, if they
14054                 // get replaced by vector instructions.
14055                 auto It = VectorToInsertElement.find(Base);
14056                 if (It != VectorToInsertElement.end())
14057                   Base = It->second;
14058               }
14059             }
14060             SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14061             if (Mask.empty())
14062               Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14063             Mask[Idx] = ExternalUse.Lane;
14064             It->InsertElements.push_back(cast<InsertElementInst>(User));
14065             continue;
14066           }
14067         }
14068       }
14069     }
14070 
14071     // Generate extracts for out-of-tree users.
14072     // Find the insertion point for the extractelement lane.
14073     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14074       if (PHINode *PH = dyn_cast<PHINode>(User)) {
14075         for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14076           if (PH->getIncomingValue(I) == Scalar) {
14077             Instruction *IncomingTerminator =
14078                 PH->getIncomingBlock(I)->getTerminator();
14079             if (isa<CatchSwitchInst>(IncomingTerminator)) {
14080               Builder.SetInsertPoint(VecI->getParent(),
14081                                      std::next(VecI->getIterator()));
14082             } else {
14083               Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
14084             }
14085             Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14086             PH->setOperand(I, NewInst);
14087           }
14088         }
14089       } else {
14090         Builder.SetInsertPoint(cast<Instruction>(User));
14091         Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14092         User->replaceUsesOfWith(Scalar, NewInst);
14093       }
14094     } else {
14095       Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14096       Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14097       User->replaceUsesOfWith(Scalar, NewInst);
14098     }
14099 
14100     LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14101   }
14102 
14103   auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14104     SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14105     SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14106     int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14107     for (int I = 0, E = Mask.size(); I < E; ++I) {
14108       if (Mask[I] < VF)
14109         CombinedMask1[I] = Mask[I];
14110       else
14111         CombinedMask2[I] = Mask[I] - VF;
14112     }
14113     ShuffleInstructionBuilder ShuffleBuilder(
14114         cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
14115     ShuffleBuilder.add(V1, CombinedMask1);
14116     if (V2)
14117       ShuffleBuilder.add(V2, CombinedMask2);
14118     return ShuffleBuilder.finalize(std::nullopt);
14119   };
14120 
14121   auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14122                                        bool ForSingleMask) {
14123     unsigned VF = Mask.size();
14124     unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14125     if (VF != VecVF) {
14126       if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14127         Vec = CreateShuffle(Vec, nullptr, Mask);
14128         return std::make_pair(Vec, true);
14129       }
14130       if (!ForSingleMask) {
14131         SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14132         for (unsigned I = 0; I < VF; ++I) {
14133           if (Mask[I] != PoisonMaskElem)
14134             ResizeMask[Mask[I]] = Mask[I];
14135         }
14136         Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14137       }
14138     }
14139 
14140     return std::make_pair(Vec, false);
14141   };
14142   // Perform shuffling of the vectorize tree entries for better handling of
14143   // external extracts.
14144   for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14145     // Find the first and the last instruction in the list of insertelements.
14146     sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
14147     InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14148     InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14149     Builder.SetInsertPoint(LastInsert);
14150     auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14151     Value *NewInst = performExtractsShuffleAction<Value>(
14152         MutableArrayRef(Vector.data(), Vector.size()),
14153         FirstInsert->getOperand(0),
14154         [](Value *Vec) {
14155           return cast<VectorType>(Vec->getType())
14156               ->getElementCount()
14157               .getKnownMinValue();
14158         },
14159         ResizeToVF,
14160         [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14161                                       ArrayRef<Value *> Vals) {
14162           assert((Vals.size() == 1 || Vals.size() == 2) &&
14163                  "Expected exactly 1 or 2 input values.");
14164           if (Vals.size() == 1) {
14165             // Do not create shuffle if the mask is a simple identity
14166             // non-resizing mask.
14167             if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14168                                    ->getNumElements() ||
14169                 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14170               return CreateShuffle(Vals.front(), nullptr, Mask);
14171             return Vals.front();
14172           }
14173           return CreateShuffle(Vals.front() ? Vals.front()
14174                                             : FirstInsert->getOperand(0),
14175                                Vals.back(), Mask);
14176         });
14177     auto It = ShuffledInserts[I].InsertElements.rbegin();
14178     // Rebuild buildvector chain.
14179     InsertElementInst *II = nullptr;
14180     if (It != ShuffledInserts[I].InsertElements.rend())
14181       II = *It;
14182     SmallVector<Instruction *> Inserts;
14183     while (It != ShuffledInserts[I].InsertElements.rend()) {
14184       assert(II && "Must be an insertelement instruction.");
14185       if (*It == II)
14186         ++It;
14187       else
14188         Inserts.push_back(cast<Instruction>(II));
14189       II = dyn_cast<InsertElementInst>(II->getOperand(0));
14190     }
14191     for (Instruction *II : reverse(Inserts)) {
14192       II->replaceUsesOfWith(II->getOperand(0), NewInst);
14193       if (auto *NewI = dyn_cast<Instruction>(NewInst))
14194         if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14195           II->moveAfter(NewI);
14196       NewInst = II;
14197     }
14198     LastInsert->replaceAllUsesWith(NewInst);
14199     for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14200       IE->replaceUsesOfWith(IE->getOperand(0),
14201                             PoisonValue::get(IE->getOperand(0)->getType()));
14202       IE->replaceUsesOfWith(IE->getOperand(1),
14203                             PoisonValue::get(IE->getOperand(1)->getType()));
14204       eraseInstruction(IE);
14205     }
14206     CSEBlocks.insert(LastInsert->getParent());
14207   }
14208 
14209   SmallVector<Instruction *> RemovedInsts;
14210   // For each vectorized value:
14211   for (auto &TEPtr : VectorizableTree) {
14212     TreeEntry *Entry = TEPtr.get();
14213 
14214     // No need to handle users of gathered values.
14215     if (Entry->isGather())
14216       continue;
14217 
14218     assert(Entry->VectorizedValue && "Can't find vectorizable value");
14219 
14220     // For each lane:
14221     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14222       Value *Scalar = Entry->Scalars[Lane];
14223 
14224       if (Entry->getOpcode() == Instruction::GetElementPtr &&
14225           !isa<GetElementPtrInst>(Scalar))
14226         continue;
14227 #ifndef NDEBUG
14228       Type *Ty = Scalar->getType();
14229       if (!Ty->isVoidTy()) {
14230         for (User *U : Scalar->users()) {
14231           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14232 
14233           // It is legal to delete users in the ignorelist.
14234           assert((getTreeEntry(U) ||
14235                   (UserIgnoreList && UserIgnoreList->contains(U)) ||
14236                   (isa_and_nonnull<Instruction>(U) &&
14237                    isDeleted(cast<Instruction>(U)))) &&
14238                  "Deleting out-of-tree value");
14239         }
14240       }
14241 #endif
14242       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14243       auto *I = cast<Instruction>(Scalar);
14244       RemovedInsts.push_back(I);
14245     }
14246   }
14247 
14248   // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14249   // new vector instruction.
14250   if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14251     V->mergeDIAssignID(RemovedInsts);
14252 
14253   // Clear up reduction references, if any.
14254   if (UserIgnoreList) {
14255     for (Instruction *I : RemovedInsts) {
14256       if (getTreeEntry(I)->Idx != 0)
14257         continue;
14258       SmallVector<SelectInst *> LogicalOpSelects;
14259       I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14260         // Do not replace condition of the logical op in form select <cond>.
14261         bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14262                                     (match(U.getUser(), m_LogicalAnd()) ||
14263                                      match(U.getUser(), m_LogicalOr())) &&
14264                                     U.getOperandNo() == 0;
14265         if (IsPoisoningLogicalOp) {
14266           LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14267           return false;
14268         }
14269         return UserIgnoreList->contains(U.getUser());
14270       });
14271       // Replace conditions of the poisoning logical ops with the non-poison
14272       // constant value.
14273       for (SelectInst *SI : LogicalOpSelects)
14274         SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
14275     }
14276   }
14277   // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14278   // cache correctness.
14279   // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14280   // - instructions are not deleted until later.
14281   removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14282 
14283   Builder.ClearInsertionPoint();
14284   InstrElementSize.clear();
14285 
14286   const TreeEntry &RootTE = *VectorizableTree.front();
14287   Value *Vec = RootTE.VectorizedValue;
14288   if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14289                                       It != MinBWs.end() &&
14290                                       ReductionBitWidth != It->second.first) {
14291     IRBuilder<>::InsertPointGuard Guard(Builder);
14292     Builder.SetInsertPoint(ReductionRoot->getParent(),
14293                            ReductionRoot->getIterator());
14294     Vec = Builder.CreateIntCast(
14295         Vec,
14296         VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14297                         cast<VectorType>(Vec->getType())->getElementCount()),
14298         It->second.second);
14299   }
14300   return Vec;
14301 }
14302 
14303 void BoUpSLP::optimizeGatherSequence() {
14304   LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14305                     << " gather sequences instructions.\n");
14306   // LICM InsertElementInst sequences.
14307   for (Instruction *I : GatherShuffleExtractSeq) {
14308     if (isDeleted(I))
14309       continue;
14310 
14311     // Check if this block is inside a loop.
14312     Loop *L = LI->getLoopFor(I->getParent());
14313     if (!L)
14314       continue;
14315 
14316     // Check if it has a preheader.
14317     BasicBlock *PreHeader = L->getLoopPreheader();
14318     if (!PreHeader)
14319       continue;
14320 
14321     // If the vector or the element that we insert into it are
14322     // instructions that are defined in this basic block then we can't
14323     // hoist this instruction.
14324     if (any_of(I->operands(), [L](Value *V) {
14325           auto *OpI = dyn_cast<Instruction>(V);
14326           return OpI && L->contains(OpI);
14327         }))
14328       continue;
14329 
14330     // We can hoist this instruction. Move it to the pre-header.
14331     I->moveBefore(PreHeader->getTerminator());
14332     CSEBlocks.insert(PreHeader);
14333   }
14334 
14335   // Make a list of all reachable blocks in our CSE queue.
14336   SmallVector<const DomTreeNode *, 8> CSEWorkList;
14337   CSEWorkList.reserve(CSEBlocks.size());
14338   for (BasicBlock *BB : CSEBlocks)
14339     if (DomTreeNode *N = DT->getNode(BB)) {
14340       assert(DT->isReachableFromEntry(N));
14341       CSEWorkList.push_back(N);
14342     }
14343 
14344   // Sort blocks by domination. This ensures we visit a block after all blocks
14345   // dominating it are visited.
14346   llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14347     assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14348            "Different nodes should have different DFS numbers");
14349     return A->getDFSNumIn() < B->getDFSNumIn();
14350   });
14351 
14352   // Less defined shuffles can be replaced by the more defined copies.
14353   // Between two shuffles one is less defined if it has the same vector operands
14354   // and its mask indeces are the same as in the first one or undefs. E.g.
14355   // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14356   // poison, <0, 0, 0, 0>.
14357   auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14358                                            SmallVectorImpl<int> &NewMask) {
14359     if (I1->getType() != I2->getType())
14360       return false;
14361     auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14362     auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14363     if (!SI1 || !SI2)
14364       return I1->isIdenticalTo(I2);
14365     if (SI1->isIdenticalTo(SI2))
14366       return true;
14367     for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14368       if (SI1->getOperand(I) != SI2->getOperand(I))
14369         return false;
14370     // Check if the second instruction is more defined than the first one.
14371     NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14372     ArrayRef<int> SM1 = SI1->getShuffleMask();
14373     // Count trailing undefs in the mask to check the final number of used
14374     // registers.
14375     unsigned LastUndefsCnt = 0;
14376     for (int I = 0, E = NewMask.size(); I < E; ++I) {
14377       if (SM1[I] == PoisonMaskElem)
14378         ++LastUndefsCnt;
14379       else
14380         LastUndefsCnt = 0;
14381       if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14382           NewMask[I] != SM1[I])
14383         return false;
14384       if (NewMask[I] == PoisonMaskElem)
14385         NewMask[I] = SM1[I];
14386     }
14387     // Check if the last undefs actually change the final number of used vector
14388     // registers.
14389     return SM1.size() - LastUndefsCnt > 1 &&
14390            TTI->getNumberOfParts(SI1->getType()) ==
14391                TTI->getNumberOfParts(
14392                    getWidenedType(SI1->getType()->getElementType(),
14393                                   SM1.size() - LastUndefsCnt));
14394   };
14395   // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14396   // instructions. TODO: We can further optimize this scan if we split the
14397   // instructions into different buckets based on the insert lane.
14398   SmallVector<Instruction *, 16> Visited;
14399   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14400     assert(*I &&
14401            (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14402            "Worklist not sorted properly!");
14403     BasicBlock *BB = (*I)->getBlock();
14404     // For all instructions in blocks containing gather sequences:
14405     for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14406       if (isDeleted(&In))
14407         continue;
14408       if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14409           !GatherShuffleExtractSeq.contains(&In))
14410         continue;
14411 
14412       // Check if we can replace this instruction with any of the
14413       // visited instructions.
14414       bool Replaced = false;
14415       for (Instruction *&V : Visited) {
14416         SmallVector<int> NewMask;
14417         if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14418             DT->dominates(V->getParent(), In.getParent())) {
14419           In.replaceAllUsesWith(V);
14420           eraseInstruction(&In);
14421           if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14422             if (!NewMask.empty())
14423               SI->setShuffleMask(NewMask);
14424           Replaced = true;
14425           break;
14426         }
14427         if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14428             GatherShuffleExtractSeq.contains(V) &&
14429             IsIdenticalOrLessDefined(V, &In, NewMask) &&
14430             DT->dominates(In.getParent(), V->getParent())) {
14431           In.moveAfter(V);
14432           V->replaceAllUsesWith(&In);
14433           eraseInstruction(V);
14434           if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14435             if (!NewMask.empty())
14436               SI->setShuffleMask(NewMask);
14437           V = &In;
14438           Replaced = true;
14439           break;
14440         }
14441       }
14442       if (!Replaced) {
14443         assert(!is_contained(Visited, &In));
14444         Visited.push_back(&In);
14445       }
14446     }
14447   }
14448   CSEBlocks.clear();
14449   GatherShuffleExtractSeq.clear();
14450 }
14451 
14452 BoUpSLP::ScheduleData *
14453 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14454   ScheduleData *Bundle = nullptr;
14455   ScheduleData *PrevInBundle = nullptr;
14456   for (Value *V : VL) {
14457     if (doesNotNeedToBeScheduled(V))
14458       continue;
14459     ScheduleData *BundleMember = getScheduleData(V);
14460     assert(BundleMember &&
14461            "no ScheduleData for bundle member "
14462            "(maybe not in same basic block)");
14463     assert(BundleMember->isSchedulingEntity() &&
14464            "bundle member already part of other bundle");
14465     if (PrevInBundle) {
14466       PrevInBundle->NextInBundle = BundleMember;
14467     } else {
14468       Bundle = BundleMember;
14469     }
14470 
14471     // Group the instructions to a bundle.
14472     BundleMember->FirstInBundle = Bundle;
14473     PrevInBundle = BundleMember;
14474   }
14475   assert(Bundle && "Failed to find schedule bundle");
14476   return Bundle;
14477 }
14478 
14479 // Groups the instructions to a bundle (which is then a single scheduling entity)
14480 // and schedules instructions until the bundle gets ready.
14481 std::optional<BoUpSLP::ScheduleData *>
14482 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14483                                             const InstructionsState &S) {
14484   // No need to schedule PHIs, insertelement, extractelement and extractvalue
14485   // instructions.
14486   if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14487       doesNotNeedToSchedule(VL))
14488     return nullptr;
14489 
14490   // Initialize the instruction bundle.
14491   Instruction *OldScheduleEnd = ScheduleEnd;
14492   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
14493 
14494   auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14495                                                          ScheduleData *Bundle) {
14496     // The scheduling region got new instructions at the lower end (or it is a
14497     // new region for the first bundle). This makes it necessary to
14498     // recalculate all dependencies.
14499     // It is seldom that this needs to be done a second time after adding the
14500     // initial bundle to the region.
14501     if (ScheduleEnd != OldScheduleEnd) {
14502       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14503         doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14504       ReSchedule = true;
14505     }
14506     if (Bundle) {
14507       LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14508                         << " in block " << BB->getName() << "\n");
14509       calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14510     }
14511 
14512     if (ReSchedule) {
14513       resetSchedule();
14514       initialFillReadyList(ReadyInsts);
14515     }
14516 
14517     // Now try to schedule the new bundle or (if no bundle) just calculate
14518     // dependencies. As soon as the bundle is "ready" it means that there are no
14519     // cyclic dependencies and we can schedule it. Note that's important that we
14520     // don't "schedule" the bundle yet (see cancelScheduling).
14521     while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14522            !ReadyInsts.empty()) {
14523       ScheduleData *Picked = ReadyInsts.pop_back_val();
14524       assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14525              "must be ready to schedule");
14526       schedule(Picked, ReadyInsts);
14527     }
14528   };
14529 
14530   // Make sure that the scheduling region contains all
14531   // instructions of the bundle.
14532   for (Value *V : VL) {
14533     if (doesNotNeedToBeScheduled(V))
14534       continue;
14535     if (!extendSchedulingRegion(V, S)) {
14536       // If the scheduling region got new instructions at the lower end (or it
14537       // is a new region for the first bundle). This makes it necessary to
14538       // recalculate all dependencies.
14539       // Otherwise the compiler may crash trying to incorrectly calculate
14540       // dependencies and emit instruction in the wrong order at the actual
14541       // scheduling.
14542       TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14543       return std::nullopt;
14544     }
14545   }
14546 
14547   bool ReSchedule = false;
14548   for (Value *V : VL) {
14549     if (doesNotNeedToBeScheduled(V))
14550       continue;
14551     ScheduleData *BundleMember = getScheduleData(V);
14552     assert(BundleMember &&
14553            "no ScheduleData for bundle member (maybe not in same basic block)");
14554 
14555     // Make sure we don't leave the pieces of the bundle in the ready list when
14556     // whole bundle might not be ready.
14557     ReadyInsts.remove(BundleMember);
14558 
14559     if (!BundleMember->IsScheduled)
14560       continue;
14561     // A bundle member was scheduled as single instruction before and now
14562     // needs to be scheduled as part of the bundle. We just get rid of the
14563     // existing schedule.
14564     LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
14565                       << " was already scheduled\n");
14566     ReSchedule = true;
14567   }
14568 
14569   auto *Bundle = buildBundle(VL);
14570   TryScheduleBundleImpl(ReSchedule, Bundle);
14571   if (!Bundle->isReady()) {
14572     cancelScheduling(VL, S.OpValue);
14573     return std::nullopt;
14574   }
14575   return Bundle;
14576 }
14577 
14578 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14579                                                 Value *OpValue) {
14580   if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14581       doesNotNeedToSchedule(VL))
14582     return;
14583 
14584   if (doesNotNeedToBeScheduled(OpValue))
14585     OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14586   ScheduleData *Bundle = getScheduleData(OpValue);
14587   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
14588   assert(!Bundle->IsScheduled &&
14589          "Can't cancel bundle which is already scheduled");
14590   assert(Bundle->isSchedulingEntity() &&
14591          (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14592          "tried to unbundle something which is not a bundle");
14593 
14594   // Remove the bundle from the ready list.
14595   if (Bundle->isReady())
14596     ReadyInsts.remove(Bundle);
14597 
14598   // Un-bundle: make single instructions out of the bundle.
14599   ScheduleData *BundleMember = Bundle;
14600   while (BundleMember) {
14601     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14602     BundleMember->FirstInBundle = BundleMember;
14603     ScheduleData *Next = BundleMember->NextInBundle;
14604     BundleMember->NextInBundle = nullptr;
14605     BundleMember->TE = nullptr;
14606     if (BundleMember->unscheduledDepsInBundle() == 0) {
14607       ReadyInsts.insert(BundleMember);
14608     }
14609     BundleMember = Next;
14610   }
14611 }
14612 
14613 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14614   // Allocate a new ScheduleData for the instruction.
14615   if (ChunkPos >= ChunkSize) {
14616     ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14617     ChunkPos = 0;
14618   }
14619   return &(ScheduleDataChunks.back()[ChunkPos++]);
14620 }
14621 
14622 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14623                                                       const InstructionsState &S) {
14624   if (getScheduleData(V, isOneOf(S, V)))
14625     return true;
14626   Instruction *I = dyn_cast<Instruction>(V);
14627   assert(I && "bundle member must be an instruction");
14628   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14629          !doesNotNeedToBeScheduled(I) &&
14630          "phi nodes/insertelements/extractelements/extractvalues don't need to "
14631          "be scheduled");
14632   auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14633     ScheduleData *ISD = getScheduleData(I);
14634     if (!ISD)
14635       return false;
14636     assert(isInSchedulingRegion(ISD) &&
14637            "ScheduleData not in scheduling region");
14638     ScheduleData *SD = allocateScheduleDataChunks();
14639     SD->Inst = I;
14640     SD->init(SchedulingRegionID, S.OpValue);
14641     ExtraScheduleDataMap[I][S.OpValue] = SD;
14642     return true;
14643   };
14644   if (CheckScheduleForI(I))
14645     return true;
14646   if (!ScheduleStart) {
14647     // It's the first instruction in the new region.
14648     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14649     ScheduleStart = I;
14650     ScheduleEnd = I->getNextNode();
14651     if (isOneOf(S, I) != I)
14652       CheckScheduleForI(I);
14653     assert(ScheduleEnd && "tried to vectorize a terminator?");
14654     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
14655     return true;
14656   }
14657   // Search up and down at the same time, because we don't know if the new
14658   // instruction is above or below the existing scheduling region.
14659   // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14660   // against the budget. Otherwise debug info could affect codegen.
14661   BasicBlock::reverse_iterator UpIter =
14662       ++ScheduleStart->getIterator().getReverse();
14663   BasicBlock::reverse_iterator UpperEnd = BB->rend();
14664   BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14665   BasicBlock::iterator LowerEnd = BB->end();
14666   auto IsAssumeLikeIntr = [](const Instruction &I) {
14667     if (auto *II = dyn_cast<IntrinsicInst>(&I))
14668       return II->isAssumeLikeIntrinsic();
14669     return false;
14670   };
14671   UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14672   DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14673   while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14674          &*DownIter != I) {
14675     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14676       LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
14677       return false;
14678     }
14679 
14680     ++UpIter;
14681     ++DownIter;
14682 
14683     UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14684     DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14685   }
14686   if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14687     assert(I->getParent() == ScheduleStart->getParent() &&
14688            "Instruction is in wrong basic block.");
14689     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14690     ScheduleStart = I;
14691     if (isOneOf(S, I) != I)
14692       CheckScheduleForI(I);
14693     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
14694                       << "\n");
14695     return true;
14696   }
14697   assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14698          "Expected to reach top of the basic block or instruction down the "
14699          "lower end.");
14700   assert(I->getParent() == ScheduleEnd->getParent() &&
14701          "Instruction is in wrong basic block.");
14702   initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14703                    nullptr);
14704   ScheduleEnd = I->getNextNode();
14705   if (isOneOf(S, I) != I)
14706     CheckScheduleForI(I);
14707   assert(ScheduleEnd && "tried to vectorize a terminator?");
14708   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
14709   return true;
14710 }
14711 
14712 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14713                                                 Instruction *ToI,
14714                                                 ScheduleData *PrevLoadStore,
14715                                                 ScheduleData *NextLoadStore) {
14716   ScheduleData *CurrentLoadStore = PrevLoadStore;
14717   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14718     // No need to allocate data for non-schedulable instructions.
14719     if (doesNotNeedToBeScheduled(I))
14720       continue;
14721     ScheduleData *SD = ScheduleDataMap.lookup(I);
14722     if (!SD) {
14723       SD = allocateScheduleDataChunks();
14724       ScheduleDataMap[I] = SD;
14725       SD->Inst = I;
14726     }
14727     assert(!isInSchedulingRegion(SD) &&
14728            "new ScheduleData already in scheduling region");
14729     SD->init(SchedulingRegionID, I);
14730 
14731     if (I->mayReadOrWriteMemory() &&
14732         (!isa<IntrinsicInst>(I) ||
14733          (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14734           cast<IntrinsicInst>(I)->getIntrinsicID() !=
14735               Intrinsic::pseudoprobe))) {
14736       // Update the linked list of memory accessing instructions.
14737       if (CurrentLoadStore) {
14738         CurrentLoadStore->NextLoadStore = SD;
14739       } else {
14740         FirstLoadStoreInRegion = SD;
14741       }
14742       CurrentLoadStore = SD;
14743     }
14744 
14745     if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14746         match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14747       RegionHasStackSave = true;
14748   }
14749   if (NextLoadStore) {
14750     if (CurrentLoadStore)
14751       CurrentLoadStore->NextLoadStore = NextLoadStore;
14752   } else {
14753     LastLoadStoreInRegion = CurrentLoadStore;
14754   }
14755 }
14756 
14757 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14758                                                      bool InsertInReadyList,
14759                                                      BoUpSLP *SLP) {
14760   assert(SD->isSchedulingEntity());
14761 
14762   SmallVector<ScheduleData *, 10> WorkList;
14763   WorkList.push_back(SD);
14764 
14765   while (!WorkList.empty()) {
14766     ScheduleData *SD = WorkList.pop_back_val();
14767     for (ScheduleData *BundleMember = SD; BundleMember;
14768          BundleMember = BundleMember->NextInBundle) {
14769       assert(isInSchedulingRegion(BundleMember));
14770       if (BundleMember->hasValidDependencies())
14771         continue;
14772 
14773       LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
14774                  << "\n");
14775       BundleMember->Dependencies = 0;
14776       BundleMember->resetUnscheduledDeps();
14777 
14778       // Handle def-use chain dependencies.
14779       if (BundleMember->OpValue != BundleMember->Inst) {
14780         if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14781           BundleMember->Dependencies++;
14782           ScheduleData *DestBundle = UseSD->FirstInBundle;
14783           if (!DestBundle->IsScheduled)
14784             BundleMember->incrementUnscheduledDeps(1);
14785           if (!DestBundle->hasValidDependencies())
14786             WorkList.push_back(DestBundle);
14787         }
14788       } else {
14789         for (User *U : BundleMember->Inst->users()) {
14790           if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14791             BundleMember->Dependencies++;
14792             ScheduleData *DestBundle = UseSD->FirstInBundle;
14793             if (!DestBundle->IsScheduled)
14794               BundleMember->incrementUnscheduledDeps(1);
14795             if (!DestBundle->hasValidDependencies())
14796               WorkList.push_back(DestBundle);
14797           }
14798         }
14799       }
14800 
14801       auto MakeControlDependent = [&](Instruction *I) {
14802         auto *DepDest = getScheduleData(I);
14803         assert(DepDest && "must be in schedule window");
14804         DepDest->ControlDependencies.push_back(BundleMember);
14805         BundleMember->Dependencies++;
14806         ScheduleData *DestBundle = DepDest->FirstInBundle;
14807         if (!DestBundle->IsScheduled)
14808           BundleMember->incrementUnscheduledDeps(1);
14809         if (!DestBundle->hasValidDependencies())
14810           WorkList.push_back(DestBundle);
14811       };
14812 
14813       // Any instruction which isn't safe to speculate at the beginning of the
14814       // block is control dependend on any early exit or non-willreturn call
14815       // which proceeds it.
14816       if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14817         for (Instruction *I = BundleMember->Inst->getNextNode();
14818              I != ScheduleEnd; I = I->getNextNode()) {
14819           if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14820             continue;
14821 
14822           // Add the dependency
14823           MakeControlDependent(I);
14824 
14825           if (!isGuaranteedToTransferExecutionToSuccessor(I))
14826             // Everything past here must be control dependent on I.
14827             break;
14828         }
14829       }
14830 
14831       if (RegionHasStackSave) {
14832         // If we have an inalloc alloca instruction, it needs to be scheduled
14833         // after any preceeding stacksave.  We also need to prevent any alloca
14834         // from reordering above a preceeding stackrestore.
14835         if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14836             match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14837           for (Instruction *I = BundleMember->Inst->getNextNode();
14838                I != ScheduleEnd; I = I->getNextNode()) {
14839             if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14840                 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14841               // Any allocas past here must be control dependent on I, and I
14842               // must be memory dependend on BundleMember->Inst.
14843               break;
14844 
14845             if (!isa<AllocaInst>(I))
14846               continue;
14847 
14848             // Add the dependency
14849             MakeControlDependent(I);
14850           }
14851         }
14852 
14853         // In addition to the cases handle just above, we need to prevent
14854         // allocas and loads/stores from moving below a stacksave or a
14855         // stackrestore. Avoiding moving allocas below stackrestore is currently
14856         // thought to be conservatism. Moving loads/stores below a stackrestore
14857         // can lead to incorrect code.
14858         if (isa<AllocaInst>(BundleMember->Inst) ||
14859             BundleMember->Inst->mayReadOrWriteMemory()) {
14860           for (Instruction *I = BundleMember->Inst->getNextNode();
14861                I != ScheduleEnd; I = I->getNextNode()) {
14862             if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14863                 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14864               continue;
14865 
14866             // Add the dependency
14867             MakeControlDependent(I);
14868             break;
14869           }
14870         }
14871       }
14872 
14873       // Handle the memory dependencies (if any).
14874       ScheduleData *DepDest = BundleMember->NextLoadStore;
14875       if (!DepDest)
14876         continue;
14877       Instruction *SrcInst = BundleMember->Inst;
14878       assert(SrcInst->mayReadOrWriteMemory() &&
14879              "NextLoadStore list for non memory effecting bundle?");
14880       MemoryLocation SrcLoc = getLocation(SrcInst);
14881       bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14882       unsigned NumAliased = 0;
14883       unsigned DistToSrc = 1;
14884 
14885       for (; DepDest; DepDest = DepDest->NextLoadStore) {
14886         assert(isInSchedulingRegion(DepDest));
14887 
14888         // We have two limits to reduce the complexity:
14889         // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14890         //    SLP->isAliased (which is the expensive part in this loop).
14891         // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14892         //    the whole loop (even if the loop is fast, it's quadratic).
14893         //    It's important for the loop break condition (see below) to
14894         //    check this limit even between two read-only instructions.
14895         if (DistToSrc >= MaxMemDepDistance ||
14896             ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14897              (NumAliased >= AliasedCheckLimit ||
14898               SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14899 
14900           // We increment the counter only if the locations are aliased
14901           // (instead of counting all alias checks). This gives a better
14902           // balance between reduced runtime and accurate dependencies.
14903           NumAliased++;
14904 
14905           DepDest->MemoryDependencies.push_back(BundleMember);
14906           BundleMember->Dependencies++;
14907           ScheduleData *DestBundle = DepDest->FirstInBundle;
14908           if (!DestBundle->IsScheduled) {
14909             BundleMember->incrementUnscheduledDeps(1);
14910           }
14911           if (!DestBundle->hasValidDependencies()) {
14912             WorkList.push_back(DestBundle);
14913           }
14914         }
14915 
14916         // Example, explaining the loop break condition: Let's assume our
14917         // starting instruction is i0 and MaxMemDepDistance = 3.
14918         //
14919         //                      +--------v--v--v
14920         //             i0,i1,i2,i3,i4,i5,i6,i7,i8
14921         //             +--------^--^--^
14922         //
14923         // MaxMemDepDistance let us stop alias-checking at i3 and we add
14924         // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14925         // Previously we already added dependencies from i3 to i6,i7,i8
14926         // (because of MaxMemDepDistance). As we added a dependency from
14927         // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14928         // and we can abort this loop at i6.
14929         if (DistToSrc >= 2 * MaxMemDepDistance)
14930           break;
14931         DistToSrc++;
14932       }
14933     }
14934     if (InsertInReadyList && SD->isReady()) {
14935       ReadyInsts.insert(SD);
14936       LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
14937                         << "\n");
14938     }
14939   }
14940 }
14941 
14942 void BoUpSLP::BlockScheduling::resetSchedule() {
14943   assert(ScheduleStart &&
14944          "tried to reset schedule on block which has not been scheduled");
14945   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14946     doForAllOpcodes(I, [&](ScheduleData *SD) {
14947       assert(isInSchedulingRegion(SD) &&
14948              "ScheduleData not in scheduling region");
14949       SD->IsScheduled = false;
14950       SD->resetUnscheduledDeps();
14951     });
14952   }
14953   ReadyInsts.clear();
14954 }
14955 
14956 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14957   if (!BS->ScheduleStart)
14958     return;
14959 
14960   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14961 
14962   // A key point - if we got here, pre-scheduling was able to find a valid
14963   // scheduling of the sub-graph of the scheduling window which consists
14964   // of all vector bundles and their transitive users.  As such, we do not
14965   // need to reschedule anything *outside of* that subgraph.
14966 
14967   BS->resetSchedule();
14968 
14969   // For the real scheduling we use a more sophisticated ready-list: it is
14970   // sorted by the original instruction location. This lets the final schedule
14971   // be as  close as possible to the original instruction order.
14972   // WARNING: If changing this order causes a correctness issue, that means
14973   // there is some missing dependence edge in the schedule data graph.
14974   struct ScheduleDataCompare {
14975     bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14976       return SD2->SchedulingPriority < SD1->SchedulingPriority;
14977     }
14978   };
14979   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14980 
14981   // Ensure that all dependency data is updated (for nodes in the sub-graph)
14982   // and fill the ready-list with initial instructions.
14983   int Idx = 0;
14984   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14985        I = I->getNextNode()) {
14986     BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14987       TreeEntry *SDTE = getTreeEntry(SD->Inst);
14988       (void)SDTE;
14989       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
14990               SD->isPartOfBundle() ==
14991                   (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14992              "scheduler and vectorizer bundle mismatch");
14993       SD->FirstInBundle->SchedulingPriority = Idx++;
14994 
14995       if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14996         BS->calculateDependencies(SD, false, this);
14997     });
14998   }
14999   BS->initialFillReadyList(ReadyInsts);
15000 
15001   Instruction *LastScheduledInst = BS->ScheduleEnd;
15002 
15003   // Do the "real" scheduling.
15004   while (!ReadyInsts.empty()) {
15005     ScheduleData *Picked = *ReadyInsts.begin();
15006     ReadyInsts.erase(ReadyInsts.begin());
15007 
15008     // Move the scheduled instruction(s) to their dedicated places, if not
15009     // there yet.
15010     for (ScheduleData *BundleMember = Picked; BundleMember;
15011          BundleMember = BundleMember->NextInBundle) {
15012       Instruction *PickedInst = BundleMember->Inst;
15013       if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15014         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
15015       LastScheduledInst = PickedInst;
15016     }
15017 
15018     BS->schedule(Picked, ReadyInsts);
15019   }
15020 
15021   // Check that we didn't break any of our invariants.
15022 #ifdef EXPENSIVE_CHECKS
15023   BS->verify();
15024 #endif
15025 
15026 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15027   // Check that all schedulable entities got scheduled
15028   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15029     BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15030       if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15031         assert(SD->IsScheduled && "must be scheduled at this point");
15032       }
15033     });
15034   }
15035 #endif
15036 
15037   // Avoid duplicate scheduling of the block.
15038   BS->ScheduleStart = nullptr;
15039 }
15040 
15041 unsigned BoUpSLP::getVectorElementSize(Value *V) {
15042   // If V is a store, just return the width of the stored value (or value
15043   // truncated just before storing) without traversing the expression tree.
15044   // This is the common case.
15045   if (auto *Store = dyn_cast<StoreInst>(V))
15046     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15047 
15048   if (auto *IEI = dyn_cast<InsertElementInst>(V))
15049     return getVectorElementSize(IEI->getOperand(1));
15050 
15051   auto E = InstrElementSize.find(V);
15052   if (E != InstrElementSize.end())
15053     return E->second;
15054 
15055   // If V is not a store, we can traverse the expression tree to find loads
15056   // that feed it. The type of the loaded value may indicate a more suitable
15057   // width than V's type. We want to base the vector element size on the width
15058   // of memory operations where possible.
15059   SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
15060   SmallPtrSet<Instruction *, 16> Visited;
15061   if (auto *I = dyn_cast<Instruction>(V)) {
15062     Worklist.emplace_back(I, I->getParent(), 0);
15063     Visited.insert(I);
15064   }
15065 
15066   // Traverse the expression tree in bottom-up order looking for loads. If we
15067   // encounter an instruction we don't yet handle, we give up.
15068   auto Width = 0u;
15069   Value *FirstNonBool = nullptr;
15070   while (!Worklist.empty()) {
15071     auto [I, Parent, Level] = Worklist.pop_back_val();
15072 
15073     // We should only be looking at scalar instructions here. If the current
15074     // instruction has a vector type, skip.
15075     auto *Ty = I->getType();
15076     if (isa<VectorType>(Ty))
15077       continue;
15078     if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15079       FirstNonBool = I;
15080     if (Level > RecursionMaxDepth)
15081       continue;
15082 
15083     // If the current instruction is a load, update MaxWidth to reflect the
15084     // width of the loaded value.
15085     if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
15086       Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
15087 
15088     // Otherwise, we need to visit the operands of the instruction. We only
15089     // handle the interesting cases from buildTree here. If an operand is an
15090     // instruction we haven't yet visited and from the same basic block as the
15091     // user or the use is a PHI node, we add it to the worklist.
15092     else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
15093                  BinaryOperator, UnaryOperator>(I)) {
15094       for (Use &U : I->operands()) {
15095         if (auto *J = dyn_cast<Instruction>(U.get()))
15096           if (Visited.insert(J).second &&
15097               (isa<PHINode>(I) || J->getParent() == Parent)) {
15098             Worklist.emplace_back(J, J->getParent(), Level + 1);
15099             continue;
15100           }
15101         if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15102           FirstNonBool = U.get();
15103       }
15104     } else {
15105       break;
15106     }
15107   }
15108 
15109   // If we didn't encounter a memory access in the expression tree, or if we
15110   // gave up for some reason, just return the width of V. Otherwise, return the
15111   // maximum width we found.
15112   if (!Width) {
15113     if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15114       V = FirstNonBool;
15115     Width = DL->getTypeSizeInBits(V->getType());
15116   }
15117 
15118   for (Instruction *I : Visited)
15119     InstrElementSize[I] = Width;
15120 
15121   return Width;
15122 }
15123 
15124 bool BoUpSLP::collectValuesToDemote(
15125     const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15126     SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
15127     unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15128     bool IsTruncRoot) const {
15129   // We can always demote constants.
15130   if (all_of(E.Scalars, IsaPred<Constant>))
15131     return true;
15132 
15133   unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
15134   if (OrigBitWidth == BitWidth) {
15135     MaxDepthLevel = 1;
15136     return true;
15137   }
15138 
15139   // If the value is not a vectorized instruction in the expression and not used
15140   // by the insertelement instruction and not used in multiple vector nodes, it
15141   // cannot be demoted.
15142   bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
15143     return !isKnownNonNegative(R, SimplifyQuery(*DL));
15144   });
15145   auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15146     if (MultiNodeScalars.contains(V))
15147       return false;
15148     // For lat shuffle of sext/zext with many uses need to check the extra bit
15149     // for unsigned values, otherwise may have incorrect casting for reused
15150     // scalars.
15151     bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
15152     if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15153       APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15154       if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15155         return true;
15156     }
15157     unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15158     unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15159     if (IsSignedNode)
15160       ++BitWidth1;
15161     if (auto *I = dyn_cast<Instruction>(V)) {
15162       APInt Mask = DB->getDemandedBits(I);
15163       unsigned BitWidth2 =
15164           std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15165       while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15166         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
15167         if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15168           break;
15169         BitWidth2 *= 2;
15170       }
15171       BitWidth1 = std::min(BitWidth1, BitWidth2);
15172     }
15173     BitWidth = std::max(BitWidth, BitWidth1);
15174     return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15175   };
15176   using namespace std::placeholders;
15177   auto FinalAnalysis = [&]() {
15178     if (!IsProfitableToDemote)
15179       return false;
15180     bool Res = all_of(
15181         E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
15182     // Demote gathers.
15183     if (Res && E.isGather()) {
15184       // Check possible extractelement instructions bases and final vector
15185       // length.
15186       SmallPtrSet<Value *, 4> UniqueBases;
15187       for (Value *V : E.Scalars) {
15188         auto *EE = dyn_cast<ExtractElementInst>(V);
15189         if (!EE)
15190           continue;
15191         UniqueBases.insert(EE->getVectorOperand());
15192       }
15193       const unsigned VF = E.Scalars.size();
15194       Type *OrigScalarTy = E.Scalars.front()->getType();
15195       if (UniqueBases.size() <= 2 ||
15196           TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
15197               TTI->getNumberOfParts(getWidenedType(
15198                   IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
15199         ToDemote.push_back(E.Idx);
15200     }
15201     return Res;
15202   };
15203   if (E.isGather() || !Visited.insert(&E).second ||
15204       any_of(E.Scalars, [&](Value *V) {
15205         return all_of(V->users(), [&](User *U) {
15206           return isa<InsertElementInst>(U) && !getTreeEntry(U);
15207         });
15208       }))
15209     return FinalAnalysis();
15210 
15211   if (any_of(E.Scalars, [&](Value *V) {
15212         return !all_of(V->users(), [=](User *U) {
15213           return getTreeEntry(U) ||
15214                  (E.Idx == 0 && UserIgnoreList &&
15215                   UserIgnoreList->contains(U)) ||
15216                  (!isa<CmpInst>(U) && U->getType()->isSized() &&
15217                   !U->getType()->isScalableTy() &&
15218                   DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15219         }) && !IsPotentiallyTruncated(V, BitWidth);
15220       }))
15221     return false;
15222 
15223   auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15224                              bool &NeedToExit) {
15225     NeedToExit = false;
15226     unsigned InitLevel = MaxDepthLevel;
15227     for (const TreeEntry *Op : Operands) {
15228       unsigned Level = InitLevel;
15229       if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15230                                  ToDemote, Visited, Level, IsProfitableToDemote,
15231                                  IsTruncRoot)) {
15232         if (!IsProfitableToDemote)
15233           return false;
15234         NeedToExit = true;
15235         if (!FinalAnalysis())
15236           return false;
15237         continue;
15238       }
15239       MaxDepthLevel = std::max(MaxDepthLevel, Level);
15240     }
15241     return true;
15242   };
15243   auto AttemptCheckBitwidth =
15244       [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15245         // Try all bitwidth < OrigBitWidth.
15246         NeedToExit = false;
15247         unsigned BestFailBitwidth = 0;
15248         for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15249           if (Checker(BitWidth, OrigBitWidth))
15250             return true;
15251           if (BestFailBitwidth == 0 && FinalAnalysis())
15252             BestFailBitwidth = BitWidth;
15253         }
15254         if (BitWidth >= OrigBitWidth) {
15255           if (BestFailBitwidth == 0) {
15256             BitWidth = OrigBitWidth;
15257             return false;
15258           }
15259           MaxDepthLevel = 1;
15260           BitWidth = BestFailBitwidth;
15261           NeedToExit = true;
15262           return true;
15263         }
15264         return false;
15265       };
15266   auto TryProcessInstruction =
15267       [&](unsigned &BitWidth,
15268           ArrayRef<const TreeEntry *> Operands = std::nullopt,
15269           function_ref<bool(unsigned, unsigned)> Checker = {}) {
15270         if (Operands.empty()) {
15271           if (!IsTruncRoot)
15272             MaxDepthLevel = 1;
15273           (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15274                                               std::ref(BitWidth)));
15275         } else {
15276           // Several vectorized uses? Check if we can truncate it, otherwise -
15277           // exit.
15278           if (E.UserTreeIndices.size() > 1 &&
15279               !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15280                                            std::ref(BitWidth))))
15281             return false;
15282           bool NeedToExit = false;
15283           if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15284             return false;
15285           if (NeedToExit)
15286             return true;
15287           if (!ProcessOperands(Operands, NeedToExit))
15288             return false;
15289           if (NeedToExit)
15290             return true;
15291         }
15292 
15293         ++MaxDepthLevel;
15294         // Record the entry that we can demote.
15295         ToDemote.push_back(E.Idx);
15296         return IsProfitableToDemote;
15297       };
15298   switch (E.getOpcode()) {
15299 
15300   // We can always demote truncations and extensions. Since truncations can
15301   // seed additional demotion, we save the truncated value.
15302   case Instruction::Trunc:
15303     if (IsProfitableToDemoteRoot)
15304       IsProfitableToDemote = true;
15305     return TryProcessInstruction(BitWidth);
15306   case Instruction::ZExt:
15307   case Instruction::SExt:
15308     IsProfitableToDemote = true;
15309     return TryProcessInstruction(BitWidth);
15310 
15311   // We can demote certain binary operations if we can demote both of their
15312   // operands.
15313   case Instruction::Add:
15314   case Instruction::Sub:
15315   case Instruction::Mul:
15316   case Instruction::And:
15317   case Instruction::Or:
15318   case Instruction::Xor: {
15319     return TryProcessInstruction(
15320         BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15321   }
15322   case Instruction::Shl: {
15323     // If we are truncating the result of this SHL, and if it's a shift of an
15324     // inrange amount, we can always perform a SHL in a smaller type.
15325     auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15326       return all_of(E.Scalars, [&](Value *V) {
15327         auto *I = cast<Instruction>(V);
15328         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15329         return AmtKnownBits.getMaxValue().ult(BitWidth);
15330       });
15331     };
15332     return TryProcessInstruction(
15333         BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15334   }
15335   case Instruction::LShr: {
15336     // If this is a truncate of a logical shr, we can truncate it to a smaller
15337     // lshr iff we know that the bits we would otherwise be shifting in are
15338     // already zeros.
15339     auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15340       return all_of(E.Scalars, [&](Value *V) {
15341         auto *I = cast<Instruction>(V);
15342         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15343         APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15344         return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15345                MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15346                                  SimplifyQuery(*DL));
15347       });
15348     };
15349     return TryProcessInstruction(
15350         BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15351         LShrChecker);
15352   }
15353   case Instruction::AShr: {
15354     // If this is a truncate of an arithmetic shr, we can truncate it to a
15355     // smaller ashr iff we know that all the bits from the sign bit of the
15356     // original type and the sign bit of the truncate type are similar.
15357     auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15358       return all_of(E.Scalars, [&](Value *V) {
15359         auto *I = cast<Instruction>(V);
15360         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15361         unsigned ShiftedBits = OrigBitWidth - BitWidth;
15362         return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15363                ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15364                                                 nullptr, DT);
15365       });
15366     };
15367     return TryProcessInstruction(
15368         BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15369         AShrChecker);
15370   }
15371   case Instruction::UDiv:
15372   case Instruction::URem: {
15373     // UDiv and URem can be truncated if all the truncated bits are zero.
15374     auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15375       assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15376       return all_of(E.Scalars, [&](Value *V) {
15377         auto *I = cast<Instruction>(V);
15378         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15379         return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15380                MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15381       });
15382     };
15383     return TryProcessInstruction(
15384         BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15385   }
15386 
15387   // We can demote selects if we can demote their true and false values.
15388   case Instruction::Select: {
15389     return TryProcessInstruction(
15390         BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15391   }
15392 
15393   // We can demote phis if we can demote all their incoming operands. Note that
15394   // we don't need to worry about cycles since we ensure single use above.
15395   case Instruction::PHI: {
15396     const unsigned NumOps = E.getNumOperands();
15397     SmallVector<const TreeEntry *> Ops(NumOps);
15398     transform(seq<unsigned>(0, NumOps), Ops.begin(),
15399               std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15400 
15401     return TryProcessInstruction(BitWidth, Ops);
15402   }
15403 
15404   case Instruction::Call: {
15405     auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15406     if (!IC)
15407       break;
15408     Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);
15409     if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15410         ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15411       break;
15412     SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15413     function_ref<bool(unsigned, unsigned)> CallChecker;
15414     auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15415       assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15416       return all_of(E.Scalars, [&](Value *V) {
15417         auto *I = cast<Instruction>(V);
15418         if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15419           APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15420           return MaskedValueIsZero(I->getOperand(0), Mask,
15421                                    SimplifyQuery(*DL)) &&
15422                  MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15423         }
15424         assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15425                "Expected min/max intrinsics only.");
15426         unsigned SignBits = OrigBitWidth - BitWidth;
15427         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15428         unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15429                                               nullptr, DT);
15430         unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15431                                               nullptr, DT);
15432         return SignBits <= Op0SignBits &&
15433                ((SignBits != Op0SignBits &&
15434                  !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15435                 MaskedValueIsZero(I->getOperand(0), Mask,
15436                                   SimplifyQuery(*DL))) &&
15437                SignBits <= Op1SignBits &&
15438                ((SignBits != Op1SignBits &&
15439                  !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15440                 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15441       });
15442     };
15443     auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15444       assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15445       return all_of(E.Scalars, [&](Value *V) {
15446         auto *I = cast<Instruction>(V);
15447         unsigned SignBits = OrigBitWidth - BitWidth;
15448         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15449         unsigned Op0SignBits =
15450             ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
15451         return SignBits <= Op0SignBits &&
15452                ((SignBits != Op0SignBits &&
15453                  !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15454                 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
15455       });
15456     };
15457     if (ID != Intrinsic::abs) {
15458       Operands.push_back(getOperandEntry(&E, 1));
15459       CallChecker = CompChecker;
15460     } else {
15461       CallChecker = AbsChecker;
15462     }
15463     InstructionCost BestCost =
15464         std::numeric_limits<InstructionCost::CostType>::max();
15465     unsigned BestBitWidth = BitWidth;
15466     unsigned VF = E.Scalars.size();
15467     // Choose the best bitwidth based on cost estimations.
15468     auto Checker = [&](unsigned BitWidth, unsigned) {
15469       unsigned MinBW = PowerOf2Ceil(BitWidth);
15470       SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15471       auto VecCallCosts = getVectorCallCosts(
15472           IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
15473           TTI, TLI, ArgTys);
15474       InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15475       if (Cost < BestCost) {
15476         BestCost = Cost;
15477         BestBitWidth = BitWidth;
15478       }
15479       return false;
15480     };
15481     [[maybe_unused]] bool NeedToExit;
15482     (void)AttemptCheckBitwidth(Checker, NeedToExit);
15483     BitWidth = BestBitWidth;
15484     return TryProcessInstruction(BitWidth, Operands, CallChecker);
15485   }
15486 
15487   // Otherwise, conservatively give up.
15488   default:
15489     break;
15490   }
15491   MaxDepthLevel = 1;
15492   return FinalAnalysis();
15493 }
15494 
15495 static RecurKind getRdxKind(Value *V);
15496 
15497 void BoUpSLP::computeMinimumValueSizes() {
15498   // We only attempt to truncate integer expressions.
15499   bool IsStoreOrInsertElt =
15500       VectorizableTree.front()->getOpcode() == Instruction::Store ||
15501       VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15502   if ((IsStoreOrInsertElt || UserIgnoreList) &&
15503       ExtraBitWidthNodes.size() <= 1 &&
15504       (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15505        CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15506     return;
15507 
15508   unsigned NodeIdx = 0;
15509   if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15510     NodeIdx = 1;
15511 
15512   // Ensure the roots of the vectorizable tree don't form a cycle.
15513   if (VectorizableTree[NodeIdx]->isGather() ||
15514       (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15515       (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15516                               [NodeIdx](const EdgeInfo &EI) {
15517                                 return EI.UserTE->Idx >
15518                                        static_cast<int>(NodeIdx);
15519                               })))
15520     return;
15521 
15522   // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15523   // resize to the final type.
15524   bool IsTruncRoot = false;
15525   bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15526   SmallVector<unsigned> RootDemotes;
15527   if (NodeIdx != 0 &&
15528       VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15529       VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15530     assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15531     IsTruncRoot = true;
15532     RootDemotes.push_back(NodeIdx);
15533     IsProfitableToDemoteRoot = true;
15534     ++NodeIdx;
15535   }
15536 
15537   // Analyzed the reduction already and not profitable - exit.
15538   if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15539     return;
15540 
15541   SmallVector<unsigned> ToDemote;
15542   auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15543                                 bool IsProfitableToDemoteRoot, unsigned Opcode,
15544                                 unsigned Limit, bool IsTruncRoot,
15545                                 bool IsSignedCmp) -> unsigned {
15546     ToDemote.clear();
15547     // Check if the root is trunc and the next node is gather/buildvector, then
15548     // keep trunc in scalars, which is free in most cases.
15549     if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15550         E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15551         all_of(E.Scalars, [&](Value *V) {
15552           return V->hasOneUse() || isa<Constant>(V) ||
15553                  (!V->hasNUsesOrMore(UsesLimit) &&
15554                   none_of(V->users(), [&](User *U) {
15555                     const TreeEntry *TE = getTreeEntry(U);
15556                     const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15557                     if (TE == UserTE || !TE)
15558                       return false;
15559                     if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15560                              SelectInst>(U) ||
15561                         !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15562                              SelectInst>(UserTE->getMainOp()))
15563                       return true;
15564                     unsigned UserTESz = DL->getTypeSizeInBits(
15565                         UserTE->Scalars.front()->getType());
15566                     auto It = MinBWs.find(TE);
15567                     if (It != MinBWs.end() && It->second.first > UserTESz)
15568                       return true;
15569                     return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15570                   }));
15571         })) {
15572       ToDemote.push_back(E.Idx);
15573       const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15574       auto It = MinBWs.find(UserTE);
15575       if (It != MinBWs.end())
15576         return It->second.first;
15577       unsigned MaxBitWidth =
15578           DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15579       MaxBitWidth = bit_ceil(MaxBitWidth);
15580       if (MaxBitWidth < 8 && MaxBitWidth > 1)
15581         MaxBitWidth = 8;
15582       return MaxBitWidth;
15583     }
15584 
15585     unsigned VF = E.getVectorFactor();
15586     auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15587     if (!TreeRootIT || !Opcode)
15588       return 0u;
15589 
15590     if (any_of(E.Scalars,
15591                [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15592       return 0u;
15593 
15594     unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF));
15595 
15596     // The maximum bit width required to represent all the values that can be
15597     // demoted without loss of precision. It would be safe to truncate the roots
15598     // of the expression to this width.
15599     unsigned MaxBitWidth = 1u;
15600 
15601     // True if the roots can be zero-extended back to their original type,
15602     // rather than sign-extended. We know that if the leading bits are not
15603     // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15604     // True.
15605     // Determine if the sign bit of all the roots is known to be zero. If not,
15606     // IsKnownPositive is set to False.
15607     bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15608       KnownBits Known = computeKnownBits(R, *DL);
15609       return Known.isNonNegative();
15610     });
15611 
15612     // We first check if all the bits of the roots are demanded. If they're not,
15613     // we can truncate the roots to this narrower type.
15614     for (Value *Root : E.Scalars) {
15615       unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15616       TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15617       unsigned BitWidth1 = NumTypeBits - NumSignBits;
15618       // If we can't prove that the sign bit is zero, we must add one to the
15619       // maximum bit width to account for the unknown sign bit. This preserves
15620       // the existing sign bit so we can safely sign-extend the root back to the
15621       // original type. Otherwise, if we know the sign bit is zero, we will
15622       // zero-extend the root instead.
15623       //
15624       // FIXME: This is somewhat suboptimal, as there will be cases where adding
15625       //        one to the maximum bit width will yield a larger-than-necessary
15626       //        type. In general, we need to add an extra bit only if we can't
15627       //        prove that the upper bit of the original type is equal to the
15628       //        upper bit of the proposed smaller type. If these two bits are
15629       //        the same (either zero or one) we know that sign-extending from
15630       //        the smaller type will result in the same value. Here, since we
15631       //        can't yet prove this, we are just making the proposed smaller
15632       //        type larger to ensure correctness.
15633       if (!IsKnownPositive)
15634         ++BitWidth1;
15635 
15636       APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15637       unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15638       MaxBitWidth =
15639           std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15640     }
15641 
15642     if (MaxBitWidth < 8 && MaxBitWidth > 1)
15643       MaxBitWidth = 8;
15644 
15645     // If the original type is large, but reduced type does not improve the reg
15646     // use - ignore it.
15647     if (NumParts > 1 &&
15648         NumParts ==
15649             TTI->getNumberOfParts(getWidenedType(
15650                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15651       return 0u;
15652 
15653     bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15654                                 Opcode == Instruction::SExt ||
15655                                 Opcode == Instruction::ZExt || NumParts > 1;
15656     // Conservatively determine if we can actually truncate the roots of the
15657     // expression. Collect the values that can be demoted in ToDemote and
15658     // additional roots that require investigating in Roots.
15659     DenseSet<const TreeEntry *> Visited;
15660     unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15661     bool NeedToDemote = IsProfitableToDemote;
15662 
15663     if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15664                                ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15665                                IsTruncRoot) ||
15666         (MaxDepthLevel <= Limit &&
15667          !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15668             (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15669              DL->getTypeSizeInBits(TreeRootIT) /
15670                      DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15671                                                ->getOperand(0)
15672                                                ->getType()) >
15673                  2)))))
15674       return 0u;
15675     // Round MaxBitWidth up to the next power-of-two.
15676     MaxBitWidth = bit_ceil(MaxBitWidth);
15677 
15678     return MaxBitWidth;
15679   };
15680 
15681   // If we can truncate the root, we must collect additional values that might
15682   // be demoted as a result. That is, those seeded by truncations we will
15683   // modify.
15684   // Add reduction ops sizes, if any.
15685   if (UserIgnoreList &&
15686       isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15687     for (Value *V : *UserIgnoreList) {
15688       auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15689       auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15690       unsigned BitWidth1 = NumTypeBits - NumSignBits;
15691       if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
15692         ++BitWidth1;
15693       unsigned BitWidth2 = BitWidth1;
15694       if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
15695         auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15696         BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15697       }
15698       ReductionBitWidth =
15699           std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15700     }
15701     if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15702       ReductionBitWidth = 8;
15703 
15704     ReductionBitWidth = bit_ceil(ReductionBitWidth);
15705   }
15706   bool IsTopRoot = NodeIdx == 0;
15707   while (NodeIdx < VectorizableTree.size() &&
15708          VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15709          VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15710     RootDemotes.push_back(NodeIdx);
15711     ++NodeIdx;
15712     IsTruncRoot = true;
15713   }
15714   bool IsSignedCmp = false;
15715   while (NodeIdx < VectorizableTree.size()) {
15716     ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15717     unsigned Limit = 2;
15718     unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15719     if (IsTopRoot &&
15720         ReductionBitWidth ==
15721             DL->getTypeSizeInBits(
15722                 VectorizableTree.front()->Scalars.front()->getType()))
15723       Limit = 3;
15724     unsigned MaxBitWidth = ComputeMaxBitWidth(
15725         *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15726         Limit, IsTruncRoot, IsSignedCmp);
15727     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15728       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15729         ReductionBitWidth = bit_ceil(MaxBitWidth);
15730       else if (MaxBitWidth == 0)
15731         ReductionBitWidth = 0;
15732     }
15733 
15734     for (unsigned Idx : RootDemotes) {
15735       if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15736             uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15737             if (OrigBitWidth > MaxBitWidth) {
15738               APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15739               return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15740             }
15741             return false;
15742           }))
15743         ToDemote.push_back(Idx);
15744     }
15745     RootDemotes.clear();
15746     IsTopRoot = false;
15747     IsProfitableToDemoteRoot = true;
15748 
15749     if (ExtraBitWidthNodes.empty()) {
15750       NodeIdx = VectorizableTree.size();
15751     } else {
15752       unsigned NewIdx = 0;
15753       do {
15754         NewIdx = *ExtraBitWidthNodes.begin();
15755         ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15756       } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15757       NodeIdx = NewIdx;
15758       IsTruncRoot =
15759           NodeIdx < VectorizableTree.size() &&
15760           any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15761                  [](const EdgeInfo &EI) {
15762                    return EI.EdgeIdx == 0 &&
15763                           EI.UserTE->getOpcode() == Instruction::Trunc &&
15764                           !EI.UserTE->isAltShuffle();
15765                  });
15766       IsSignedCmp =
15767           NodeIdx < VectorizableTree.size() &&
15768           any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15769                  [&](const EdgeInfo &EI) {
15770                    return EI.UserTE->getOpcode() == Instruction::ICmp &&
15771                           any_of(EI.UserTE->Scalars, [&](Value *V) {
15772                             auto *IC = dyn_cast<ICmpInst>(V);
15773                             return IC &&
15774                                    (IC->isSigned() ||
15775                                     !isKnownNonNegative(IC->getOperand(0),
15776                                                         SimplifyQuery(*DL)) ||
15777                                     !isKnownNonNegative(IC->getOperand(1),
15778                                                         SimplifyQuery(*DL)));
15779                           });
15780                  });
15781     }
15782 
15783     // If the maximum bit width we compute is less than the with of the roots'
15784     // type, we can proceed with the narrowing. Otherwise, do nothing.
15785     if (MaxBitWidth == 0 ||
15786         MaxBitWidth >=
15787             cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15788       if (UserIgnoreList)
15789         AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15790       continue;
15791     }
15792 
15793     // Finally, map the values we can demote to the maximum bit with we
15794     // computed.
15795     for (unsigned Idx : ToDemote) {
15796       TreeEntry *TE = VectorizableTree[Idx].get();
15797       if (MinBWs.contains(TE))
15798         continue;
15799       bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15800                         return !isKnownNonNegative(R, SimplifyQuery(*DL));
15801                       });
15802       MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15803     }
15804   }
15805 }
15806 
15807 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15808   auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15809   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15810   auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15811   auto *AA = &AM.getResult<AAManager>(F);
15812   auto *LI = &AM.getResult<LoopAnalysis>(F);
15813   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15814   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15815   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15816   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
15817 
15818   bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15819   if (!Changed)
15820     return PreservedAnalyses::all();
15821 
15822   PreservedAnalyses PA;
15823   PA.preserveSet<CFGAnalyses>();
15824   return PA;
15825 }
15826 
15827 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15828                                 TargetTransformInfo *TTI_,
15829                                 TargetLibraryInfo *TLI_, AAResults *AA_,
15830                                 LoopInfo *LI_, DominatorTree *DT_,
15831                                 AssumptionCache *AC_, DemandedBits *DB_,
15832                                 OptimizationRemarkEmitter *ORE_) {
15833   if (!RunSLPVectorization)
15834     return false;
15835   SE = SE_;
15836   TTI = TTI_;
15837   TLI = TLI_;
15838   AA = AA_;
15839   LI = LI_;
15840   DT = DT_;
15841   AC = AC_;
15842   DB = DB_;
15843   DL = &F.getDataLayout();
15844 
15845   Stores.clear();
15846   GEPs.clear();
15847   bool Changed = false;
15848 
15849   // If the target claims to have no vector registers don't attempt
15850   // vectorization.
15851   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
15852     LLVM_DEBUG(
15853         dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15854     return false;
15855   }
15856 
15857   // Don't vectorize when the attribute NoImplicitFloat is used.
15858   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15859     return false;
15860 
15861   LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15862 
15863   // Use the bottom up slp vectorizer to construct chains that start with
15864   // store instructions.
15865   BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15866 
15867   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15868   // delete instructions.
15869 
15870   // Update DFS numbers now so that we can use them for ordering.
15871   DT->updateDFSNumbers();
15872 
15873   // Scan the blocks in the function in post order.
15874   for (auto *BB : post_order(&F.getEntryBlock())) {
15875     // Start new block - clear the list of reduction roots.
15876     R.clearReductionData();
15877     collectSeedInstructions(BB);
15878 
15879     // Vectorize trees that end at stores.
15880     if (!Stores.empty()) {
15881       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15882                         << " underlying objects.\n");
15883       Changed |= vectorizeStoreChains(R);
15884     }
15885 
15886     // Vectorize trees that end at reductions.
15887     Changed |= vectorizeChainsInBlock(BB, R);
15888 
15889     // Vectorize the index computations of getelementptr instructions. This
15890     // is primarily intended to catch gather-like idioms ending at
15891     // non-consecutive loads.
15892     if (!GEPs.empty()) {
15893       LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15894                         << " underlying objects.\n");
15895       Changed |= vectorizeGEPIndices(BB, R);
15896     }
15897   }
15898 
15899   if (Changed) {
15900     R.optimizeGatherSequence();
15901     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15902   }
15903   return Changed;
15904 }
15905 
15906 std::optional<bool>
15907 SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15908                                        unsigned Idx, unsigned MinVF,
15909                                        unsigned &Size) {
15910   Size = 0;
15911   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15912                     << "\n");
15913   const unsigned Sz = R.getVectorElementSize(Chain[0]);
15914   unsigned VF = Chain.size();
15915 
15916   if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15917     // Check if vectorizing with a non-power-of-2 VF should be considered. At
15918     // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15919     // all vector lanes are used.
15920     if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15921       return false;
15922   }
15923 
15924   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15925                     << "\n");
15926 
15927   SetVector<Value *> ValOps;
15928   for (Value *V : Chain)
15929     ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15930   // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15931   InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15932   if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15933     DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15934     bool IsPowerOf2 =
15935         isPowerOf2_32(ValOps.size()) ||
15936         (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15937     if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15938          (!S.MainOp->isSafeToRemove() ||
15939           any_of(ValOps.getArrayRef(),
15940                  [&](Value *V) {
15941                    return !isa<ExtractElementInst>(V) &&
15942                           (V->getNumUses() > Chain.size() ||
15943                            any_of(V->users(), [&](User *U) {
15944                              return !Stores.contains(U);
15945                            }));
15946                  }))) ||
15947         (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15948       Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15949       return false;
15950     }
15951   }
15952   if (R.isLoadCombineCandidate(Chain))
15953     return true;
15954   R.buildTree(Chain);
15955   // Check if tree tiny and store itself or its value is not vectorized.
15956   if (R.isTreeTinyAndNotFullyVectorizable()) {
15957     if (R.isGathered(Chain.front()) ||
15958         R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15959       return std::nullopt;
15960     Size = R.getTreeSize();
15961     return false;
15962   }
15963   R.reorderTopToBottom();
15964   R.reorderBottomToTop();
15965   R.buildExternalUses();
15966 
15967   R.computeMinimumValueSizes();
15968   R.transformNodes();
15969 
15970   Size = R.getTreeSize();
15971   if (S.getOpcode() == Instruction::Load)
15972     Size = 2; // cut off masked gather small trees
15973   InstructionCost Cost = R.getTreeCost();
15974 
15975   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15976   if (Cost < -SLPCostThreshold) {
15977     LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15978 
15979     using namespace ore;
15980 
15981     R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15982                                         cast<StoreInst>(Chain[0]))
15983                      << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15984                      << " and with tree size "
15985                      << NV("TreeSize", R.getTreeSize()));
15986 
15987     R.vectorizeTree();
15988     return true;
15989   }
15990 
15991   return false;
15992 }
15993 
15994 /// Checks if the quadratic mean deviation is less than 90% of the mean size.
15995 static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15996                            bool First) {
15997   unsigned Num = 0;
15998   uint64_t Sum = std::accumulate(
15999       Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16000       [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16001         unsigned Size = First ? Val.first : Val.second;
16002         if (Size == 1)
16003           return V;
16004         ++Num;
16005         return V + Size;
16006       });
16007   if (Num == 0)
16008     return true;
16009   uint64_t Mean = Sum / Num;
16010   if (Mean == 0)
16011     return true;
16012   uint64_t Dev = std::accumulate(
16013                      Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16014                      [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16015                        unsigned P = First ? Val.first : Val.second;
16016                        if (P == 1)
16017                          return V;
16018                        return V + (P - Mean) * (P - Mean);
16019                      }) /
16020                  Num;
16021   return Dev * 81 / (Mean * Mean) == 0;
16022 }
16023 
16024 bool SLPVectorizerPass::vectorizeStores(
16025     ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16026     DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16027         &Visited) {
16028   // We may run into multiple chains that merge into a single chain. We mark the
16029   // stores that we vectorized so that we don't visit the same store twice.
16030   BoUpSLP::ValueSet VectorizedStores;
16031   bool Changed = false;
16032 
16033   struct StoreDistCompare {
16034     bool operator()(const std::pair<unsigned, int> &Op1,
16035                     const std::pair<unsigned, int> &Op2) const {
16036       return Op1.second < Op2.second;
16037     }
16038   };
16039   // A set of pairs (index of store in Stores array ref, Distance of the store
16040   // address relative to base store address in units).
16041   using StoreIndexToDistSet =
16042       std::set<std::pair<unsigned, int>, StoreDistCompare>;
16043   auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16044     int PrevDist = -1;
16045     BoUpSLP::ValueList Operands;
16046     // Collect the chain into a list.
16047     for (auto [Idx, Data] : enumerate(Set)) {
16048       if (Operands.empty() || Data.second - PrevDist == 1) {
16049         Operands.push_back(Stores[Data.first]);
16050         PrevDist = Data.second;
16051         if (Idx != Set.size() - 1)
16052           continue;
16053       }
16054       auto E = make_scope_exit([&, &DataVar = Data]() {
16055         Operands.clear();
16056         Operands.push_back(Stores[DataVar.first]);
16057         PrevDist = DataVar.second;
16058       });
16059 
16060       if (Operands.size() <= 1 ||
16061           !Visited
16062                .insert({Operands.front(),
16063                         cast<StoreInst>(Operands.front())->getValueOperand(),
16064                         Operands.back(),
16065                         cast<StoreInst>(Operands.back())->getValueOperand(),
16066                         Operands.size()})
16067                .second)
16068         continue;
16069 
16070       unsigned MaxVecRegSize = R.getMaxVecRegSize();
16071       unsigned EltSize = R.getVectorElementSize(Operands[0]);
16072       unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
16073 
16074       unsigned MaxVF =
16075           std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16076       unsigned MaxRegVF = MaxVF;
16077       auto *Store = cast<StoreInst>(Operands[0]);
16078       Type *StoreTy = Store->getValueOperand()->getType();
16079       Type *ValueTy = StoreTy;
16080       if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
16081         ValueTy = Trunc->getSrcTy();
16082       if (ValueTy == StoreTy &&
16083           R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
16084         MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
16085       unsigned MinVF = std::max<unsigned>(
16086           2, PowerOf2Ceil(TTI->getStoreMinimumVF(
16087                  R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16088                  ValueTy)));
16089 
16090       if (MaxVF < MinVF) {
16091         LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16092                           << ") < "
16093                           << "MinVF (" << MinVF << ")\n");
16094         continue;
16095       }
16096 
16097       unsigned NonPowerOf2VF = 0;
16098       if (VectorizeNonPowerOf2) {
16099         // First try vectorizing with a non-power-of-2 VF. At the moment, only
16100         // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16101         // lanes are used.
16102         unsigned CandVF = Operands.size();
16103         if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
16104           NonPowerOf2VF = CandVF;
16105       }
16106 
16107       unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
16108       SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16109       unsigned Size = MinVF;
16110       for_each(reverse(CandidateVFs), [&](unsigned &VF) {
16111         VF = Size > MaxVF ? NonPowerOf2VF : Size;
16112         Size *= 2;
16113       });
16114       unsigned End = Operands.size();
16115       unsigned Repeat = 0;
16116       constexpr unsigned MaxAttempts = 4;
16117       OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
16118       for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
16119         P.first = P.second = 1;
16120       });
16121       DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
16122       auto IsNotVectorized = [](bool First,
16123                                 const std::pair<unsigned, unsigned> &P) {
16124         return First ? P.first > 0 : P.second > 0;
16125       };
16126       auto IsVectorized = [](bool First,
16127                              const std::pair<unsigned, unsigned> &P) {
16128         return First ? P.first == 0 : P.second == 0;
16129       };
16130       auto VFIsProfitable = [](bool First, unsigned Size,
16131                                const std::pair<unsigned, unsigned> &P) {
16132         return First ? Size >= P.first : Size >= P.second;
16133       };
16134       auto FirstSizeSame = [](unsigned Size,
16135                               const std::pair<unsigned, unsigned> &P) {
16136         return Size == P.first;
16137       };
16138       while (true) {
16139         ++Repeat;
16140         bool RepeatChanged = false;
16141         bool AnyProfitableGraph = false;
16142         for (unsigned Size : CandidateVFs) {
16143           AnyProfitableGraph = false;
16144           unsigned StartIdx = std::distance(
16145               RangeSizes.begin(),
16146               find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
16147                                             std::placeholders::_1)));
16148           while (StartIdx < End) {
16149             unsigned EndIdx =
16150                 std::distance(RangeSizes.begin(),
16151                               find_if(RangeSizes.drop_front(StartIdx),
16152                                       std::bind(IsVectorized, Size >= MaxRegVF,
16153                                                 std::placeholders::_1)));
16154             unsigned Sz = EndIdx >= End ? End : EndIdx;
16155             for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16156               if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
16157                                   Size >= MaxRegVF)) {
16158                 ++Cnt;
16159                 continue;
16160               }
16161               ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
16162               assert(all_of(Slice,
16163                             [&](Value *V) {
16164                               return cast<StoreInst>(V)
16165                                          ->getValueOperand()
16166                                          ->getType() ==
16167                                      cast<StoreInst>(Slice.front())
16168                                          ->getValueOperand()
16169                                          ->getType();
16170                             }) &&
16171                      "Expected all operands of same type.");
16172               if (!NonSchedulable.empty()) {
16173                 auto [NonSchedSizeMax, NonSchedSizeMin] =
16174                     NonSchedulable.lookup(Slice.front());
16175                 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16176                   Cnt += NonSchedSizeMax;
16177                   continue;
16178                 }
16179               }
16180               unsigned TreeSize;
16181               std::optional<bool> Res =
16182                   vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16183               if (!Res) {
16184                 NonSchedulable
16185                     .try_emplace(Slice.front(), std::make_pair(Size, Size))
16186                     .first->getSecond()
16187                     .second = Size;
16188               } else if (*Res) {
16189                 // Mark the vectorized stores so that we don't vectorize them
16190                 // again.
16191                 VectorizedStores.insert(Slice.begin(), Slice.end());
16192                 // Mark the vectorized stores so that we don't vectorize them
16193                 // again.
16194                 AnyProfitableGraph = RepeatChanged = Changed = true;
16195                 // If we vectorized initial block, no need to try to vectorize
16196                 // it again.
16197                 for_each(RangeSizes.slice(Cnt, Size),
16198                          [](std::pair<unsigned, unsigned> &P) {
16199                            P.first = P.second = 0;
16200                          });
16201                 if (Cnt < StartIdx + MinVF) {
16202                   for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16203                            [](std::pair<unsigned, unsigned> &P) {
16204                              P.first = P.second = 0;
16205                            });
16206                   StartIdx = Cnt + Size;
16207                 }
16208                 if (Cnt > Sz - Size - MinVF) {
16209                   for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
16210                            [](std::pair<unsigned, unsigned> &P) {
16211                              P.first = P.second = 0;
16212                            });
16213                   if (Sz == End)
16214                     End = Cnt;
16215                   Sz = Cnt;
16216                 }
16217                 Cnt += Size;
16218                 continue;
16219               }
16220               if (Size > 2 && Res &&
16221                   !all_of(RangeSizes.slice(Cnt, Size),
16222                           std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
16223                                     std::placeholders::_1))) {
16224                 Cnt += Size;
16225                 continue;
16226               }
16227               // Check for the very big VFs that we're not rebuilding same
16228               // trees, just with larger number of elements.
16229               if (Size > MaxRegVF && TreeSize > 1 &&
16230                   all_of(RangeSizes.slice(Cnt, Size),
16231                          std::bind(FirstSizeSame, TreeSize,
16232                                    std::placeholders::_1))) {
16233                 Cnt += Size;
16234                 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16235                   ++Cnt;
16236                 continue;
16237               }
16238               if (TreeSize > 1)
16239                 for_each(RangeSizes.slice(Cnt, Size),
16240                          [&](std::pair<unsigned, unsigned> &P) {
16241                            if (Size >= MaxRegVF)
16242                              P.second = std::max(P.second, TreeSize);
16243                            else
16244                              P.first = std::max(P.first, TreeSize);
16245                          });
16246               ++Cnt;
16247               AnyProfitableGraph = true;
16248             }
16249             if (StartIdx >= End)
16250               break;
16251             if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16252               AnyProfitableGraph = true;
16253             StartIdx = std::distance(
16254                 RangeSizes.begin(),
16255                 find_if(RangeSizes.drop_front(Sz),
16256                         std::bind(IsNotVectorized, Size >= MaxRegVF,
16257                                   std::placeholders::_1)));
16258           }
16259           if (!AnyProfitableGraph && Size >= MaxRegVF)
16260             break;
16261         }
16262         // All values vectorized - exit.
16263         if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16264               return P.first == 0 && P.second == 0;
16265             }))
16266           break;
16267         // Check if tried all attempts or no need for the last attempts at all.
16268         if (Repeat >= MaxAttempts ||
16269             (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16270           break;
16271         constexpr unsigned StoresLimit = 64;
16272         const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16273             Operands.size(),
16274             static_cast<unsigned>(
16275                 End -
16276                 std::distance(
16277                     RangeSizes.begin(),
16278                     find_if(RangeSizes, std::bind(IsNotVectorized, true,
16279                                                   std::placeholders::_1))) +
16280                 1)));
16281         unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16282         if (VF > MaxTotalNum || VF >= StoresLimit)
16283           break;
16284         for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16285           if (P.first != 0)
16286             P.first = std::max(P.second, P.first);
16287         });
16288         // Last attempt to vectorize max number of elements, if all previous
16289         // attempts were unsuccessful because of the cost issues.
16290         CandidateVFs.clear();
16291         CandidateVFs.push_back(VF);
16292       }
16293     }
16294   };
16295 
16296   // Stores pair (first: index of the store into Stores array ref, address of
16297   // which taken as base, second: sorted set of pairs {index, dist}, which are
16298   // indices of stores in the set and their store location distances relative to
16299   // the base address).
16300 
16301   // Need to store the index of the very first store separately, since the set
16302   // may be reordered after the insertion and the first store may be moved. This
16303   // container allows to reduce number of calls of getPointersDiff() function.
16304   SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
16305   // Inserts the specified store SI with the given index Idx to the set of the
16306   // stores. If the store with the same distance is found already - stop
16307   // insertion, try to vectorize already found stores. If some stores from this
16308   // sequence were not vectorized - try to vectorize them with the new store
16309   // later. But this logic is applied only to the stores, that come before the
16310   // previous store with the same distance.
16311   // Example:
16312   // 1. store x, %p
16313   // 2. store y, %p+1
16314   // 3. store z, %p+2
16315   // 4. store a, %p
16316   // 5. store b, %p+3
16317   // - Scan this from the last to first store. The very first bunch of stores is
16318   // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16319   // vector).
16320   // - The next store in the list - #1 - has the same distance from store #5 as
16321   // the store #4.
16322   // - Try to vectorize sequence of stores 4,2,3,5.
16323   // - If all these stores are vectorized - just drop them.
16324   // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16325   // - Start new stores sequence.
16326   // The new bunch of stores is {1, {1, 0}}.
16327   // - Add the stores from previous sequence, that were not vectorized.
16328   // Here we consider the stores in the reversed order, rather they are used in
16329   // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16330   // Store #3 can be added -> comes after store #4 with the same distance as
16331   // store #1.
16332   // Store #5 cannot be added - comes before store #4.
16333   // This logic allows to improve the compile time, we assume that the stores
16334   // after previous store with the same distance most likely have memory
16335   // dependencies and no need to waste compile time to try to vectorize them.
16336   // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16337   auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16338     for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16339       std::optional<int> Diff = getPointersDiff(
16340           Stores[Set.first]->getValueOperand()->getType(),
16341           Stores[Set.first]->getPointerOperand(),
16342           SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16343           /*StrictCheck=*/true);
16344       if (!Diff)
16345         continue;
16346       auto It = Set.second.find(std::make_pair(Idx, *Diff));
16347       if (It == Set.second.end()) {
16348         Set.second.emplace(Idx, *Diff);
16349         return;
16350       }
16351       // Try to vectorize the first found set to avoid duplicate analysis.
16352       TryToVectorize(Set.second);
16353       StoreIndexToDistSet PrevSet;
16354       PrevSet.swap(Set.second);
16355       Set.first = Idx;
16356       Set.second.emplace(Idx, 0);
16357       // Insert stores that followed previous match to try to vectorize them
16358       // with this store.
16359       unsigned StartIdx = It->first + 1;
16360       SmallBitVector UsedStores(Idx - StartIdx);
16361       // Distances to previously found dup store (or this store, since they
16362       // store to the same addresses).
16363       SmallVector<int> Dists(Idx - StartIdx, 0);
16364       for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16365         // Do not try to vectorize sequences, we already tried.
16366         if (Pair.first <= It->first ||
16367             VectorizedStores.contains(Stores[Pair.first]))
16368           break;
16369         unsigned BI = Pair.first - StartIdx;
16370         UsedStores.set(BI);
16371         Dists[BI] = Pair.second - It->second;
16372       }
16373       for (unsigned I = StartIdx; I < Idx; ++I) {
16374         unsigned BI = I - StartIdx;
16375         if (UsedStores.test(BI))
16376           Set.second.emplace(I, Dists[BI]);
16377       }
16378       return;
16379     }
16380     auto &Res = SortedStores.emplace_back();
16381     Res.first = Idx;
16382     Res.second.emplace(Idx, 0);
16383   };
16384   Type *PrevValTy = nullptr;
16385   for (auto [I, SI] : enumerate(Stores)) {
16386     if (R.isDeleted(SI))
16387       continue;
16388     if (!PrevValTy)
16389       PrevValTy = SI->getValueOperand()->getType();
16390     // Check that we do not try to vectorize stores of different types.
16391     if (PrevValTy != SI->getValueOperand()->getType()) {
16392       for (auto &Set : SortedStores)
16393         TryToVectorize(Set.second);
16394       SortedStores.clear();
16395       PrevValTy = SI->getValueOperand()->getType();
16396     }
16397     FillStoresSet(I, SI);
16398   }
16399 
16400   // Final vectorization attempt.
16401   for (auto &Set : SortedStores)
16402     TryToVectorize(Set.second);
16403 
16404   return Changed;
16405 }
16406 
16407 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16408   // Initialize the collections. We will make a single pass over the block.
16409   Stores.clear();
16410   GEPs.clear();
16411 
16412   // Visit the store and getelementptr instructions in BB and organize them in
16413   // Stores and GEPs according to the underlying objects of their pointer
16414   // operands.
16415   for (Instruction &I : *BB) {
16416     // Ignore store instructions that are volatile or have a pointer operand
16417     // that doesn't point to a scalar type.
16418     if (auto *SI = dyn_cast<StoreInst>(&I)) {
16419       if (!SI->isSimple())
16420         continue;
16421       if (!isValidElementType(SI->getValueOperand()->getType()))
16422         continue;
16423       Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16424     }
16425 
16426     // Ignore getelementptr instructions that have more than one index, a
16427     // constant index, or a pointer operand that doesn't point to a scalar
16428     // type.
16429     else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16430       if (GEP->getNumIndices() != 1)
16431         continue;
16432       Value *Idx = GEP->idx_begin()->get();
16433       if (isa<Constant>(Idx))
16434         continue;
16435       if (!isValidElementType(Idx->getType()))
16436         continue;
16437       if (GEP->getType()->isVectorTy())
16438         continue;
16439       GEPs[GEP->getPointerOperand()].push_back(GEP);
16440     }
16441   }
16442 }
16443 
16444 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16445                                            bool MaxVFOnly) {
16446   if (VL.size() < 2)
16447     return false;
16448 
16449   LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16450                     << VL.size() << ".\n");
16451 
16452   // Check that all of the parts are instructions of the same type,
16453   // we permit an alternate opcode via InstructionsState.
16454   InstructionsState S = getSameOpcode(VL, *TLI);
16455   if (!S.getOpcode())
16456     return false;
16457 
16458   Instruction *I0 = cast<Instruction>(S.OpValue);
16459   // Make sure invalid types (including vector type) are rejected before
16460   // determining vectorization factor for scalar instructions.
16461   for (Value *V : VL) {
16462     Type *Ty = V->getType();
16463     if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16464       // NOTE: the following will give user internal llvm type name, which may
16465       // not be useful.
16466       R.getORE()->emit([&]() {
16467         std::string TypeStr;
16468         llvm::raw_string_ostream rso(TypeStr);
16469         Ty->print(rso);
16470         return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16471                << "Cannot SLP vectorize list: type "
16472                << TypeStr + " is unsupported by vectorizer";
16473       });
16474       return false;
16475     }
16476   }
16477 
16478   unsigned Sz = R.getVectorElementSize(I0);
16479   unsigned MinVF = R.getMinVF(Sz);
16480   unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16481   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16482   if (MaxVF < 2) {
16483     R.getORE()->emit([&]() {
16484       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16485              << "Cannot SLP vectorize list: vectorization factor "
16486              << "less than 2 is not supported";
16487     });
16488     return false;
16489   }
16490 
16491   bool Changed = false;
16492   bool CandidateFound = false;
16493   InstructionCost MinCost = SLPCostThreshold.getValue();
16494   Type *ScalarTy = VL[0]->getType();
16495   if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16496     ScalarTy = IE->getOperand(1)->getType();
16497 
16498   unsigned NextInst = 0, MaxInst = VL.size();
16499   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16500     // No actual vectorization should happen, if number of parts is the same as
16501     // provided vectorization factor (i.e. the scalar type is used for vector
16502     // code during codegen).
16503     auto *VecTy = getWidenedType(ScalarTy, VF);
16504     if (TTI->getNumberOfParts(VecTy) == VF)
16505       continue;
16506     for (unsigned I = NextInst; I < MaxInst; ++I) {
16507       unsigned ActualVF = std::min(MaxInst - I, VF);
16508 
16509       if (!isPowerOf2_32(ActualVF))
16510         continue;
16511 
16512       if (MaxVFOnly && ActualVF < MaxVF)
16513         break;
16514       if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16515         break;
16516 
16517       ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16518       // Check that a previous iteration of this loop did not delete the Value.
16519       if (llvm::any_of(Ops, [&R](Value *V) {
16520             auto *I = dyn_cast<Instruction>(V);
16521             return I && R.isDeleted(I);
16522           }))
16523         continue;
16524 
16525       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16526                         << "\n");
16527 
16528       R.buildTree(Ops);
16529       if (R.isTreeTinyAndNotFullyVectorizable())
16530         continue;
16531       R.reorderTopToBottom();
16532       R.reorderBottomToTop(
16533           /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16534           !R.doesRootHaveInTreeUses());
16535       R.buildExternalUses();
16536 
16537       R.computeMinimumValueSizes();
16538       R.transformNodes();
16539       InstructionCost Cost = R.getTreeCost();
16540       CandidateFound = true;
16541       MinCost = std::min(MinCost, Cost);
16542 
16543       LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16544                         << " for VF=" << ActualVF << "\n");
16545       if (Cost < -SLPCostThreshold) {
16546         LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16547         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16548                                                     cast<Instruction>(Ops[0]))
16549                                  << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16550                                  << " and with tree size "
16551                                  << ore::NV("TreeSize", R.getTreeSize()));
16552 
16553         R.vectorizeTree();
16554         // Move to the next bundle.
16555         I += VF - 1;
16556         NextInst = I + 1;
16557         Changed = true;
16558       }
16559     }
16560   }
16561 
16562   if (!Changed && CandidateFound) {
16563     R.getORE()->emit([&]() {
16564       return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16565              << "List vectorization was possible but not beneficial with cost "
16566              << ore::NV("Cost", MinCost) << " >= "
16567              << ore::NV("Treshold", -SLPCostThreshold);
16568     });
16569   } else if (!Changed) {
16570     R.getORE()->emit([&]() {
16571       return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16572              << "Cannot SLP vectorize list: vectorization was impossible"
16573              << " with available vectorization factors";
16574     });
16575   }
16576   return Changed;
16577 }
16578 
16579 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16580   if (!I)
16581     return false;
16582 
16583   if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16584     return false;
16585 
16586   Value *P = I->getParent();
16587 
16588   // Vectorize in current basic block only.
16589   auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16590   auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16591   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16592     return false;
16593 
16594   // First collect all possible candidates
16595   SmallVector<std::pair<Value *, Value *>, 4> Candidates;
16596   Candidates.emplace_back(Op0, Op1);
16597 
16598   auto *A = dyn_cast<BinaryOperator>(Op0);
16599   auto *B = dyn_cast<BinaryOperator>(Op1);
16600   // Try to skip B.
16601   if (A && B && B->hasOneUse()) {
16602     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16603     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16604     if (B0 && B0->getParent() == P)
16605       Candidates.emplace_back(A, B0);
16606     if (B1 && B1->getParent() == P)
16607       Candidates.emplace_back(A, B1);
16608   }
16609   // Try to skip A.
16610   if (B && A && A->hasOneUse()) {
16611     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16612     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16613     if (A0 && A0->getParent() == P)
16614       Candidates.emplace_back(A0, B);
16615     if (A1 && A1->getParent() == P)
16616       Candidates.emplace_back(A1, B);
16617   }
16618 
16619   if (Candidates.size() == 1)
16620     return tryToVectorizeList({Op0, Op1}, R);
16621 
16622   // We have multiple options. Try to pick the single best.
16623   std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16624   if (!BestCandidate)
16625     return false;
16626   return tryToVectorizeList(
16627       {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16628 }
16629 
16630 namespace {
16631 
16632 /// Model horizontal reductions.
16633 ///
16634 /// A horizontal reduction is a tree of reduction instructions that has values
16635 /// that can be put into a vector as its leaves. For example:
16636 ///
16637 /// mul mul mul mul
16638 ///  \  /    \  /
16639 ///   +       +
16640 ///    \     /
16641 ///       +
16642 /// This tree has "mul" as its leaf values and "+" as its reduction
16643 /// instructions. A reduction can feed into a store or a binary operation
16644 /// feeding a phi.
16645 ///    ...
16646 ///    \  /
16647 ///     +
16648 ///     |
16649 ///  phi +=
16650 ///
16651 ///  Or:
16652 ///    ...
16653 ///    \  /
16654 ///     +
16655 ///     |
16656 ///   *p =
16657 ///
16658 class HorizontalReduction {
16659   using ReductionOpsType = SmallVector<Value *, 16>;
16660   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16661   ReductionOpsListType ReductionOps;
16662   /// List of possibly reduced values.
16663   SmallVector<SmallVector<Value *>> ReducedVals;
16664   /// Maps reduced value to the corresponding reduction operation.
16665   DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
16666   // Use map vector to make stable output.
16667   MapVector<Instruction *, Value *> ExtraArgs;
16668   WeakTrackingVH ReductionRoot;
16669   /// The type of reduction operation.
16670   RecurKind RdxKind;
16671   /// Checks if the optimization of original scalar identity operations on
16672   /// matched horizontal reductions is enabled and allowed.
16673   bool IsSupportedHorRdxIdentityOp = false;
16674 
16675   static bool isCmpSelMinMax(Instruction *I) {
16676     return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16677            RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
16678   }
16679 
16680   // And/or are potentially poison-safe logical patterns like:
16681   // select x, y, false
16682   // select x, true, y
16683   static bool isBoolLogicOp(Instruction *I) {
16684     return isa<SelectInst>(I) &&
16685            (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16686   }
16687 
16688   /// Checks if instruction is associative and can be vectorized.
16689   static bool isVectorizable(RecurKind Kind, Instruction *I) {
16690     if (Kind == RecurKind::None)
16691       return false;
16692 
16693     // Integer ops that map to select instructions or intrinsics are fine.
16694     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
16695         isBoolLogicOp(I))
16696       return true;
16697 
16698     if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16699       // FP min/max are associative except for NaN and -0.0. We do not
16700       // have to rule out -0.0 here because the intrinsic semantics do not
16701       // specify a fixed result for it.
16702       return I->getFastMathFlags().noNaNs();
16703     }
16704 
16705     if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16706       return true;
16707 
16708     return I->isAssociative();
16709   }
16710 
16711   static Value *getRdxOperand(Instruction *I, unsigned Index) {
16712     // Poison-safe 'or' takes the form: select X, true, Y
16713     // To make that work with the normal operand processing, we skip the
16714     // true value operand.
16715     // TODO: Change the code and data structures to handle this without a hack.
16716     if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16717       return I->getOperand(2);
16718     return I->getOperand(Index);
16719   }
16720 
16721   /// Creates reduction operation with the current opcode.
16722   static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16723                          Value *RHS, const Twine &Name, bool UseSelect) {
16724     unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16725     switch (Kind) {
16726     case RecurKind::Or:
16727       if (UseSelect &&
16728           LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
16729         return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16730       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16731                                  Name);
16732     case RecurKind::And:
16733       if (UseSelect &&
16734           LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
16735         return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16736       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16737                                  Name);
16738     case RecurKind::Add:
16739     case RecurKind::Mul:
16740     case RecurKind::Xor:
16741     case RecurKind::FAdd:
16742     case RecurKind::FMul:
16743       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16744                                  Name);
16745     case RecurKind::FMax:
16746       return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16747     case RecurKind::FMin:
16748       return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16749     case RecurKind::FMaximum:
16750       return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16751     case RecurKind::FMinimum:
16752       return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16753     case RecurKind::SMax:
16754       if (UseSelect) {
16755         Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16756         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16757       }
16758       return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16759     case RecurKind::SMin:
16760       if (UseSelect) {
16761         Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16762         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16763       }
16764       return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16765     case RecurKind::UMax:
16766       if (UseSelect) {
16767         Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16768         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16769       }
16770       return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16771     case RecurKind::UMin:
16772       if (UseSelect) {
16773         Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16774         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16775       }
16776       return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16777     default:
16778       llvm_unreachable("Unknown reduction operation.");
16779     }
16780   }
16781 
16782   /// Creates reduction operation with the current opcode with the IR flags
16783   /// from \p ReductionOps, dropping nuw/nsw flags.
16784   static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16785                          Value *RHS, const Twine &Name,
16786                          const ReductionOpsListType &ReductionOps) {
16787     bool UseSelect = ReductionOps.size() == 2 ||
16788                      // Logical or/and.
16789                      (ReductionOps.size() == 1 &&
16790                       any_of(ReductionOps.front(), IsaPred<SelectInst>));
16791     assert((!UseSelect || ReductionOps.size() != 2 ||
16792             isa<SelectInst>(ReductionOps[1][0])) &&
16793            "Expected cmp + select pairs for reduction");
16794     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16795     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
16796       if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16797         propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16798                          /*IncludeWrapFlags=*/false);
16799         propagateIRFlags(Op, ReductionOps[1], nullptr,
16800                          /*IncludeWrapFlags=*/false);
16801         return Op;
16802       }
16803     }
16804     propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16805     return Op;
16806   }
16807 
16808 public:
16809   static RecurKind getRdxKind(Value *V) {
16810     auto *I = dyn_cast<Instruction>(V);
16811     if (!I)
16812       return RecurKind::None;
16813     if (match(I, m_Add(m_Value(), m_Value())))
16814       return RecurKind::Add;
16815     if (match(I, m_Mul(m_Value(), m_Value())))
16816       return RecurKind::Mul;
16817     if (match(I, m_And(m_Value(), m_Value())) ||
16818         match(I, m_LogicalAnd(m_Value(), m_Value())))
16819       return RecurKind::And;
16820     if (match(I, m_Or(m_Value(), m_Value())) ||
16821         match(I, m_LogicalOr(m_Value(), m_Value())))
16822       return RecurKind::Or;
16823     if (match(I, m_Xor(m_Value(), m_Value())))
16824       return RecurKind::Xor;
16825     if (match(I, m_FAdd(m_Value(), m_Value())))
16826       return RecurKind::FAdd;
16827     if (match(I, m_FMul(m_Value(), m_Value())))
16828       return RecurKind::FMul;
16829 
16830     if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16831       return RecurKind::FMax;
16832     if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16833       return RecurKind::FMin;
16834 
16835     if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16836       return RecurKind::FMaximum;
16837     if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16838       return RecurKind::FMinimum;
16839     // This matches either cmp+select or intrinsics. SLP is expected to handle
16840     // either form.
16841     // TODO: If we are canonicalizing to intrinsics, we can remove several
16842     //       special-case paths that deal with selects.
16843     if (match(I, m_SMax(m_Value(), m_Value())))
16844       return RecurKind::SMax;
16845     if (match(I, m_SMin(m_Value(), m_Value())))
16846       return RecurKind::SMin;
16847     if (match(I, m_UMax(m_Value(), m_Value())))
16848       return RecurKind::UMax;
16849     if (match(I, m_UMin(m_Value(), m_Value())))
16850       return RecurKind::UMin;
16851 
16852     if (auto *Select = dyn_cast<SelectInst>(I)) {
16853       // Try harder: look for min/max pattern based on instructions producing
16854       // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16855       // During the intermediate stages of SLP, it's very common to have
16856       // pattern like this (since optimizeGatherSequence is run only once
16857       // at the end):
16858       // %1 = extractelement <2 x i32> %a, i32 0
16859       // %2 = extractelement <2 x i32> %a, i32 1
16860       // %cond = icmp sgt i32 %1, %2
16861       // %3 = extractelement <2 x i32> %a, i32 0
16862       // %4 = extractelement <2 x i32> %a, i32 1
16863       // %select = select i1 %cond, i32 %3, i32 %4
16864       CmpInst::Predicate Pred;
16865       Instruction *L1;
16866       Instruction *L2;
16867 
16868       Value *LHS = Select->getTrueValue();
16869       Value *RHS = Select->getFalseValue();
16870       Value *Cond = Select->getCondition();
16871 
16872       // TODO: Support inverse predicates.
16873       if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16874         if (!isa<ExtractElementInst>(RHS) ||
16875             !L2->isIdenticalTo(cast<Instruction>(RHS)))
16876           return RecurKind::None;
16877       } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16878         if (!isa<ExtractElementInst>(LHS) ||
16879             !L1->isIdenticalTo(cast<Instruction>(LHS)))
16880           return RecurKind::None;
16881       } else {
16882         if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16883           return RecurKind::None;
16884         if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16885             !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16886             !L2->isIdenticalTo(cast<Instruction>(RHS)))
16887           return RecurKind::None;
16888       }
16889 
16890       switch (Pred) {
16891       default:
16892         return RecurKind::None;
16893       case CmpInst::ICMP_SGT:
16894       case CmpInst::ICMP_SGE:
16895         return RecurKind::SMax;
16896       case CmpInst::ICMP_SLT:
16897       case CmpInst::ICMP_SLE:
16898         return RecurKind::SMin;
16899       case CmpInst::ICMP_UGT:
16900       case CmpInst::ICMP_UGE:
16901         return RecurKind::UMax;
16902       case CmpInst::ICMP_ULT:
16903       case CmpInst::ICMP_ULE:
16904         return RecurKind::UMin;
16905       }
16906     }
16907     return RecurKind::None;
16908   }
16909 
16910   /// Get the index of the first operand.
16911   static unsigned getFirstOperandIndex(Instruction *I) {
16912     return isCmpSelMinMax(I) ? 1 : 0;
16913   }
16914 
16915 private:
16916   /// Total number of operands in the reduction operation.
16917   static unsigned getNumberOfOperands(Instruction *I) {
16918     return isCmpSelMinMax(I) ? 3 : 2;
16919   }
16920 
16921   /// Checks if the instruction is in basic block \p BB.
16922   /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16923   static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16924     if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16925       auto *Sel = cast<SelectInst>(I);
16926       auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16927       return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16928     }
16929     return I->getParent() == BB;
16930   }
16931 
16932   /// Expected number of uses for reduction operations/reduced values.
16933   static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16934     if (IsCmpSelMinMax) {
16935       // SelectInst must be used twice while the condition op must have single
16936       // use only.
16937       if (auto *Sel = dyn_cast<SelectInst>(I))
16938         return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16939       return I->hasNUses(2);
16940     }
16941 
16942     // Arithmetic reduction operation must be used once only.
16943     return I->hasOneUse();
16944   }
16945 
16946   /// Initializes the list of reduction operations.
16947   void initReductionOps(Instruction *I) {
16948     if (isCmpSelMinMax(I))
16949       ReductionOps.assign(2, ReductionOpsType());
16950     else
16951       ReductionOps.assign(1, ReductionOpsType());
16952   }
16953 
16954   /// Add all reduction operations for the reduction instruction \p I.
16955   void addReductionOps(Instruction *I) {
16956     if (isCmpSelMinMax(I)) {
16957       ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16958       ReductionOps[1].emplace_back(I);
16959     } else {
16960       ReductionOps[0].emplace_back(I);
16961     }
16962   }
16963 
16964   static bool isGoodForReduction(ArrayRef<Value *> Data) {
16965     int Sz = Data.size();
16966     auto *I = dyn_cast<Instruction>(Data.front());
16967     return Sz > 1 || isConstant(Data.front()) ||
16968            (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16969   }
16970 
16971 public:
16972   HorizontalReduction() = default;
16973 
16974   /// Try to find a reduction tree.
16975   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16976                                  ScalarEvolution &SE, const DataLayout &DL,
16977                                  const TargetLibraryInfo &TLI) {
16978     RdxKind = HorizontalReduction::getRdxKind(Root);
16979     if (!isVectorizable(RdxKind, Root))
16980       return false;
16981 
16982     // Analyze "regular" integer/FP types for reductions - no target-specific
16983     // types or pointers.
16984     Type *Ty = Root->getType();
16985     if (!isValidElementType(Ty) || Ty->isPointerTy())
16986       return false;
16987 
16988     // Though the ultimate reduction may have multiple uses, its condition must
16989     // have only single use.
16990     if (auto *Sel = dyn_cast<SelectInst>(Root))
16991       if (!Sel->getCondition()->hasOneUse())
16992         return false;
16993 
16994     ReductionRoot = Root;
16995 
16996     // Iterate through all the operands of the possible reduction tree and
16997     // gather all the reduced values, sorting them by their value id.
16998     BasicBlock *BB = Root->getParent();
16999     bool IsCmpSelMinMax = isCmpSelMinMax(Root);
17000     SmallVector<Instruction *> Worklist(1, Root);
17001     // Checks if the operands of the \p TreeN instruction are also reduction
17002     // operations or should be treated as reduced values or an extra argument,
17003     // which is not part of the reduction.
17004     auto CheckOperands = [&](Instruction *TreeN,
17005                              SmallVectorImpl<Value *> &ExtraArgs,
17006                              SmallVectorImpl<Value *> &PossibleReducedVals,
17007                              SmallVectorImpl<Instruction *> &ReductionOps) {
17008       for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
17009                                     getNumberOfOperands(TreeN)))) {
17010         Value *EdgeVal = getRdxOperand(TreeN, I);
17011         ReducedValsToOps[EdgeVal].push_back(TreeN);
17012         auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17013         // Edge has wrong parent - mark as an extra argument.
17014         if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
17015             !hasSameParent(EdgeInst, BB)) {
17016           ExtraArgs.push_back(EdgeVal);
17017           continue;
17018         }
17019         // If the edge is not an instruction, or it is different from the main
17020         // reduction opcode or has too many uses - possible reduced value.
17021         // Also, do not try to reduce const values, if the operation is not
17022         // foldable.
17023         if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
17024             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17025             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17026             !isVectorizable(RdxKind, EdgeInst) ||
17027             (R.isAnalyzedReductionRoot(EdgeInst) &&
17028              all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17029           PossibleReducedVals.push_back(EdgeVal);
17030           continue;
17031         }
17032         ReductionOps.push_back(EdgeInst);
17033       }
17034     };
17035     // Try to regroup reduced values so that it gets more profitable to try to
17036     // reduce them. Values are grouped by their value ids, instructions - by
17037     // instruction op id and/or alternate op id, plus do extra analysis for
17038     // loads (grouping them by the distabce between pointers) and cmp
17039     // instructions (grouping them by the predicate).
17040     MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
17041         PossibleReducedVals;
17042     initReductionOps(Root);
17043     DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
17044     SmallSet<size_t, 2> LoadKeyUsed;
17045 
17046     auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17047       Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
17048       if (LoadKeyUsed.contains(Key)) {
17049         auto LIt = LoadsMap.find(Ptr);
17050         if (LIt != LoadsMap.end()) {
17051           for (LoadInst *RLI : LIt->second) {
17052             if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
17053                                 LI->getType(), LI->getPointerOperand(), DL, SE,
17054                                 /*StrictCheck=*/true))
17055               return hash_value(RLI->getPointerOperand());
17056           }
17057           for (LoadInst *RLI : LIt->second) {
17058             if (arePointersCompatible(RLI->getPointerOperand(),
17059                                       LI->getPointerOperand(), TLI)) {
17060               hash_code SubKey = hash_value(RLI->getPointerOperand());
17061               return SubKey;
17062             }
17063           }
17064           if (LIt->second.size() > 2) {
17065             hash_code SubKey =
17066                 hash_value(LIt->second.back()->getPointerOperand());
17067             return SubKey;
17068           }
17069         }
17070       }
17071       LoadKeyUsed.insert(Key);
17072       LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
17073       return hash_value(LI->getPointerOperand());
17074     };
17075 
17076     while (!Worklist.empty()) {
17077       Instruction *TreeN = Worklist.pop_back_val();
17078       SmallVector<Value *> Args;
17079       SmallVector<Value *> PossibleRedVals;
17080       SmallVector<Instruction *> PossibleReductionOps;
17081       CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17082       // If too many extra args - mark the instruction itself as a reduction
17083       // value, not a reduction operation.
17084       if (Args.size() < 2) {
17085         addReductionOps(TreeN);
17086         // Add extra args.
17087         if (!Args.empty()) {
17088           assert(Args.size() == 1 && "Expected only single argument.");
17089           ExtraArgs[TreeN] = Args.front();
17090         }
17091         // Add reduction values. The values are sorted for better vectorization
17092         // results.
17093         for (Value *V : PossibleRedVals) {
17094           size_t Key, Idx;
17095           std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17096                                                  /*AllowAlternate=*/false);
17097           ++PossibleReducedVals[Key][Idx]
17098                 .insert(std::make_pair(V, 0))
17099                 .first->second;
17100         }
17101         Worklist.append(PossibleReductionOps.rbegin(),
17102                         PossibleReductionOps.rend());
17103       } else {
17104         size_t Key, Idx;
17105         std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
17106                                                /*AllowAlternate=*/false);
17107         ++PossibleReducedVals[Key][Idx]
17108               .insert(std::make_pair(TreeN, 0))
17109               .first->second;
17110       }
17111     }
17112     auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17113     // Sort values by the total number of values kinds to start the reduction
17114     // from the longest possible reduced values sequences.
17115     for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17116       auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17117       SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17118       for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17119            It != E; ++It) {
17120         PossibleRedValsVect.emplace_back();
17121         auto RedValsVect = It->second.takeVector();
17122         stable_sort(RedValsVect, llvm::less_second());
17123         for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17124           PossibleRedValsVect.back().append(Data.second, Data.first);
17125       }
17126       stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
17127         return P1.size() > P2.size();
17128       });
17129       int NewIdx = -1;
17130       for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17131         if (NewIdx < 0 ||
17132             (!isGoodForReduction(Data) &&
17133              (!isa<LoadInst>(Data.front()) ||
17134               !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17135               getUnderlyingObject(
17136                   cast<LoadInst>(Data.front())->getPointerOperand()) !=
17137                   getUnderlyingObject(
17138                       cast<LoadInst>(ReducedVals[NewIdx].front())
17139                           ->getPointerOperand())))) {
17140           NewIdx = ReducedVals.size();
17141           ReducedVals.emplace_back();
17142         }
17143         ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
17144       }
17145     }
17146     // Sort the reduced values by number of same/alternate opcode and/or pointer
17147     // operand.
17148     stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17149       return P1.size() > P2.size();
17150     });
17151     return true;
17152   }
17153 
17154   /// Attempt to vectorize the tree found by matchAssociativeReduction.
17155   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17156                      const TargetLibraryInfo &TLI) {
17157     constexpr int ReductionLimit = 4;
17158     constexpr unsigned RegMaxNumber = 4;
17159     constexpr unsigned RedValsMaxNumber = 128;
17160     // If there are a sufficient number of reduction values, reduce
17161     // to a nearby power-of-2. We can safely generate oversized
17162     // vectors and rely on the backend to split them to legal sizes.
17163     unsigned NumReducedVals =
17164         std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
17165                         [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17166                           if (!isGoodForReduction(Vals))
17167                             return Num;
17168                           return Num + Vals.size();
17169                         });
17170     if (NumReducedVals < ReductionLimit &&
17171         (!AllowHorRdxIdenityOptimization ||
17172          all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
17173            return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
17174          }))) {
17175       for (ReductionOpsType &RdxOps : ReductionOps)
17176         for (Value *RdxOp : RdxOps)
17177           V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17178       return nullptr;
17179     }
17180 
17181     IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17182                                     TargetFolder(DL));
17183     Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
17184 
17185     // Track the reduced values in case if they are replaced by extractelement
17186     // because of the vectorization.
17187     DenseMap<Value *, WeakTrackingVH> TrackedVals(
17188         ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17189     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17190     SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17191     ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
17192     // The same extra argument may be used several times, so log each attempt
17193     // to use it.
17194     for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17195       assert(Pair.first && "DebugLoc must be set.");
17196       ExternallyUsedValues[Pair.second].push_back(Pair.first);
17197       TrackedVals.try_emplace(Pair.second, Pair.second);
17198     }
17199 
17200     // The compare instruction of a min/max is the insertion point for new
17201     // instructions and may be replaced with a new compare instruction.
17202     auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17203       assert(isa<SelectInst>(RdxRootInst) &&
17204              "Expected min/max reduction to have select root instruction");
17205       Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17206       assert(isa<Instruction>(ScalarCond) &&
17207              "Expected min/max reduction to have compare condition");
17208       return cast<Instruction>(ScalarCond);
17209     };
17210 
17211     // Return new VectorizedTree, based on previous value.
17212     auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17213       if (VectorizedTree) {
17214         // Update the final value in the reduction.
17215         Builder.SetCurrentDebugLocation(
17216             cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17217         if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17218             (isGuaranteedNotToBePoison(Res) &&
17219              !isGuaranteedNotToBePoison(VectorizedTree))) {
17220           auto It = ReducedValsToOps.find(Res);
17221           if (It != ReducedValsToOps.end() &&
17222               any_of(It->getSecond(),
17223                      [](Instruction *I) { return isBoolLogicOp(I); }))
17224             std::swap(VectorizedTree, Res);
17225         }
17226 
17227         return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
17228                         ReductionOps);
17229       }
17230       // Initialize the final value in the reduction.
17231       return Res;
17232     };
17233     bool AnyBoolLogicOp =
17234         any_of(ReductionOps.back(), [](Value *V) {
17235           return isBoolLogicOp(cast<Instruction>(V));
17236         });
17237     // The reduction root is used as the insertion point for new instructions,
17238     // so set it as externally used to prevent it from being deleted.
17239     ExternallyUsedValues[ReductionRoot];
17240     SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17241                                       ReductionOps.front().size());
17242     for (ReductionOpsType &RdxOps : ReductionOps)
17243       for (Value *RdxOp : RdxOps) {
17244         if (!RdxOp)
17245           continue;
17246         IgnoreList.insert(RdxOp);
17247       }
17248     // Intersect the fast-math-flags from all reduction operations.
17249     FastMathFlags RdxFMF;
17250     RdxFMF.set();
17251     for (Value *U : IgnoreList)
17252       if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17253         RdxFMF &= FPMO->getFastMathFlags();
17254     bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17255 
17256     // Need to track reduced vals, they may be changed during vectorization of
17257     // subvectors.
17258     for (ArrayRef<Value *> Candidates : ReducedVals)
17259       for (Value *V : Candidates)
17260         TrackedVals.try_emplace(V, V);
17261 
17262     DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17263     // List of the values that were reduced in other trees as part of gather
17264     // nodes and thus requiring extract if fully vectorized in other trees.
17265     SmallPtrSet<Value *, 4> RequiredExtract;
17266     Value *VectorizedTree = nullptr;
17267     bool CheckForReusedReductionOps = false;
17268     // Try to vectorize elements based on their type.
17269     SmallVector<InstructionsState> States;
17270     for (ArrayRef<Value *> RV : ReducedVals)
17271       States.push_back(getSameOpcode(RV, TLI));
17272     for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17273       ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17274       InstructionsState S = States[I];
17275       SmallVector<Value *> Candidates;
17276       Candidates.reserve(2 * OrigReducedVals.size());
17277       DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17278       for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17279         Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17280         // Check if the reduction value was not overriden by the extractelement
17281         // instruction because of the vectorization and exclude it, if it is not
17282         // compatible with other values.
17283         // Also check if the instruction was folded to constant/other value.
17284         auto *Inst = dyn_cast<Instruction>(RdxVal);
17285         if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17286              (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17287             (S.getOpcode() && !Inst))
17288           continue;
17289         Candidates.push_back(RdxVal);
17290         TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17291       }
17292       bool ShuffledExtracts = false;
17293       // Try to handle shuffled extractelements.
17294       if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17295           I + 1 < E) {
17296         InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
17297         if (NextS.getOpcode() == Instruction::ExtractElement &&
17298             !NextS.isAltShuffle()) {
17299           SmallVector<Value *> CommonCandidates(Candidates);
17300           for (Value *RV : ReducedVals[I + 1]) {
17301             Value *RdxVal = TrackedVals.find(RV)->second;
17302             // Check if the reduction value was not overriden by the
17303             // extractelement instruction because of the vectorization and
17304             // exclude it, if it is not compatible with other values.
17305             if (auto *Inst = dyn_cast<Instruction>(RdxVal))
17306               if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17307                 continue;
17308             CommonCandidates.push_back(RdxVal);
17309             TrackedToOrig.try_emplace(RdxVal, RV);
17310           }
17311           SmallVector<int> Mask;
17312           if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17313             ++I;
17314             Candidates.swap(CommonCandidates);
17315             ShuffledExtracts = true;
17316           }
17317         }
17318       }
17319 
17320       // Emit code for constant values.
17321       if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17322           allConstant(Candidates)) {
17323         Value *Res = Candidates.front();
17324         ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17325         for (Value *VC : ArrayRef(Candidates).drop_front()) {
17326           Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17327           ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17328           if (auto *ResI = dyn_cast<Instruction>(Res))
17329             V.analyzedReductionRoot(ResI);
17330         }
17331         VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17332         continue;
17333       }
17334 
17335       unsigned NumReducedVals = Candidates.size();
17336       if (NumReducedVals < ReductionLimit &&
17337           (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17338            !isSplat(Candidates)))
17339         continue;
17340 
17341       // Check if we support repeated scalar values processing (optimization of
17342       // original scalar identity operations on matched horizontal reductions).
17343       IsSupportedHorRdxIdentityOp =
17344           AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17345           RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17346       // Gather same values.
17347       MapVector<Value *, unsigned> SameValuesCounter;
17348       if (IsSupportedHorRdxIdentityOp)
17349         for (Value *V : Candidates)
17350           ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17351       // Used to check if the reduced values used same number of times. In this
17352       // case the compiler may produce better code. E.g. if reduced values are
17353       // aabbccdd (8 x values), then the first node of the tree will have a node
17354       // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17355       // Plus, the final reduction will be performed on <8 x aabbccdd>.
17356       // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17357       // x abcd) * 2.
17358       // Currently it only handles add/fadd/xor. and/or/min/max do not require
17359       // this analysis, other operations may require an extra estimation of
17360       // the profitability.
17361       bool SameScaleFactor = false;
17362       bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17363                               SameValuesCounter.size() != Candidates.size();
17364       if (OptReusedScalars) {
17365         SameScaleFactor =
17366             (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17367              RdxKind == RecurKind::Xor) &&
17368             all_of(drop_begin(SameValuesCounter),
17369                    [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17370                      return P.second == SameValuesCounter.front().second;
17371                    });
17372         Candidates.resize(SameValuesCounter.size());
17373         transform(SameValuesCounter, Candidates.begin(),
17374                   [](const auto &P) { return P.first; });
17375         NumReducedVals = Candidates.size();
17376         // Have a reduction of the same element.
17377         if (NumReducedVals == 1) {
17378           Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17379           unsigned Cnt = SameValuesCounter.lookup(OrigV);
17380           Value *RedVal =
17381               emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17382           VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17383           VectorizedVals.try_emplace(OrigV, Cnt);
17384           continue;
17385         }
17386       }
17387 
17388       unsigned MaxVecRegSize = V.getMaxVecRegSize();
17389       unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17390       unsigned MaxElts =
17391           RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17392 
17393       unsigned ReduxWidth = std::min<unsigned>(
17394           llvm::bit_floor(NumReducedVals),
17395           std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17396                                RegMaxNumber * RedValsMaxNumber));
17397       unsigned Start = 0;
17398       unsigned Pos = Start;
17399       // Restarts vectorization attempt with lower vector factor.
17400       unsigned PrevReduxWidth = ReduxWidth;
17401       bool CheckForReusedReductionOpsLocal = false;
17402       auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17403                                   &CheckForReusedReductionOpsLocal,
17404                                   &PrevReduxWidth, &V,
17405                                   &IgnoreList](bool IgnoreVL = false) {
17406         bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17407         if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17408           // Check if any of the reduction ops are gathered. If so, worth
17409           // trying again with less number of reduction ops.
17410           CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17411         }
17412         ++Pos;
17413         if (Pos < NumReducedVals - ReduxWidth + 1)
17414           return IsAnyRedOpGathered;
17415         Pos = Start;
17416         ReduxWidth /= 2;
17417         return IsAnyRedOpGathered;
17418       };
17419       bool AnyVectorized = false;
17420       while (Pos < NumReducedVals - ReduxWidth + 1 &&
17421              ReduxWidth >= ReductionLimit) {
17422         // Dependency in tree of the reduction ops - drop this attempt, try
17423         // later.
17424         if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17425             Start == 0) {
17426           CheckForReusedReductionOps = true;
17427           break;
17428         }
17429         PrevReduxWidth = ReduxWidth;
17430         ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17431         // Beeing analyzed already - skip.
17432         if (V.areAnalyzedReductionVals(VL)) {
17433           (void)AdjustReducedVals(/*IgnoreVL=*/true);
17434           continue;
17435         }
17436         // Early exit if any of the reduction values were deleted during
17437         // previous vectorization attempts.
17438         if (any_of(VL, [&V](Value *RedVal) {
17439               auto *RedValI = dyn_cast<Instruction>(RedVal);
17440               if (!RedValI)
17441                 return false;
17442               return V.isDeleted(RedValI);
17443             }))
17444           break;
17445         V.buildTree(VL, IgnoreList);
17446         if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17447           if (!AdjustReducedVals())
17448             V.analyzedReductionVals(VL);
17449           continue;
17450         }
17451         if (V.isLoadCombineReductionCandidate(RdxKind)) {
17452           if (!AdjustReducedVals())
17453             V.analyzedReductionVals(VL);
17454           continue;
17455         }
17456         V.reorderTopToBottom();
17457         // No need to reorder the root node at all.
17458         V.reorderBottomToTop(/*IgnoreReorder=*/true);
17459         // Keep extracted other reduction values, if they are used in the
17460         // vectorization trees.
17461         BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17462             ExternallyUsedValues);
17463         for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17464           if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17465             continue;
17466           for (Value *V : ReducedVals[Cnt])
17467             if (isa<Instruction>(V))
17468               LocalExternallyUsedValues[TrackedVals[V]];
17469         }
17470         if (!IsSupportedHorRdxIdentityOp) {
17471           // Number of uses of the candidates in the vector of values.
17472           assert(SameValuesCounter.empty() &&
17473                  "Reused values counter map is not empty");
17474           for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17475             if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17476               continue;
17477             Value *V = Candidates[Cnt];
17478             Value *OrigV = TrackedToOrig.find(V)->second;
17479             ++SameValuesCounter[OrigV];
17480           }
17481         }
17482         SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17483         // Gather externally used values.
17484         SmallPtrSet<Value *, 4> Visited;
17485         for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17486           if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17487             continue;
17488           Value *RdxVal = Candidates[Cnt];
17489           if (!Visited.insert(RdxVal).second)
17490             continue;
17491           // Check if the scalar was vectorized as part of the vectorization
17492           // tree but not the top node.
17493           if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17494             LocalExternallyUsedValues[RdxVal];
17495             continue;
17496           }
17497           Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17498           unsigned NumOps =
17499               VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17500           if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17501             LocalExternallyUsedValues[RdxVal];
17502         }
17503         // Do not need the list of reused scalars in regular mode anymore.
17504         if (!IsSupportedHorRdxIdentityOp)
17505           SameValuesCounter.clear();
17506         for (Value *RdxVal : VL)
17507           if (RequiredExtract.contains(RdxVal))
17508             LocalExternallyUsedValues[RdxVal];
17509         // Update LocalExternallyUsedValues for the scalar, replaced by
17510         // extractelement instructions.
17511         DenseMap<Value *, Value *> ReplacementToExternal;
17512         for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17513           ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17514         for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17515           Value *Ext = Pair.first;
17516           auto RIt = ReplacementToExternal.find(Ext);
17517           while (RIt != ReplacementToExternal.end()) {
17518             Ext = RIt->second;
17519             RIt = ReplacementToExternal.find(Ext);
17520           }
17521           auto *It = ExternallyUsedValues.find(Ext);
17522           if (It == ExternallyUsedValues.end())
17523             continue;
17524           LocalExternallyUsedValues[Pair.second].append(It->second);
17525         }
17526         V.buildExternalUses(LocalExternallyUsedValues);
17527 
17528         V.computeMinimumValueSizes();
17529         V.transformNodes();
17530 
17531         // Estimate cost.
17532         InstructionCost TreeCost = V.getTreeCost(VL);
17533         InstructionCost ReductionCost =
17534             getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17535         InstructionCost Cost = TreeCost + ReductionCost;
17536         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17537                           << " for reduction\n");
17538         if (!Cost.isValid())
17539           break;
17540         if (Cost >= -SLPCostThreshold) {
17541           V.getORE()->emit([&]() {
17542             return OptimizationRemarkMissed(
17543                        SV_NAME, "HorSLPNotBeneficial",
17544                        ReducedValsToOps.find(VL[0])->second.front())
17545                    << "Vectorizing horizontal reduction is possible "
17546                    << "but not beneficial with cost " << ore::NV("Cost", Cost)
17547                    << " and threshold "
17548                    << ore::NV("Threshold", -SLPCostThreshold);
17549           });
17550           if (!AdjustReducedVals())
17551             V.analyzedReductionVals(VL);
17552           continue;
17553         }
17554 
17555         LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17556                           << Cost << ". (HorRdx)\n");
17557         V.getORE()->emit([&]() {
17558           return OptimizationRemark(
17559                      SV_NAME, "VectorizedHorizontalReduction",
17560                      ReducedValsToOps.find(VL[0])->second.front())
17561                  << "Vectorized horizontal reduction with cost "
17562                  << ore::NV("Cost", Cost) << " and with tree size "
17563                  << ore::NV("TreeSize", V.getTreeSize());
17564         });
17565 
17566         Builder.setFastMathFlags(RdxFMF);
17567 
17568         // Emit a reduction. If the root is a select (min/max idiom), the insert
17569         // point is the compare condition of that select.
17570         Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17571         Instruction *InsertPt = RdxRootInst;
17572         if (IsCmpSelMinMax)
17573           InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17574 
17575         // Vectorize a tree.
17576         Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17577                                                 ReplacedExternals, InsertPt);
17578 
17579         Builder.SetInsertPoint(InsertPt);
17580 
17581         // To prevent poison from leaking across what used to be sequential,
17582         // safe, scalar boolean logic operations, the reduction operand must be
17583         // frozen.
17584         if ((isBoolLogicOp(RdxRootInst) ||
17585              (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17586             !isGuaranteedNotToBePoison(VectorizedRoot))
17587           VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17588 
17589         // Emit code to correctly handle reused reduced values, if required.
17590         if (OptReusedScalars && !SameScaleFactor) {
17591           VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17592                                          SameValuesCounter, TrackedToOrig);
17593         }
17594 
17595         Value *ReducedSubTree =
17596             emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17597         if (ReducedSubTree->getType() != VL.front()->getType()) {
17598           assert(ReducedSubTree->getType() != VL.front()->getType() &&
17599                  "Expected different reduction type.");
17600           ReducedSubTree =
17601               Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17602                                     V.isSignedMinBitwidthRootNode());
17603         }
17604 
17605         // Improved analysis for add/fadd/xor reductions with same scale factor
17606         // for all operands of reductions. We can emit scalar ops for them
17607         // instead.
17608         if (OptReusedScalars && SameScaleFactor)
17609           ReducedSubTree = emitScaleForReusedOps(
17610               ReducedSubTree, Builder, SameValuesCounter.front().second);
17611 
17612         VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17613         // Count vectorized reduced values to exclude them from final reduction.
17614         for (Value *RdxVal : VL) {
17615           Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17616           if (IsSupportedHorRdxIdentityOp) {
17617             VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17618             continue;
17619           }
17620           ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17621           if (!V.isVectorized(RdxVal))
17622             RequiredExtract.insert(RdxVal);
17623         }
17624         Pos += ReduxWidth;
17625         Start = Pos;
17626         ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17627         AnyVectorized = true;
17628       }
17629       if (OptReusedScalars && !AnyVectorized) {
17630         for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17631           Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17632           VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17633           Value *OrigV = TrackedToOrig.find(P.first)->second;
17634           VectorizedVals.try_emplace(OrigV, P.second);
17635         }
17636         continue;
17637       }
17638     }
17639     if (VectorizedTree) {
17640       // Reorder operands of bool logical op in the natural order to avoid
17641       // possible problem with poison propagation. If not possible to reorder
17642       // (both operands are originally RHS), emit an extra freeze instruction
17643       // for the LHS operand.
17644       // I.e., if we have original code like this:
17645       // RedOp1 = select i1 ?, i1 LHS, i1 false
17646       // RedOp2 = select i1 RHS, i1 ?, i1 false
17647 
17648       // Then, we swap LHS/RHS to create a new op that matches the poison
17649       // semantics of the original code.
17650 
17651       // If we have original code like this and both values could be poison:
17652       // RedOp1 = select i1 ?, i1 LHS, i1 false
17653       // RedOp2 = select i1 ?, i1 RHS, i1 false
17654 
17655       // Then, we must freeze LHS in the new op.
17656       auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17657                                                    Instruction *RedOp1,
17658                                                    Instruction *RedOp2,
17659                                                    bool InitStep) {
17660         if (!AnyBoolLogicOp)
17661           return;
17662         if (isBoolLogicOp(RedOp1) &&
17663             ((!InitStep && LHS == VectorizedTree) ||
17664              getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17665           return;
17666         if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17667                                       getRdxOperand(RedOp2, 0) == RHS ||
17668                                       isGuaranteedNotToBePoison(RHS))) {
17669           std::swap(LHS, RHS);
17670           return;
17671         }
17672         if (LHS != VectorizedTree)
17673           LHS = Builder.CreateFreeze(LHS);
17674       };
17675       // Finish the reduction.
17676       // Need to add extra arguments and not vectorized possible reduction
17677       // values.
17678       // Try to avoid dependencies between the scalar remainders after
17679       // reductions.
17680       auto FinalGen =
17681           [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
17682               bool InitStep) {
17683             unsigned Sz = InstVals.size();
17684             SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
17685                                                                      Sz % 2);
17686             for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17687               Instruction *RedOp = InstVals[I + 1].first;
17688               Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17689               Value *RdxVal1 = InstVals[I].second;
17690               Value *StableRdxVal1 = RdxVal1;
17691               auto It1 = TrackedVals.find(RdxVal1);
17692               if (It1 != TrackedVals.end())
17693                 StableRdxVal1 = It1->second;
17694               Value *RdxVal2 = InstVals[I + 1].second;
17695               Value *StableRdxVal2 = RdxVal2;
17696               auto It2 = TrackedVals.find(RdxVal2);
17697               if (It2 != TrackedVals.end())
17698                 StableRdxVal2 = It2->second;
17699               // To prevent poison from leaking across what used to be
17700               // sequential, safe, scalar boolean logic operations, the
17701               // reduction operand must be frozen.
17702               FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17703                                 RedOp, InitStep);
17704               Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17705                                          StableRdxVal2, "op.rdx", ReductionOps);
17706               ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17707             }
17708             if (Sz % 2 == 1)
17709               ExtraReds[Sz / 2] = InstVals.back();
17710             return ExtraReds;
17711           };
17712       SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
17713       ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17714                                    VectorizedTree);
17715       SmallPtrSet<Value *, 8> Visited;
17716       for (ArrayRef<Value *> Candidates : ReducedVals) {
17717         for (Value *RdxVal : Candidates) {
17718           if (!Visited.insert(RdxVal).second)
17719             continue;
17720           unsigned NumOps = VectorizedVals.lookup(RdxVal);
17721           for (Instruction *RedOp :
17722                ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17723                    .drop_back(NumOps))
17724             ExtraReductions.emplace_back(RedOp, RdxVal);
17725         }
17726       }
17727       for (auto &Pair : ExternallyUsedValues) {
17728         // Add each externally used value to the final reduction.
17729         for (auto *I : Pair.second)
17730           ExtraReductions.emplace_back(I, Pair.first);
17731       }
17732       // Iterate through all not-vectorized reduction values/extra arguments.
17733       bool InitStep = true;
17734       while (ExtraReductions.size() > 1) {
17735         SmallVector<std::pair<Instruction *, Value *>> NewReds =
17736             FinalGen(ExtraReductions, InitStep);
17737         ExtraReductions.swap(NewReds);
17738         InitStep = false;
17739       }
17740       VectorizedTree = ExtraReductions.front().second;
17741 
17742       ReductionRoot->replaceAllUsesWith(VectorizedTree);
17743 
17744       // The original scalar reduction is expected to have no remaining
17745       // uses outside the reduction tree itself.  Assert that we got this
17746       // correct, replace internal uses with undef, and mark for eventual
17747       // deletion.
17748 #ifndef NDEBUG
17749       SmallSet<Value *, 4> IgnoreSet;
17750       for (ArrayRef<Value *> RdxOps : ReductionOps)
17751         IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17752 #endif
17753       for (ArrayRef<Value *> RdxOps : ReductionOps) {
17754         for (Value *Ignore : RdxOps) {
17755           if (!Ignore)
17756             continue;
17757 #ifndef NDEBUG
17758           for (auto *U : Ignore->users()) {
17759             assert(IgnoreSet.count(U) &&
17760                    "All users must be either in the reduction ops list.");
17761           }
17762 #endif
17763           if (!Ignore->use_empty()) {
17764             Value *P = PoisonValue::get(Ignore->getType());
17765             Ignore->replaceAllUsesWith(P);
17766           }
17767         }
17768         V.removeInstructionsAndOperands(RdxOps);
17769       }
17770     } else if (!CheckForReusedReductionOps) {
17771       for (ReductionOpsType &RdxOps : ReductionOps)
17772         for (Value *RdxOp : RdxOps)
17773           V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17774     }
17775     return VectorizedTree;
17776   }
17777 
17778 private:
17779   /// Calculate the cost of a reduction.
17780   InstructionCost getReductionCost(TargetTransformInfo *TTI,
17781                                    ArrayRef<Value *> ReducedVals,
17782                                    bool IsCmpSelMinMax, unsigned ReduxWidth,
17783                                    FastMathFlags FMF) {
17784     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17785     Type *ScalarTy = ReducedVals.front()->getType();
17786     FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
17787     InstructionCost VectorCost = 0, ScalarCost;
17788     // If all of the reduced values are constant, the vector cost is 0, since
17789     // the reduction value can be calculated at the compile time.
17790     bool AllConsts = allConstant(ReducedVals);
17791     auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17792       InstructionCost Cost = 0;
17793       // Scalar cost is repeated for N-1 elements.
17794       int Cnt = ReducedVals.size();
17795       for (Value *RdxVal : ReducedVals) {
17796         if (Cnt == 1)
17797           break;
17798         --Cnt;
17799         if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17800           Cost += GenCostFn();
17801           continue;
17802         }
17803         InstructionCost ScalarCost = 0;
17804         for (User *U : RdxVal->users()) {
17805           auto *RdxOp = cast<Instruction>(U);
17806           if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17807             ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17808             continue;
17809           }
17810           ScalarCost = InstructionCost::getInvalid();
17811           break;
17812         }
17813         if (ScalarCost.isValid())
17814           Cost += ScalarCost;
17815         else
17816           Cost += GenCostFn();
17817       }
17818       return Cost;
17819     };
17820     switch (RdxKind) {
17821     case RecurKind::Add:
17822     case RecurKind::Mul:
17823     case RecurKind::Or:
17824     case RecurKind::And:
17825     case RecurKind::Xor:
17826     case RecurKind::FAdd:
17827     case RecurKind::FMul: {
17828       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17829       if (!AllConsts)
17830         VectorCost =
17831             TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17832       ScalarCost = EvaluateScalarCost([&]() {
17833         return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17834       });
17835       break;
17836     }
17837     case RecurKind::FMax:
17838     case RecurKind::FMin:
17839     case RecurKind::FMaximum:
17840     case RecurKind::FMinimum:
17841     case RecurKind::SMax:
17842     case RecurKind::SMin:
17843     case RecurKind::UMax:
17844     case RecurKind::UMin: {
17845       Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
17846       if (!AllConsts)
17847         VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17848       ScalarCost = EvaluateScalarCost([&]() {
17849         IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17850         return TTI->getIntrinsicInstrCost(ICA, CostKind);
17851       });
17852       break;
17853     }
17854     default:
17855       llvm_unreachable("Expected arithmetic or min/max reduction operation");
17856     }
17857 
17858     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17859                       << " for reduction of " << shortBundleName(ReducedVals)
17860                       << " (It is a splitting reduction)\n");
17861     return VectorCost - ScalarCost;
17862   }
17863 
17864   /// Emit a horizontal reduction of the vectorized value.
17865   Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17866                        unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17867     assert(VectorizedValue && "Need to have a vectorized tree node");
17868     assert(isPowerOf2_32(ReduxWidth) &&
17869            "We only handle power-of-two reductions for now");
17870     assert(RdxKind != RecurKind::FMulAdd &&
17871            "A call to the llvm.fmuladd intrinsic is not handled yet");
17872 
17873     ++NumVectorInstructions;
17874     return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17875   }
17876 
17877   /// Emits optimized code for unique scalar value reused \p Cnt times.
17878   Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17879                                unsigned Cnt) {
17880     assert(IsSupportedHorRdxIdentityOp &&
17881            "The optimization of matched scalar identity horizontal reductions "
17882            "must be supported.");
17883     switch (RdxKind) {
17884     case RecurKind::Add: {
17885       // res = mul vv, n
17886       Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17887       LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17888                         << VectorizedValue << ". (HorRdx)\n");
17889       return Builder.CreateMul(VectorizedValue, Scale);
17890     }
17891     case RecurKind::Xor: {
17892       // res = n % 2 ? 0 : vv
17893       LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17894                         << ". (HorRdx)\n");
17895       if (Cnt % 2 == 0)
17896         return Constant::getNullValue(VectorizedValue->getType());
17897       return VectorizedValue;
17898     }
17899     case RecurKind::FAdd: {
17900       // res = fmul v, n
17901       Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17902       LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17903                         << VectorizedValue << ". (HorRdx)\n");
17904       return Builder.CreateFMul(VectorizedValue, Scale);
17905     }
17906     case RecurKind::And:
17907     case RecurKind::Or:
17908     case RecurKind::SMax:
17909     case RecurKind::SMin:
17910     case RecurKind::UMax:
17911     case RecurKind::UMin:
17912     case RecurKind::FMax:
17913     case RecurKind::FMin:
17914     case RecurKind::FMaximum:
17915     case RecurKind::FMinimum:
17916       // res = vv
17917       return VectorizedValue;
17918     case RecurKind::Mul:
17919     case RecurKind::FMul:
17920     case RecurKind::FMulAdd:
17921     case RecurKind::IAnyOf:
17922     case RecurKind::FAnyOf:
17923     case RecurKind::None:
17924       llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17925     }
17926     return nullptr;
17927   }
17928 
17929   /// Emits actual operation for the scalar identity values, found during
17930   /// horizontal reduction analysis.
17931   Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17932                        BoUpSLP &R,
17933                        const MapVector<Value *, unsigned> &SameValuesCounter,
17934                        const DenseMap<Value *, Value *> &TrackedToOrig) {
17935     assert(IsSupportedHorRdxIdentityOp &&
17936            "The optimization of matched scalar identity horizontal reductions "
17937            "must be supported.");
17938     ArrayRef<Value *> VL = R.getRootNodeScalars();
17939     auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17940     if (VTy->getElementType() != VL.front()->getType()) {
17941       VectorizedValue = Builder.CreateIntCast(
17942           VectorizedValue,
17943           getWidenedType(VL.front()->getType(), VTy->getNumElements()),
17944           R.isSignedMinBitwidthRootNode());
17945     }
17946     switch (RdxKind) {
17947     case RecurKind::Add: {
17948       // root = mul prev_root, <1, 1, n, 1>
17949       SmallVector<Constant *> Vals;
17950       for (Value *V : VL) {
17951         unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17952         Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17953       }
17954       auto *Scale = ConstantVector::get(Vals);
17955       LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17956                         << VectorizedValue << ". (HorRdx)\n");
17957       return Builder.CreateMul(VectorizedValue, Scale);
17958     }
17959     case RecurKind::And:
17960     case RecurKind::Or:
17961       // No need for multiple or/and(s).
17962       LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17963                         << ". (HorRdx)\n");
17964       return VectorizedValue;
17965     case RecurKind::SMax:
17966     case RecurKind::SMin:
17967     case RecurKind::UMax:
17968     case RecurKind::UMin:
17969     case RecurKind::FMax:
17970     case RecurKind::FMin:
17971     case RecurKind::FMaximum:
17972     case RecurKind::FMinimum:
17973       // No need for multiple min/max(s) of the same value.
17974       LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17975                         << ". (HorRdx)\n");
17976       return VectorizedValue;
17977     case RecurKind::Xor: {
17978       // Replace values with even number of repeats with 0, since
17979       // x xor x = 0.
17980       // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17981       // 7>, if elements 4th and 6th elements have even number of repeats.
17982       SmallVector<int> Mask(
17983           cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17984           PoisonMaskElem);
17985       std::iota(Mask.begin(), Mask.end(), 0);
17986       bool NeedShuffle = false;
17987       for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17988         Value *V = VL[I];
17989         unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17990         if (Cnt % 2 == 0) {
17991           Mask[I] = VF;
17992           NeedShuffle = true;
17993         }
17994       }
17995       LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17996                                               : Mask) dbgs()
17997                                          << I << " ";
17998                  dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17999       if (NeedShuffle)
18000         VectorizedValue = Builder.CreateShuffleVector(
18001             VectorizedValue,
18002             ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
18003       return VectorizedValue;
18004     }
18005     case RecurKind::FAdd: {
18006       // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
18007       SmallVector<Constant *> Vals;
18008       for (Value *V : VL) {
18009         unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18010         Vals.push_back(ConstantFP::get(V->getType(), Cnt));
18011       }
18012       auto *Scale = ConstantVector::get(Vals);
18013       return Builder.CreateFMul(VectorizedValue, Scale);
18014     }
18015     case RecurKind::Mul:
18016     case RecurKind::FMul:
18017     case RecurKind::FMulAdd:
18018     case RecurKind::IAnyOf:
18019     case RecurKind::FAnyOf:
18020     case RecurKind::None:
18021       llvm_unreachable("Unexpected reduction kind for reused scalars.");
18022     }
18023     return nullptr;
18024   }
18025 };
18026 } // end anonymous namespace
18027 
18028 /// Gets recurrence kind from the specified value.
18029 static RecurKind getRdxKind(Value *V) {
18030   return HorizontalReduction::getRdxKind(V);
18031 }
18032 static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18033   if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18034     return cast<FixedVectorType>(IE->getType())->getNumElements();
18035 
18036   unsigned AggregateSize = 1;
18037   auto *IV = cast<InsertValueInst>(InsertInst);
18038   Type *CurrentType = IV->getType();
18039   do {
18040     if (auto *ST = dyn_cast<StructType>(CurrentType)) {
18041       for (auto *Elt : ST->elements())
18042         if (Elt != ST->getElementType(0)) // check homogeneity
18043           return std::nullopt;
18044       AggregateSize *= ST->getNumElements();
18045       CurrentType = ST->getElementType(0);
18046     } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18047       AggregateSize *= AT->getNumElements();
18048       CurrentType = AT->getElementType();
18049     } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18050       AggregateSize *= VT->getNumElements();
18051       return AggregateSize;
18052     } else if (CurrentType->isSingleValueType()) {
18053       return AggregateSize;
18054     } else {
18055       return std::nullopt;
18056     }
18057   } while (true);
18058 }
18059 
18060 static void findBuildAggregate_rec(Instruction *LastInsertInst,
18061                                    TargetTransformInfo *TTI,
18062                                    SmallVectorImpl<Value *> &BuildVectorOpds,
18063                                    SmallVectorImpl<Value *> &InsertElts,
18064                                    unsigned OperandOffset) {
18065   do {
18066     Value *InsertedOperand = LastInsertInst->getOperand(1);
18067     std::optional<unsigned> OperandIndex =
18068         getElementIndex(LastInsertInst, OperandOffset);
18069     if (!OperandIndex)
18070       return;
18071     if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18072       findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
18073                              BuildVectorOpds, InsertElts, *OperandIndex);
18074 
18075     } else {
18076       BuildVectorOpds[*OperandIndex] = InsertedOperand;
18077       InsertElts[*OperandIndex] = LastInsertInst;
18078     }
18079     LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
18080   } while (LastInsertInst != nullptr &&
18081            isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18082            LastInsertInst->hasOneUse());
18083 }
18084 
18085 /// Recognize construction of vectors like
18086 ///  %ra = insertelement <4 x float> poison, float %s0, i32 0
18087 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
18088 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
18089 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
18090 ///  starting from the last insertelement or insertvalue instruction.
18091 ///
18092 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18093 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18094 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18095 ///
18096 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18097 ///
18098 /// \return true if it matches.
18099 static bool findBuildAggregate(Instruction *LastInsertInst,
18100                                TargetTransformInfo *TTI,
18101                                SmallVectorImpl<Value *> &BuildVectorOpds,
18102                                SmallVectorImpl<Value *> &InsertElts) {
18103 
18104   assert((isa<InsertElementInst>(LastInsertInst) ||
18105           isa<InsertValueInst>(LastInsertInst)) &&
18106          "Expected insertelement or insertvalue instruction!");
18107 
18108   assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18109          "Expected empty result vectors!");
18110 
18111   std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
18112   if (!AggregateSize)
18113     return false;
18114   BuildVectorOpds.resize(*AggregateSize);
18115   InsertElts.resize(*AggregateSize);
18116 
18117   findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
18118   llvm::erase(BuildVectorOpds, nullptr);
18119   llvm::erase(InsertElts, nullptr);
18120   if (BuildVectorOpds.size() >= 2)
18121     return true;
18122 
18123   return false;
18124 }
18125 
18126 /// Try and get a reduction instruction from a phi node.
18127 ///
18128 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18129 /// if they come from either \p ParentBB or a containing loop latch.
18130 ///
18131 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
18132 /// if not possible.
18133 static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
18134                                       BasicBlock *ParentBB, LoopInfo *LI) {
18135   // There are situations where the reduction value is not dominated by the
18136   // reduction phi. Vectorizing such cases has been reported to cause
18137   // miscompiles. See PR25787.
18138   auto DominatedReduxValue = [&](Value *R) {
18139     return isa<Instruction>(R) &&
18140            DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
18141   };
18142 
18143   Instruction *Rdx = nullptr;
18144 
18145   // Return the incoming value if it comes from the same BB as the phi node.
18146   if (P->getIncomingBlock(0) == ParentBB) {
18147     Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18148   } else if (P->getIncomingBlock(1) == ParentBB) {
18149     Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18150   }
18151 
18152   if (Rdx && DominatedReduxValue(Rdx))
18153     return Rdx;
18154 
18155   // Otherwise, check whether we have a loop latch to look at.
18156   Loop *BBL = LI->getLoopFor(ParentBB);
18157   if (!BBL)
18158     return nullptr;
18159   BasicBlock *BBLatch = BBL->getLoopLatch();
18160   if (!BBLatch)
18161     return nullptr;
18162 
18163   // There is a loop latch, return the incoming value if it comes from
18164   // that. This reduction pattern occasionally turns up.
18165   if (P->getIncomingBlock(0) == BBLatch) {
18166     Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18167   } else if (P->getIncomingBlock(1) == BBLatch) {
18168     Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18169   }
18170 
18171   if (Rdx && DominatedReduxValue(Rdx))
18172     return Rdx;
18173 
18174   return nullptr;
18175 }
18176 
18177 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18178   if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
18179     return true;
18180   if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
18181     return true;
18182   if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
18183     return true;
18184   if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
18185     return true;
18186   if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
18187     return true;
18188   if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
18189     return true;
18190   if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
18191     return true;
18192   if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
18193     return true;
18194   if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
18195     return true;
18196   return false;
18197 }
18198 
18199 /// We could have an initial reduction that is not an add.
18200 ///  r *= v1 + v2 + v3 + v4
18201 /// In such a case start looking for a tree rooted in the first '+'.
18202 /// \Returns the new root if found, which may be nullptr if not an instruction.
18203 static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
18204                                                  Instruction *Root) {
18205   assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18206           isa<IntrinsicInst>(Root)) &&
18207          "Expected binop, select, or intrinsic for reduction matching");
18208   Value *LHS =
18209       Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18210   Value *RHS =
18211       Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18212   if (LHS == Phi)
18213     return dyn_cast<Instruction>(RHS);
18214   if (RHS == Phi)
18215     return dyn_cast<Instruction>(LHS);
18216   return nullptr;
18217 }
18218 
18219 /// \p Returns the first operand of \p I that does not match \p Phi. If
18220 /// operand is not an instruction it returns nullptr.
18221 static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
18222   Value *Op0 = nullptr;
18223   Value *Op1 = nullptr;
18224   if (!matchRdxBop(I, Op0, Op1))
18225     return nullptr;
18226   return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18227 }
18228 
18229 /// \Returns true if \p I is a candidate instruction for reduction vectorization.
18230 static bool isReductionCandidate(Instruction *I) {
18231   bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
18232   Value *B0 = nullptr, *B1 = nullptr;
18233   bool IsBinop = matchRdxBop(I, B0, B1);
18234   return IsBinop || IsSelect;
18235 }
18236 
18237 bool SLPVectorizerPass::vectorizeHorReduction(
18238     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
18239     SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18240   if (!ShouldVectorizeHor)
18241     return false;
18242   bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18243 
18244   if (Root->getParent() != BB || isa<PHINode>(Root))
18245     return false;
18246 
18247   // If we can find a secondary reduction root, use that instead.
18248   auto SelectRoot = [&]() {
18249     if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18250         HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18251       if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18252         return NewRoot;
18253     return Root;
18254   };
18255 
18256   // Start analysis starting from Root instruction. If horizontal reduction is
18257   // found, try to vectorize it. If it is not a horizontal reduction or
18258   // vectorization is not possible or not effective, and currently analyzed
18259   // instruction is a binary operation, try to vectorize the operands, using
18260   // pre-order DFS traversal order. If the operands were not vectorized, repeat
18261   // the same procedure considering each operand as a possible root of the
18262   // horizontal reduction.
18263   // Interrupt the process if the Root instruction itself was vectorized or all
18264   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18265   // If a horizintal reduction was not matched or vectorized we collect
18266   // instructions for possible later attempts for vectorization.
18267   std::queue<std::pair<Instruction *, unsigned>> Stack;
18268   Stack.emplace(SelectRoot(), 0);
18269   SmallPtrSet<Value *, 8> VisitedInstrs;
18270   bool Res = false;
18271   auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18272     if (R.isAnalyzedReductionRoot(Inst))
18273       return nullptr;
18274     if (!isReductionCandidate(Inst))
18275       return nullptr;
18276     HorizontalReduction HorRdx;
18277     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18278       return nullptr;
18279     return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18280   };
18281   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18282     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18283       FutureSeed = getNonPhiOperand(Root, P);
18284       if (!FutureSeed)
18285         return false;
18286     }
18287     // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18288     // analysis is done separately.
18289     if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18290       PostponedInsts.push_back(FutureSeed);
18291     return true;
18292   };
18293 
18294   while (!Stack.empty()) {
18295     Instruction *Inst;
18296     unsigned Level;
18297     std::tie(Inst, Level) = Stack.front();
18298     Stack.pop();
18299     // Do not try to analyze instruction that has already been vectorized.
18300     // This may happen when we vectorize instruction operands on a previous
18301     // iteration while stack was populated before that happened.
18302     if (R.isDeleted(Inst))
18303       continue;
18304     if (Value *VectorizedV = TryToReduce(Inst)) {
18305       Res = true;
18306       if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18307         // Try to find another reduction.
18308         Stack.emplace(I, Level);
18309         continue;
18310       }
18311       if (R.isDeleted(Inst))
18312         continue;
18313     } else {
18314       // We could not vectorize `Inst` so try to use it as a future seed.
18315       if (!TryAppendToPostponedInsts(Inst)) {
18316         assert(Stack.empty() && "Expected empty stack");
18317         break;
18318       }
18319     }
18320 
18321     // Try to vectorize operands.
18322     // Continue analysis for the instruction from the same basic block only to
18323     // save compile time.
18324     if (++Level < RecursionMaxDepth)
18325       for (auto *Op : Inst->operand_values())
18326         if (VisitedInstrs.insert(Op).second)
18327           if (auto *I = dyn_cast<Instruction>(Op))
18328             // Do not try to vectorize CmpInst operands,  this is done
18329             // separately.
18330             if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18331                 !R.isDeleted(I) && I->getParent() == BB)
18332               Stack.emplace(I, Level);
18333   }
18334   return Res;
18335 }
18336 
18337 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18338                                                  BasicBlock *BB, BoUpSLP &R,
18339                                                  TargetTransformInfo *TTI) {
18340   SmallVector<WeakTrackingVH> PostponedInsts;
18341   bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18342   Res |= tryToVectorize(PostponedInsts, R);
18343   return Res;
18344 }
18345 
18346 bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18347                                        BoUpSLP &R) {
18348   bool Res = false;
18349   for (Value *V : Insts)
18350     if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18351       Res |= tryToVectorize(Inst, R);
18352   return Res;
18353 }
18354 
18355 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18356                                                  BasicBlock *BB, BoUpSLP &R,
18357                                                  bool MaxVFOnly) {
18358   if (!R.canMapToVector(IVI->getType()))
18359     return false;
18360 
18361   SmallVector<Value *, 16> BuildVectorOpds;
18362   SmallVector<Value *, 16> BuildVectorInsts;
18363   if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18364     return false;
18365 
18366   if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18367     R.getORE()->emit([&]() {
18368       return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18369              << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18370                 "trying reduction first.";
18371     });
18372     return false;
18373   }
18374   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18375   // Aggregate value is unlikely to be processed in vector register.
18376   return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18377 }
18378 
18379 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18380                                                    BasicBlock *BB, BoUpSLP &R,
18381                                                    bool MaxVFOnly) {
18382   SmallVector<Value *, 16> BuildVectorInsts;
18383   SmallVector<Value *, 16> BuildVectorOpds;
18384   SmallVector<int> Mask;
18385   if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18386       (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18387        isFixedVectorShuffle(BuildVectorOpds, Mask)))
18388     return false;
18389 
18390   if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18391     R.getORE()->emit([&]() {
18392       return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18393              << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18394                 "trying reduction first.";
18395     });
18396     return false;
18397   }
18398   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18399   return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18400 }
18401 
18402 template <typename T>
18403 static bool tryToVectorizeSequence(
18404     SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18405     function_ref<bool(T *, T *)> AreCompatible,
18406     function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18407     bool MaxVFOnly, BoUpSLP &R) {
18408   bool Changed = false;
18409   // Sort by type, parent, operands.
18410   stable_sort(Incoming, Comparator);
18411 
18412   // Try to vectorize elements base on their type.
18413   SmallVector<T *> Candidates;
18414   SmallVector<T *> VL;
18415   for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18416        VL.clear()) {
18417     // Look for the next elements with the same type, parent and operand
18418     // kinds.
18419     auto *I = dyn_cast<Instruction>(*IncIt);
18420     if (!I || R.isDeleted(I)) {
18421       ++IncIt;
18422       continue;
18423     }
18424     auto *SameTypeIt = IncIt;
18425     while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18426                                R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18427                                AreCompatible(*SameTypeIt, *IncIt))) {
18428       auto *I = dyn_cast<Instruction>(*SameTypeIt);
18429       ++SameTypeIt;
18430       if (I && !R.isDeleted(I))
18431         VL.push_back(cast<T>(I));
18432     }
18433 
18434     // Try to vectorize them.
18435     unsigned NumElts = VL.size();
18436     LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18437                       << NumElts << ")\n");
18438     // The vectorization is a 3-state attempt:
18439     // 1. Try to vectorize instructions with the same/alternate opcodes with the
18440     // size of maximal register at first.
18441     // 2. Try to vectorize remaining instructions with the same type, if
18442     // possible. This may result in the better vectorization results rather than
18443     // if we try just to vectorize instructions with the same/alternate opcodes.
18444     // 3. Final attempt to try to vectorize all instructions with the
18445     // same/alternate ops only, this may result in some extra final
18446     // vectorization.
18447     if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18448       // Success start over because instructions might have been changed.
18449       Changed = true;
18450       VL.swap(Candidates);
18451       Candidates.clear();
18452       for (T *V : VL) {
18453         if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18454           Candidates.push_back(V);
18455       }
18456     } else {
18457       /// \Returns the minimum number of elements that we will attempt to
18458       /// vectorize.
18459       auto GetMinNumElements = [&R](Value *V) {
18460         unsigned EltSize = R.getVectorElementSize(V);
18461         return std::max(2U, R.getMaxVecRegSize() / EltSize);
18462       };
18463       if (NumElts < GetMinNumElements(*IncIt) &&
18464           (Candidates.empty() ||
18465            Candidates.front()->getType() == (*IncIt)->getType())) {
18466         for (T *V : VL) {
18467           if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18468             Candidates.push_back(V);
18469         }
18470       }
18471     }
18472     // Final attempt to vectorize instructions with the same types.
18473     if (Candidates.size() > 1 &&
18474         (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18475       if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18476         // Success start over because instructions might have been changed.
18477         Changed = true;
18478       } else if (MaxVFOnly) {
18479         // Try to vectorize using small vectors.
18480         SmallVector<T *> VL;
18481         for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18482              VL.clear()) {
18483           auto *I = dyn_cast<Instruction>(*It);
18484           if (!I || R.isDeleted(I)) {
18485             ++It;
18486             continue;
18487           }
18488           auto *SameTypeIt = It;
18489           while (SameTypeIt != End &&
18490                  (!isa<Instruction>(*SameTypeIt) ||
18491                   R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18492                   AreCompatible(*SameTypeIt, *It))) {
18493             auto *I = dyn_cast<Instruction>(*SameTypeIt);
18494             ++SameTypeIt;
18495             if (I && !R.isDeleted(I))
18496               VL.push_back(cast<T>(I));
18497           }
18498           unsigned NumElts = VL.size();
18499           if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18500                                                   /*MaxVFOnly=*/false))
18501             Changed = true;
18502           It = SameTypeIt;
18503         }
18504       }
18505       Candidates.clear();
18506     }
18507 
18508     // Start over at the next instruction of a different type (or the end).
18509     IncIt = SameTypeIt;
18510   }
18511   return Changed;
18512 }
18513 
18514 /// Compare two cmp instructions. If IsCompatibility is true, function returns
18515 /// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18516 /// operands. If IsCompatibility is false, function implements strict weak
18517 /// ordering relation between two cmp instructions, returning true if the first
18518 /// instruction is "less" than the second, i.e. its predicate is less than the
18519 /// predicate of the second or the operands IDs are less than the operands IDs
18520 /// of the second cmp instruction.
18521 template <bool IsCompatibility>
18522 static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18523                        const DominatorTree &DT) {
18524   assert(isValidElementType(V->getType()) &&
18525          isValidElementType(V2->getType()) &&
18526          "Expected valid element types only.");
18527   if (V == V2)
18528     return IsCompatibility;
18529   auto *CI1 = cast<CmpInst>(V);
18530   auto *CI2 = cast<CmpInst>(V2);
18531   if (CI1->getOperand(0)->getType()->getTypeID() <
18532       CI2->getOperand(0)->getType()->getTypeID())
18533     return !IsCompatibility;
18534   if (CI1->getOperand(0)->getType()->getTypeID() >
18535       CI2->getOperand(0)->getType()->getTypeID())
18536     return false;
18537   CmpInst::Predicate Pred1 = CI1->getPredicate();
18538   CmpInst::Predicate Pred2 = CI2->getPredicate();
18539   CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
18540   CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
18541   CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18542   CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18543   if (BasePred1 < BasePred2)
18544     return !IsCompatibility;
18545   if (BasePred1 > BasePred2)
18546     return false;
18547   // Compare operands.
18548   bool CI1Preds = Pred1 == BasePred1;
18549   bool CI2Preds = Pred2 == BasePred1;
18550   for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18551     auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18552     auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18553     if (Op1 == Op2)
18554       continue;
18555     if (Op1->getValueID() < Op2->getValueID())
18556       return !IsCompatibility;
18557     if (Op1->getValueID() > Op2->getValueID())
18558       return false;
18559     if (auto *I1 = dyn_cast<Instruction>(Op1))
18560       if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18561         if (IsCompatibility) {
18562           if (I1->getParent() != I2->getParent())
18563             return false;
18564         } else {
18565           // Try to compare nodes with same parent.
18566           DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18567           DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18568           if (!NodeI1)
18569             return NodeI2 != nullptr;
18570           if (!NodeI2)
18571             return false;
18572           assert((NodeI1 == NodeI2) ==
18573                      (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18574                  "Different nodes should have different DFS numbers");
18575           if (NodeI1 != NodeI2)
18576             return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18577         }
18578         InstructionsState S = getSameOpcode({I1, I2}, TLI);
18579         if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18580           continue;
18581         if (IsCompatibility)
18582           return false;
18583         if (I1->getOpcode() != I2->getOpcode())
18584           return I1->getOpcode() < I2->getOpcode();
18585       }
18586   }
18587   return IsCompatibility;
18588 }
18589 
18590 template <typename ItT>
18591 bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18592                                           BasicBlock *BB, BoUpSLP &R) {
18593   bool Changed = false;
18594   // Try to find reductions first.
18595   for (CmpInst *I : CmpInsts) {
18596     if (R.isDeleted(I))
18597       continue;
18598     for (Value *Op : I->operands())
18599       if (auto *RootOp = dyn_cast<Instruction>(Op)) {
18600         Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18601         if (R.isDeleted(I))
18602           break;
18603       }
18604   }
18605   // Try to vectorize operands as vector bundles.
18606   for (CmpInst *I : CmpInsts) {
18607     if (R.isDeleted(I))
18608       continue;
18609     Changed |= tryToVectorize(I, R);
18610   }
18611   // Try to vectorize list of compares.
18612   // Sort by type, compare predicate, etc.
18613   auto CompareSorter = [&](Value *V, Value *V2) {
18614     if (V == V2)
18615       return false;
18616     return compareCmp<false>(V, V2, *TLI, *DT);
18617   };
18618 
18619   auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18620     if (V1 == V2)
18621       return true;
18622     return compareCmp<true>(V1, V2, *TLI, *DT);
18623   };
18624 
18625   SmallVector<Value *> Vals;
18626   for (Instruction *V : CmpInsts)
18627     if (!R.isDeleted(V) && isValidElementType(V->getType()))
18628       Vals.push_back(V);
18629   if (Vals.size() <= 1)
18630     return Changed;
18631   Changed |= tryToVectorizeSequence<Value>(
18632       Vals, CompareSorter, AreCompatibleCompares,
18633       [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18634         // Exclude possible reductions from other blocks.
18635         bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18636           return any_of(V->users(), [V](User *U) {
18637             auto *Select = dyn_cast<SelectInst>(U);
18638             return Select &&
18639                    Select->getParent() != cast<Instruction>(V)->getParent();
18640           });
18641         });
18642         if (ArePossiblyReducedInOtherBlock)
18643           return false;
18644         return tryToVectorizeList(Candidates, R, MaxVFOnly);
18645       },
18646       /*MaxVFOnly=*/true, R);
18647   return Changed;
18648 }
18649 
18650 bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18651                                          BasicBlock *BB, BoUpSLP &R) {
18652   assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18653          "This function only accepts Insert instructions");
18654   bool OpsChanged = false;
18655   SmallVector<WeakTrackingVH> PostponedInsts;
18656   for (auto *I : reverse(Instructions)) {
18657     // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18658     if (R.isDeleted(I) || isa<CmpInst>(I))
18659       continue;
18660     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18661       OpsChanged |=
18662           vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18663     } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18664       OpsChanged |=
18665           vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18666     }
18667     // pass2 - try to vectorize reductions only
18668     if (R.isDeleted(I))
18669       continue;
18670     OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18671     if (R.isDeleted(I) || isa<CmpInst>(I))
18672       continue;
18673     // pass3 - try to match and vectorize a buildvector sequence.
18674     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18675       OpsChanged |=
18676           vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
18677     } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18678       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18679                                                /*MaxVFOnly=*/false);
18680     }
18681   }
18682   // Now try to vectorize postponed instructions.
18683   OpsChanged |= tryToVectorize(PostponedInsts, R);
18684 
18685   Instructions.clear();
18686   return OpsChanged;
18687 }
18688 
18689 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18690   bool Changed = false;
18691   SmallVector<Value *, 4> Incoming;
18692   SmallPtrSet<Value *, 16> VisitedInstrs;
18693   // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18694   // node. Allows better to identify the chains that can be vectorized in the
18695   // better way.
18696   DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
18697   auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18698     assert(isValidElementType(V1->getType()) &&
18699            isValidElementType(V2->getType()) &&
18700            "Expected vectorizable types only.");
18701     // It is fine to compare type IDs here, since we expect only vectorizable
18702     // types, like ints, floats and pointers, we don't care about other type.
18703     if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18704       return true;
18705     if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18706       return false;
18707     ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18708     ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18709     if (Opcodes1.size() < Opcodes2.size())
18710       return true;
18711     if (Opcodes1.size() > Opcodes2.size())
18712       return false;
18713     for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18714       {
18715         // Instructions come first.
18716         auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18717         auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18718         if (I1 && I2) {
18719           DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18720           DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18721           if (!NodeI1)
18722             return NodeI2 != nullptr;
18723           if (!NodeI2)
18724             return false;
18725           assert((NodeI1 == NodeI2) ==
18726                      (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18727                  "Different nodes should have different DFS numbers");
18728           if (NodeI1 != NodeI2)
18729             return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18730           InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18731           if (S.getOpcode() && !S.isAltShuffle())
18732             continue;
18733           return I1->getOpcode() < I2->getOpcode();
18734         }
18735         if (I1)
18736           return true;
18737         if (I2)
18738           return false;
18739       }
18740       {
18741         // Non-undef constants come next.
18742         bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18743         bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18744         if (C1 && C2)
18745           continue;
18746         if (C1)
18747           return true;
18748         if (C2)
18749           return false;
18750       }
18751       bool U1 = isa<UndefValue>(Opcodes1[I]);
18752       bool U2 = isa<UndefValue>(Opcodes2[I]);
18753       {
18754         // Non-constant non-instructions come next.
18755         if (!U1 && !U2) {
18756           auto ValID1 = Opcodes1[I]->getValueID();
18757           auto ValID2 = Opcodes2[I]->getValueID();
18758           if (ValID1 == ValID2)
18759             continue;
18760           if (ValID1 < ValID2)
18761             return true;
18762           if (ValID1 > ValID2)
18763             return false;
18764         }
18765         if (!U1)
18766           return true;
18767         if (!U2)
18768           return false;
18769       }
18770       // Undefs come last.
18771       assert(U1 && U2 && "The only thing left should be undef & undef.");
18772       continue;
18773     }
18774     return false;
18775   };
18776   auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
18777     if (V1 == V2)
18778       return true;
18779     if (V1->getType() != V2->getType())
18780       return false;
18781     ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18782     ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18783     if (Opcodes1.size() != Opcodes2.size())
18784       return false;
18785     for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18786       // Undefs are compatible with any other value.
18787       if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18788         continue;
18789       if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18790         if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18791           if (R.isDeleted(I1) || R.isDeleted(I2))
18792             return false;
18793           if (I1->getParent() != I2->getParent())
18794             return false;
18795           InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18796           if (S.getOpcode())
18797             continue;
18798           return false;
18799         }
18800       if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18801         continue;
18802       if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18803         return false;
18804     }
18805     return true;
18806   };
18807 
18808   bool HaveVectorizedPhiNodes = false;
18809   do {
18810     // Collect the incoming values from the PHIs.
18811     Incoming.clear();
18812     for (Instruction &I : *BB) {
18813       auto *P = dyn_cast<PHINode>(&I);
18814       if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18815         break;
18816 
18817       // No need to analyze deleted, vectorized and non-vectorizable
18818       // instructions.
18819       if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18820           isValidElementType(P->getType()))
18821         Incoming.push_back(P);
18822     }
18823 
18824     if (Incoming.size() <= 1)
18825       break;
18826 
18827     // Find the corresponding non-phi nodes for better matching when trying to
18828     // build the tree.
18829     for (Value *V : Incoming) {
18830       SmallVectorImpl<Value *> &Opcodes =
18831           PHIToOpcodes.try_emplace(V).first->getSecond();
18832       if (!Opcodes.empty())
18833         continue;
18834       SmallVector<Value *, 4> Nodes(1, V);
18835       SmallPtrSet<Value *, 4> Visited;
18836       while (!Nodes.empty()) {
18837         auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18838         if (!Visited.insert(PHI).second)
18839           continue;
18840         for (Value *V : PHI->incoming_values()) {
18841           if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18842             Nodes.push_back(PHI1);
18843             continue;
18844           }
18845           Opcodes.emplace_back(V);
18846         }
18847       }
18848     }
18849 
18850     HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18851         Incoming, PHICompare, AreCompatiblePHIs,
18852         [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18853           return tryToVectorizeList(Candidates, R, MaxVFOnly);
18854         },
18855         /*MaxVFOnly=*/true, R);
18856     Changed |= HaveVectorizedPhiNodes;
18857     if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
18858           auto *PHI = dyn_cast<PHINode>(P.first);
18859           return !PHI || R.isDeleted(PHI);
18860         }))
18861       PHIToOpcodes.clear();
18862     VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18863   } while (HaveVectorizedPhiNodes);
18864 
18865   VisitedInstrs.clear();
18866 
18867   InstSetVector PostProcessInserts;
18868   SmallSetVector<CmpInst *, 8> PostProcessCmps;
18869   // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18870   // also vectorizes `PostProcessCmps`.
18871   auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18872     bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18873     if (VectorizeCmps) {
18874       Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18875       PostProcessCmps.clear();
18876     }
18877     PostProcessInserts.clear();
18878     return Changed;
18879   };
18880   // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18881   auto IsInPostProcessInstrs = [&](Instruction *I) {
18882     if (auto *Cmp = dyn_cast<CmpInst>(I))
18883       return PostProcessCmps.contains(Cmp);
18884     return isa<InsertElementInst, InsertValueInst>(I) &&
18885            PostProcessInserts.contains(I);
18886   };
18887   // Returns true if `I` is an instruction without users, like terminator, or
18888   // function call with ignored return value, store. Ignore unused instructions
18889   // (basing on instruction type, except for CallInst and InvokeInst).
18890   auto HasNoUsers = [](Instruction *I) {
18891     return I->use_empty() &&
18892            (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18893   };
18894   for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18895     // Skip instructions with scalable type. The num of elements is unknown at
18896     // compile-time for scalable type.
18897     if (isa<ScalableVectorType>(It->getType()))
18898       continue;
18899 
18900     // Skip instructions marked for the deletion.
18901     if (R.isDeleted(&*It))
18902       continue;
18903     // We may go through BB multiple times so skip the one we have checked.
18904     if (!VisitedInstrs.insert(&*It).second) {
18905       if (HasNoUsers(&*It) &&
18906           VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18907         // We would like to start over since some instructions are deleted
18908         // and the iterator may become invalid value.
18909         Changed = true;
18910         It = BB->begin();
18911         E = BB->end();
18912       }
18913       continue;
18914     }
18915 
18916     if (isa<DbgInfoIntrinsic>(It))
18917       continue;
18918 
18919     // Try to vectorize reductions that use PHINodes.
18920     if (PHINode *P = dyn_cast<PHINode>(It)) {
18921       // Check that the PHI is a reduction PHI.
18922       if (P->getNumIncomingValues() == 2) {
18923         // Try to match and vectorize a horizontal reduction.
18924         Instruction *Root = getReductionInstr(DT, P, BB, LI);
18925         if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18926           Changed = true;
18927           It = BB->begin();
18928           E = BB->end();
18929           continue;
18930         }
18931       }
18932       // Try to vectorize the incoming values of the PHI, to catch reductions
18933       // that feed into PHIs.
18934       for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
18935         // Skip if the incoming block is the current BB for now. Also, bypass
18936         // unreachable IR for efficiency and to avoid crashing.
18937         // TODO: Collect the skipped incoming values and try to vectorize them
18938         // after processing BB.
18939         if (BB == P->getIncomingBlock(I) ||
18940             !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18941           continue;
18942 
18943         // Postponed instructions should not be vectorized here, delay their
18944         // vectorization.
18945         if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18946             PI && !IsInPostProcessInstrs(PI)) {
18947           bool Res = vectorizeRootInstruction(nullptr, PI,
18948                                               P->getIncomingBlock(I), R, TTI);
18949           Changed |= Res;
18950           if (Res && R.isDeleted(P)) {
18951             It = BB->begin();
18952             E = BB->end();
18953             break;
18954           }
18955         }
18956       }
18957       continue;
18958     }
18959 
18960     if (HasNoUsers(&*It)) {
18961       bool OpsChanged = false;
18962       auto *SI = dyn_cast<StoreInst>(It);
18963       bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18964       if (SI) {
18965         auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18966         // Try to vectorize chain in store, if this is the only store to the
18967         // address in the block.
18968         // TODO: This is just a temporarily solution to save compile time. Need
18969         // to investigate if we can safely turn on slp-vectorize-hor-store
18970         // instead to allow lookup for reduction chains in all non-vectorized
18971         // stores (need to check side effects and compile time).
18972         TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18973                               SI->getValueOperand()->hasOneUse();
18974       }
18975       if (TryToVectorizeRoot) {
18976         for (auto *V : It->operand_values()) {
18977           // Postponed instructions should not be vectorized here, delay their
18978           // vectorization.
18979           if (auto *VI = dyn_cast<Instruction>(V);
18980               VI && !IsInPostProcessInstrs(VI))
18981             // Try to match and vectorize a horizontal reduction.
18982             OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18983         }
18984       }
18985       // Start vectorization of post-process list of instructions from the
18986       // top-tree instructions to try to vectorize as many instructions as
18987       // possible.
18988       OpsChanged |=
18989           VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18990       if (OpsChanged) {
18991         // We would like to start over since some instructions are deleted
18992         // and the iterator may become invalid value.
18993         Changed = true;
18994         It = BB->begin();
18995         E = BB->end();
18996         continue;
18997       }
18998     }
18999 
19000     if (isa<InsertElementInst, InsertValueInst>(It))
19001       PostProcessInserts.insert(&*It);
19002     else if (isa<CmpInst>(It))
19003       PostProcessCmps.insert(cast<CmpInst>(&*It));
19004   }
19005 
19006   return Changed;
19007 }
19008 
19009 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
19010   auto Changed = false;
19011   for (auto &Entry : GEPs) {
19012     // If the getelementptr list has fewer than two elements, there's nothing
19013     // to do.
19014     if (Entry.second.size() < 2)
19015       continue;
19016 
19017     LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
19018                       << Entry.second.size() << ".\n");
19019 
19020     // Process the GEP list in chunks suitable for the target's supported
19021     // vector size. If a vector register can't hold 1 element, we are done. We
19022     // are trying to vectorize the index computations, so the maximum number of
19023     // elements is based on the size of the index expression, rather than the
19024     // size of the GEP itself (the target's pointer size).
19025     auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
19026       return !R.isDeleted(GEP);
19027     });
19028     if (It == Entry.second.end())
19029       continue;
19030     unsigned MaxVecRegSize = R.getMaxVecRegSize();
19031     unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
19032     if (MaxVecRegSize < EltSize)
19033       continue;
19034 
19035     unsigned MaxElts = MaxVecRegSize / EltSize;
19036     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19037       auto Len = std::min<unsigned>(BE - BI, MaxElts);
19038       ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19039 
19040       // Initialize a set a candidate getelementptrs. Note that we use a
19041       // SetVector here to preserve program order. If the index computations
19042       // are vectorizable and begin with loads, we want to minimize the chance
19043       // of having to reorder them later.
19044       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19045 
19046       // Some of the candidates may have already been vectorized after we
19047       // initially collected them or their index is optimized to constant value.
19048       // If so, they are marked as deleted, so remove them from the set of
19049       // candidates.
19050       Candidates.remove_if([&R](Value *I) {
19051         return R.isDeleted(cast<Instruction>(I)) ||
19052                isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
19053       });
19054 
19055       // Remove from the set of candidates all pairs of getelementptrs with
19056       // constant differences. Such getelementptrs are likely not good
19057       // candidates for vectorization in a bottom-up phase since one can be
19058       // computed from the other. We also ensure all candidate getelementptr
19059       // indices are unique.
19060       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19061         auto *GEPI = GEPList[I];
19062         if (!Candidates.count(GEPI))
19063           continue;
19064         auto *SCEVI = SE->getSCEV(GEPList[I]);
19065         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19066           auto *GEPJ = GEPList[J];
19067           auto *SCEVJ = SE->getSCEV(GEPList[J]);
19068           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
19069             Candidates.remove(GEPI);
19070             Candidates.remove(GEPJ);
19071           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19072             Candidates.remove(GEPJ);
19073           }
19074         }
19075       }
19076 
19077       // We break out of the above computation as soon as we know there are
19078       // fewer than two candidates remaining.
19079       if (Candidates.size() < 2)
19080         continue;
19081 
19082       // Add the single, non-constant index of each candidate to the bundle. We
19083       // ensured the indices met these constraints when we originally collected
19084       // the getelementptrs.
19085       SmallVector<Value *, 16> Bundle(Candidates.size());
19086       auto BundleIndex = 0u;
19087       for (auto *V : Candidates) {
19088         auto *GEP = cast<GetElementPtrInst>(V);
19089         auto *GEPIdx = GEP->idx_begin()->get();
19090         assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19091         Bundle[BundleIndex++] = GEPIdx;
19092       }
19093 
19094       // Try and vectorize the indices. We are currently only interested in
19095       // gather-like cases of the form:
19096       //
19097       // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19098       //
19099       // where the loads of "a", the loads of "b", and the subtractions can be
19100       // performed in parallel. It's likely that detecting this pattern in a
19101       // bottom-up phase will be simpler and less costly than building a
19102       // full-blown top-down phase beginning at the consecutive loads.
19103       Changed |= tryToVectorizeList(Bundle, R);
19104     }
19105   }
19106   return Changed;
19107 }
19108 
19109 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19110   bool Changed = false;
19111   // Sort by type, base pointers and values operand. Value operands must be
19112   // compatible (have the same opcode, same parent), otherwise it is
19113   // definitely not profitable to try to vectorize them.
19114   auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19115     if (V->getValueOperand()->getType()->getTypeID() <
19116         V2->getValueOperand()->getType()->getTypeID())
19117       return true;
19118     if (V->getValueOperand()->getType()->getTypeID() >
19119         V2->getValueOperand()->getType()->getTypeID())
19120       return false;
19121     if (V->getPointerOperandType()->getTypeID() <
19122         V2->getPointerOperandType()->getTypeID())
19123       return true;
19124     if (V->getPointerOperandType()->getTypeID() >
19125         V2->getPointerOperandType()->getTypeID())
19126       return false;
19127     // UndefValues are compatible with all other values.
19128     if (isa<UndefValue>(V->getValueOperand()) ||
19129         isa<UndefValue>(V2->getValueOperand()))
19130       return false;
19131     if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
19132       if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19133         DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
19134             DT->getNode(I1->getParent());
19135         DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
19136             DT->getNode(I2->getParent());
19137         assert(NodeI1 && "Should only process reachable instructions");
19138         assert(NodeI2 && "Should only process reachable instructions");
19139         assert((NodeI1 == NodeI2) ==
19140                    (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19141                "Different nodes should have different DFS numbers");
19142         if (NodeI1 != NodeI2)
19143           return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19144         InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19145         if (S.getOpcode())
19146           return false;
19147         return I1->getOpcode() < I2->getOpcode();
19148       }
19149     if (isa<Constant>(V->getValueOperand()) &&
19150         isa<Constant>(V2->getValueOperand()))
19151       return false;
19152     return V->getValueOperand()->getValueID() <
19153            V2->getValueOperand()->getValueID();
19154   };
19155 
19156   auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19157     if (V1 == V2)
19158       return true;
19159     if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19160       return false;
19161     if (V1->getPointerOperandType() != V2->getPointerOperandType())
19162       return false;
19163     // Undefs are compatible with any other value.
19164     if (isa<UndefValue>(V1->getValueOperand()) ||
19165         isa<UndefValue>(V2->getValueOperand()))
19166       return true;
19167     if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
19168       if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19169         if (I1->getParent() != I2->getParent())
19170           return false;
19171         InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19172         return S.getOpcode() > 0;
19173       }
19174     if (isa<Constant>(V1->getValueOperand()) &&
19175         isa<Constant>(V2->getValueOperand()))
19176       return true;
19177     return V1->getValueOperand()->getValueID() ==
19178            V2->getValueOperand()->getValueID();
19179   };
19180 
19181   // Attempt to sort and vectorize each of the store-groups.
19182   DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
19183   for (auto &Pair : Stores) {
19184     if (Pair.second.size() < 2)
19185       continue;
19186 
19187     LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19188                       << Pair.second.size() << ".\n");
19189 
19190     if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
19191       continue;
19192 
19193     // Reverse stores to do bottom-to-top analysis. This is important if the
19194     // values are stores to the same addresses several times, in this case need
19195     // to follow the stores order (reversed to meet the memory dependecies).
19196     SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19197                                             Pair.second.rend());
19198     Changed |= tryToVectorizeSequence<StoreInst>(
19199         ReversedStores, StoreSorter, AreCompatibleStores,
19200         [&](ArrayRef<StoreInst *> Candidates, bool) {
19201           return vectorizeStores(Candidates, R, Attempted);
19202         },
19203         /*MaxVFOnly=*/false, R);
19204   }
19205   return Changed;
19206 }
19207