xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This transformation analyzes and transforms the induction variables (and
10 // computations derived from them) into forms suitable for efficient execution
11 // on the target.
12 //
13 // This pass performs a strength reduction on array references inside loops that
14 // have as one or more of their components the loop induction variable, it
15 // rewrites expressions to take advantage of scaled-index addressing modes
16 // available on the target, and it performs a variety of other optimizations
17 // related to loop induction variables.
18 //
19 // Terminology note: this code has a lot of handling for "post-increment" or
20 // "post-inc" users. This is not talking about post-increment addressing modes;
21 // it is instead talking about code like this:
22 //
23 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
24 //   ...
25 //   %i.next = add %i, 1
26 //   %c = icmp eq %i.next, %n
27 //
28 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29 // it's useful to think about these as the same register, with some uses using
30 // the value of the register before the add and some using it after. In this
31 // example, the icmp is a post-increment user, since it uses %i.next, which is
32 // the value of the induction variable after the increment. The other common
33 // case of post-increment users is users outside the loop.
34 //
35 // TODO: More sophistication in the way Formulae are generated and filtered.
36 //
37 // TODO: Handle multiple loops at a time.
38 //
39 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40 //       of a GlobalValue?
41 //
42 // TODO: When truncation is free, truncate ICmp users' operands to make it a
43 //       smaller encoding (on x86 at least).
44 //
45 // TODO: When a negated register is used by an add (such as in a list of
46 //       multiple base registers, or as the increment expression in an addrec),
47 //       we may not actually need both reg and (-1 * reg) in registers; the
48 //       negation can be implemented by using a sub instead of an add. The
49 //       lack of support for taking this into consideration when making
50 //       register pressure decisions is partly worked around by the "Special"
51 //       use kind.
52 //
53 //===----------------------------------------------------------------------===//
54 
55 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
56 #include "llvm/ADT/APInt.h"
57 #include "llvm/ADT/DenseMap.h"
58 #include "llvm/ADT/DenseSet.h"
59 #include "llvm/ADT/PointerIntPair.h"
60 #include "llvm/ADT/STLExtras.h"
61 #include "llvm/ADT/SetVector.h"
62 #include "llvm/ADT/SmallBitVector.h"
63 #include "llvm/ADT/SmallPtrSet.h"
64 #include "llvm/ADT/SmallSet.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/Statistic.h"
67 #include "llvm/ADT/iterator_range.h"
68 #include "llvm/Analysis/AssumptionCache.h"
69 #include "llvm/Analysis/DomTreeUpdater.h"
70 #include "llvm/Analysis/IVUsers.h"
71 #include "llvm/Analysis/LoopAnalysisManager.h"
72 #include "llvm/Analysis/LoopInfo.h"
73 #include "llvm/Analysis/LoopPass.h"
74 #include "llvm/Analysis/MemorySSA.h"
75 #include "llvm/Analysis/MemorySSAUpdater.h"
76 #include "llvm/Analysis/ScalarEvolution.h"
77 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
78 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
79 #include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
80 #include "llvm/Analysis/TargetLibraryInfo.h"
81 #include "llvm/Analysis/TargetTransformInfo.h"
82 #include "llvm/Analysis/ValueTracking.h"
83 #include "llvm/BinaryFormat/Dwarf.h"
84 #include "llvm/IR/BasicBlock.h"
85 #include "llvm/IR/Constant.h"
86 #include "llvm/IR/Constants.h"
87 #include "llvm/IR/DebugInfoMetadata.h"
88 #include "llvm/IR/DerivedTypes.h"
89 #include "llvm/IR/Dominators.h"
90 #include "llvm/IR/GlobalValue.h"
91 #include "llvm/IR/IRBuilder.h"
92 #include "llvm/IR/InstrTypes.h"
93 #include "llvm/IR/Instruction.h"
94 #include "llvm/IR/Instructions.h"
95 #include "llvm/IR/IntrinsicInst.h"
96 #include "llvm/IR/Module.h"
97 #include "llvm/IR/Operator.h"
98 #include "llvm/IR/Type.h"
99 #include "llvm/IR/Use.h"
100 #include "llvm/IR/User.h"
101 #include "llvm/IR/Value.h"
102 #include "llvm/IR/ValueHandle.h"
103 #include "llvm/InitializePasses.h"
104 #include "llvm/Pass.h"
105 #include "llvm/Support/Casting.h"
106 #include "llvm/Support/CommandLine.h"
107 #include "llvm/Support/Compiler.h"
108 #include "llvm/Support/Debug.h"
109 #include "llvm/Support/ErrorHandling.h"
110 #include "llvm/Support/MathExtras.h"
111 #include "llvm/Support/raw_ostream.h"
112 #include "llvm/Transforms/Scalar.h"
113 #include "llvm/Transforms/Utils.h"
114 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
115 #include "llvm/Transforms/Utils/Local.h"
116 #include "llvm/Transforms/Utils/LoopUtils.h"
117 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
118 #include <algorithm>
119 #include <cassert>
120 #include <cstddef>
121 #include <cstdint>
122 #include <iterator>
123 #include <limits>
124 #include <map>
125 #include <numeric>
126 #include <optional>
127 #include <utility>
128 
129 using namespace llvm;
130 using namespace SCEVPatternMatch;
131 
132 #define DEBUG_TYPE "loop-reduce"
133 
134 /// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135 /// bail out. This threshold is far beyond the number of users that LSR can
136 /// conceivably solve, so it should not affect generated code, but catches the
137 /// worst cases before LSR burns too much compile time and stack space.
138 static const unsigned MaxIVUsers = 200;
139 
140 /// Limit the size of expression that SCEV-based salvaging will attempt to
141 /// translate into a DIExpression.
142 /// Choose a maximum size such that debuginfo is not excessively increased and
143 /// the salvaging is not too expensive for the compiler.
144 static const unsigned MaxSCEVSalvageExpressionSize = 64;
145 
146 // Cleanup congruent phis after LSR phi expansion.
147 static cl::opt<bool> EnablePhiElim(
148   "enable-lsr-phielim", cl::Hidden, cl::init(true),
149   cl::desc("Enable LSR phi elimination"));
150 
151 // The flag adds instruction count to solutions cost comparison.
152 static cl::opt<bool> InsnsCost(
153   "lsr-insns-cost", cl::Hidden, cl::init(true),
154   cl::desc("Add instruction count to a LSR cost model"));
155 
156 // Flag to choose how to narrow complex lsr solution
157 static cl::opt<bool> LSRExpNarrow(
158   "lsr-exp-narrow", cl::Hidden, cl::init(false),
159   cl::desc("Narrow LSR complex solution using"
160            " expectation of registers number"));
161 
162 // Flag to narrow search space by filtering non-optimal formulae with
163 // the same ScaledReg and Scale.
164 static cl::opt<bool> FilterSameScaledReg(
165     "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166     cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167              " with the same ScaledReg and Scale"));
168 
169 static cl::opt<TTI::AddressingModeKind> PreferredAddresingMode(
170   "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171    cl::desc("A flag that overrides the target's preferred addressing mode."),
172    cl::values(clEnumValN(TTI::AMK_None,
173                          "none",
174                          "Don't prefer any addressing mode"),
175               clEnumValN(TTI::AMK_PreIndexed,
176                          "preindexed",
177                          "Prefer pre-indexed addressing mode"),
178               clEnumValN(TTI::AMK_PostIndexed,
179                          "postindexed",
180                          "Prefer post-indexed addressing mode")));
181 
182 static cl::opt<unsigned> ComplexityLimit(
183   "lsr-complexity-limit", cl::Hidden,
184   cl::init(std::numeric_limits<uint16_t>::max()),
185   cl::desc("LSR search space complexity limit"));
186 
187 static cl::opt<unsigned> SetupCostDepthLimit(
188     "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
189     cl::desc("The limit on recursion depth for LSRs setup cost"));
190 
191 static cl::opt<cl::boolOrDefault> AllowDropSolutionIfLessProfitable(
192     "lsr-drop-solution", cl::Hidden,
193     cl::desc("Attempt to drop solution if it is less profitable"));
194 
195 static cl::opt<bool> EnableVScaleImmediates(
196     "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
197     cl::desc("Enable analysis of vscale-relative immediates in LSR"));
198 
199 static cl::opt<bool> DropScaledForVScale(
200     "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
201     cl::desc("Avoid using scaled registers with vscale-relative addressing"));
202 
203 #ifndef NDEBUG
204 // Stress test IV chain generation.
205 static cl::opt<bool> StressIVChain(
206   "stress-ivchain", cl::Hidden, cl::init(false),
207   cl::desc("Stress test LSR IV chains"));
208 #else
209 static bool StressIVChain = false;
210 #endif
211 
212 namespace {
213 
214 struct MemAccessTy {
215   /// Used in situations where the accessed memory type is unknown.
216   static const unsigned UnknownAddressSpace =
217       std::numeric_limits<unsigned>::max();
218 
219   Type *MemTy = nullptr;
220   unsigned AddrSpace = UnknownAddressSpace;
221 
222   MemAccessTy() = default;
223   MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
224 
225   bool operator==(MemAccessTy Other) const {
226     return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
227   }
228 
229   bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
230 
231   static MemAccessTy getUnknown(LLVMContext &Ctx,
232                                 unsigned AS = UnknownAddressSpace) {
233     return MemAccessTy(Type::getVoidTy(Ctx), AS);
234   }
235 
236   Type *getType() { return MemTy; }
237 };
238 
239 /// This class holds data which is used to order reuse candidates.
240 class RegSortData {
241 public:
242   /// This represents the set of LSRUse indices which reference
243   /// a particular register.
244   SmallBitVector UsedByIndices;
245 
246   void print(raw_ostream &OS) const;
247   void dump() const;
248 };
249 
250 // An offset from an address that is either scalable or fixed. Used for
251 // per-target optimizations of addressing modes.
252 class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
253   constexpr Immediate(ScalarTy MinVal, bool Scalable)
254       : FixedOrScalableQuantity(MinVal, Scalable) {}
255 
256   constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
257       : FixedOrScalableQuantity(V) {}
258 
259 public:
260   constexpr Immediate() = delete;
261 
262   static constexpr Immediate getFixed(ScalarTy MinVal) {
263     return {MinVal, false};
264   }
265   static constexpr Immediate getScalable(ScalarTy MinVal) {
266     return {MinVal, true};
267   }
268   static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
269     return {MinVal, Scalable};
270   }
271   static constexpr Immediate getZero() { return {0, false}; }
272   static constexpr Immediate getFixedMin() {
273     return {std::numeric_limits<int64_t>::min(), false};
274   }
275   static constexpr Immediate getFixedMax() {
276     return {std::numeric_limits<int64_t>::max(), false};
277   }
278   static constexpr Immediate getScalableMin() {
279     return {std::numeric_limits<int64_t>::min(), true};
280   }
281   static constexpr Immediate getScalableMax() {
282     return {std::numeric_limits<int64_t>::max(), true};
283   }
284 
285   constexpr bool isLessThanZero() const { return Quantity < 0; }
286 
287   constexpr bool isGreaterThanZero() const { return Quantity > 0; }
288 
289   constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
290     return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
291   }
292 
293   constexpr bool isMin() const {
294     return Quantity == std::numeric_limits<ScalarTy>::min();
295   }
296 
297   constexpr bool isMax() const {
298     return Quantity == std::numeric_limits<ScalarTy>::max();
299   }
300 
301   // Arithmetic 'operators' that cast to unsigned types first.
302   constexpr Immediate addUnsigned(const Immediate &RHS) const {
303     assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
304     ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
305     return {Value, Scalable || RHS.isScalable()};
306   }
307 
308   constexpr Immediate subUnsigned(const Immediate &RHS) const {
309     assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
310     ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
311     return {Value, Scalable || RHS.isScalable()};
312   }
313 
314   // Scale the quantity by a constant without caring about runtime scalability.
315   constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
316     ScalarTy Value = (uint64_t)Quantity * RHS;
317     return {Value, Scalable};
318   }
319 
320   // Helpers for generating SCEVs with vscale terms where needed.
321   const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
322     const SCEV *S = SE.getConstant(Ty, Quantity);
323     if (Scalable)
324       S = SE.getMulExpr(S, SE.getVScale(S->getType()));
325     return S;
326   }
327 
328   const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
329     const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
330     if (Scalable)
331       NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
332     return NegS;
333   }
334 
335   const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
336     const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
337     if (Scalable)
338       SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
339     return SU;
340   }
341 };
342 
343 // This is needed for the Compare type of std::map when Immediate is used
344 // as a key. We don't need it to be fully correct against any value of vscale,
345 // just to make sure that vscale-related terms in the map are considered against
346 // each other rather than being mixed up and potentially missing opportunities.
347 struct KeyOrderTargetImmediate {
348   bool operator()(const Immediate &LHS, const Immediate &RHS) const {
349     if (LHS.isScalable() && !RHS.isScalable())
350       return false;
351     if (!LHS.isScalable() && RHS.isScalable())
352       return true;
353     return LHS.getKnownMinValue() < RHS.getKnownMinValue();
354   }
355 };
356 
357 // This would be nicer if we could be generic instead of directly using size_t,
358 // but there doesn't seem to be a type trait for is_orderable or
359 // is_lessthan_comparable or similar.
360 struct KeyOrderSizeTAndImmediate {
361   bool operator()(const std::pair<size_t, Immediate> &LHS,
362                   const std::pair<size_t, Immediate> &RHS) const {
363     size_t LSize = LHS.first;
364     size_t RSize = RHS.first;
365     if (LSize != RSize)
366       return LSize < RSize;
367     return KeyOrderTargetImmediate()(LHS.second, RHS.second);
368   }
369 };
370 } // end anonymous namespace
371 
372 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
373 void RegSortData::print(raw_ostream &OS) const {
374   OS << "[NumUses=" << UsedByIndices.count() << ']';
375 }
376 
377 LLVM_DUMP_METHOD void RegSortData::dump() const {
378   print(errs()); errs() << '\n';
379 }
380 #endif
381 
382 namespace {
383 
384 /// Map register candidates to information about how they are used.
385 class RegUseTracker {
386   using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
387 
388   RegUsesTy RegUsesMap;
389   SmallVector<const SCEV *, 16> RegSequence;
390 
391 public:
392   void countRegister(const SCEV *Reg, size_t LUIdx);
393   void dropRegister(const SCEV *Reg, size_t LUIdx);
394   void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
395 
396   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
397 
398   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
399 
400   void clear();
401 
402   using iterator = SmallVectorImpl<const SCEV *>::iterator;
403   using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
404 
405   iterator begin() { return RegSequence.begin(); }
406   iterator end()   { return RegSequence.end(); }
407   const_iterator begin() const { return RegSequence.begin(); }
408   const_iterator end() const   { return RegSequence.end(); }
409 };
410 
411 } // end anonymous namespace
412 
413 void
414 RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
415   std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
416   RegSortData &RSD = Pair.first->second;
417   if (Pair.second)
418     RegSequence.push_back(Reg);
419   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
420   RSD.UsedByIndices.set(LUIdx);
421 }
422 
423 void
424 RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
425   RegUsesTy::iterator It = RegUsesMap.find(Reg);
426   assert(It != RegUsesMap.end());
427   RegSortData &RSD = It->second;
428   assert(RSD.UsedByIndices.size() > LUIdx);
429   RSD.UsedByIndices.reset(LUIdx);
430 }
431 
432 void
433 RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
434   assert(LUIdx <= LastLUIdx);
435 
436   // Update RegUses. The data structure is not optimized for this purpose;
437   // we must iterate through it and update each of the bit vectors.
438   for (auto &Pair : RegUsesMap) {
439     SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
440     if (LUIdx < UsedByIndices.size())
441       UsedByIndices[LUIdx] =
442         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
443     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
444   }
445 }
446 
447 bool
448 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
449   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
450   if (I == RegUsesMap.end())
451     return false;
452   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
453   int i = UsedByIndices.find_first();
454   if (i == -1) return false;
455   if ((size_t)i != LUIdx) return true;
456   return UsedByIndices.find_next(i) != -1;
457 }
458 
459 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
460   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
461   assert(I != RegUsesMap.end() && "Unknown register!");
462   return I->second.UsedByIndices;
463 }
464 
465 void RegUseTracker::clear() {
466   RegUsesMap.clear();
467   RegSequence.clear();
468 }
469 
470 namespace {
471 
472 /// This class holds information that describes a formula for computing
473 /// satisfying a use. It may include broken-out immediates and scaled registers.
474 struct Formula {
475   /// Global base address used for complex addressing.
476   GlobalValue *BaseGV = nullptr;
477 
478   /// Base offset for complex addressing.
479   Immediate BaseOffset = Immediate::getZero();
480 
481   /// Whether any complex addressing has a base register.
482   bool HasBaseReg = false;
483 
484   /// The scale of any complex addressing.
485   int64_t Scale = 0;
486 
487   /// The list of "base" registers for this use. When this is non-empty. The
488   /// canonical representation of a formula is
489   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
490   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
491   /// 3. The reg containing recurrent expr related with currect loop in the
492   /// formula should be put in the ScaledReg.
493   /// #1 enforces that the scaled register is always used when at least two
494   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
495   /// #2 enforces that 1 * reg is reg.
496   /// #3 ensures invariant regs with respect to current loop can be combined
497   /// together in LSR codegen.
498   /// This invariant can be temporarily broken while building a formula.
499   /// However, every formula inserted into the LSRInstance must be in canonical
500   /// form.
501   SmallVector<const SCEV *, 4> BaseRegs;
502 
503   /// The 'scaled' register for this use. This should be non-null when Scale is
504   /// not zero.
505   const SCEV *ScaledReg = nullptr;
506 
507   /// An additional constant offset which added near the use. This requires a
508   /// temporary register, but the offset itself can live in an add immediate
509   /// field rather than a register.
510   Immediate UnfoldedOffset = Immediate::getZero();
511 
512   Formula() = default;
513 
514   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
515 
516   bool isCanonical(const Loop &L) const;
517 
518   void canonicalize(const Loop &L);
519 
520   bool unscale();
521 
522   bool hasZeroEnd() const;
523 
524   bool countsDownToZero() const;
525 
526   size_t getNumRegs() const;
527   Type *getType() const;
528 
529   void deleteBaseReg(const SCEV *&S);
530 
531   bool referencesReg(const SCEV *S) const;
532   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
533                                   const RegUseTracker &RegUses) const;
534 
535   void print(raw_ostream &OS) const;
536   void dump() const;
537 };
538 
539 } // end anonymous namespace
540 
541 /// Recursion helper for initialMatch.
542 static void DoInitialMatch(const SCEV *S, Loop *L,
543                            SmallVectorImpl<const SCEV *> &Good,
544                            SmallVectorImpl<const SCEV *> &Bad,
545                            ScalarEvolution &SE) {
546   // Collect expressions which properly dominate the loop header.
547   if (SE.properlyDominates(S, L->getHeader())) {
548     Good.push_back(S);
549     return;
550   }
551 
552   // Look at add operands.
553   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554     for (const SCEV *S : Add->operands())
555       DoInitialMatch(S, L, Good, Bad, SE);
556     return;
557   }
558 
559   // Look at addrec operands.
560   const SCEV *Start, *Step;
561   const Loop *ARLoop;
562   if (match(S,
563             m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564       !Start->isZero()) {
565     DoInitialMatch(Start, L, Good, Bad, SE);
566     DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567                                     // FIXME: AR->getNoWrapFlags()
568                                     ARLoop, SCEV::FlagAnyWrap),
569                    L, Good, Bad, SE);
570     return;
571   }
572 
573   // Handle a multiplication by -1 (negation) if it didn't fold.
574   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575     if (Mul->getOperand(0)->isAllOnesValue()) {
576       SmallVector<const SCEV *, 4> Ops(drop_begin(Mul->operands()));
577       const SCEV *NewMul = SE.getMulExpr(Ops);
578 
579       SmallVector<const SCEV *, 4> MyGood;
580       SmallVector<const SCEV *, 4> MyBad;
581       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583         SE.getEffectiveSCEVType(NewMul->getType())));
584       for (const SCEV *S : MyGood)
585         Good.push_back(SE.getMulExpr(NegOne, S));
586       for (const SCEV *S : MyBad)
587         Bad.push_back(SE.getMulExpr(NegOne, S));
588       return;
589     }
590 
591   // Ok, we can't do anything interesting. Just stuff the whole thing into a
592   // register and hope for the best.
593   Bad.push_back(S);
594 }
595 
596 /// Incorporate loop-variant parts of S into this Formula, attempting to keep
597 /// all loop-invariant and loop-computable values in a single base register.
598 void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
599   SmallVector<const SCEV *, 4> Good;
600   SmallVector<const SCEV *, 4> Bad;
601   DoInitialMatch(S, L, Good, Bad, SE);
602   if (!Good.empty()) {
603     const SCEV *Sum = SE.getAddExpr(Good);
604     if (!Sum->isZero())
605       BaseRegs.push_back(Sum);
606     HasBaseReg = true;
607   }
608   if (!Bad.empty()) {
609     const SCEV *Sum = SE.getAddExpr(Bad);
610     if (!Sum->isZero())
611       BaseRegs.push_back(Sum);
612     HasBaseReg = true;
613   }
614   canonicalize(*L);
615 }
616 
617 static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618   return SCEVExprContains(S, [&L](const SCEV *S) {
619     return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620   });
621 }
622 
623 /// Check whether or not this formula satisfies the canonical
624 /// representation.
625 /// \see Formula::BaseRegs.
626 bool Formula::isCanonical(const Loop &L) const {
627   assert((Scale == 0 || ScaledReg) &&
628          "ScaledReg must be non-null if Scale is non-zero");
629 
630   if (!ScaledReg)
631     return BaseRegs.size() <= 1;
632 
633   if (Scale != 1)
634     return true;
635 
636   if (Scale == 1 && BaseRegs.empty())
637     return false;
638 
639   if (containsAddRecDependentOnLoop(ScaledReg, L))
640     return true;
641 
642   // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643   // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644   // loop, we want to swap the reg in BaseRegs with ScaledReg.
645   return none_of(BaseRegs, [&L](const SCEV *S) {
646     return containsAddRecDependentOnLoop(S, L);
647   });
648 }
649 
650 /// Helper method to morph a formula into its canonical representation.
651 /// \see Formula::BaseRegs.
652 /// Every formula having more than one base register, must use the ScaledReg
653 /// field. Otherwise, we would have to do special cases everywhere in LSR
654 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655 /// On the other hand, 1*reg should be canonicalized into reg.
656 void Formula::canonicalize(const Loop &L) {
657   if (isCanonical(L))
658     return;
659 
660   if (BaseRegs.empty()) {
661     // No base reg? Use scale reg with scale = 1 as such.
662     assert(ScaledReg && "Expected 1*reg => reg");
663     assert(Scale == 1 && "Expected 1*reg => reg");
664     BaseRegs.push_back(ScaledReg);
665     Scale = 0;
666     ScaledReg = nullptr;
667     return;
668   }
669 
670   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671   if (!ScaledReg) {
672     ScaledReg = BaseRegs.pop_back_val();
673     Scale = 1;
674   }
675 
676   // If ScaledReg is an invariant with respect to L, find the reg from
677   // BaseRegs containing the recurrent expr related with Loop L. Swap the
678   // reg with ScaledReg.
679   if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680     auto I = find_if(BaseRegs, [&L](const SCEV *S) {
681       return containsAddRecDependentOnLoop(S, L);
682     });
683     if (I != BaseRegs.end())
684       std::swap(ScaledReg, *I);
685   }
686   assert(isCanonical(L) && "Failed to canonicalize?");
687 }
688 
689 /// Get rid of the scale in the formula.
690 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691 /// \return true if it was possible to get rid of the scale, false otherwise.
692 /// \note After this operation the formula may not be in the canonical form.
693 bool Formula::unscale() {
694   if (Scale != 1)
695     return false;
696   Scale = 0;
697   BaseRegs.push_back(ScaledReg);
698   ScaledReg = nullptr;
699   return true;
700 }
701 
702 bool Formula::hasZeroEnd() const {
703   if (UnfoldedOffset || BaseOffset)
704     return false;
705   if (BaseRegs.size() != 1 || ScaledReg)
706     return false;
707   return true;
708 }
709 
710 bool Formula::countsDownToZero() const {
711   if (!hasZeroEnd())
712     return false;
713   assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714   const APInt *StepInt;
715   if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716     return false;
717   return StepInt->isNegative();
718 }
719 
720 /// Return the total number of register operands used by this formula. This does
721 /// not include register uses implied by non-constant addrec strides.
722 size_t Formula::getNumRegs() const {
723   return !!ScaledReg + BaseRegs.size();
724 }
725 
726 /// Return the type of this formula, if it has one, or null otherwise. This type
727 /// is meaningless except for the bit size.
728 Type *Formula::getType() const {
729   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730          ScaledReg ? ScaledReg->getType() :
731          BaseGV ? BaseGV->getType() :
732          nullptr;
733 }
734 
735 /// Delete the given base reg from the BaseRegs list.
736 void Formula::deleteBaseReg(const SCEV *&S) {
737   if (&S != &BaseRegs.back())
738     std::swap(S, BaseRegs.back());
739   BaseRegs.pop_back();
740 }
741 
742 /// Test if this formula references the given register.
743 bool Formula::referencesReg(const SCEV *S) const {
744   return S == ScaledReg || is_contained(BaseRegs, S);
745 }
746 
747 /// Test whether this formula uses registers which are used by uses other than
748 /// the use with the given index.
749 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750                                          const RegUseTracker &RegUses) const {
751   if (ScaledReg)
752     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753       return true;
754   for (const SCEV *BaseReg : BaseRegs)
755     if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756       return true;
757   return false;
758 }
759 
760 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761 void Formula::print(raw_ostream &OS) const {
762   bool First = true;
763   if (BaseGV) {
764     if (!First) OS << " + "; else First = false;
765     BaseGV->printAsOperand(OS, /*PrintType=*/false);
766   }
767   if (BaseOffset.isNonZero()) {
768     if (!First) OS << " + "; else First = false;
769     OS << BaseOffset;
770   }
771   for (const SCEV *BaseReg : BaseRegs) {
772     if (!First) OS << " + "; else First = false;
773     OS << "reg(" << *BaseReg << ')';
774   }
775   if (HasBaseReg && BaseRegs.empty()) {
776     if (!First) OS << " + "; else First = false;
777     OS << "**error: HasBaseReg**";
778   } else if (!HasBaseReg && !BaseRegs.empty()) {
779     if (!First) OS << " + "; else First = false;
780     OS << "**error: !HasBaseReg**";
781   }
782   if (Scale != 0) {
783     if (!First) OS << " + "; else First = false;
784     OS << Scale << "*reg(";
785     if (ScaledReg)
786       OS << *ScaledReg;
787     else
788       OS << "<unknown>";
789     OS << ')';
790   }
791   if (UnfoldedOffset.isNonZero()) {
792     if (!First) OS << " + ";
793     OS << "imm(" << UnfoldedOffset << ')';
794   }
795 }
796 
797 LLVM_DUMP_METHOD void Formula::dump() const {
798   print(errs()); errs() << '\n';
799 }
800 #endif
801 
802 /// Return true if the given addrec can be sign-extended without changing its
803 /// value.
804 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
805   Type *WideTy =
806     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
807   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
808 }
809 
810 /// Return true if the given add can be sign-extended without changing its
811 /// value.
812 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
813   Type *WideTy =
814     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
815   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
816 }
817 
818 /// Return true if the given mul can be sign-extended without changing its
819 /// value.
820 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
821   Type *WideTy =
822     IntegerType::get(SE.getContext(),
823                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
824   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
825 }
826 
827 /// Return an expression for LHS /s RHS, if it can be determined and if the
828 /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
829 /// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
830 /// the multiplication may overflow, which is useful when the result will be
831 /// used in a context where the most significant bits are ignored.
832 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
833                                 ScalarEvolution &SE,
834                                 bool IgnoreSignificantBits = false) {
835   // Handle the trivial case, which works for any SCEV type.
836   if (LHS == RHS)
837     return SE.getConstant(LHS->getType(), 1);
838 
839   // Handle a few RHS special cases.
840   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
841   if (RC) {
842     const APInt &RA = RC->getAPInt();
843     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
844     // some folding.
845     if (RA.isAllOnes()) {
846       if (LHS->getType()->isPointerTy())
847         return nullptr;
848       return SE.getMulExpr(LHS, RC);
849     }
850     // Handle x /s 1 as x.
851     if (RA == 1)
852       return LHS;
853   }
854 
855   // Check for a division of a constant by a constant.
856   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
857     if (!RC)
858       return nullptr;
859     const APInt &LA = C->getAPInt();
860     const APInt &RA = RC->getAPInt();
861     if (LA.srem(RA) != 0)
862       return nullptr;
863     return SE.getConstant(LA.sdiv(RA));
864   }
865 
866   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
867   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
868     if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
869       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
870                                       IgnoreSignificantBits);
871       if (!Step) return nullptr;
872       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
873                                        IgnoreSignificantBits);
874       if (!Start) return nullptr;
875       // FlagNW is independent of the start value, step direction, and is
876       // preserved with smaller magnitude steps.
877       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
878       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
879     }
880     return nullptr;
881   }
882 
883   // Distribute the sdiv over add operands, if the add doesn't overflow.
884   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
885     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
886       SmallVector<const SCEV *, 8> Ops;
887       for (const SCEV *S : Add->operands()) {
888         const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
889         if (!Op) return nullptr;
890         Ops.push_back(Op);
891       }
892       return SE.getAddExpr(Ops);
893     }
894     return nullptr;
895   }
896 
897   // Check for a multiply operand that we can pull RHS out of.
898   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
899     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
900       // Handle special case C1*X*Y /s C2*X*Y.
901       if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
902         if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
903           const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
904           const SCEVConstant *RC =
905               dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
906           if (LC && RC) {
907             SmallVector<const SCEV *, 4> LOps(drop_begin(Mul->operands()));
908             SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
909             if (LOps == ROps)
910               return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
911           }
912         }
913       }
914 
915       SmallVector<const SCEV *, 4> Ops;
916       bool Found = false;
917       for (const SCEV *S : Mul->operands()) {
918         if (!Found)
919           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
920                                            IgnoreSignificantBits)) {
921             S = Q;
922             Found = true;
923           }
924         Ops.push_back(S);
925       }
926       return Found ? SE.getMulExpr(Ops) : nullptr;
927     }
928     return nullptr;
929   }
930 
931   // Otherwise we don't know.
932   return nullptr;
933 }
934 
935 /// If S involves the addition of a constant integer value, return that integer
936 /// value, and mutate S to point to a new SCEV with that value excluded.
937 static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
938   const APInt *C;
939   if (match(S, m_scev_APInt(C))) {
940     if (C->getSignificantBits() <= 64) {
941       S = SE.getConstant(S->getType(), 0);
942       return Immediate::getFixed(C->getSExtValue());
943     }
944   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
945     SmallVector<const SCEV *, 8> NewOps(Add->operands());
946     Immediate Result = ExtractImmediate(NewOps.front(), SE);
947     if (Result.isNonZero())
948       S = SE.getAddExpr(NewOps);
949     return Result;
950   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
951     SmallVector<const SCEV *, 8> NewOps(AR->operands());
952     Immediate Result = ExtractImmediate(NewOps.front(), SE);
953     if (Result.isNonZero())
954       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
955                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
956                            SCEV::FlagAnyWrap);
957     return Result;
958   } else if (EnableVScaleImmediates &&
959              match(S, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale()))) {
960     S = SE.getConstant(S->getType(), 0);
961     return Immediate::getScalable(C->getSExtValue());
962   }
963   return Immediate::getZero();
964 }
965 
966 /// If S involves the addition of a GlobalValue address, return that symbol, and
967 /// mutate S to point to a new SCEV with that value excluded.
968 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
969   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
970     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
971       S = SE.getConstant(GV->getType(), 0);
972       return GV;
973     }
974   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
975     SmallVector<const SCEV *, 8> NewOps(Add->operands());
976     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
977     if (Result)
978       S = SE.getAddExpr(NewOps);
979     return Result;
980   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
981     SmallVector<const SCEV *, 8> NewOps(AR->operands());
982     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
983     if (Result)
984       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
985                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
986                            SCEV::FlagAnyWrap);
987     return Result;
988   }
989   return nullptr;
990 }
991 
992 /// Returns true if the specified instruction is using the specified value as an
993 /// address.
994 static bool isAddressUse(const TargetTransformInfo &TTI,
995                          Instruction *Inst, Value *OperandVal) {
996   bool isAddress = isa<LoadInst>(Inst);
997   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
998     if (SI->getPointerOperand() == OperandVal)
999       isAddress = true;
1000   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1001     // Addressing modes can also be folded into prefetches and a variety
1002     // of intrinsics.
1003     switch (II->getIntrinsicID()) {
1004     case Intrinsic::memset:
1005     case Intrinsic::prefetch:
1006     case Intrinsic::masked_load:
1007       if (II->getArgOperand(0) == OperandVal)
1008         isAddress = true;
1009       break;
1010     case Intrinsic::masked_store:
1011       if (II->getArgOperand(1) == OperandVal)
1012         isAddress = true;
1013       break;
1014     case Intrinsic::memmove:
1015     case Intrinsic::memcpy:
1016       if (II->getArgOperand(0) == OperandVal ||
1017           II->getArgOperand(1) == OperandVal)
1018         isAddress = true;
1019       break;
1020     default: {
1021       MemIntrinsicInfo IntrInfo;
1022       if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1023         if (IntrInfo.PtrVal == OperandVal)
1024           isAddress = true;
1025       }
1026     }
1027     }
1028   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1029     if (RMW->getPointerOperand() == OperandVal)
1030       isAddress = true;
1031   } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1032     if (CmpX->getPointerOperand() == OperandVal)
1033       isAddress = true;
1034   }
1035   return isAddress;
1036 }
1037 
1038 /// Return the type of the memory being accessed.
1039 static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1040                                  Instruction *Inst, Value *OperandVal) {
1041   MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1042 
1043   // First get the type of memory being accessed.
1044   if (Type *Ty = Inst->getAccessType())
1045     AccessTy.MemTy = Ty;
1046 
1047   // Then get the pointer address space.
1048   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1049     AccessTy.AddrSpace = SI->getPointerAddressSpace();
1050   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1051     AccessTy.AddrSpace = LI->getPointerAddressSpace();
1052   } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1053     AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1054   } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1055     AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1056   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1057     switch (II->getIntrinsicID()) {
1058     case Intrinsic::prefetch:
1059     case Intrinsic::memset:
1060       AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1061       AccessTy.MemTy = OperandVal->getType();
1062       break;
1063     case Intrinsic::memmove:
1064     case Intrinsic::memcpy:
1065       AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1066       AccessTy.MemTy = OperandVal->getType();
1067       break;
1068     case Intrinsic::masked_load:
1069       AccessTy.AddrSpace =
1070           II->getArgOperand(0)->getType()->getPointerAddressSpace();
1071       break;
1072     case Intrinsic::masked_store:
1073       AccessTy.AddrSpace =
1074           II->getArgOperand(1)->getType()->getPointerAddressSpace();
1075       break;
1076     default: {
1077       MemIntrinsicInfo IntrInfo;
1078       if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1079         AccessTy.AddrSpace
1080           = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1081       }
1082 
1083       break;
1084     }
1085     }
1086   }
1087 
1088   return AccessTy;
1089 }
1090 
1091 /// Return true if this AddRec is already a phi in its loop.
1092 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1093   for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1094     if (SE.isSCEVable(PN.getType()) &&
1095         (SE.getEffectiveSCEVType(PN.getType()) ==
1096          SE.getEffectiveSCEVType(AR->getType())) &&
1097         SE.getSCEV(&PN) == AR)
1098       return true;
1099   }
1100   return false;
1101 }
1102 
1103 /// Check if expanding this expression is likely to incur significant cost. This
1104 /// is tricky because SCEV doesn't track which expressions are actually computed
1105 /// by the current IR.
1106 ///
1107 /// We currently allow expansion of IV increments that involve adds,
1108 /// multiplication by constants, and AddRecs from existing phis.
1109 ///
1110 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1111 /// obvious multiple of the UDivExpr.
1112 static bool isHighCostExpansion(const SCEV *S,
1113                                 SmallPtrSetImpl<const SCEV*> &Processed,
1114                                 ScalarEvolution &SE) {
1115   // Zero/One operand expressions
1116   switch (S->getSCEVType()) {
1117   case scUnknown:
1118   case scConstant:
1119   case scVScale:
1120     return false;
1121   case scTruncate:
1122     return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1123                                Processed, SE);
1124   case scZeroExtend:
1125     return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1126                                Processed, SE);
1127   case scSignExtend:
1128     return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1129                                Processed, SE);
1130   default:
1131     break;
1132   }
1133 
1134   if (!Processed.insert(S).second)
1135     return false;
1136 
1137   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1138     for (const SCEV *S : Add->operands()) {
1139       if (isHighCostExpansion(S, Processed, SE))
1140         return true;
1141     }
1142     return false;
1143   }
1144 
1145   const SCEV *Op0, *Op1;
1146   if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1147     // Multiplication by a constant is ok
1148     if (isa<SCEVConstant>(Op0))
1149       return isHighCostExpansion(Op1, Processed, SE);
1150 
1151     // If we have the value of one operand, check if an existing
1152     // multiplication already generates this expression.
1153     if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1154       Value *UVal = U->getValue();
1155       for (User *UR : UVal->users()) {
1156         // If U is a constant, it may be used by a ConstantExpr.
1157         Instruction *UI = dyn_cast<Instruction>(UR);
1158         if (UI && UI->getOpcode() == Instruction::Mul &&
1159             SE.isSCEVable(UI->getType())) {
1160           return SE.getSCEV(UI) == S;
1161         }
1162       }
1163     }
1164   }
1165 
1166   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1167     if (isExistingPhi(AR, SE))
1168       return false;
1169   }
1170 
1171   // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1172   return true;
1173 }
1174 
1175 namespace {
1176 
1177 class LSRUse;
1178 
1179 } // end anonymous namespace
1180 
1181 /// Check if the addressing mode defined by \p F is completely
1182 /// folded in \p LU at isel time.
1183 /// This includes address-mode folding and special icmp tricks.
1184 /// This function returns true if \p LU can accommodate what \p F
1185 /// defines and up to 1 base + 1 scaled + offset.
1186 /// In other words, if \p F has several base registers, this function may
1187 /// still return true. Therefore, users still need to account for
1188 /// additional base registers and/or unfolded offsets to derive an
1189 /// accurate cost model.
1190 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1191                                  const LSRUse &LU, const Formula &F);
1192 
1193 // Get the cost of the scaling factor used in F for LU.
1194 static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1195                                             const LSRUse &LU, const Formula &F,
1196                                             const Loop &L);
1197 
1198 namespace {
1199 
1200 /// This class is used to measure and compare candidate formulae.
1201 class Cost {
1202   const Loop *L = nullptr;
1203   ScalarEvolution *SE = nullptr;
1204   const TargetTransformInfo *TTI = nullptr;
1205   TargetTransformInfo::LSRCost C;
1206   TTI::AddressingModeKind AMK = TTI::AMK_None;
1207 
1208 public:
1209   Cost() = delete;
1210   Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1211        TTI::AddressingModeKind AMK) :
1212     L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1213     C.Insns = 0;
1214     C.NumRegs = 0;
1215     C.AddRecCost = 0;
1216     C.NumIVMuls = 0;
1217     C.NumBaseAdds = 0;
1218     C.ImmCost = 0;
1219     C.SetupCost = 0;
1220     C.ScaleCost = 0;
1221   }
1222 
1223   bool isLess(const Cost &Other) const;
1224 
1225   void Lose();
1226 
1227 #ifndef NDEBUG
1228   // Once any of the metrics loses, they must all remain losers.
1229   bool isValid() {
1230     return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1231              | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1232       || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1233            & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1234   }
1235 #endif
1236 
1237   bool isLoser() {
1238     assert(isValid() && "invalid cost");
1239     return C.NumRegs == ~0u;
1240   }
1241 
1242   void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1243                    const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1244                    bool HardwareLoopProfitable,
1245                    SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1246 
1247   void print(raw_ostream &OS) const;
1248   void dump() const;
1249 
1250 private:
1251   void RateRegister(const Formula &F, const SCEV *Reg,
1252                     SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1253                     bool HardwareLoopProfitable);
1254   void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1255                            SmallPtrSetImpl<const SCEV *> &Regs,
1256                            const LSRUse &LU, bool HardwareLoopProfitable,
1257                            SmallPtrSetImpl<const SCEV *> *LoserRegs);
1258 };
1259 
1260 /// An operand value in an instruction which is to be replaced with some
1261 /// equivalent, possibly strength-reduced, replacement.
1262 struct LSRFixup {
1263   /// The instruction which will be updated.
1264   Instruction *UserInst = nullptr;
1265 
1266   /// The operand of the instruction which will be replaced. The operand may be
1267   /// used more than once; every instance will be replaced.
1268   Value *OperandValToReplace = nullptr;
1269 
1270   /// If this user is to use the post-incremented value of an induction
1271   /// variable, this set is non-empty and holds the loops associated with the
1272   /// induction variable.
1273   PostIncLoopSet PostIncLoops;
1274 
1275   /// A constant offset to be added to the LSRUse expression.  This allows
1276   /// multiple fixups to share the same LSRUse with different offsets, for
1277   /// example in an unrolled loop.
1278   Immediate Offset = Immediate::getZero();
1279 
1280   LSRFixup() = default;
1281 
1282   bool isUseFullyOutsideLoop(const Loop *L) const;
1283 
1284   void print(raw_ostream &OS) const;
1285   void dump() const;
1286 };
1287 
1288 /// This class holds the state that LSR keeps for each use in IVUsers, as well
1289 /// as uses invented by LSR itself. It includes information about what kinds of
1290 /// things can be folded into the user, information about the user itself, and
1291 /// information about how the use may be satisfied.  TODO: Represent multiple
1292 /// users of the same expression in common?
1293 class LSRUse {
1294   DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1295 
1296 public:
1297   /// An enum for a kind of use, indicating what types of scaled and immediate
1298   /// operands it might support.
1299   enum KindType {
1300     Basic,   ///< A normal use, with no folding.
1301     Special, ///< A special case of basic, allowing -1 scales.
1302     Address, ///< An address use; folding according to TargetLowering
1303     ICmpZero ///< An equality icmp with both operands folded into one.
1304     // TODO: Add a generic icmp too?
1305   };
1306 
1307   using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1308 
1309   KindType Kind;
1310   MemAccessTy AccessTy;
1311 
1312   /// The list of operands which are to be replaced.
1313   SmallVector<LSRFixup, 8> Fixups;
1314 
1315   /// Keep track of the min and max offsets of the fixups.
1316   Immediate MinOffset = Immediate::getFixedMax();
1317   Immediate MaxOffset = Immediate::getFixedMin();
1318 
1319   /// This records whether all of the fixups using this LSRUse are outside of
1320   /// the loop, in which case some special-case heuristics may be used.
1321   bool AllFixupsOutsideLoop = true;
1322 
1323   /// RigidFormula is set to true to guarantee that this use will be associated
1324   /// with a single formula--the one that initially matched. Some SCEV
1325   /// expressions cannot be expanded. This allows LSR to consider the registers
1326   /// used by those expressions without the need to expand them later after
1327   /// changing the formula.
1328   bool RigidFormula = false;
1329 
1330   /// This records the widest use type for any fixup using this
1331   /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1332   /// fixup widths to be equivalent, because the narrower one may be relying on
1333   /// the implicit truncation to truncate away bogus bits.
1334   Type *WidestFixupType = nullptr;
1335 
1336   /// A list of ways to build a value that can satisfy this user.  After the
1337   /// list is populated, one of these is selected heuristically and used to
1338   /// formulate a replacement for OperandValToReplace in UserInst.
1339   SmallVector<Formula, 12> Formulae;
1340 
1341   /// The set of register candidates used by all formulae in this LSRUse.
1342   SmallPtrSet<const SCEV *, 4> Regs;
1343 
1344   LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1345 
1346   LSRFixup &getNewFixup() {
1347     Fixups.push_back(LSRFixup());
1348     return Fixups.back();
1349   }
1350 
1351   void pushFixup(LSRFixup &f) {
1352     Fixups.push_back(f);
1353     if (Immediate::isKnownGT(f.Offset, MaxOffset))
1354       MaxOffset = f.Offset;
1355     if (Immediate::isKnownLT(f.Offset, MinOffset))
1356       MinOffset = f.Offset;
1357   }
1358 
1359   bool HasFormulaWithSameRegs(const Formula &F) const;
1360   float getNotSelectedProbability(const SCEV *Reg) const;
1361   bool InsertFormula(const Formula &F, const Loop &L);
1362   void DeleteFormula(Formula &F);
1363   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1364 
1365   void print(raw_ostream &OS) const;
1366   void dump() const;
1367 };
1368 
1369 } // end anonymous namespace
1370 
1371 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1372                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
1373                                  GlobalValue *BaseGV, Immediate BaseOffset,
1374                                  bool HasBaseReg, int64_t Scale,
1375                                  Instruction *Fixup = nullptr);
1376 
1377 static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1378   if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1379     return 1;
1380   if (Depth == 0)
1381     return 0;
1382   if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1383     return getSetupCost(S->getStart(), Depth - 1);
1384   if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1385     return getSetupCost(S->getOperand(), Depth - 1);
1386   if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1387     return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1388                            [&](unsigned i, const SCEV *Reg) {
1389                              return i + getSetupCost(Reg, Depth - 1);
1390                            });
1391   if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1392     return getSetupCost(S->getLHS(), Depth - 1) +
1393            getSetupCost(S->getRHS(), Depth - 1);
1394   return 0;
1395 }
1396 
1397 /// Tally up interesting quantities from the given register.
1398 void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1399                         SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1400                         bool HardwareLoopProfitable) {
1401   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1402     // If this is an addrec for another loop, it should be an invariant
1403     // with respect to L since L is the innermost loop (at least
1404     // for now LSR only handles innermost loops).
1405     if (AR->getLoop() != L) {
1406       // If the AddRec exists, consider it's register free and leave it alone.
1407       if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1408         return;
1409 
1410       // It is bad to allow LSR for current loop to add induction variables
1411       // for its sibling loops.
1412       if (!AR->getLoop()->contains(L)) {
1413         Lose();
1414         return;
1415       }
1416 
1417       // Otherwise, it will be an invariant with respect to Loop L.
1418       ++C.NumRegs;
1419       return;
1420     }
1421 
1422     unsigned LoopCost = 1;
1423     if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1424         TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1425       const SCEV *Start;
1426       const SCEVConstant *Step;
1427       if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
1428         // If the step size matches the base offset, we could use pre-indexed
1429         // addressing.
1430         if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
1431              Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
1432             (AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
1433              SE->isLoopInvariant(Start, L)))
1434           LoopCost = 0;
1435     }
1436     // If the loop counts down to zero and we'll be using a hardware loop then
1437     // the addrec will be combined into the hardware loop instruction.
1438     if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1439         HardwareLoopProfitable)
1440       LoopCost = 0;
1441     C.AddRecCost += LoopCost;
1442 
1443     // Add the step value register, if it needs one.
1444     // TODO: The non-affine case isn't precisely modeled here.
1445     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1446       if (!Regs.count(AR->getOperand(1))) {
1447         RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1448         if (isLoser())
1449           return;
1450       }
1451     }
1452   }
1453   ++C.NumRegs;
1454 
1455   // Rough heuristic; favor registers which don't require extra setup
1456   // instructions in the preheader.
1457   C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1458   // Ensure we don't, even with the recusion limit, produce invalid costs.
1459   C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1460 
1461   C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1462                SE->hasComputableLoopEvolution(Reg, L);
1463 }
1464 
1465 /// Record this register in the set. If we haven't seen it before, rate
1466 /// it. Optional LoserRegs provides a way to declare any formula that refers to
1467 /// one of those regs an instant loser.
1468 void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1469                                SmallPtrSetImpl<const SCEV *> &Regs,
1470                                const LSRUse &LU, bool HardwareLoopProfitable,
1471                                SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1472   if (LoserRegs && LoserRegs->count(Reg)) {
1473     Lose();
1474     return;
1475   }
1476   if (Regs.insert(Reg).second) {
1477     RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1478     if (LoserRegs && isLoser())
1479       LoserRegs->insert(Reg);
1480   }
1481 }
1482 
1483 void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1484                        const DenseSet<const SCEV *> &VisitedRegs,
1485                        const LSRUse &LU, bool HardwareLoopProfitable,
1486                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1487   if (isLoser())
1488     return;
1489   assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1490   // Tally up the registers.
1491   unsigned PrevAddRecCost = C.AddRecCost;
1492   unsigned PrevNumRegs = C.NumRegs;
1493   unsigned PrevNumBaseAdds = C.NumBaseAdds;
1494   if (const SCEV *ScaledReg = F.ScaledReg) {
1495     if (VisitedRegs.count(ScaledReg)) {
1496       Lose();
1497       return;
1498     }
1499     RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1500                         LoserRegs);
1501     if (isLoser())
1502       return;
1503   }
1504   for (const SCEV *BaseReg : F.BaseRegs) {
1505     if (VisitedRegs.count(BaseReg)) {
1506       Lose();
1507       return;
1508     }
1509     RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1510                         LoserRegs);
1511     if (isLoser())
1512       return;
1513   }
1514 
1515   // Determine how many (unfolded) adds we'll need inside the loop.
1516   size_t NumBaseParts = F.getNumRegs();
1517   if (NumBaseParts > 1)
1518     // Do not count the base and a possible second register if the target
1519     // allows to fold 2 registers.
1520     C.NumBaseAdds +=
1521         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1522   C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1523 
1524   // Accumulate non-free scaling amounts.
1525   C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1526 
1527   // Tally up the non-zero immediates.
1528   for (const LSRFixup &Fixup : LU.Fixups) {
1529     if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1530       Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1531       if (F.BaseGV)
1532         C.ImmCost += 64; // Handle symbolic values conservatively.
1533                          // TODO: This should probably be the pointer size.
1534       else if (Offset.isNonZero())
1535         C.ImmCost +=
1536             APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1537 
1538       // Check with target if this offset with this instruction is
1539       // specifically not supported.
1540       if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1541           !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1542                                 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1543         C.NumBaseAdds++;
1544     } else {
1545       // Incompatible immediate type, increase cost to avoid using
1546       C.ImmCost += 2048;
1547     }
1548   }
1549 
1550   // If we don't count instruction cost exit here.
1551   if (!InsnsCost) {
1552     assert(isValid() && "invalid cost");
1553     return;
1554   }
1555 
1556   // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1557   // additional instruction (at least fill).
1558   // TODO: Need distinguish register class?
1559   unsigned TTIRegNum = TTI->getNumberOfRegisters(
1560                        TTI->getRegisterClassForType(false, F.getType())) - 1;
1561   if (C.NumRegs > TTIRegNum) {
1562     // Cost already exceeded TTIRegNum, then only newly added register can add
1563     // new instructions.
1564     if (PrevNumRegs > TTIRegNum)
1565       C.Insns += (C.NumRegs - PrevNumRegs);
1566     else
1567       C.Insns += (C.NumRegs - TTIRegNum);
1568   }
1569 
1570   // If ICmpZero formula ends with not 0, it could not be replaced by
1571   // just add or sub. We'll need to compare final result of AddRec.
1572   // That means we'll need an additional instruction. But if the target can
1573   // macro-fuse a compare with a branch, don't count this extra instruction.
1574   // For -10 + {0, +, 1}:
1575   // i = i + 1;
1576   // cmp i, 10
1577   //
1578   // For {-10, +, 1}:
1579   // i = i + 1;
1580   if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1581       !TTI->canMacroFuseCmp())
1582     C.Insns++;
1583   // Each new AddRec adds 1 instruction to calculation.
1584   C.Insns += (C.AddRecCost - PrevAddRecCost);
1585 
1586   // BaseAdds adds instructions for unfolded registers.
1587   if (LU.Kind != LSRUse::ICmpZero)
1588     C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1589   assert(isValid() && "invalid cost");
1590 }
1591 
1592 /// Set this cost to a losing value.
1593 void Cost::Lose() {
1594   C.Insns = std::numeric_limits<unsigned>::max();
1595   C.NumRegs = std::numeric_limits<unsigned>::max();
1596   C.AddRecCost = std::numeric_limits<unsigned>::max();
1597   C.NumIVMuls = std::numeric_limits<unsigned>::max();
1598   C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1599   C.ImmCost = std::numeric_limits<unsigned>::max();
1600   C.SetupCost = std::numeric_limits<unsigned>::max();
1601   C.ScaleCost = std::numeric_limits<unsigned>::max();
1602 }
1603 
1604 /// Choose the lower cost.
1605 bool Cost::isLess(const Cost &Other) const {
1606   if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1607       C.Insns != Other.C.Insns)
1608     return C.Insns < Other.C.Insns;
1609   return TTI->isLSRCostLess(C, Other.C);
1610 }
1611 
1612 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1613 void Cost::print(raw_ostream &OS) const {
1614   if (InsnsCost)
1615     OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1616   OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1617   if (C.AddRecCost != 0)
1618     OS << ", with addrec cost " << C.AddRecCost;
1619   if (C.NumIVMuls != 0)
1620     OS << ", plus " << C.NumIVMuls << " IV mul"
1621        << (C.NumIVMuls == 1 ? "" : "s");
1622   if (C.NumBaseAdds != 0)
1623     OS << ", plus " << C.NumBaseAdds << " base add"
1624        << (C.NumBaseAdds == 1 ? "" : "s");
1625   if (C.ScaleCost != 0)
1626     OS << ", plus " << C.ScaleCost << " scale cost";
1627   if (C.ImmCost != 0)
1628     OS << ", plus " << C.ImmCost << " imm cost";
1629   if (C.SetupCost != 0)
1630     OS << ", plus " << C.SetupCost << " setup cost";
1631 }
1632 
1633 LLVM_DUMP_METHOD void Cost::dump() const {
1634   print(errs()); errs() << '\n';
1635 }
1636 #endif
1637 
1638 /// Test whether this fixup always uses its value outside of the given loop.
1639 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1640   // PHI nodes use their value in their incoming blocks.
1641   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1642     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1643       if (PN->getIncomingValue(i) == OperandValToReplace &&
1644           L->contains(PN->getIncomingBlock(i)))
1645         return false;
1646     return true;
1647   }
1648 
1649   return !L->contains(UserInst);
1650 }
1651 
1652 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1653 void LSRFixup::print(raw_ostream &OS) const {
1654   OS << "UserInst=";
1655   // Store is common and interesting enough to be worth special-casing.
1656   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1657     OS << "store ";
1658     Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1659   } else if (UserInst->getType()->isVoidTy())
1660     OS << UserInst->getOpcodeName();
1661   else
1662     UserInst->printAsOperand(OS, /*PrintType=*/false);
1663 
1664   OS << ", OperandValToReplace=";
1665   OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1666 
1667   for (const Loop *PIL : PostIncLoops) {
1668     OS << ", PostIncLoop=";
1669     PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1670   }
1671 
1672   if (Offset.isNonZero())
1673     OS << ", Offset=" << Offset;
1674 }
1675 
1676 LLVM_DUMP_METHOD void LSRFixup::dump() const {
1677   print(errs()); errs() << '\n';
1678 }
1679 #endif
1680 
1681 /// Test whether this use as a formula which has the same registers as the given
1682 /// formula.
1683 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1684   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1685   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1686   // Unstable sort by host order ok, because this is only used for uniquifying.
1687   llvm::sort(Key);
1688   return Uniquifier.count(Key);
1689 }
1690 
1691 /// The function returns a probability of selecting formula without Reg.
1692 float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1693   unsigned FNum = 0;
1694   for (const Formula &F : Formulae)
1695     if (F.referencesReg(Reg))
1696       FNum++;
1697   return ((float)(Formulae.size() - FNum)) / Formulae.size();
1698 }
1699 
1700 /// If the given formula has not yet been inserted, add it to the list, and
1701 /// return true. Return false otherwise.  The formula must be in canonical form.
1702 bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1703   assert(F.isCanonical(L) && "Invalid canonical representation");
1704 
1705   if (!Formulae.empty() && RigidFormula)
1706     return false;
1707 
1708   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1709   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1710   // Unstable sort by host order ok, because this is only used for uniquifying.
1711   llvm::sort(Key);
1712 
1713   if (!Uniquifier.insert(Key).second)
1714     return false;
1715 
1716   // Using a register to hold the value of 0 is not profitable.
1717   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1718          "Zero allocated in a scaled register!");
1719 #ifndef NDEBUG
1720   for (const SCEV *BaseReg : F.BaseRegs)
1721     assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1722 #endif
1723 
1724   // Add the formula to the list.
1725   Formulae.push_back(F);
1726 
1727   // Record registers now being used by this use.
1728   Regs.insert_range(F.BaseRegs);
1729   if (F.ScaledReg)
1730     Regs.insert(F.ScaledReg);
1731 
1732   return true;
1733 }
1734 
1735 /// Remove the given formula from this use's list.
1736 void LSRUse::DeleteFormula(Formula &F) {
1737   if (&F != &Formulae.back())
1738     std::swap(F, Formulae.back());
1739   Formulae.pop_back();
1740 }
1741 
1742 /// Recompute the Regs field, and update RegUses.
1743 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1744   // Now that we've filtered out some formulae, recompute the Regs set.
1745   SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1746   Regs.clear();
1747   for (const Formula &F : Formulae) {
1748     if (F.ScaledReg) Regs.insert(F.ScaledReg);
1749     Regs.insert_range(F.BaseRegs);
1750   }
1751 
1752   // Update the RegTracker.
1753   for (const SCEV *S : OldRegs)
1754     if (!Regs.count(S))
1755       RegUses.dropRegister(S, LUIdx);
1756 }
1757 
1758 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1759 void LSRUse::print(raw_ostream &OS) const {
1760   OS << "LSR Use: Kind=";
1761   switch (Kind) {
1762   case Basic:    OS << "Basic"; break;
1763   case Special:  OS << "Special"; break;
1764   case ICmpZero: OS << "ICmpZero"; break;
1765   case Address:
1766     OS << "Address of ";
1767     if (AccessTy.MemTy->isPointerTy())
1768       OS << "pointer"; // the full pointer type could be really verbose
1769     else {
1770       OS << *AccessTy.MemTy;
1771     }
1772 
1773     OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1774   }
1775 
1776   OS << ", Offsets={";
1777   bool NeedComma = false;
1778   for (const LSRFixup &Fixup : Fixups) {
1779     if (NeedComma) OS << ',';
1780     OS << Fixup.Offset;
1781     NeedComma = true;
1782   }
1783   OS << '}';
1784 
1785   if (AllFixupsOutsideLoop)
1786     OS << ", all-fixups-outside-loop";
1787 
1788   if (WidestFixupType)
1789     OS << ", widest fixup type: " << *WidestFixupType;
1790 }
1791 
1792 LLVM_DUMP_METHOD void LSRUse::dump() const {
1793   print(errs()); errs() << '\n';
1794 }
1795 #endif
1796 
1797 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1798                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
1799                                  GlobalValue *BaseGV, Immediate BaseOffset,
1800                                  bool HasBaseReg, int64_t Scale,
1801                                  Instruction *Fixup /* = nullptr */) {
1802   switch (Kind) {
1803   case LSRUse::Address: {
1804     int64_t FixedOffset =
1805         BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1806     int64_t ScalableOffset =
1807         BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1808     return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1809                                      HasBaseReg, Scale, AccessTy.AddrSpace,
1810                                      Fixup, ScalableOffset);
1811   }
1812   case LSRUse::ICmpZero:
1813     // There's not even a target hook for querying whether it would be legal to
1814     // fold a GV into an ICmp.
1815     if (BaseGV)
1816       return false;
1817 
1818     // ICmp only has two operands; don't allow more than two non-trivial parts.
1819     if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1820       return false;
1821 
1822     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1823     // putting the scaled register in the other operand of the icmp.
1824     if (Scale != 0 && Scale != -1)
1825       return false;
1826 
1827     // If we have low-level target information, ask the target if it can fold an
1828     // integer immediate on an icmp.
1829     if (BaseOffset.isNonZero()) {
1830       // We don't have an interface to query whether the target supports
1831       // icmpzero against scalable quantities yet.
1832       if (BaseOffset.isScalable())
1833         return false;
1834 
1835       // We have one of:
1836       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1837       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1838       // Offs is the ICmp immediate.
1839       if (Scale == 0)
1840         // The cast does the right thing with
1841         // std::numeric_limits<int64_t>::min().
1842         BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1843       return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1844     }
1845 
1846     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1847     return true;
1848 
1849   case LSRUse::Basic:
1850     // Only handle single-register values.
1851     return !BaseGV && Scale == 0 && BaseOffset.isZero();
1852 
1853   case LSRUse::Special:
1854     // Special case Basic to handle -1 scales.
1855     return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1856   }
1857 
1858   llvm_unreachable("Invalid LSRUse Kind!");
1859 }
1860 
1861 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1862                                  Immediate MinOffset, Immediate MaxOffset,
1863                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
1864                                  GlobalValue *BaseGV, Immediate BaseOffset,
1865                                  bool HasBaseReg, int64_t Scale) {
1866   if (BaseOffset.isNonZero() &&
1867       (BaseOffset.isScalable() != MinOffset.isScalable() ||
1868        BaseOffset.isScalable() != MaxOffset.isScalable()))
1869     return false;
1870   // Check for overflow.
1871   int64_t Base = BaseOffset.getKnownMinValue();
1872   int64_t Min = MinOffset.getKnownMinValue();
1873   int64_t Max = MaxOffset.getKnownMinValue();
1874   if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1875     return false;
1876   MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1877   if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1878     return false;
1879   MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1880 
1881   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1882                               HasBaseReg, Scale) &&
1883          isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1884                               HasBaseReg, Scale);
1885 }
1886 
1887 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1888                                  Immediate MinOffset, Immediate MaxOffset,
1889                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
1890                                  const Formula &F, const Loop &L) {
1891   // For the purpose of isAMCompletelyFolded either having a canonical formula
1892   // or a scale not equal to zero is correct.
1893   // Problems may arise from non canonical formulae having a scale == 0.
1894   // Strictly speaking it would best to just rely on canonical formulae.
1895   // However, when we generate the scaled formulae, we first check that the
1896   // scaling factor is profitable before computing the actual ScaledReg for
1897   // compile time sake.
1898   assert((F.isCanonical(L) || F.Scale != 0));
1899   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1900                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1901 }
1902 
1903 /// Test whether we know how to expand the current formula.
1904 static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1905                        Immediate MaxOffset, LSRUse::KindType Kind,
1906                        MemAccessTy AccessTy, GlobalValue *BaseGV,
1907                        Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1908   // We know how to expand completely foldable formulae.
1909   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1910                               BaseOffset, HasBaseReg, Scale) ||
1911          // Or formulae that use a base register produced by a sum of base
1912          // registers.
1913          (Scale == 1 &&
1914           isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1915                                BaseGV, BaseOffset, true, 0));
1916 }
1917 
1918 static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1919                        Immediate MaxOffset, LSRUse::KindType Kind,
1920                        MemAccessTy AccessTy, const Formula &F) {
1921   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1922                     F.BaseOffset, F.HasBaseReg, F.Scale);
1923 }
1924 
1925 static bool isLegalAddImmediate(const TargetTransformInfo &TTI,
1926                                 Immediate Offset) {
1927   if (Offset.isScalable())
1928     return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1929 
1930   return TTI.isLegalAddImmediate(Offset.getFixedValue());
1931 }
1932 
1933 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1934                                  const LSRUse &LU, const Formula &F) {
1935   // Target may want to look at the user instructions.
1936   if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1937     for (const LSRFixup &Fixup : LU.Fixups)
1938       if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1939                                 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1940                                 F.Scale, Fixup.UserInst))
1941         return false;
1942     return true;
1943   }
1944 
1945   return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1946                               LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1947                               F.Scale);
1948 }
1949 
1950 static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1951                                             const LSRUse &LU, const Formula &F,
1952                                             const Loop &L) {
1953   if (!F.Scale)
1954     return 0;
1955 
1956   // If the use is not completely folded in that instruction, we will have to
1957   // pay an extra cost only for scale != 1.
1958   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1959                             LU.AccessTy, F, L))
1960     return F.Scale != 1;
1961 
1962   switch (LU.Kind) {
1963   case LSRUse::Address: {
1964     // Check the scaling factor cost with both the min and max offsets.
1965     int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1966     if (F.BaseOffset.isScalable()) {
1967       ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1968       ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1969     } else {
1970       FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1971       FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1972     }
1973     InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1974         LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1975         F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1976     InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1977         LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1978         F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1979 
1980     assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1981            "Legal addressing mode has an illegal cost!");
1982     return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1983   }
1984   case LSRUse::ICmpZero:
1985   case LSRUse::Basic:
1986   case LSRUse::Special:
1987     // The use is completely folded, i.e., everything is folded into the
1988     // instruction.
1989     return 0;
1990   }
1991 
1992   llvm_unreachable("Invalid LSRUse Kind!");
1993 }
1994 
1995 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1996                              LSRUse::KindType Kind, MemAccessTy AccessTy,
1997                              GlobalValue *BaseGV, Immediate BaseOffset,
1998                              bool HasBaseReg) {
1999   // Fast-path: zero is always foldable.
2000   if (BaseOffset.isZero() && !BaseGV)
2001     return true;
2002 
2003   // Conservatively, create an address with an immediate and a
2004   // base and a scale.
2005   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2006 
2007   // Canonicalize a scale of 1 to a base register if the formula doesn't
2008   // already have a base register.
2009   if (!HasBaseReg && Scale == 1) {
2010     Scale = 0;
2011     HasBaseReg = true;
2012   }
2013 
2014   // FIXME: Try with + without a scale? Maybe based on TTI?
2015   // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2016   // default for many architectures, not just AArch64 SVE. More investigation
2017   // needed later to determine if this should be used more widely than just
2018   // on scalable types.
2019   if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2020       AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2021     Scale = 0;
2022 
2023   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2024                               HasBaseReg, Scale);
2025 }
2026 
2027 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
2028                              ScalarEvolution &SE, Immediate MinOffset,
2029                              Immediate MaxOffset, LSRUse::KindType Kind,
2030                              MemAccessTy AccessTy, const SCEV *S,
2031                              bool HasBaseReg) {
2032   // Fast-path: zero is always foldable.
2033   if (S->isZero()) return true;
2034 
2035   // Conservatively, create an address with an immediate and a
2036   // base and a scale.
2037   Immediate BaseOffset = ExtractImmediate(S, SE);
2038   GlobalValue *BaseGV = ExtractSymbol(S, SE);
2039 
2040   // If there's anything else involved, it's not foldable.
2041   if (!S->isZero()) return false;
2042 
2043   // Fast-path: zero is always foldable.
2044   if (BaseOffset.isZero() && !BaseGV)
2045     return true;
2046 
2047   if (BaseOffset.isScalable())
2048     return false;
2049 
2050   // Conservatively, create an address with an immediate and a
2051   // base and a scale.
2052   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2053 
2054   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2055                               BaseOffset, HasBaseReg, Scale);
2056 }
2057 
2058 namespace {
2059 
2060 /// An individual increment in a Chain of IV increments.  Relate an IV user to
2061 /// an expression that computes the IV it uses from the IV used by the previous
2062 /// link in the Chain.
2063 ///
2064 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2065 /// original IVOperand. The head of the chain's IVOperand is only valid during
2066 /// chain collection, before LSR replaces IV users. During chain generation,
2067 /// IncExpr can be used to find the new IVOperand that computes the same
2068 /// expression.
2069 struct IVInc {
2070   Instruction *UserInst;
2071   Value* IVOperand;
2072   const SCEV *IncExpr;
2073 
2074   IVInc(Instruction *U, Value *O, const SCEV *E)
2075       : UserInst(U), IVOperand(O), IncExpr(E) {}
2076 };
2077 
2078 // The list of IV increments in program order.  We typically add the head of a
2079 // chain without finding subsequent links.
2080 struct IVChain {
2081   SmallVector<IVInc, 1> Incs;
2082   const SCEV *ExprBase = nullptr;
2083 
2084   IVChain() = default;
2085   IVChain(const IVInc &Head, const SCEV *Base)
2086       : Incs(1, Head), ExprBase(Base) {}
2087 
2088   using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2089 
2090   // Return the first increment in the chain.
2091   const_iterator begin() const {
2092     assert(!Incs.empty());
2093     return std::next(Incs.begin());
2094   }
2095   const_iterator end() const {
2096     return Incs.end();
2097   }
2098 
2099   // Returns true if this chain contains any increments.
2100   bool hasIncs() const { return Incs.size() >= 2; }
2101 
2102   // Add an IVInc to the end of this chain.
2103   void add(const IVInc &X) { Incs.push_back(X); }
2104 
2105   // Returns the last UserInst in the chain.
2106   Instruction *tailUserInst() const { return Incs.back().UserInst; }
2107 
2108   // Returns true if IncExpr can be profitably added to this chain.
2109   bool isProfitableIncrement(const SCEV *OperExpr,
2110                              const SCEV *IncExpr,
2111                              ScalarEvolution&);
2112 };
2113 
2114 /// Helper for CollectChains to track multiple IV increment uses.  Distinguish
2115 /// between FarUsers that definitely cross IV increments and NearUsers that may
2116 /// be used between IV increments.
2117 struct ChainUsers {
2118   SmallPtrSet<Instruction*, 4> FarUsers;
2119   SmallPtrSet<Instruction*, 4> NearUsers;
2120 };
2121 
2122 /// This class holds state for the main loop strength reduction logic.
2123 class LSRInstance {
2124   IVUsers &IU;
2125   ScalarEvolution &SE;
2126   DominatorTree &DT;
2127   LoopInfo &LI;
2128   AssumptionCache &AC;
2129   TargetLibraryInfo &TLI;
2130   const TargetTransformInfo &TTI;
2131   Loop *const L;
2132   MemorySSAUpdater *MSSAU;
2133   TTI::AddressingModeKind AMK;
2134   mutable SCEVExpander Rewriter;
2135   bool Changed = false;
2136   bool HardwareLoopProfitable = false;
2137 
2138   /// This is the insert position that the current loop's induction variable
2139   /// increment should be placed. In simple loops, this is the latch block's
2140   /// terminator. But in more complicated cases, this is a position which will
2141   /// dominate all the in-loop post-increment users.
2142   Instruction *IVIncInsertPos = nullptr;
2143 
2144   /// Interesting factors between use strides.
2145   ///
2146   /// We explicitly use a SetVector which contains a SmallSet, instead of the
2147   /// default, a SmallDenseSet, because we need to use the full range of
2148   /// int64_ts, and there's currently no good way of doing that with
2149   /// SmallDenseSet.
2150   SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2151 
2152   /// The cost of the current SCEV, the best solution by LSR will be dropped if
2153   /// the solution is not profitable.
2154   Cost BaselineCost;
2155 
2156   /// Interesting use types, to facilitate truncation reuse.
2157   SmallSetVector<Type *, 4> Types;
2158 
2159   /// The list of interesting uses.
2160   mutable SmallVector<LSRUse, 16> Uses;
2161 
2162   /// Track which uses use which register candidates.
2163   RegUseTracker RegUses;
2164 
2165   // Limit the number of chains to avoid quadratic behavior. We don't expect to
2166   // have more than a few IV increment chains in a loop. Missing a Chain falls
2167   // back to normal LSR behavior for those uses.
2168   static const unsigned MaxChains = 8;
2169 
2170   /// IV users can form a chain of IV increments.
2171   SmallVector<IVChain, MaxChains> IVChainVec;
2172 
2173   /// IV users that belong to profitable IVChains.
2174   SmallPtrSet<Use*, MaxChains> IVIncSet;
2175 
2176   /// Induction variables that were generated and inserted by the SCEV Expander.
2177   SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2178 
2179   // Inserting instructions in the loop and using them as PHI's input could
2180   // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2181   // corresponding incoming block is not loop exiting). So collect all such
2182   // instructions to form LCSSA for them later.
2183   SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2184 
2185   void OptimizeShadowIV();
2186   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2187   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2188   void OptimizeLoopTermCond();
2189 
2190   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2191                         SmallVectorImpl<ChainUsers> &ChainUsersVec);
2192   void FinalizeChain(IVChain &Chain);
2193   void CollectChains();
2194   void GenerateIVChain(const IVChain &Chain,
2195                        SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2196 
2197   void CollectInterestingTypesAndFactors();
2198   void CollectFixupsAndInitialFormulae();
2199 
2200   // Support for sharing of LSRUses between LSRFixups.
2201   using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2202   UseMapTy UseMap;
2203 
2204   bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2205                           LSRUse::KindType Kind, MemAccessTy AccessTy);
2206 
2207   std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2208                                       MemAccessTy AccessTy);
2209 
2210   void DeleteUse(LSRUse &LU, size_t LUIdx);
2211 
2212   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2213 
2214   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2215   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2216   void CountRegisters(const Formula &F, size_t LUIdx);
2217   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2218 
2219   void CollectLoopInvariantFixupsAndFormulae();
2220 
2221   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2222                               unsigned Depth = 0);
2223 
2224   void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2225                                   const Formula &Base, unsigned Depth,
2226                                   size_t Idx, bool IsScaledReg = false);
2227   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2228   void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2229                                    const Formula &Base, size_t Idx,
2230                                    bool IsScaledReg = false);
2231   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2232   void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2233                                    const Formula &Base,
2234                                    const SmallVectorImpl<Immediate> &Worklist,
2235                                    size_t Idx, bool IsScaledReg = false);
2236   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2237   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2238   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2239   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2240   void GenerateCrossUseConstantOffsets();
2241   void GenerateAllReuseFormulae();
2242 
2243   void FilterOutUndesirableDedicatedRegisters();
2244 
2245   size_t EstimateSearchSpaceComplexity() const;
2246   void NarrowSearchSpaceByDetectingSupersets();
2247   void NarrowSearchSpaceByCollapsingUnrolledCode();
2248   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2249   void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2250   void NarrowSearchSpaceByFilterPostInc();
2251   void NarrowSearchSpaceByDeletingCostlyFormulas();
2252   void NarrowSearchSpaceByPickingWinnerRegs();
2253   void NarrowSearchSpaceUsingHeuristics();
2254 
2255   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2256                     Cost &SolutionCost,
2257                     SmallVectorImpl<const Formula *> &Workspace,
2258                     const Cost &CurCost,
2259                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
2260                     DenseSet<const SCEV *> &VisitedRegs) const;
2261   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2262 
2263   BasicBlock::iterator
2264   HoistInsertPosition(BasicBlock::iterator IP,
2265                       const SmallVectorImpl<Instruction *> &Inputs) const;
2266   BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2267                                                      const LSRFixup &LF,
2268                                                      const LSRUse &LU) const;
2269 
2270   Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2271                 BasicBlock::iterator IP,
2272                 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2273   void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2274                      const Formula &F,
2275                      SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2276   void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2277                SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2278   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2279 
2280 public:
2281   LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2282               LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2283               TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2284 
2285   bool getChanged() const { return Changed; }
2286   const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2287     return ScalarEvolutionIVs;
2288   }
2289 
2290   void print_factors_and_types(raw_ostream &OS) const;
2291   void print_fixups(raw_ostream &OS) const;
2292   void print_uses(raw_ostream &OS) const;
2293   void print(raw_ostream &OS) const;
2294   void dump() const;
2295 };
2296 
2297 } // end anonymous namespace
2298 
2299 /// If IV is used in a int-to-float cast inside the loop then try to eliminate
2300 /// the cast operation.
2301 void LSRInstance::OptimizeShadowIV() {
2302   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2303   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2304     return;
2305 
2306   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2307        UI != E; /* empty */) {
2308     IVUsers::const_iterator CandidateUI = UI;
2309     ++UI;
2310     Instruction *ShadowUse = CandidateUI->getUser();
2311     Type *DestTy = nullptr;
2312     bool IsSigned = false;
2313 
2314     /* If shadow use is a int->float cast then insert a second IV
2315        to eliminate this cast.
2316 
2317          for (unsigned i = 0; i < n; ++i)
2318            foo((double)i);
2319 
2320        is transformed into
2321 
2322          double d = 0.0;
2323          for (unsigned i = 0; i < n; ++i, ++d)
2324            foo(d);
2325     */
2326     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2327       IsSigned = false;
2328       DestTy = UCast->getDestTy();
2329     }
2330     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2331       IsSigned = true;
2332       DestTy = SCast->getDestTy();
2333     }
2334     if (!DestTy) continue;
2335 
2336     // If target does not support DestTy natively then do not apply
2337     // this transformation.
2338     if (!TTI.isTypeLegal(DestTy)) continue;
2339 
2340     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2341     if (!PH) continue;
2342     if (PH->getNumIncomingValues() != 2) continue;
2343 
2344     // If the calculation in integers overflows, the result in FP type will
2345     // differ. So we only can do this transformation if we are guaranteed to not
2346     // deal with overflowing values
2347     const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2348     if (!AR) continue;
2349     if (IsSigned && !AR->hasNoSignedWrap()) continue;
2350     if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2351 
2352     Type *SrcTy = PH->getType();
2353     int Mantissa = DestTy->getFPMantissaWidth();
2354     if (Mantissa == -1) continue;
2355     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2356       continue;
2357 
2358     unsigned Entry, Latch;
2359     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2360       Entry = 0;
2361       Latch = 1;
2362     } else {
2363       Entry = 1;
2364       Latch = 0;
2365     }
2366 
2367     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2368     if (!Init) continue;
2369     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2370                                         (double)Init->getSExtValue() :
2371                                         (double)Init->getZExtValue());
2372 
2373     BinaryOperator *Incr =
2374       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2375     if (!Incr) continue;
2376     if (Incr->getOpcode() != Instruction::Add
2377         && Incr->getOpcode() != Instruction::Sub)
2378       continue;
2379 
2380     /* Initialize new IV, double d = 0.0 in above example. */
2381     ConstantInt *C = nullptr;
2382     if (Incr->getOperand(0) == PH)
2383       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2384     else if (Incr->getOperand(1) == PH)
2385       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2386     else
2387       continue;
2388 
2389     if (!C) continue;
2390 
2391     // Ignore negative constants, as the code below doesn't handle them
2392     // correctly. TODO: Remove this restriction.
2393     if (!C->getValue().isStrictlyPositive())
2394       continue;
2395 
2396     /* Add new PHINode. */
2397     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2398     NewPH->setDebugLoc(PH->getDebugLoc());
2399 
2400     /* create new increment. '++d' in above example. */
2401     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2402     BinaryOperator *NewIncr = BinaryOperator::Create(
2403         Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2404                                               : Instruction::FSub,
2405         NewPH, CFP, "IV.S.next.", Incr->getIterator());
2406     NewIncr->setDebugLoc(Incr->getDebugLoc());
2407 
2408     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2409     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2410 
2411     /* Remove cast operation */
2412     ShadowUse->replaceAllUsesWith(NewPH);
2413     ShadowUse->eraseFromParent();
2414     Changed = true;
2415     break;
2416   }
2417 }
2418 
2419 /// If Cond has an operand that is an expression of an IV, set the IV user and
2420 /// stride information and return true, otherwise return false.
2421 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2422   for (IVStrideUse &U : IU)
2423     if (U.getUser() == Cond) {
2424       // NOTE: we could handle setcc instructions with multiple uses here, but
2425       // InstCombine does it as well for simple uses, it's not clear that it
2426       // occurs enough in real life to handle.
2427       CondUse = &U;
2428       return true;
2429     }
2430   return false;
2431 }
2432 
2433 /// Rewrite the loop's terminating condition if it uses a max computation.
2434 ///
2435 /// This is a narrow solution to a specific, but acute, problem. For loops
2436 /// like this:
2437 ///
2438 ///   i = 0;
2439 ///   do {
2440 ///     p[i] = 0.0;
2441 ///   } while (++i < n);
2442 ///
2443 /// the trip count isn't just 'n', because 'n' might not be positive. And
2444 /// unfortunately this can come up even for loops where the user didn't use
2445 /// a C do-while loop. For example, seemingly well-behaved top-test loops
2446 /// will commonly be lowered like this:
2447 ///
2448 ///   if (n > 0) {
2449 ///     i = 0;
2450 ///     do {
2451 ///       p[i] = 0.0;
2452 ///     } while (++i < n);
2453 ///   }
2454 ///
2455 /// and then it's possible for subsequent optimization to obscure the if
2456 /// test in such a way that indvars can't find it.
2457 ///
2458 /// When indvars can't find the if test in loops like this, it creates a
2459 /// max expression, which allows it to give the loop a canonical
2460 /// induction variable:
2461 ///
2462 ///   i = 0;
2463 ///   max = n < 1 ? 1 : n;
2464 ///   do {
2465 ///     p[i] = 0.0;
2466 ///   } while (++i != max);
2467 ///
2468 /// Canonical induction variables are necessary because the loop passes
2469 /// are designed around them. The most obvious example of this is the
2470 /// LoopInfo analysis, which doesn't remember trip count values. It
2471 /// expects to be able to rediscover the trip count each time it is
2472 /// needed, and it does this using a simple analysis that only succeeds if
2473 /// the loop has a canonical induction variable.
2474 ///
2475 /// However, when it comes time to generate code, the maximum operation
2476 /// can be quite costly, especially if it's inside of an outer loop.
2477 ///
2478 /// This function solves this problem by detecting this type of loop and
2479 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2480 /// the instructions for the maximum computation.
2481 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2482   // Check that the loop matches the pattern we're looking for.
2483   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2484       Cond->getPredicate() != CmpInst::ICMP_NE)
2485     return Cond;
2486 
2487   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2488   if (!Sel || !Sel->hasOneUse()) return Cond;
2489 
2490   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2491   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2492     return Cond;
2493   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2494 
2495   // Add one to the backedge-taken count to get the trip count.
2496   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2497   if (IterationCount != SE.getSCEV(Sel)) return Cond;
2498 
2499   // Check for a max calculation that matches the pattern. There's no check
2500   // for ICMP_ULE here because the comparison would be with zero, which
2501   // isn't interesting.
2502   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2503   const SCEVNAryExpr *Max = nullptr;
2504   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2505     Pred = ICmpInst::ICMP_SLE;
2506     Max = S;
2507   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2508     Pred = ICmpInst::ICMP_SLT;
2509     Max = S;
2510   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2511     Pred = ICmpInst::ICMP_ULT;
2512     Max = U;
2513   } else {
2514     // No match; bail.
2515     return Cond;
2516   }
2517 
2518   // To handle a max with more than two operands, this optimization would
2519   // require additional checking and setup.
2520   if (Max->getNumOperands() != 2)
2521     return Cond;
2522 
2523   const SCEV *MaxLHS = Max->getOperand(0);
2524   const SCEV *MaxRHS = Max->getOperand(1);
2525 
2526   // ScalarEvolution canonicalizes constants to the left. For < and >, look
2527   // for a comparison with 1. For <= and >=, a comparison with zero.
2528   if (!MaxLHS ||
2529       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2530     return Cond;
2531 
2532   // Check the relevant induction variable for conformance to
2533   // the pattern.
2534   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2535   if (!match(IV,
2536              m_scev_AffineAddRec(m_scev_SpecificInt(1), m_scev_SpecificInt(1))))
2537     return Cond;
2538 
2539   assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2540          "Loop condition operand is an addrec in a different loop!");
2541 
2542   // Check the right operand of the select, and remember it, as it will
2543   // be used in the new comparison instruction.
2544   Value *NewRHS = nullptr;
2545   if (ICmpInst::isTrueWhenEqual(Pred)) {
2546     // Look for n+1, and grab n.
2547     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2548       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2549          if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2550            NewRHS = BO->getOperand(0);
2551     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2552       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2553         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2554           NewRHS = BO->getOperand(0);
2555     if (!NewRHS)
2556       return Cond;
2557   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2558     NewRHS = Sel->getOperand(1);
2559   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2560     NewRHS = Sel->getOperand(2);
2561   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2562     NewRHS = SU->getValue();
2563   else
2564     // Max doesn't match expected pattern.
2565     return Cond;
2566 
2567   // Determine the new comparison opcode. It may be signed or unsigned,
2568   // and the original comparison may be either equality or inequality.
2569   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2570     Pred = CmpInst::getInversePredicate(Pred);
2571 
2572   // Ok, everything looks ok to change the condition into an SLT or SGE and
2573   // delete the max calculation.
2574   ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2575                                    Cond->getOperand(0), NewRHS, "scmp");
2576 
2577   // Delete the max calculation instructions.
2578   NewCond->setDebugLoc(Cond->getDebugLoc());
2579   Cond->replaceAllUsesWith(NewCond);
2580   CondUse->setUser(NewCond);
2581   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2582   Cond->eraseFromParent();
2583   Sel->eraseFromParent();
2584   if (Cmp->use_empty()) {
2585     salvageDebugInfo(*Cmp);
2586     Cmp->eraseFromParent();
2587   }
2588   return NewCond;
2589 }
2590 
2591 /// Change loop terminating condition to use the postinc iv when possible.
2592 void
2593 LSRInstance::OptimizeLoopTermCond() {
2594   SmallPtrSet<Instruction *, 4> PostIncs;
2595 
2596   // We need a different set of heuristics for rotated and non-rotated loops.
2597   // If a loop is rotated then the latch is also the backedge, so inserting
2598   // post-inc expressions just before the latch is ideal. To reduce live ranges
2599   // it also makes sense to rewrite terminating conditions to use post-inc
2600   // expressions.
2601   //
2602   // If the loop is not rotated then the latch is not a backedge; the latch
2603   // check is done in the loop head. Adding post-inc expressions before the
2604   // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2605   // in the loop body. In this case we do *not* want to use post-inc expressions
2606   // in the latch check, and we want to insert post-inc expressions before
2607   // the backedge.
2608   BasicBlock *LatchBlock = L->getLoopLatch();
2609   SmallVector<BasicBlock*, 8> ExitingBlocks;
2610   L->getExitingBlocks(ExitingBlocks);
2611   if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2612     // The backedge doesn't exit the loop; treat this as a head-tested loop.
2613     IVIncInsertPos = LatchBlock->getTerminator();
2614     return;
2615   }
2616 
2617   // Otherwise treat this as a rotated loop.
2618   for (BasicBlock *ExitingBlock : ExitingBlocks) {
2619     // Get the terminating condition for the loop if possible.  If we
2620     // can, we want to change it to use a post-incremented version of its
2621     // induction variable, to allow coalescing the live ranges for the IV into
2622     // one register value.
2623 
2624     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2625     if (!TermBr)
2626       continue;
2627     // FIXME: Overly conservative, termination condition could be an 'or' etc..
2628     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2629       continue;
2630 
2631     // Search IVUsesByStride to find Cond's IVUse if there is one.
2632     IVStrideUse *CondUse = nullptr;
2633     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2634     if (!FindIVUserForCond(Cond, CondUse))
2635       continue;
2636 
2637     // If the trip count is computed in terms of a max (due to ScalarEvolution
2638     // being unable to find a sufficient guard, for example), change the loop
2639     // comparison to use SLT or ULT instead of NE.
2640     // One consequence of doing this now is that it disrupts the count-down
2641     // optimization. That's not always a bad thing though, because in such
2642     // cases it may still be worthwhile to avoid a max.
2643     Cond = OptimizeMax(Cond, CondUse);
2644 
2645     // If this exiting block dominates the latch block, it may also use
2646     // the post-inc value if it won't be shared with other uses.
2647     // Check for dominance.
2648     if (!DT.dominates(ExitingBlock, LatchBlock))
2649       continue;
2650 
2651     // Conservatively avoid trying to use the post-inc value in non-latch
2652     // exits if there may be pre-inc users in intervening blocks.
2653     if (LatchBlock != ExitingBlock)
2654       for (const IVStrideUse &UI : IU)
2655         // Test if the use is reachable from the exiting block. This dominator
2656         // query is a conservative approximation of reachability.
2657         if (&UI != CondUse &&
2658             !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2659           // Conservatively assume there may be reuse if the quotient of their
2660           // strides could be a legal scale.
2661           const SCEV *A = IU.getStride(*CondUse, L);
2662           const SCEV *B = IU.getStride(UI, L);
2663           if (!A || !B) continue;
2664           if (SE.getTypeSizeInBits(A->getType()) !=
2665               SE.getTypeSizeInBits(B->getType())) {
2666             if (SE.getTypeSizeInBits(A->getType()) >
2667                 SE.getTypeSizeInBits(B->getType()))
2668               B = SE.getSignExtendExpr(B, A->getType());
2669             else
2670               A = SE.getSignExtendExpr(A, B->getType());
2671           }
2672           if (const SCEVConstant *D =
2673                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2674             const ConstantInt *C = D->getValue();
2675             // Stride of one or negative one can have reuse with non-addresses.
2676             if (C->isOne() || C->isMinusOne())
2677               goto decline_post_inc;
2678             // Avoid weird situations.
2679             if (C->getValue().getSignificantBits() >= 64 ||
2680                 C->getValue().isMinSignedValue())
2681               goto decline_post_inc;
2682             // Check for possible scaled-address reuse.
2683             if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2684               MemAccessTy AccessTy =
2685                   getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2686               int64_t Scale = C->getSExtValue();
2687               if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2688                                             /*BaseOffset=*/0,
2689                                             /*HasBaseReg=*/true, Scale,
2690                                             AccessTy.AddrSpace))
2691                 goto decline_post_inc;
2692               Scale = -Scale;
2693               if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2694                                             /*BaseOffset=*/0,
2695                                             /*HasBaseReg=*/true, Scale,
2696                                             AccessTy.AddrSpace))
2697                 goto decline_post_inc;
2698             }
2699           }
2700         }
2701 
2702     LLVM_DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
2703                       << *Cond << '\n');
2704 
2705     // It's possible for the setcc instruction to be anywhere in the loop, and
2706     // possible for it to have multiple users.  If it is not immediately before
2707     // the exiting block branch, move it.
2708     if (Cond->getNextNonDebugInstruction() != TermBr) {
2709       if (Cond->hasOneUse()) {
2710         Cond->moveBefore(TermBr->getIterator());
2711       } else {
2712         // Clone the terminating condition and insert into the loopend.
2713         ICmpInst *OldCond = Cond;
2714         Cond = cast<ICmpInst>(Cond->clone());
2715         Cond->setName(L->getHeader()->getName() + ".termcond");
2716         Cond->insertInto(ExitingBlock, TermBr->getIterator());
2717 
2718         // Clone the IVUse, as the old use still exists!
2719         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2720         TermBr->replaceUsesOfWith(OldCond, Cond);
2721       }
2722     }
2723 
2724     // If we get to here, we know that we can transform the setcc instruction to
2725     // use the post-incremented version of the IV, allowing us to coalesce the
2726     // live ranges for the IV correctly.
2727     CondUse->transformToPostInc(L);
2728     Changed = true;
2729 
2730     PostIncs.insert(Cond);
2731   decline_post_inc:;
2732   }
2733 
2734   // Determine an insertion point for the loop induction variable increment. It
2735   // must dominate all the post-inc comparisons we just set up, and it must
2736   // dominate the loop latch edge.
2737   IVIncInsertPos = L->getLoopLatch()->getTerminator();
2738   for (Instruction *Inst : PostIncs)
2739     IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2740 }
2741 
2742 /// Determine if the given use can accommodate a fixup at the given offset and
2743 /// other details. If so, update the use and return true.
2744 bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2745                                      bool HasBaseReg, LSRUse::KindType Kind,
2746                                      MemAccessTy AccessTy) {
2747   Immediate NewMinOffset = LU.MinOffset;
2748   Immediate NewMaxOffset = LU.MaxOffset;
2749   MemAccessTy NewAccessTy = AccessTy;
2750 
2751   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2752   // something conservative, however this can pessimize in the case that one of
2753   // the uses will have all its uses outside the loop, for example.
2754   if (LU.Kind != Kind)
2755     return false;
2756 
2757   // Check for a mismatched access type, and fall back conservatively as needed.
2758   // TODO: Be less conservative when the type is similar and can use the same
2759   // addressing modes.
2760   if (Kind == LSRUse::Address) {
2761     if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2762       NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2763                                             AccessTy.AddrSpace);
2764     }
2765   }
2766 
2767   // Conservatively assume HasBaseReg is true for now.
2768   if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2769     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2770                           LU.MaxOffset - NewOffset, HasBaseReg))
2771       return false;
2772     NewMinOffset = NewOffset;
2773   } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2774     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2775                           NewOffset - LU.MinOffset, HasBaseReg))
2776       return false;
2777     NewMaxOffset = NewOffset;
2778   }
2779 
2780   // FIXME: We should be able to handle some level of scalable offset support
2781   // for 'void', but in order to get basic support up and running this is
2782   // being left out.
2783   if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2784       (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2785     return false;
2786 
2787   // Update the use.
2788   LU.MinOffset = NewMinOffset;
2789   LU.MaxOffset = NewMaxOffset;
2790   LU.AccessTy = NewAccessTy;
2791   return true;
2792 }
2793 
2794 /// Return an LSRUse index and an offset value for a fixup which needs the given
2795 /// expression, with the given kind and optional access type.  Either reuse an
2796 /// existing use or create a new one, as needed.
2797 std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2798                                                  LSRUse::KindType Kind,
2799                                                  MemAccessTy AccessTy) {
2800   const SCEV *Copy = Expr;
2801   Immediate Offset = ExtractImmediate(Expr, SE);
2802 
2803   // Basic uses can't accept any offset, for example.
2804   if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2805                         Offset, /*HasBaseReg=*/ true)) {
2806     Expr = Copy;
2807     Offset = Immediate::getFixed(0);
2808   }
2809 
2810   std::pair<UseMapTy::iterator, bool> P =
2811       UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2812   if (!P.second) {
2813     // A use already existed with this base.
2814     size_t LUIdx = P.first->second;
2815     LSRUse &LU = Uses[LUIdx];
2816     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2817       // Reuse this use.
2818       return std::make_pair(LUIdx, Offset);
2819   }
2820 
2821   // Create a new use.
2822   size_t LUIdx = Uses.size();
2823   P.first->second = LUIdx;
2824   Uses.push_back(LSRUse(Kind, AccessTy));
2825   LSRUse &LU = Uses[LUIdx];
2826 
2827   LU.MinOffset = Offset;
2828   LU.MaxOffset = Offset;
2829   return std::make_pair(LUIdx, Offset);
2830 }
2831 
2832 /// Delete the given use from the Uses list.
2833 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2834   if (&LU != &Uses.back())
2835     std::swap(LU, Uses.back());
2836   Uses.pop_back();
2837 
2838   // Update RegUses.
2839   RegUses.swapAndDropUse(LUIdx, Uses.size());
2840 }
2841 
2842 /// Look for a use distinct from OrigLU which is has a formula that has the same
2843 /// registers as the given formula.
2844 LSRUse *
2845 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2846                                        const LSRUse &OrigLU) {
2847   // Search all uses for the formula. This could be more clever.
2848   for (LSRUse &LU : Uses) {
2849     // Check whether this use is close enough to OrigLU, to see whether it's
2850     // worthwhile looking through its formulae.
2851     // Ignore ICmpZero uses because they may contain formulae generated by
2852     // GenerateICmpZeroScales, in which case adding fixup offsets may
2853     // be invalid.
2854     if (&LU != &OrigLU &&
2855         LU.Kind != LSRUse::ICmpZero &&
2856         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2857         LU.WidestFixupType == OrigLU.WidestFixupType &&
2858         LU.HasFormulaWithSameRegs(OrigF)) {
2859       // Scan through this use's formulae.
2860       for (const Formula &F : LU.Formulae) {
2861         // Check to see if this formula has the same registers and symbols
2862         // as OrigF.
2863         if (F.BaseRegs == OrigF.BaseRegs &&
2864             F.ScaledReg == OrigF.ScaledReg &&
2865             F.BaseGV == OrigF.BaseGV &&
2866             F.Scale == OrigF.Scale &&
2867             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2868           if (F.BaseOffset.isZero())
2869             return &LU;
2870           // This is the formula where all the registers and symbols matched;
2871           // there aren't going to be any others. Since we declined it, we
2872           // can skip the rest of the formulae and proceed to the next LSRUse.
2873           break;
2874         }
2875       }
2876     }
2877   }
2878 
2879   // Nothing looked good.
2880   return nullptr;
2881 }
2882 
2883 void LSRInstance::CollectInterestingTypesAndFactors() {
2884   SmallSetVector<const SCEV *, 4> Strides;
2885 
2886   // Collect interesting types and strides.
2887   SmallVector<const SCEV *, 4> Worklist;
2888   for (const IVStrideUse &U : IU) {
2889     const SCEV *Expr = IU.getExpr(U);
2890     if (!Expr)
2891       continue;
2892 
2893     // Collect interesting types.
2894     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2895 
2896     // Add strides for mentioned loops.
2897     Worklist.push_back(Expr);
2898     do {
2899       const SCEV *S = Worklist.pop_back_val();
2900       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2901         if (AR->getLoop() == L)
2902           Strides.insert(AR->getStepRecurrence(SE));
2903         Worklist.push_back(AR->getStart());
2904       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2905         append_range(Worklist, Add->operands());
2906       }
2907     } while (!Worklist.empty());
2908   }
2909 
2910   // Compute interesting factors from the set of interesting strides.
2911   for (SmallSetVector<const SCEV *, 4>::const_iterator
2912        I = Strides.begin(), E = Strides.end(); I != E; ++I)
2913     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2914          std::next(I); NewStrideIter != E; ++NewStrideIter) {
2915       const SCEV *OldStride = *I;
2916       const SCEV *NewStride = *NewStrideIter;
2917 
2918       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2919           SE.getTypeSizeInBits(NewStride->getType())) {
2920         if (SE.getTypeSizeInBits(OldStride->getType()) >
2921             SE.getTypeSizeInBits(NewStride->getType()))
2922           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2923         else
2924           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2925       }
2926       if (const SCEVConstant *Factor =
2927             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2928                                                         SE, true))) {
2929         if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2930           Factors.insert(Factor->getAPInt().getSExtValue());
2931       } else if (const SCEVConstant *Factor =
2932                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2933                                                                NewStride,
2934                                                                SE, true))) {
2935         if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2936           Factors.insert(Factor->getAPInt().getSExtValue());
2937       }
2938     }
2939 
2940   // If all uses use the same type, don't bother looking for truncation-based
2941   // reuse.
2942   if (Types.size() == 1)
2943     Types.clear();
2944 
2945   LLVM_DEBUG(print_factors_and_types(dbgs()));
2946 }
2947 
2948 /// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2949 /// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2950 /// IVStrideUses, we could partially skip this.
2951 static User::op_iterator
2952 findIVOperand(User::op_iterator OI, User::op_iterator OE,
2953               Loop *L, ScalarEvolution &SE) {
2954   for(; OI != OE; ++OI) {
2955     if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2956       if (!SE.isSCEVable(Oper->getType()))
2957         continue;
2958 
2959       if (const SCEVAddRecExpr *AR =
2960           dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2961         if (AR->getLoop() == L)
2962           break;
2963       }
2964     }
2965   }
2966   return OI;
2967 }
2968 
2969 /// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2970 /// a convenient helper.
2971 static Value *getWideOperand(Value *Oper) {
2972   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2973     return Trunc->getOperand(0);
2974   return Oper;
2975 }
2976 
2977 /// Return an approximation of this SCEV expression's "base", or NULL for any
2978 /// constant. Returning the expression itself is conservative. Returning a
2979 /// deeper subexpression is more precise and valid as long as it isn't less
2980 /// complex than another subexpression. For expressions involving multiple
2981 /// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2982 /// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2983 /// IVInc==b-a.
2984 ///
2985 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2986 /// SCEVUnknown, we simply return the rightmost SCEV operand.
2987 static const SCEV *getExprBase(const SCEV *S) {
2988   switch (S->getSCEVType()) {
2989   default: // including scUnknown.
2990     return S;
2991   case scConstant:
2992   case scVScale:
2993     return nullptr;
2994   case scTruncate:
2995     return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2996   case scZeroExtend:
2997     return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2998   case scSignExtend:
2999     return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3000   case scAddExpr: {
3001     // Skip over scaled operands (scMulExpr) to follow add operands as long as
3002     // there's nothing more complex.
3003     // FIXME: not sure if we want to recognize negation.
3004     const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3005     for (const SCEV *SubExpr : reverse(Add->operands())) {
3006       if (SubExpr->getSCEVType() == scAddExpr)
3007         return getExprBase(SubExpr);
3008 
3009       if (SubExpr->getSCEVType() != scMulExpr)
3010         return SubExpr;
3011     }
3012     return S; // all operands are scaled, be conservative.
3013   }
3014   case scAddRecExpr:
3015     return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3016   }
3017   llvm_unreachable("Unknown SCEV kind!");
3018 }
3019 
3020 /// Return true if the chain increment is profitable to expand into a loop
3021 /// invariant value, which may require its own register. A profitable chain
3022 /// increment will be an offset relative to the same base. We allow such offsets
3023 /// to potentially be used as chain increment as long as it's not obviously
3024 /// expensive to expand using real instructions.
3025 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3026                                     const SCEV *IncExpr,
3027                                     ScalarEvolution &SE) {
3028   // Aggressively form chains when -stress-ivchain.
3029   if (StressIVChain)
3030     return true;
3031 
3032   // Do not replace a constant offset from IV head with a nonconstant IV
3033   // increment.
3034   if (!isa<SCEVConstant>(IncExpr)) {
3035     const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3036     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3037       return false;
3038   }
3039 
3040   SmallPtrSet<const SCEV*, 8> Processed;
3041   return !isHighCostExpansion(IncExpr, Processed, SE);
3042 }
3043 
3044 /// Return true if the number of registers needed for the chain is estimated to
3045 /// be less than the number required for the individual IV users. First prohibit
3046 /// any IV users that keep the IV live across increments (the Users set should
3047 /// be empty). Next count the number and type of increments in the chain.
3048 ///
3049 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3050 /// effectively use postinc addressing modes. Only consider it profitable it the
3051 /// increments can be computed in fewer registers when chained.
3052 ///
3053 /// TODO: Consider IVInc free if it's already used in another chains.
3054 static bool isProfitableChain(IVChain &Chain,
3055                               SmallPtrSetImpl<Instruction *> &Users,
3056                               ScalarEvolution &SE,
3057                               const TargetTransformInfo &TTI) {
3058   if (StressIVChain)
3059     return true;
3060 
3061   if (!Chain.hasIncs())
3062     return false;
3063 
3064   if (!Users.empty()) {
3065     LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3066                for (Instruction *Inst
3067                     : Users) { dbgs() << "  " << *Inst << "\n"; });
3068     return false;
3069   }
3070   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3071 
3072   // The chain itself may require a register, so intialize cost to 1.
3073   int cost = 1;
3074 
3075   // A complete chain likely eliminates the need for keeping the original IV in
3076   // a register. LSR does not currently know how to form a complete chain unless
3077   // the header phi already exists.
3078   if (isa<PHINode>(Chain.tailUserInst())
3079       && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3080     --cost;
3081   }
3082   const SCEV *LastIncExpr = nullptr;
3083   unsigned NumConstIncrements = 0;
3084   unsigned NumVarIncrements = 0;
3085   unsigned NumReusedIncrements = 0;
3086 
3087   if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3088     return true;
3089 
3090   for (const IVInc &Inc : Chain) {
3091     if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3092       return true;
3093     if (Inc.IncExpr->isZero())
3094       continue;
3095 
3096     // Incrementing by zero or some constant is neutral. We assume constants can
3097     // be folded into an addressing mode or an add's immediate operand.
3098     if (isa<SCEVConstant>(Inc.IncExpr)) {
3099       ++NumConstIncrements;
3100       continue;
3101     }
3102 
3103     if (Inc.IncExpr == LastIncExpr)
3104       ++NumReusedIncrements;
3105     else
3106       ++NumVarIncrements;
3107 
3108     LastIncExpr = Inc.IncExpr;
3109   }
3110   // An IV chain with a single increment is handled by LSR's postinc
3111   // uses. However, a chain with multiple increments requires keeping the IV's
3112   // value live longer than it needs to be if chained.
3113   if (NumConstIncrements > 1)
3114     --cost;
3115 
3116   // Materializing increment expressions in the preheader that didn't exist in
3117   // the original code may cost a register. For example, sign-extended array
3118   // indices can produce ridiculous increments like this:
3119   // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3120   cost += NumVarIncrements;
3121 
3122   // Reusing variable increments likely saves a register to hold the multiple of
3123   // the stride.
3124   cost -= NumReusedIncrements;
3125 
3126   LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3127                     << "\n");
3128 
3129   return cost < 0;
3130 }
3131 
3132 /// Add this IV user to an existing chain or make it the head of a new chain.
3133 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3134                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3135   // When IVs are used as types of varying widths, they are generally converted
3136   // to a wider type with some uses remaining narrow under a (free) trunc.
3137   Value *const NextIV = getWideOperand(IVOper);
3138   const SCEV *const OperExpr = SE.getSCEV(NextIV);
3139   const SCEV *const OperExprBase = getExprBase(OperExpr);
3140 
3141   // Visit all existing chains. Check if its IVOper can be computed as a
3142   // profitable loop invariant increment from the last link in the Chain.
3143   unsigned ChainIdx = 0, NChains = IVChainVec.size();
3144   const SCEV *LastIncExpr = nullptr;
3145   for (; ChainIdx < NChains; ++ChainIdx) {
3146     IVChain &Chain = IVChainVec[ChainIdx];
3147 
3148     // Prune the solution space aggressively by checking that both IV operands
3149     // are expressions that operate on the same unscaled SCEVUnknown. This
3150     // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3151     // first avoids creating extra SCEV expressions.
3152     if (!StressIVChain && Chain.ExprBase != OperExprBase)
3153       continue;
3154 
3155     Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3156     if (PrevIV->getType() != NextIV->getType())
3157       continue;
3158 
3159     // A phi node terminates a chain.
3160     if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3161       continue;
3162 
3163     // The increment must be loop-invariant so it can be kept in a register.
3164     const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3165     const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3166     if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3167       continue;
3168 
3169     if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3170       LastIncExpr = IncExpr;
3171       break;
3172     }
3173   }
3174   // If we haven't found a chain, create a new one, unless we hit the max. Don't
3175   // bother for phi nodes, because they must be last in the chain.
3176   if (ChainIdx == NChains) {
3177     if (isa<PHINode>(UserInst))
3178       return;
3179     if (NChains >= MaxChains && !StressIVChain) {
3180       LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3181       return;
3182     }
3183     LastIncExpr = OperExpr;
3184     // IVUsers may have skipped over sign/zero extensions. We don't currently
3185     // attempt to form chains involving extensions unless they can be hoisted
3186     // into this loop's AddRec.
3187     if (!isa<SCEVAddRecExpr>(LastIncExpr))
3188       return;
3189     ++NChains;
3190     IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3191                                  OperExprBase));
3192     ChainUsersVec.resize(NChains);
3193     LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3194                       << ") IV=" << *LastIncExpr << "\n");
3195   } else {
3196     LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
3197                       << ") IV+" << *LastIncExpr << "\n");
3198     // Add this IV user to the end of the chain.
3199     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3200   }
3201   IVChain &Chain = IVChainVec[ChainIdx];
3202 
3203   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3204   // This chain's NearUsers become FarUsers.
3205   if (!LastIncExpr->isZero()) {
3206     ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3207     NearUsers.clear();
3208   }
3209 
3210   // All other uses of IVOperand become near uses of the chain.
3211   // We currently ignore intermediate values within SCEV expressions, assuming
3212   // they will eventually be used be the current chain, or can be computed
3213   // from one of the chain increments. To be more precise we could
3214   // transitively follow its user and only add leaf IV users to the set.
3215   for (User *U : IVOper->users()) {
3216     Instruction *OtherUse = dyn_cast<Instruction>(U);
3217     if (!OtherUse)
3218       continue;
3219     // Uses in the chain will no longer be uses if the chain is formed.
3220     // Include the head of the chain in this iteration (not Chain.begin()).
3221     IVChain::const_iterator IncIter = Chain.Incs.begin();
3222     IVChain::const_iterator IncEnd = Chain.Incs.end();
3223     for( ; IncIter != IncEnd; ++IncIter) {
3224       if (IncIter->UserInst == OtherUse)
3225         break;
3226     }
3227     if (IncIter != IncEnd)
3228       continue;
3229 
3230     if (SE.isSCEVable(OtherUse->getType())
3231         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3232         && IU.isIVUserOrOperand(OtherUse)) {
3233       continue;
3234     }
3235     NearUsers.insert(OtherUse);
3236   }
3237 
3238   // Since this user is part of the chain, it's no longer considered a use
3239   // of the chain.
3240   ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3241 }
3242 
3243 /// Populate the vector of Chains.
3244 ///
3245 /// This decreases ILP at the architecture level. Targets with ample registers,
3246 /// multiple memory ports, and no register renaming probably don't want
3247 /// this. However, such targets should probably disable LSR altogether.
3248 ///
3249 /// The job of LSR is to make a reasonable choice of induction variables across
3250 /// the loop. Subsequent passes can easily "unchain" computation exposing more
3251 /// ILP *within the loop* if the target wants it.
3252 ///
3253 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
3254 /// will not reorder memory operations, it will recognize this as a chain, but
3255 /// will generate redundant IV increments. Ideally this would be corrected later
3256 /// by a smart scheduler:
3257 ///        = A[i]
3258 ///        = A[i+x]
3259 /// A[i]   =
3260 /// A[i+x] =
3261 ///
3262 /// TODO: Walk the entire domtree within this loop, not just the path to the
3263 /// loop latch. This will discover chains on side paths, but requires
3264 /// maintaining multiple copies of the Chains state.
3265 void LSRInstance::CollectChains() {
3266   LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3267   SmallVector<ChainUsers, 8> ChainUsersVec;
3268 
3269   SmallVector<BasicBlock *,8> LatchPath;
3270   BasicBlock *LoopHeader = L->getHeader();
3271   for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3272        Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3273     LatchPath.push_back(Rung->getBlock());
3274   }
3275   LatchPath.push_back(LoopHeader);
3276 
3277   // Walk the instruction stream from the loop header to the loop latch.
3278   for (BasicBlock *BB : reverse(LatchPath)) {
3279     for (Instruction &I : *BB) {
3280       // Skip instructions that weren't seen by IVUsers analysis.
3281       if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3282         continue;
3283 
3284       // Ignore users that are part of a SCEV expression. This way we only
3285       // consider leaf IV Users. This effectively rediscovers a portion of
3286       // IVUsers analysis but in program order this time.
3287       if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3288           continue;
3289 
3290       // Remove this instruction from any NearUsers set it may be in.
3291       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3292            ChainIdx < NChains; ++ChainIdx) {
3293         ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3294       }
3295       // Search for operands that can be chained.
3296       SmallPtrSet<Instruction*, 4> UniqueOperands;
3297       User::op_iterator IVOpEnd = I.op_end();
3298       User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3299       while (IVOpIter != IVOpEnd) {
3300         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3301         if (UniqueOperands.insert(IVOpInst).second)
3302           ChainInstruction(&I, IVOpInst, ChainUsersVec);
3303         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3304       }
3305     } // Continue walking down the instructions.
3306   } // Continue walking down the domtree.
3307   // Visit phi backedges to determine if the chain can generate the IV postinc.
3308   for (PHINode &PN : L->getHeader()->phis()) {
3309     if (!SE.isSCEVable(PN.getType()))
3310       continue;
3311 
3312     Instruction *IncV =
3313         dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3314     if (IncV)
3315       ChainInstruction(&PN, IncV, ChainUsersVec);
3316   }
3317   // Remove any unprofitable chains.
3318   unsigned ChainIdx = 0;
3319   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3320        UsersIdx < NChains; ++UsersIdx) {
3321     if (!isProfitableChain(IVChainVec[UsersIdx],
3322                            ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3323       continue;
3324     // Preserve the chain at UsesIdx.
3325     if (ChainIdx != UsersIdx)
3326       IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3327     FinalizeChain(IVChainVec[ChainIdx]);
3328     ++ChainIdx;
3329   }
3330   IVChainVec.resize(ChainIdx);
3331 }
3332 
3333 void LSRInstance::FinalizeChain(IVChain &Chain) {
3334   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3335   LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3336 
3337   for (const IVInc &Inc : Chain) {
3338     LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
3339     auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3340     assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3341     IVIncSet.insert(UseI);
3342   }
3343 }
3344 
3345 /// Return true if the IVInc can be folded into an addressing mode.
3346 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3347                              Value *Operand, const TargetTransformInfo &TTI) {
3348   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3349   Immediate IncOffset = Immediate::getZero();
3350   if (IncConst) {
3351     if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3352       return false;
3353     IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3354   } else {
3355     // Look for mul(vscale, constant), to detect a scalable offset.
3356     const APInt *C;
3357     if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3358         C->getSignificantBits() > 64)
3359       return false;
3360     IncOffset = Immediate::getScalable(C->getSExtValue());
3361   }
3362 
3363   if (!isAddressUse(TTI, UserInst, Operand))
3364     return false;
3365 
3366   MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3367   if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3368                         IncOffset, /*HasBaseReg=*/false))
3369     return false;
3370 
3371   return true;
3372 }
3373 
3374 /// Generate an add or subtract for each IVInc in a chain to materialize the IV
3375 /// user's operand from the previous IV user's operand.
3376 void LSRInstance::GenerateIVChain(const IVChain &Chain,
3377                                   SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3378   // Find the new IVOperand for the head of the chain. It may have been replaced
3379   // by LSR.
3380   const IVInc &Head = Chain.Incs[0];
3381   User::op_iterator IVOpEnd = Head.UserInst->op_end();
3382   // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3383   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3384                                              IVOpEnd, L, SE);
3385   Value *IVSrc = nullptr;
3386   while (IVOpIter != IVOpEnd) {
3387     IVSrc = getWideOperand(*IVOpIter);
3388 
3389     // If this operand computes the expression that the chain needs, we may use
3390     // it. (Check this after setting IVSrc which is used below.)
3391     //
3392     // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3393     // narrow for the chain, so we can no longer use it. We do allow using a
3394     // wider phi, assuming the LSR checked for free truncation. In that case we
3395     // should already have a truncate on this operand such that
3396     // getSCEV(IVSrc) == IncExpr.
3397     if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3398         || SE.getSCEV(IVSrc) == Head.IncExpr) {
3399       break;
3400     }
3401     IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3402   }
3403   if (IVOpIter == IVOpEnd) {
3404     // Gracefully give up on this chain.
3405     LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3406     return;
3407   }
3408   assert(IVSrc && "Failed to find IV chain source");
3409 
3410   LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3411   Type *IVTy = IVSrc->getType();
3412   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3413   const SCEV *LeftOverExpr = nullptr;
3414   const SCEV *Accum = SE.getZero(IntTy);
3415   SmallVector<std::pair<const SCEV *, Value *>> Bases;
3416   Bases.emplace_back(Accum, IVSrc);
3417 
3418   for (const IVInc &Inc : Chain) {
3419     Instruction *InsertPt = Inc.UserInst;
3420     if (isa<PHINode>(InsertPt))
3421       InsertPt = L->getLoopLatch()->getTerminator();
3422 
3423     // IVOper will replace the current IV User's operand. IVSrc is the IV
3424     // value currently held in a register.
3425     Value *IVOper = IVSrc;
3426     if (!Inc.IncExpr->isZero()) {
3427       // IncExpr was the result of subtraction of two narrow values, so must
3428       // be signed.
3429       const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3430       Accum = SE.getAddExpr(Accum, IncExpr);
3431       LeftOverExpr = LeftOverExpr ?
3432         SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3433     }
3434 
3435     // Look through each base to see if any can produce a nice addressing mode.
3436     bool FoundBase = false;
3437     for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3438       const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3439       if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3440         if (!Remainder->isZero()) {
3441           Rewriter.clearPostInc();
3442           Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3443           const SCEV *IVOperExpr =
3444               SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3445           IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3446         } else {
3447           IVOper = MapIVOper;
3448         }
3449 
3450         FoundBase = true;
3451         break;
3452       }
3453     }
3454     if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3455       // Expand the IV increment.
3456       Rewriter.clearPostInc();
3457       Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3458       const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3459                                              SE.getUnknown(IncV));
3460       IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3461 
3462       // If an IV increment can't be folded, use it as the next IV value.
3463       if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3464         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3465         Bases.emplace_back(Accum, IVOper);
3466         IVSrc = IVOper;
3467         LeftOverExpr = nullptr;
3468       }
3469     }
3470     Type *OperTy = Inc.IVOperand->getType();
3471     if (IVTy != OperTy) {
3472       assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3473              "cannot extend a chained IV");
3474       IRBuilder<> Builder(InsertPt);
3475       IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3476     }
3477     Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3478     if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3479       DeadInsts.emplace_back(OperandIsInstr);
3480   }
3481   // If LSR created a new, wider phi, we may also replace its postinc. We only
3482   // do this if we also found a wide value for the head of the chain.
3483   if (isa<PHINode>(Chain.tailUserInst())) {
3484     for (PHINode &Phi : L->getHeader()->phis()) {
3485       if (Phi.getType() != IVSrc->getType())
3486         continue;
3487       Instruction *PostIncV = dyn_cast<Instruction>(
3488           Phi.getIncomingValueForBlock(L->getLoopLatch()));
3489       if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3490         continue;
3491       Value *IVOper = IVSrc;
3492       Type *PostIncTy = PostIncV->getType();
3493       if (IVTy != PostIncTy) {
3494         assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3495         IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3496         Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3497         IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3498       }
3499       Phi.replaceUsesOfWith(PostIncV, IVOper);
3500       DeadInsts.emplace_back(PostIncV);
3501     }
3502   }
3503 }
3504 
3505 void LSRInstance::CollectFixupsAndInitialFormulae() {
3506   BranchInst *ExitBranch = nullptr;
3507   bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3508 
3509   // For calculating baseline cost
3510   SmallPtrSet<const SCEV *, 16> Regs;
3511   DenseSet<const SCEV *> VisitedRegs;
3512   DenseSet<size_t> VisitedLSRUse;
3513 
3514   for (const IVStrideUse &U : IU) {
3515     Instruction *UserInst = U.getUser();
3516     // Skip IV users that are part of profitable IV Chains.
3517     User::op_iterator UseI =
3518         find(UserInst->operands(), U.getOperandValToReplace());
3519     assert(UseI != UserInst->op_end() && "cannot find IV operand");
3520     if (IVIncSet.count(UseI)) {
3521       LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3522       continue;
3523     }
3524 
3525     LSRUse::KindType Kind = LSRUse::Basic;
3526     MemAccessTy AccessTy;
3527     if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3528       Kind = LSRUse::Address;
3529       AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3530     }
3531 
3532     const SCEV *S = IU.getExpr(U);
3533     if (!S)
3534       continue;
3535     PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3536 
3537     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3538     // (N - i == 0), and this allows (N - i) to be the expression that we work
3539     // with rather than just N or i, so we can consider the register
3540     // requirements for both N and i at the same time. Limiting this code to
3541     // equality icmps is not a problem because all interesting loops use
3542     // equality icmps, thanks to IndVarSimplify.
3543     if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3544       // If CI can be saved in some target, like replaced inside hardware loop
3545       // in PowerPC, no need to generate initial formulae for it.
3546       if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3547         continue;
3548       if (CI->isEquality()) {
3549         // Swap the operands if needed to put the OperandValToReplace on the
3550         // left, for consistency.
3551         Value *NV = CI->getOperand(1);
3552         if (NV == U.getOperandValToReplace()) {
3553           CI->setOperand(1, CI->getOperand(0));
3554           CI->setOperand(0, NV);
3555           NV = CI->getOperand(1);
3556           Changed = true;
3557         }
3558 
3559         // x == y  -->  x - y == 0
3560         const SCEV *N = SE.getSCEV(NV);
3561         if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3562             (!NV->getType()->isPointerTy() ||
3563              SE.getPointerBase(N) == SE.getPointerBase(S))) {
3564           // S is normalized, so normalize N before folding it into S
3565           // to keep the result normalized.
3566           N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3567           if (!N)
3568             continue;
3569           Kind = LSRUse::ICmpZero;
3570           S = SE.getMinusSCEV(N, S);
3571         } else if (L->isLoopInvariant(NV) &&
3572                    (!isa<Instruction>(NV) ||
3573                     DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3574                    !NV->getType()->isPointerTy()) {
3575           // If we can't generally expand the expression (e.g. it contains
3576           // a divide), but it is already at a loop invariant point before the
3577           // loop, wrap it in an unknown (to prevent the expander from trying
3578           // to re-expand in a potentially unsafe way.)  The restriction to
3579           // integer types is required because the unknown hides the base, and
3580           // SCEV can't compute the difference of two unknown pointers.
3581           N = SE.getUnknown(NV);
3582           N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3583           if (!N)
3584             continue;
3585           Kind = LSRUse::ICmpZero;
3586           S = SE.getMinusSCEV(N, S);
3587           assert(!isa<SCEVCouldNotCompute>(S));
3588         }
3589 
3590         // -1 and the negations of all interesting strides (except the negation
3591         // of -1) are now also interesting.
3592         for (size_t i = 0, e = Factors.size(); i != e; ++i)
3593           if (Factors[i] != -1)
3594             Factors.insert(-(uint64_t)Factors[i]);
3595         Factors.insert(-1);
3596       }
3597     }
3598 
3599     // Get or create an LSRUse.
3600     std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3601     size_t LUIdx = P.first;
3602     Immediate Offset = P.second;
3603     LSRUse &LU = Uses[LUIdx];
3604 
3605     // Record the fixup.
3606     LSRFixup &LF = LU.getNewFixup();
3607     LF.UserInst = UserInst;
3608     LF.OperandValToReplace = U.getOperandValToReplace();
3609     LF.PostIncLoops = TmpPostIncLoops;
3610     LF.Offset = Offset;
3611     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3612 
3613     // Create SCEV as Formula for calculating baseline cost
3614     if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3615       Formula F;
3616       F.initialMatch(S, L, SE);
3617       BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3618                                HardwareLoopProfitable);
3619       VisitedLSRUse.insert(LUIdx);
3620     }
3621 
3622     if (!LU.WidestFixupType ||
3623         SE.getTypeSizeInBits(LU.WidestFixupType) <
3624         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3625       LU.WidestFixupType = LF.OperandValToReplace->getType();
3626 
3627     // If this is the first use of this LSRUse, give it a formula.
3628     if (LU.Formulae.empty()) {
3629       InsertInitialFormula(S, LU, LUIdx);
3630       CountRegisters(LU.Formulae.back(), LUIdx);
3631     }
3632   }
3633 
3634   LLVM_DEBUG(print_fixups(dbgs()));
3635 }
3636 
3637 /// Insert a formula for the given expression into the given use, separating out
3638 /// loop-variant portions from loop-invariant and loop-computable portions.
3639 void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3640                                        size_t LUIdx) {
3641   // Mark uses whose expressions cannot be expanded.
3642   if (!Rewriter.isSafeToExpand(S))
3643     LU.RigidFormula = true;
3644 
3645   Formula F;
3646   F.initialMatch(S, L, SE);
3647   bool Inserted = InsertFormula(LU, LUIdx, F);
3648   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3649 }
3650 
3651 /// Insert a simple single-register formula for the given expression into the
3652 /// given use.
3653 void
3654 LSRInstance::InsertSupplementalFormula(const SCEV *S,
3655                                        LSRUse &LU, size_t LUIdx) {
3656   Formula F;
3657   F.BaseRegs.push_back(S);
3658   F.HasBaseReg = true;
3659   bool Inserted = InsertFormula(LU, LUIdx, F);
3660   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3661 }
3662 
3663 /// Note which registers are used by the given formula, updating RegUses.
3664 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3665   if (F.ScaledReg)
3666     RegUses.countRegister(F.ScaledReg, LUIdx);
3667   for (const SCEV *BaseReg : F.BaseRegs)
3668     RegUses.countRegister(BaseReg, LUIdx);
3669 }
3670 
3671 /// If the given formula has not yet been inserted, add it to the list, and
3672 /// return true. Return false otherwise.
3673 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3674   // Do not insert formula that we will not be able to expand.
3675   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3676          "Formula is illegal");
3677 
3678   if (!LU.InsertFormula(F, *L))
3679     return false;
3680 
3681   CountRegisters(F, LUIdx);
3682   return true;
3683 }
3684 
3685 /// Check for other uses of loop-invariant values which we're tracking. These
3686 /// other uses will pin these values in registers, making them less profitable
3687 /// for elimination.
3688 /// TODO: This currently misses non-constant addrec step registers.
3689 /// TODO: Should this give more weight to users inside the loop?
3690 void
3691 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3692   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3693   SmallPtrSet<const SCEV *, 32> Visited;
3694 
3695   // Don't collect outside uses if we are favoring postinc - the instructions in
3696   // the loop are more important than the ones outside of it.
3697   if (AMK == TTI::AMK_PostIndexed)
3698     return;
3699 
3700   while (!Worklist.empty()) {
3701     const SCEV *S = Worklist.pop_back_val();
3702 
3703     // Don't process the same SCEV twice
3704     if (!Visited.insert(S).second)
3705       continue;
3706 
3707     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3708       append_range(Worklist, N->operands());
3709     else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3710       Worklist.push_back(C->getOperand());
3711     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3712       Worklist.push_back(D->getLHS());
3713       Worklist.push_back(D->getRHS());
3714     } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3715       const Value *V = US->getValue();
3716       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3717         // Look for instructions defined outside the loop.
3718         if (L->contains(Inst)) continue;
3719       } else if (isa<Constant>(V))
3720         // Constants can be re-materialized.
3721         continue;
3722       for (const Use &U : V->uses()) {
3723         const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3724         // Ignore non-instructions.
3725         if (!UserInst)
3726           continue;
3727         // Don't bother if the instruction is an EHPad.
3728         if (UserInst->isEHPad())
3729           continue;
3730         // Ignore instructions in other functions (as can happen with
3731         // Constants).
3732         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3733           continue;
3734         // Ignore instructions not dominated by the loop.
3735         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3736           UserInst->getParent() :
3737           cast<PHINode>(UserInst)->getIncomingBlock(
3738             PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3739         if (!DT.dominates(L->getHeader(), UseBB))
3740           continue;
3741         // Don't bother if the instruction is in a BB which ends in an EHPad.
3742         if (UseBB->getTerminator()->isEHPad())
3743           continue;
3744 
3745         // Ignore cases in which the currently-examined value could come from
3746         // a basic block terminated with an EHPad. This checks all incoming
3747         // blocks of the phi node since it is possible that the same incoming
3748         // value comes from multiple basic blocks, only some of which may end
3749         // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3750         // pass would try to insert instructions into an EHPad, hitting an
3751         // assertion.
3752         if (isa<PHINode>(UserInst)) {
3753           const auto *PhiNode = cast<PHINode>(UserInst);
3754           bool HasIncompatibleEHPTerminatedBlock = false;
3755           llvm::Value *ExpectedValue = U;
3756           for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3757             if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3758               if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3759                 HasIncompatibleEHPTerminatedBlock = true;
3760                 break;
3761               }
3762             }
3763           }
3764           if (HasIncompatibleEHPTerminatedBlock) {
3765             continue;
3766           }
3767         }
3768 
3769         // Don't bother rewriting PHIs in catchswitch blocks.
3770         if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3771           continue;
3772         // Ignore uses which are part of other SCEV expressions, to avoid
3773         // analyzing them multiple times.
3774         if (SE.isSCEVable(UserInst->getType())) {
3775           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3776           // If the user is a no-op, look through to its uses.
3777           if (!isa<SCEVUnknown>(UserS))
3778             continue;
3779           if (UserS == US) {
3780             Worklist.push_back(
3781               SE.getUnknown(const_cast<Instruction *>(UserInst)));
3782             continue;
3783           }
3784         }
3785         // Ignore icmp instructions which are already being analyzed.
3786         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3787           unsigned OtherIdx = !U.getOperandNo();
3788           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3789           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3790             continue;
3791         }
3792 
3793         std::pair<size_t, Immediate> P =
3794             getUse(S, LSRUse::Basic, MemAccessTy());
3795         size_t LUIdx = P.first;
3796         Immediate Offset = P.second;
3797         LSRUse &LU = Uses[LUIdx];
3798         LSRFixup &LF = LU.getNewFixup();
3799         LF.UserInst = const_cast<Instruction *>(UserInst);
3800         LF.OperandValToReplace = U;
3801         LF.Offset = Offset;
3802         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3803         if (!LU.WidestFixupType ||
3804             SE.getTypeSizeInBits(LU.WidestFixupType) <
3805             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3806           LU.WidestFixupType = LF.OperandValToReplace->getType();
3807         InsertSupplementalFormula(US, LU, LUIdx);
3808         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3809         break;
3810       }
3811     }
3812   }
3813 }
3814 
3815 /// Split S into subexpressions which can be pulled out into separate
3816 /// registers. If C is non-null, multiply each subexpression by C.
3817 ///
3818 /// Return remainder expression after factoring the subexpressions captured by
3819 /// Ops. If Ops is complete, return NULL.
3820 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3821                                    SmallVectorImpl<const SCEV *> &Ops,
3822                                    const Loop *L,
3823                                    ScalarEvolution &SE,
3824                                    unsigned Depth = 0) {
3825   // Arbitrarily cap recursion to protect compile time.
3826   if (Depth >= 3)
3827     return S;
3828 
3829   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3830     // Break out add operands.
3831     for (const SCEV *S : Add->operands()) {
3832       const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3833       if (Remainder)
3834         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3835     }
3836     return nullptr;
3837   }
3838   const SCEV *Start, *Step;
3839   const SCEVConstant *Op0;
3840   const SCEV *Op1;
3841   if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3842     // Split a non-zero base out of an addrec.
3843     if (Start->isZero())
3844       return S;
3845 
3846     const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3847     // Split the non-zero AddRec unless it is part of a nested recurrence that
3848     // does not pertain to this loop.
3849     if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3850                       !isa<SCEVAddRecExpr>(Remainder))) {
3851       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3852       Remainder = nullptr;
3853     }
3854     if (Remainder != Start) {
3855       if (!Remainder)
3856         Remainder = SE.getConstant(S->getType(), 0);
3857       return SE.getAddRecExpr(Remainder, Step,
3858                               cast<SCEVAddRecExpr>(S)->getLoop(),
3859                               // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3860                               SCEV::FlagAnyWrap);
3861     }
3862   } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3863     // Break (C * (a + b + c)) into C*a + C*b + C*c.
3864     C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3865     const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3866     if (Remainder)
3867       Ops.push_back(SE.getMulExpr(C, Remainder));
3868     return nullptr;
3869   }
3870   return S;
3871 }
3872 
3873 /// Return true if the SCEV represents a value that may end up as a
3874 /// post-increment operation.
3875 static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
3876                               LSRUse &LU, const SCEV *S, const Loop *L,
3877                               ScalarEvolution &SE) {
3878   if (LU.Kind != LSRUse::Address ||
3879       !LU.AccessTy.getType()->isIntOrIntVectorTy())
3880     return false;
3881   const SCEV *Start;
3882   if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3883     return false;
3884   // Check if a post-indexed load/store can be used.
3885   if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3886       TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3887     if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3888       return true;
3889   }
3890   return false;
3891 }
3892 
3893 /// Helper function for LSRInstance::GenerateReassociations.
3894 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3895                                              const Formula &Base,
3896                                              unsigned Depth, size_t Idx,
3897                                              bool IsScaledReg) {
3898   const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3899   // Don't generate reassociations for the base register of a value that
3900   // may generate a post-increment operator. The reason is that the
3901   // reassociations cause extra base+register formula to be created,
3902   // and possibly chosen, but the post-increment is more efficient.
3903   if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3904     return;
3905   SmallVector<const SCEV *, 8> AddOps;
3906   const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3907   if (Remainder)
3908     AddOps.push_back(Remainder);
3909 
3910   if (AddOps.size() == 1)
3911     return;
3912 
3913   for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3914                                                      JE = AddOps.end();
3915        J != JE; ++J) {
3916     // Loop-variant "unknown" values are uninteresting; we won't be able to
3917     // do anything meaningful with them.
3918     if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3919       continue;
3920 
3921     // Don't pull a constant into a register if the constant could be folded
3922     // into an immediate field.
3923     if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3924                          LU.AccessTy, *J, Base.getNumRegs() > 1))
3925       continue;
3926 
3927     // Collect all operands except *J.
3928     SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3929     InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3930 
3931     // Don't leave just a constant behind in a register if the constant could
3932     // be folded into an immediate field.
3933     if (InnerAddOps.size() == 1 &&
3934         isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3935                          LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3936       continue;
3937 
3938     const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3939     if (InnerSum->isZero())
3940       continue;
3941     Formula F = Base;
3942 
3943     if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3944       continue;
3945 
3946     // Add the remaining pieces of the add back into the new formula.
3947     const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3948     if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3949         TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3950                                 InnerSumSC->getValue()->getZExtValue())) {
3951       F.UnfoldedOffset =
3952           Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3953                               InnerSumSC->getValue()->getZExtValue());
3954       if (IsScaledReg) {
3955         F.ScaledReg = nullptr;
3956         F.Scale = 0;
3957       } else
3958         F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3959     } else if (IsScaledReg)
3960       F.ScaledReg = InnerSum;
3961     else
3962       F.BaseRegs[Idx] = InnerSum;
3963 
3964     // Add J as its own register, or an unfolded immediate.
3965     const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3966     if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3967         TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3968                                 SC->getValue()->getZExtValue()))
3969       F.UnfoldedOffset =
3970           Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3971                               SC->getValue()->getZExtValue());
3972     else
3973       F.BaseRegs.push_back(*J);
3974     // We may have changed the number of register in base regs, adjust the
3975     // formula accordingly.
3976     F.canonicalize(*L);
3977 
3978     if (InsertFormula(LU, LUIdx, F))
3979       // If that formula hadn't been seen before, recurse to find more like
3980       // it.
3981       // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3982       // Because just Depth is not enough to bound compile time.
3983       // This means that every time AddOps.size() is greater 16^x we will add
3984       // x to Depth.
3985       GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3986                              Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3987   }
3988 }
3989 
3990 /// Split out subexpressions from adds and the bases of addrecs.
3991 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3992                                          Formula Base, unsigned Depth) {
3993   assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3994   // Arbitrarily cap recursion to protect compile time.
3995   if (Depth >= 3)
3996     return;
3997 
3998   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3999     GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4000 
4001   if (Base.Scale == 1)
4002     GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4003                                /* Idx */ -1, /* IsScaledReg */ true);
4004 }
4005 
4006 ///  Generate a formula consisting of all of the loop-dominating registers added
4007 /// into a single register.
4008 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4009                                        Formula Base) {
4010   // This method is only interesting on a plurality of registers.
4011   if (Base.BaseRegs.size() + (Base.Scale == 1) +
4012           (Base.UnfoldedOffset.isNonZero()) <=
4013       1)
4014     return;
4015 
4016   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4017   // processing the formula.
4018   Base.unscale();
4019   SmallVector<const SCEV *, 4> Ops;
4020   Formula NewBase = Base;
4021   NewBase.BaseRegs.clear();
4022   Type *CombinedIntegerType = nullptr;
4023   for (const SCEV *BaseReg : Base.BaseRegs) {
4024     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4025         !SE.hasComputableLoopEvolution(BaseReg, L)) {
4026       if (!CombinedIntegerType)
4027         CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4028       Ops.push_back(BaseReg);
4029     }
4030     else
4031       NewBase.BaseRegs.push_back(BaseReg);
4032   }
4033 
4034   // If no register is relevant, we're done.
4035   if (Ops.size() == 0)
4036     return;
4037 
4038   // Utility function for generating the required variants of the combined
4039   // registers.
4040   auto GenerateFormula = [&](const SCEV *Sum) {
4041     Formula F = NewBase;
4042 
4043     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4044     // opportunity to fold something. For now, just ignore such cases
4045     // rather than proceed with zero in a register.
4046     if (Sum->isZero())
4047       return;
4048 
4049     F.BaseRegs.push_back(Sum);
4050     F.canonicalize(*L);
4051     (void)InsertFormula(LU, LUIdx, F);
4052   };
4053 
4054   // If we collected at least two registers, generate a formula combining them.
4055   if (Ops.size() > 1) {
4056     SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4057     GenerateFormula(SE.getAddExpr(OpsCopy));
4058   }
4059 
4060   // If we have an unfolded offset, generate a formula combining it with the
4061   // registers collected.
4062   if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4063     assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4064     Ops.push_back(SE.getConstant(CombinedIntegerType,
4065                                  NewBase.UnfoldedOffset.getFixedValue(), true));
4066     NewBase.UnfoldedOffset = Immediate::getFixed(0);
4067     GenerateFormula(SE.getAddExpr(Ops));
4068   }
4069 }
4070 
4071 /// Helper function for LSRInstance::GenerateSymbolicOffsets.
4072 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4073                                               const Formula &Base, size_t Idx,
4074                                               bool IsScaledReg) {
4075   const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4076   GlobalValue *GV = ExtractSymbol(G, SE);
4077   if (G->isZero() || !GV)
4078     return;
4079   Formula F = Base;
4080   F.BaseGV = GV;
4081   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4082     return;
4083   if (IsScaledReg)
4084     F.ScaledReg = G;
4085   else
4086     F.BaseRegs[Idx] = G;
4087   (void)InsertFormula(LU, LUIdx, F);
4088 }
4089 
4090 /// Generate reuse formulae using symbolic offsets.
4091 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4092                                           Formula Base) {
4093   // We can't add a symbolic offset if the address already contains one.
4094   if (Base.BaseGV) return;
4095 
4096   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4097     GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4098   if (Base.Scale == 1)
4099     GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4100                                 /* IsScaledReg */ true);
4101 }
4102 
4103 /// Helper function for LSRInstance::GenerateConstantOffsets.
4104 void LSRInstance::GenerateConstantOffsetsImpl(
4105     LSRUse &LU, unsigned LUIdx, const Formula &Base,
4106     const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4107 
4108   auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4109     Formula F = Base;
4110     if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4111       return;
4112     F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4113 
4114     if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4115       // Add the offset to the base register.
4116       const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4117       const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4118       // If it cancelled out, drop the base register, otherwise update it.
4119       if (NewG->isZero()) {
4120         if (IsScaledReg) {
4121           F.Scale = 0;
4122           F.ScaledReg = nullptr;
4123         } else
4124           F.deleteBaseReg(F.BaseRegs[Idx]);
4125         F.canonicalize(*L);
4126       } else if (IsScaledReg)
4127         F.ScaledReg = NewG;
4128       else
4129         F.BaseRegs[Idx] = NewG;
4130 
4131       (void)InsertFormula(LU, LUIdx, F);
4132     }
4133   };
4134 
4135   const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4136 
4137   // With constant offsets and constant steps, we can generate pre-inc
4138   // accesses by having the offset equal the step. So, for access #0 with a
4139   // step of 8, we generate a G - 8 base which would require the first access
4140   // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4141   // for itself and hopefully becomes the base for other accesses. This means
4142   // means that a single pre-indexed access can be generated to become the new
4143   // base pointer for each iteration of the loop, resulting in no extra add/sub
4144   // instructions for pointer updating.
4145   if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4146     const APInt *StepInt;
4147     if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4148       int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4149                                            : StepInt->getZExtValue();
4150 
4151       for (Immediate Offset : Worklist) {
4152         if (Offset.isFixed()) {
4153           Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4154           GenerateOffset(G, Offset);
4155         }
4156       }
4157     }
4158   }
4159   for (Immediate Offset : Worklist)
4160     GenerateOffset(G, Offset);
4161 
4162   Immediate Imm = ExtractImmediate(G, SE);
4163   if (G->isZero() || Imm.isZero() ||
4164       !Base.BaseOffset.isCompatibleImmediate(Imm))
4165     return;
4166   Formula F = Base;
4167   F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4168   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4169     return;
4170   if (IsScaledReg) {
4171     F.ScaledReg = G;
4172   } else {
4173     F.BaseRegs[Idx] = G;
4174     // We may generate non canonical Formula if G is a recurrent expr reg
4175     // related with current loop while F.ScaledReg is not.
4176     F.canonicalize(*L);
4177   }
4178   (void)InsertFormula(LU, LUIdx, F);
4179 }
4180 
4181 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4182 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4183                                           Formula Base) {
4184   // TODO: For now, just add the min and max offset, because it usually isn't
4185   // worthwhile looking at everything inbetween.
4186   SmallVector<Immediate, 2> Worklist;
4187   Worklist.push_back(LU.MinOffset);
4188   if (LU.MaxOffset != LU.MinOffset)
4189     Worklist.push_back(LU.MaxOffset);
4190 
4191   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4192     GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4193   if (Base.Scale == 1)
4194     GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4195                                 /* IsScaledReg */ true);
4196 }
4197 
4198 /// For ICmpZero, check to see if we can scale up the comparison. For example, x
4199 /// == y -> x*c == y*c.
4200 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4201                                          Formula Base) {
4202   if (LU.Kind != LSRUse::ICmpZero) return;
4203 
4204   // Determine the integer type for the base formula.
4205   Type *IntTy = Base.getType();
4206   if (!IntTy) return;
4207   if (SE.getTypeSizeInBits(IntTy) > 64) return;
4208 
4209   // Don't do this if there is more than one offset.
4210   if (LU.MinOffset != LU.MaxOffset) return;
4211 
4212   // Check if transformation is valid. It is illegal to multiply pointer.
4213   if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4214     return;
4215   for (const SCEV *BaseReg : Base.BaseRegs)
4216     if (BaseReg->getType()->isPointerTy())
4217       return;
4218   assert(!Base.BaseGV && "ICmpZero use is not legal!");
4219 
4220   // Check each interesting stride.
4221   for (int64_t Factor : Factors) {
4222     // Check that Factor can be represented by IntTy
4223     if (!ConstantInt::isValueValidForType(IntTy, Factor))
4224       continue;
4225     // Check that the multiplication doesn't overflow.
4226     if (Base.BaseOffset.isMin() && Factor == -1)
4227       continue;
4228     // Not supporting scalable immediates.
4229     if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4230       continue;
4231     Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4232     assert(Factor != 0 && "Zero factor not expected!");
4233     if (NewBaseOffset.getFixedValue() / Factor !=
4234         Base.BaseOffset.getFixedValue())
4235       continue;
4236     // If the offset will be truncated at this use, check that it is in bounds.
4237     if (!IntTy->isPointerTy() &&
4238         !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4239       continue;
4240 
4241     // Check that multiplying with the use offset doesn't overflow.
4242     Immediate Offset = LU.MinOffset;
4243     if (Offset.isMin() && Factor == -1)
4244       continue;
4245     Offset = Offset.mulUnsigned(Factor);
4246     if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4247       continue;
4248     // If the offset will be truncated at this use, check that it is in bounds.
4249     if (!IntTy->isPointerTy() &&
4250         !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4251       continue;
4252 
4253     Formula F = Base;
4254     F.BaseOffset = NewBaseOffset;
4255 
4256     // Check that this scale is legal.
4257     if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4258       continue;
4259 
4260     // Compensate for the use having MinOffset built into it.
4261     F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4262 
4263     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4264 
4265     // Check that multiplying with each base register doesn't overflow.
4266     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4267       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4268       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4269         goto next;
4270     }
4271 
4272     // Check that multiplying with the scaled register doesn't overflow.
4273     if (F.ScaledReg) {
4274       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4275       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4276         continue;
4277     }
4278 
4279     // Check that multiplying with the unfolded offset doesn't overflow.
4280     if (F.UnfoldedOffset.isNonZero()) {
4281       if (F.UnfoldedOffset.isMin() && Factor == -1)
4282         continue;
4283       F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4284       if (F.UnfoldedOffset.getFixedValue() / Factor !=
4285           Base.UnfoldedOffset.getFixedValue())
4286         continue;
4287       // If the offset will be truncated, check that it is in bounds.
4288       if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType(
4289                                        IntTy, F.UnfoldedOffset.getFixedValue()))
4290         continue;
4291     }
4292 
4293     // If we make it here and it's legal, add it.
4294     (void)InsertFormula(LU, LUIdx, F);
4295   next:;
4296   }
4297 }
4298 
4299 /// Generate stride factor reuse formulae by making use of scaled-offset address
4300 /// modes, for example.
4301 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4302   // Determine the integer type for the base formula.
4303   Type *IntTy = Base.getType();
4304   if (!IntTy) return;
4305 
4306   // If this Formula already has a scaled register, we can't add another one.
4307   // Try to unscale the formula to generate a better scale.
4308   if (Base.Scale != 0 && !Base.unscale())
4309     return;
4310 
4311   assert(Base.Scale == 0 && "unscale did not did its job!");
4312 
4313   // Check each interesting stride.
4314   for (int64_t Factor : Factors) {
4315     Base.Scale = Factor;
4316     Base.HasBaseReg = Base.BaseRegs.size() > 1;
4317     // Check whether this scale is going to be legal.
4318     if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4319                     Base)) {
4320       // As a special-case, handle special out-of-loop Basic users specially.
4321       // TODO: Reconsider this special case.
4322       if (LU.Kind == LSRUse::Basic &&
4323           isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4324                      LU.AccessTy, Base) &&
4325           LU.AllFixupsOutsideLoop)
4326         LU.Kind = LSRUse::Special;
4327       else
4328         continue;
4329     }
4330     // For an ICmpZero, negating a solitary base register won't lead to
4331     // new solutions.
4332     if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4333         Base.BaseOffset.isZero() && !Base.BaseGV)
4334       continue;
4335     // For each addrec base reg, if its loop is current loop, apply the scale.
4336     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4337       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4338       if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4339         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4340         if (FactorS->isZero())
4341           continue;
4342         // Divide out the factor, ignoring high bits, since we'll be
4343         // scaling the value back up in the end.
4344         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4345           if (!Quotient->isZero()) {
4346             // TODO: This could be optimized to avoid all the copying.
4347             Formula F = Base;
4348             F.ScaledReg = Quotient;
4349             F.deleteBaseReg(F.BaseRegs[i]);
4350             // The canonical representation of 1*reg is reg, which is already in
4351             // Base. In that case, do not try to insert the formula, it will be
4352             // rejected anyway.
4353             if (F.Scale == 1 && (F.BaseRegs.empty() ||
4354                                  (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4355               continue;
4356             // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4357             // non canonical Formula with ScaledReg's loop not being L.
4358             if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4359               F.canonicalize(*L);
4360             (void)InsertFormula(LU, LUIdx, F);
4361           }
4362       }
4363     }
4364   }
4365 }
4366 
4367 /// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4368 /// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4369 /// perform the extension/truncate and normalize again, as the normalized form
4370 /// can result in folds that are not valid in the post-inc use contexts. The
4371 /// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4372 static const SCEV *
4373 getAnyExtendConsideringPostIncUses(ArrayRef<PostIncLoopSet> Loops,
4374                                    const SCEV *Expr, Type *ToTy,
4375                                    ScalarEvolution &SE) {
4376   const SCEV *Result = nullptr;
4377   for (auto &L : Loops) {
4378     auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4379     const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4380     const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4381     if (!New || (Result && New != Result))
4382       return nullptr;
4383     Result = New;
4384   }
4385 
4386   assert(Result && "failed to create expression");
4387   return Result;
4388 }
4389 
4390 /// Generate reuse formulae from different IV types.
4391 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4392   // Don't bother truncating symbolic values.
4393   if (Base.BaseGV) return;
4394 
4395   // Determine the integer type for the base formula.
4396   Type *DstTy = Base.getType();
4397   if (!DstTy) return;
4398   if (DstTy->isPointerTy())
4399     return;
4400 
4401   // It is invalid to extend a pointer type so exit early if ScaledReg or
4402   // any of the BaseRegs are pointers.
4403   if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4404     return;
4405   if (any_of(Base.BaseRegs,
4406              [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4407     return;
4408 
4409   SmallVector<PostIncLoopSet> Loops;
4410   for (auto &LF : LU.Fixups)
4411     Loops.push_back(LF.PostIncLoops);
4412 
4413   for (Type *SrcTy : Types) {
4414     if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4415       Formula F = Base;
4416 
4417       // Sometimes SCEV is able to prove zero during ext transform. It may
4418       // happen if SCEV did not do all possible transforms while creating the
4419       // initial node (maybe due to depth limitations), but it can do them while
4420       // taking ext.
4421       if (F.ScaledReg) {
4422         const SCEV *NewScaledReg =
4423             getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4424         if (!NewScaledReg || NewScaledReg->isZero())
4425           continue;
4426         F.ScaledReg = NewScaledReg;
4427       }
4428       bool HasZeroBaseReg = false;
4429       for (const SCEV *&BaseReg : F.BaseRegs) {
4430         const SCEV *NewBaseReg =
4431             getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4432         if (!NewBaseReg || NewBaseReg->isZero()) {
4433           HasZeroBaseReg = true;
4434           break;
4435         }
4436         BaseReg = NewBaseReg;
4437       }
4438       if (HasZeroBaseReg)
4439         continue;
4440 
4441       // TODO: This assumes we've done basic processing on all uses and
4442       // have an idea what the register usage is.
4443       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4444         continue;
4445 
4446       F.canonicalize(*L);
4447       (void)InsertFormula(LU, LUIdx, F);
4448     }
4449   }
4450 }
4451 
4452 namespace {
4453 
4454 /// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4455 /// modifications so that the search phase doesn't have to worry about the data
4456 /// structures moving underneath it.
4457 struct WorkItem {
4458   size_t LUIdx;
4459   Immediate Imm;
4460   const SCEV *OrigReg;
4461 
4462   WorkItem(size_t LI, Immediate I, const SCEV *R)
4463       : LUIdx(LI), Imm(I), OrigReg(R) {}
4464 
4465   void print(raw_ostream &OS) const;
4466   void dump() const;
4467 };
4468 
4469 } // end anonymous namespace
4470 
4471 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4472 void WorkItem::print(raw_ostream &OS) const {
4473   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4474      << " , add offset " << Imm;
4475 }
4476 
4477 LLVM_DUMP_METHOD void WorkItem::dump() const {
4478   print(errs()); errs() << '\n';
4479 }
4480 #endif
4481 
4482 /// Look for registers which are a constant distance apart and try to form reuse
4483 /// opportunities between them.
4484 void LSRInstance::GenerateCrossUseConstantOffsets() {
4485   // Group the registers by their value without any added constant offset.
4486   using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4487 
4488   DenseMap<const SCEV *, ImmMapTy> Map;
4489   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4490   SmallVector<const SCEV *, 8> Sequence;
4491   for (const SCEV *Use : RegUses) {
4492     const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4493     Immediate Imm = ExtractImmediate(Reg, SE);
4494     auto Pair = Map.try_emplace(Reg);
4495     if (Pair.second)
4496       Sequence.push_back(Reg);
4497     Pair.first->second.insert(std::make_pair(Imm, Use));
4498     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4499   }
4500 
4501   // Now examine each set of registers with the same base value. Build up
4502   // a list of work to do and do the work in a separate step so that we're
4503   // not adding formulae and register counts while we're searching.
4504   SmallVector<WorkItem, 32> WorkItems;
4505   SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4506       UniqueItems;
4507   for (const SCEV *Reg : Sequence) {
4508     const ImmMapTy &Imms = Map.find(Reg)->second;
4509 
4510     // It's not worthwhile looking for reuse if there's only one offset.
4511     if (Imms.size() == 1)
4512       continue;
4513 
4514     LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4515                for (const auto &Entry
4516                     : Imms) dbgs()
4517                << ' ' << Entry.first;
4518                dbgs() << '\n');
4519 
4520     // Examine each offset.
4521     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4522          J != JE; ++J) {
4523       const SCEV *OrigReg = J->second;
4524 
4525       Immediate JImm = J->first;
4526       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4527 
4528       if (!isa<SCEVConstant>(OrigReg) &&
4529           UsedByIndicesMap[Reg].count() == 1) {
4530         LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4531                           << '\n');
4532         continue;
4533       }
4534 
4535       // Conservatively examine offsets between this orig reg a few selected
4536       // other orig regs.
4537       Immediate First = Imms.begin()->first;
4538       Immediate Last = std::prev(Imms.end())->first;
4539       if (!First.isCompatibleImmediate(Last)) {
4540         LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4541                           << "\n");
4542         continue;
4543       }
4544       // Only scalable if both terms are scalable, or if one is scalable and
4545       // the other is 0.
4546       bool Scalable = First.isScalable() || Last.isScalable();
4547       int64_t FI = First.getKnownMinValue();
4548       int64_t LI = Last.getKnownMinValue();
4549       // Compute (First + Last)  / 2 without overflow using the fact that
4550       // First + Last = 2 * (First + Last) + (First ^ Last).
4551       int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4552       // If the result is negative and FI is odd and LI even (or vice versa),
4553       // we rounded towards -inf. Add 1 in that case, to round towards 0.
4554       Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4555       ImmMapTy::const_iterator OtherImms[] = {
4556           Imms.begin(), std::prev(Imms.end()),
4557           Imms.lower_bound(Immediate::get(Avg, Scalable))};
4558       for (const auto &M : OtherImms) {
4559         if (M == J || M == JE) continue;
4560         if (!JImm.isCompatibleImmediate(M->first))
4561           continue;
4562 
4563         // Compute the difference between the two.
4564         Immediate Imm = JImm.subUnsigned(M->first);
4565         for (unsigned LUIdx : UsedByIndices.set_bits())
4566           // Make a memo of this use, offset, and register tuple.
4567           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4568             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4569       }
4570     }
4571   }
4572 
4573   Map.clear();
4574   Sequence.clear();
4575   UsedByIndicesMap.clear();
4576   UniqueItems.clear();
4577 
4578   // Now iterate through the worklist and add new formulae.
4579   for (const WorkItem &WI : WorkItems) {
4580     size_t LUIdx = WI.LUIdx;
4581     LSRUse &LU = Uses[LUIdx];
4582     Immediate Imm = WI.Imm;
4583     const SCEV *OrigReg = WI.OrigReg;
4584 
4585     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4586     const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4587     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4588 
4589     // TODO: Use a more targeted data structure.
4590     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4591       Formula F = LU.Formulae[L];
4592       // FIXME: The code for the scaled and unscaled registers looks
4593       // very similar but slightly different. Investigate if they
4594       // could be merged. That way, we would not have to unscale the
4595       // Formula.
4596       F.unscale();
4597       // Use the immediate in the scaled register.
4598       if (F.ScaledReg == OrigReg) {
4599         if (!F.BaseOffset.isCompatibleImmediate(Imm))
4600           continue;
4601         Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4602         // Don't create 50 + reg(-50).
4603         const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4604         if (F.referencesReg(S))
4605           continue;
4606         Formula NewF = F;
4607         NewF.BaseOffset = Offset;
4608         if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4609                         NewF))
4610           continue;
4611         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4612 
4613         // If the new scale is a constant in a register, and adding the constant
4614         // value to the immediate would produce a value closer to zero than the
4615         // immediate itself, then the formula isn't worthwhile.
4616         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4617           // FIXME: Do we need to do something for scalable immediates here?
4618           //        A scalable SCEV won't be constant, but we might still have
4619           //        something in the offset? Bail out for now to be safe.
4620           if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4621             continue;
4622           if (C->getValue()->isNegative() !=
4623                   (NewF.BaseOffset.isLessThanZero()) &&
4624               (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4625                   .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4626             continue;
4627         }
4628 
4629         // OK, looks good.
4630         NewF.canonicalize(*this->L);
4631         (void)InsertFormula(LU, LUIdx, NewF);
4632       } else {
4633         // Use the immediate in a base register.
4634         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4635           const SCEV *BaseReg = F.BaseRegs[N];
4636           if (BaseReg != OrigReg)
4637             continue;
4638           Formula NewF = F;
4639           if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4640               !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4641               !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4642             continue;
4643           NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4644           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4645                           LU.Kind, LU.AccessTy, NewF)) {
4646             if (AMK == TTI::AMK_PostIndexed &&
4647                 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4648               continue;
4649             Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4650             if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4651               continue;
4652             NewF = F;
4653             NewF.UnfoldedOffset = NewUnfoldedOffset;
4654           }
4655           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4656 
4657           // If the new formula has a constant in a register, and adding the
4658           // constant value to the immediate would produce a value closer to
4659           // zero than the immediate itself, then the formula isn't worthwhile.
4660           for (const SCEV *NewReg : NewF.BaseRegs)
4661             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4662               if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4663                 goto skip_formula;
4664               if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4665                       .abs()
4666                       .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4667                   (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4668                           .countr_zero() >=
4669                       (unsigned)llvm::countr_zero<uint64_t>(
4670                           NewF.BaseOffset.getFixedValue()))
4671                 goto skip_formula;
4672             }
4673 
4674           // Ok, looks good.
4675           NewF.canonicalize(*this->L);
4676           (void)InsertFormula(LU, LUIdx, NewF);
4677           break;
4678         skip_formula:;
4679         }
4680       }
4681     }
4682   }
4683 }
4684 
4685 /// Generate formulae for each use.
4686 void
4687 LSRInstance::GenerateAllReuseFormulae() {
4688   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4689   // queries are more precise.
4690   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4691     LSRUse &LU = Uses[LUIdx];
4692     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4693       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4694     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4695       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4696   }
4697   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4698     LSRUse &LU = Uses[LUIdx];
4699     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4700       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4701     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4702       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4703     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4704       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4705     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4706       GenerateScales(LU, LUIdx, LU.Formulae[i]);
4707   }
4708   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4709     LSRUse &LU = Uses[LUIdx];
4710     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4711       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4712   }
4713 
4714   GenerateCrossUseConstantOffsets();
4715 
4716   LLVM_DEBUG(dbgs() << "\n"
4717                        "After generating reuse formulae:\n";
4718              print_uses(dbgs()));
4719 }
4720 
4721 /// If there are multiple formulae with the same set of registers used
4722 /// by other uses, pick the best one and delete the others.
4723 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4724   DenseSet<const SCEV *> VisitedRegs;
4725   SmallPtrSet<const SCEV *, 16> Regs;
4726   SmallPtrSet<const SCEV *, 16> LoserRegs;
4727 #ifndef NDEBUG
4728   bool ChangedFormulae = false;
4729 #endif
4730 
4731   // Collect the best formula for each unique set of shared registers. This
4732   // is reset for each use.
4733   using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4734 
4735   BestFormulaeTy BestFormulae;
4736 
4737   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4738     LSRUse &LU = Uses[LUIdx];
4739     LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4740                dbgs() << '\n');
4741 
4742     bool Any = false;
4743     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4744          FIdx != NumForms; ++FIdx) {
4745       Formula &F = LU.Formulae[FIdx];
4746 
4747       // Some formulas are instant losers. For example, they may depend on
4748       // nonexistent AddRecs from other loops. These need to be filtered
4749       // immediately, otherwise heuristics could choose them over others leading
4750       // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4751       // avoids the need to recompute this information across formulae using the
4752       // same bad AddRec. Passing LoserRegs is also essential unless we remove
4753       // the corresponding bad register from the Regs set.
4754       Cost CostF(L, SE, TTI, AMK);
4755       Regs.clear();
4756       CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4757                         &LoserRegs);
4758       if (CostF.isLoser()) {
4759         // During initial formula generation, undesirable formulae are generated
4760         // by uses within other loops that have some non-trivial address mode or
4761         // use the postinc form of the IV. LSR needs to provide these formulae
4762         // as the basis of rediscovering the desired formula that uses an AddRec
4763         // corresponding to the existing phi. Once all formulae have been
4764         // generated, these initial losers may be pruned.
4765         LLVM_DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
4766                    dbgs() << "\n");
4767       }
4768       else {
4769         SmallVector<const SCEV *, 4> Key;
4770         for (const SCEV *Reg : F.BaseRegs) {
4771           if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4772             Key.push_back(Reg);
4773         }
4774         if (F.ScaledReg &&
4775             RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4776           Key.push_back(F.ScaledReg);
4777         // Unstable sort by host order ok, because this is only used for
4778         // uniquifying.
4779         llvm::sort(Key);
4780 
4781         std::pair<BestFormulaeTy::const_iterator, bool> P =
4782           BestFormulae.insert(std::make_pair(Key, FIdx));
4783         if (P.second)
4784           continue;
4785 
4786         Formula &Best = LU.Formulae[P.first->second];
4787 
4788         Cost CostBest(L, SE, TTI, AMK);
4789         Regs.clear();
4790         CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4791                              HardwareLoopProfitable);
4792         if (CostF.isLess(CostBest))
4793           std::swap(F, Best);
4794         LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
4795                    dbgs() << "\n"
4796                              "    in favor of formula ";
4797                    Best.print(dbgs()); dbgs() << '\n');
4798       }
4799 #ifndef NDEBUG
4800       ChangedFormulae = true;
4801 #endif
4802       LU.DeleteFormula(F);
4803       --FIdx;
4804       --NumForms;
4805       Any = true;
4806     }
4807 
4808     // Now that we've filtered out some formulae, recompute the Regs set.
4809     if (Any)
4810       LU.RecomputeRegs(LUIdx, RegUses);
4811 
4812     // Reset this to prepare for the next use.
4813     BestFormulae.clear();
4814   }
4815 
4816   LLVM_DEBUG(if (ChangedFormulae) {
4817     dbgs() << "\n"
4818               "After filtering out undesirable candidates:\n";
4819     print_uses(dbgs());
4820   });
4821 }
4822 
4823 /// Estimate the worst-case number of solutions the solver might have to
4824 /// consider. It almost never considers this many solutions because it prune the
4825 /// search space, but the pruning isn't always sufficient.
4826 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4827   size_t Power = 1;
4828   for (const LSRUse &LU : Uses) {
4829     size_t FSize = LU.Formulae.size();
4830     if (FSize >= ComplexityLimit) {
4831       Power = ComplexityLimit;
4832       break;
4833     }
4834     Power *= FSize;
4835     if (Power >= ComplexityLimit)
4836       break;
4837   }
4838   return Power;
4839 }
4840 
4841 /// When one formula uses a superset of the registers of another formula, it
4842 /// won't help reduce register pressure (though it may not necessarily hurt
4843 /// register pressure); remove it to simplify the system.
4844 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4845   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4846     LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4847 
4848     LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4849                          "which use a superset of registers used by other "
4850                          "formulae.\n");
4851 
4852     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4853       LSRUse &LU = Uses[LUIdx];
4854       bool Any = false;
4855       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4856         Formula &F = LU.Formulae[i];
4857         if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4858           continue;
4859         // Look for a formula with a constant or GV in a register. If the use
4860         // also has a formula with that same value in an immediate field,
4861         // delete the one that uses a register.
4862         for (SmallVectorImpl<const SCEV *>::const_iterator
4863              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4864           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4865             Formula NewF = F;
4866             //FIXME: Formulas should store bitwidth to do wrapping properly.
4867             //       See PR41034.
4868             NewF.BaseOffset =
4869                 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4870                                     (uint64_t)C->getValue()->getSExtValue());
4871             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4872                                 (I - F.BaseRegs.begin()));
4873             if (LU.HasFormulaWithSameRegs(NewF)) {
4874               LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4875                          dbgs() << '\n');
4876               LU.DeleteFormula(F);
4877               --i;
4878               --e;
4879               Any = true;
4880               break;
4881             }
4882           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4883             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4884               if (!F.BaseGV) {
4885                 Formula NewF = F;
4886                 NewF.BaseGV = GV;
4887                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4888                                     (I - F.BaseRegs.begin()));
4889                 if (LU.HasFormulaWithSameRegs(NewF)) {
4890                   LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4891                              dbgs() << '\n');
4892                   LU.DeleteFormula(F);
4893                   --i;
4894                   --e;
4895                   Any = true;
4896                   break;
4897                 }
4898               }
4899           }
4900         }
4901       }
4902       if (Any)
4903         LU.RecomputeRegs(LUIdx, RegUses);
4904     }
4905 
4906     LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4907   }
4908 }
4909 
4910 /// When there are many registers for expressions like A, A+1, A+2, etc.,
4911 /// allocate a single register for them.
4912 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4913   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4914     return;
4915 
4916   LLVM_DEBUG(
4917       dbgs() << "The search space is too complex.\n"
4918                 "Narrowing the search space by assuming that uses separated "
4919                 "by a constant offset will use the same registers.\n");
4920 
4921   // This is especially useful for unrolled loops.
4922 
4923   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4924     LSRUse &LU = Uses[LUIdx];
4925     for (const Formula &F : LU.Formulae) {
4926       if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4927         continue;
4928 
4929       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4930       if (!LUThatHas)
4931         continue;
4932 
4933       if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4934                               LU.Kind, LU.AccessTy))
4935         continue;
4936 
4937       LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4938 
4939       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4940 
4941       // Transfer the fixups of LU to LUThatHas.
4942       for (LSRFixup &Fixup : LU.Fixups) {
4943         Fixup.Offset += F.BaseOffset;
4944         LUThatHas->pushFixup(Fixup);
4945         LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4946       }
4947 
4948       // Delete formulae from the new use which are no longer legal.
4949       bool Any = false;
4950       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4951         Formula &F = LUThatHas->Formulae[i];
4952         if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4953                         LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4954           LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4955           LUThatHas->DeleteFormula(F);
4956           --i;
4957           --e;
4958           Any = true;
4959         }
4960       }
4961 
4962       if (Any)
4963         LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4964 
4965       // Delete the old use.
4966       DeleteUse(LU, LUIdx);
4967       --LUIdx;
4968       --NumUses;
4969       break;
4970     }
4971   }
4972 
4973   LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4974 }
4975 
4976 /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4977 /// we've done more filtering, as it may be able to find more formulae to
4978 /// eliminate.
4979 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4980   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4981     LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4982 
4983     LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4984                          "undesirable dedicated registers.\n");
4985 
4986     FilterOutUndesirableDedicatedRegisters();
4987 
4988     LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4989   }
4990 }
4991 
4992 /// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4993 /// Pick the best one and delete the others.
4994 /// This narrowing heuristic is to keep as many formulae with different
4995 /// Scale and ScaledReg pair as possible while narrowing the search space.
4996 /// The benefit is that it is more likely to find out a better solution
4997 /// from a formulae set with more Scale and ScaledReg variations than
4998 /// a formulae set with the same Scale and ScaledReg. The picking winner
4999 /// reg heuristic will often keep the formulae with the same Scale and
5000 /// ScaledReg and filter others, and we want to avoid that if possible.
5001 void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5002   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5003     return;
5004 
5005   LLVM_DEBUG(
5006       dbgs() << "The search space is too complex.\n"
5007                 "Narrowing the search space by choosing the best Formula "
5008                 "from the Formulae with the same Scale and ScaledReg.\n");
5009 
5010   // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5011   using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5012 
5013   BestFormulaeTy BestFormulae;
5014 #ifndef NDEBUG
5015   bool ChangedFormulae = false;
5016 #endif
5017   DenseSet<const SCEV *> VisitedRegs;
5018   SmallPtrSet<const SCEV *, 16> Regs;
5019 
5020   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5021     LSRUse &LU = Uses[LUIdx];
5022     LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5023                dbgs() << '\n');
5024 
5025     // Return true if Formula FA is better than Formula FB.
5026     auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5027       // First we will try to choose the Formula with fewer new registers.
5028       // For a register used by current Formula, the more the register is
5029       // shared among LSRUses, the less we increase the register number
5030       // counter of the formula.
5031       size_t FARegNum = 0;
5032       for (const SCEV *Reg : FA.BaseRegs) {
5033         const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5034         FARegNum += (NumUses - UsedByIndices.count() + 1);
5035       }
5036       size_t FBRegNum = 0;
5037       for (const SCEV *Reg : FB.BaseRegs) {
5038         const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5039         FBRegNum += (NumUses - UsedByIndices.count() + 1);
5040       }
5041       if (FARegNum != FBRegNum)
5042         return FARegNum < FBRegNum;
5043 
5044       // If the new register numbers are the same, choose the Formula with
5045       // less Cost.
5046       Cost CostFA(L, SE, TTI, AMK);
5047       Cost CostFB(L, SE, TTI, AMK);
5048       Regs.clear();
5049       CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5050       Regs.clear();
5051       CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5052       return CostFA.isLess(CostFB);
5053     };
5054 
5055     bool Any = false;
5056     for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5057          ++FIdx) {
5058       Formula &F = LU.Formulae[FIdx];
5059       if (!F.ScaledReg)
5060         continue;
5061       auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5062       if (P.second)
5063         continue;
5064 
5065       Formula &Best = LU.Formulae[P.first->second];
5066       if (IsBetterThan(F, Best))
5067         std::swap(F, Best);
5068       LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
5069                  dbgs() << "\n"
5070                            "    in favor of formula ";
5071                  Best.print(dbgs()); dbgs() << '\n');
5072 #ifndef NDEBUG
5073       ChangedFormulae = true;
5074 #endif
5075       LU.DeleteFormula(F);
5076       --FIdx;
5077       --NumForms;
5078       Any = true;
5079     }
5080     if (Any)
5081       LU.RecomputeRegs(LUIdx, RegUses);
5082 
5083     // Reset this to prepare for the next use.
5084     BestFormulae.clear();
5085   }
5086 
5087   LLVM_DEBUG(if (ChangedFormulae) {
5088     dbgs() << "\n"
5089               "After filtering out undesirable candidates:\n";
5090     print_uses(dbgs());
5091   });
5092 }
5093 
5094 /// If we are over the complexity limit, filter out any post-inc prefering
5095 /// variables to only post-inc values.
5096 void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5097   if (AMK != TTI::AMK_PostIndexed)
5098     return;
5099   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5100     return;
5101 
5102   LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5103                        "Narrowing the search space by choosing the lowest "
5104                        "register Formula for PostInc Uses.\n");
5105 
5106   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5107     LSRUse &LU = Uses[LUIdx];
5108 
5109     if (LU.Kind != LSRUse::Address)
5110       continue;
5111     if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5112         !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5113       continue;
5114 
5115     size_t MinRegs = std::numeric_limits<size_t>::max();
5116     for (const Formula &F : LU.Formulae)
5117       MinRegs = std::min(F.getNumRegs(), MinRegs);
5118 
5119     bool Any = false;
5120     for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5121          ++FIdx) {
5122       Formula &F = LU.Formulae[FIdx];
5123       if (F.getNumRegs() > MinRegs) {
5124         LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
5125                    dbgs() << "\n");
5126         LU.DeleteFormula(F);
5127         --FIdx;
5128         --NumForms;
5129         Any = true;
5130       }
5131     }
5132     if (Any)
5133       LU.RecomputeRegs(LUIdx, RegUses);
5134 
5135     if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5136       break;
5137   }
5138 
5139   LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5140 }
5141 
5142 /// The function delete formulas with high registers number expectation.
5143 /// Assuming we don't know the value of each formula (already delete
5144 /// all inefficient), generate probability of not selecting for each
5145 /// register.
5146 /// For example,
5147 /// Use1:
5148 ///  reg(a) + reg({0,+,1})
5149 ///  reg(a) + reg({-1,+,1}) + 1
5150 ///  reg({a,+,1})
5151 /// Use2:
5152 ///  reg(b) + reg({0,+,1})
5153 ///  reg(b) + reg({-1,+,1}) + 1
5154 ///  reg({b,+,1})
5155 /// Use3:
5156 ///  reg(c) + reg(b) + reg({0,+,1})
5157 ///  reg(c) + reg({b,+,1})
5158 ///
5159 /// Probability of not selecting
5160 ///                 Use1   Use2    Use3
5161 /// reg(a)         (1/3) *   1   *   1
5162 /// reg(b)           1   * (1/3) * (1/2)
5163 /// reg({0,+,1})   (2/3) * (2/3) * (1/2)
5164 /// reg({-1,+,1})  (2/3) * (2/3) *   1
5165 /// reg({a,+,1})   (2/3) *   1   *   1
5166 /// reg({b,+,1})     1   * (2/3) * (2/3)
5167 /// reg(c)           1   *   1   *   0
5168 ///
5169 /// Now count registers number mathematical expectation for each formula:
5170 /// Note that for each use we exclude probability if not selecting for the use.
5171 /// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5172 /// probabilty 1/3 of not selecting for Use1).
5173 /// Use1:
5174 ///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
5175 ///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
5176 ///  reg({a,+,1})                   1
5177 /// Use2:
5178 ///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
5179 ///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
5180 ///  reg({b,+,1})                   2/3
5181 /// Use3:
5182 ///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5183 ///  reg(c) + reg({b,+,1})          1 + 2/3
5184 void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5185   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5186     return;
5187   // Ok, we have too many of formulae on our hands to conveniently handle.
5188   // Use a rough heuristic to thin out the list.
5189 
5190   // Set of Regs wich will be 100% used in final solution.
5191   // Used in each formula of a solution (in example above this is reg(c)).
5192   // We can skip them in calculations.
5193   SmallPtrSet<const SCEV *, 4> UniqRegs;
5194   LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5195 
5196   // Map each register to probability of not selecting
5197   DenseMap <const SCEV *, float> RegNumMap;
5198   for (const SCEV *Reg : RegUses) {
5199     if (UniqRegs.count(Reg))
5200       continue;
5201     float PNotSel = 1;
5202     for (const LSRUse &LU : Uses) {
5203       if (!LU.Regs.count(Reg))
5204         continue;
5205       float P = LU.getNotSelectedProbability(Reg);
5206       if (P != 0.0)
5207         PNotSel *= P;
5208       else
5209         UniqRegs.insert(Reg);
5210     }
5211     RegNumMap.insert(std::make_pair(Reg, PNotSel));
5212   }
5213 
5214   LLVM_DEBUG(
5215       dbgs() << "Narrowing the search space by deleting costly formulas\n");
5216 
5217   // Delete formulas where registers number expectation is high.
5218   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5219     LSRUse &LU = Uses[LUIdx];
5220     // If nothing to delete - continue.
5221     if (LU.Formulae.size() < 2)
5222       continue;
5223     // This is temporary solution to test performance. Float should be
5224     // replaced with round independent type (based on integers) to avoid
5225     // different results for different target builds.
5226     float FMinRegNum = LU.Formulae[0].getNumRegs();
5227     float FMinARegNum = LU.Formulae[0].getNumRegs();
5228     size_t MinIdx = 0;
5229     for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5230       Formula &F = LU.Formulae[i];
5231       float FRegNum = 0;
5232       float FARegNum = 0;
5233       for (const SCEV *BaseReg : F.BaseRegs) {
5234         if (UniqRegs.count(BaseReg))
5235           continue;
5236         FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5237         if (isa<SCEVAddRecExpr>(BaseReg))
5238           FARegNum +=
5239               RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5240       }
5241       if (const SCEV *ScaledReg = F.ScaledReg) {
5242         if (!UniqRegs.count(ScaledReg)) {
5243           FRegNum +=
5244               RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5245           if (isa<SCEVAddRecExpr>(ScaledReg))
5246             FARegNum +=
5247                 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5248         }
5249       }
5250       if (FMinRegNum > FRegNum ||
5251           (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5252         FMinRegNum = FRegNum;
5253         FMinARegNum = FARegNum;
5254         MinIdx = i;
5255       }
5256     }
5257     LLVM_DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
5258                dbgs() << " with min reg num " << FMinRegNum << '\n');
5259     if (MinIdx != 0)
5260       std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5261     while (LU.Formulae.size() != 1) {
5262       LLVM_DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
5263                  dbgs() << '\n');
5264       LU.Formulae.pop_back();
5265     }
5266     LU.RecomputeRegs(LUIdx, RegUses);
5267     assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5268     Formula &F = LU.Formulae[0];
5269     LLVM_DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
5270     // When we choose the formula, the regs become unique.
5271     UniqRegs.insert_range(F.BaseRegs);
5272     if (F.ScaledReg)
5273       UniqRegs.insert(F.ScaledReg);
5274   }
5275   LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5276 }
5277 
5278 // Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5279 // would the addressing offset +C would be legal where the negative offset -C is
5280 // not.
5281 static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI,
5282                                        ScalarEvolution &SE, const SCEV *Best,
5283                                        const SCEV *Reg,
5284                                        MemAccessTy AccessType) {
5285   if (Best->getType() != Reg->getType() ||
5286       (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5287        cast<SCEVAddRecExpr>(Best)->getLoop() !=
5288            cast<SCEVAddRecExpr>(Reg)->getLoop()))
5289     return false;
5290   std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5291   if (!Diff)
5292     return false;
5293 
5294   return TTI.isLegalAddressingMode(
5295              AccessType.MemTy, /*BaseGV=*/nullptr,
5296              /*BaseOffset=*/Diff->getSExtValue(),
5297              /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5298          !TTI.isLegalAddressingMode(
5299              AccessType.MemTy, /*BaseGV=*/nullptr,
5300              /*BaseOffset=*/-Diff->getSExtValue(),
5301              /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5302 }
5303 
5304 /// Pick a register which seems likely to be profitable, and then in any use
5305 /// which has any reference to that register, delete all formulae which do not
5306 /// reference that register.
5307 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5308   // With all other options exhausted, loop until the system is simple
5309   // enough to handle.
5310   SmallPtrSet<const SCEV *, 4> Taken;
5311   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5312     // Ok, we have too many of formulae on our hands to conveniently handle.
5313     // Use a rough heuristic to thin out the list.
5314     LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5315 
5316     // Pick the register which is used by the most LSRUses, which is likely
5317     // to be a good reuse register candidate.
5318     const SCEV *Best = nullptr;
5319     unsigned BestNum = 0;
5320     for (const SCEV *Reg : RegUses) {
5321       if (Taken.count(Reg))
5322         continue;
5323       if (!Best) {
5324         Best = Reg;
5325         BestNum = RegUses.getUsedByIndices(Reg).count();
5326       } else {
5327         unsigned Count = RegUses.getUsedByIndices(Reg).count();
5328         if (Count > BestNum) {
5329           Best = Reg;
5330           BestNum = Count;
5331         }
5332 
5333         // If the scores are the same, but the Reg is simpler for the target
5334         // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5335         // handle +C but not -C), opt for the simpler formula.
5336         if (Count == BestNum) {
5337           int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5338           if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5339               IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5340                                          Uses[LUIdx].AccessTy)) {
5341             Best = Reg;
5342             BestNum = Count;
5343           }
5344         }
5345       }
5346     }
5347     assert(Best && "Failed to find best LSRUse candidate");
5348 
5349     LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5350                       << " will yield profitable reuse.\n");
5351     Taken.insert(Best);
5352 
5353     // In any use with formulae which references this register, delete formulae
5354     // which don't reference it.
5355     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5356       LSRUse &LU = Uses[LUIdx];
5357       if (!LU.Regs.count(Best)) continue;
5358 
5359       bool Any = false;
5360       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5361         Formula &F = LU.Formulae[i];
5362         if (!F.referencesReg(Best)) {
5363           LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
5364           LU.DeleteFormula(F);
5365           --e;
5366           --i;
5367           Any = true;
5368           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5369           continue;
5370         }
5371       }
5372 
5373       if (Any)
5374         LU.RecomputeRegs(LUIdx, RegUses);
5375     }
5376 
5377     LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5378   }
5379 }
5380 
5381 /// If there are an extraordinary number of formulae to choose from, use some
5382 /// rough heuristics to prune down the number of formulae. This keeps the main
5383 /// solver from taking an extraordinary amount of time in some worst-case
5384 /// scenarios.
5385 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5386   NarrowSearchSpaceByDetectingSupersets();
5387   NarrowSearchSpaceByCollapsingUnrolledCode();
5388   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5389   if (FilterSameScaledReg)
5390     NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5391   NarrowSearchSpaceByFilterPostInc();
5392   if (LSRExpNarrow)
5393     NarrowSearchSpaceByDeletingCostlyFormulas();
5394   else
5395     NarrowSearchSpaceByPickingWinnerRegs();
5396 }
5397 
5398 /// This is the recursive solver.
5399 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5400                                Cost &SolutionCost,
5401                                SmallVectorImpl<const Formula *> &Workspace,
5402                                const Cost &CurCost,
5403                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
5404                                DenseSet<const SCEV *> &VisitedRegs) const {
5405   // Some ideas:
5406   //  - prune more:
5407   //    - use more aggressive filtering
5408   //    - sort the formula so that the most profitable solutions are found first
5409   //    - sort the uses too
5410   //  - search faster:
5411   //    - don't compute a cost, and then compare. compare while computing a cost
5412   //      and bail early.
5413   //    - track register sets with SmallBitVector
5414 
5415   const LSRUse &LU = Uses[Workspace.size()];
5416 
5417   // If this use references any register that's already a part of the
5418   // in-progress solution, consider it a requirement that a formula must
5419   // reference that register in order to be considered. This prunes out
5420   // unprofitable searching.
5421   SmallSetVector<const SCEV *, 4> ReqRegs;
5422   for (const SCEV *S : CurRegs)
5423     if (LU.Regs.count(S))
5424       ReqRegs.insert(S);
5425 
5426   SmallPtrSet<const SCEV *, 16> NewRegs;
5427   Cost NewCost(L, SE, TTI, AMK);
5428   for (const Formula &F : LU.Formulae) {
5429     // Ignore formulae which may not be ideal in terms of register reuse of
5430     // ReqRegs.  The formula should use all required registers before
5431     // introducing new ones.
5432     // This can sometimes (notably when trying to favour postinc) lead to
5433     // sub-optimial decisions. There it is best left to the cost modelling to
5434     // get correct.
5435     if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5436       int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5437       for (const SCEV *Reg : ReqRegs) {
5438         if ((F.ScaledReg && F.ScaledReg == Reg) ||
5439             is_contained(F.BaseRegs, Reg)) {
5440           --NumReqRegsToFind;
5441           if (NumReqRegsToFind == 0)
5442             break;
5443         }
5444       }
5445       if (NumReqRegsToFind != 0) {
5446         // If none of the formulae satisfied the required registers, then we could
5447         // clear ReqRegs and try again. Currently, we simply give up in this case.
5448         continue;
5449       }
5450     }
5451 
5452     // Evaluate the cost of the current formula. If it's already worse than
5453     // the current best, prune the search at that point.
5454     NewCost = CurCost;
5455     NewRegs = CurRegs;
5456     NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5457     if (NewCost.isLess(SolutionCost)) {
5458       Workspace.push_back(&F);
5459       if (Workspace.size() != Uses.size()) {
5460         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5461                      NewRegs, VisitedRegs);
5462         if (F.getNumRegs() == 1 && Workspace.size() == 1)
5463           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5464       } else {
5465         LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5466                    dbgs() << ".\nRegs:\n";
5467                    for (const SCEV *S : NewRegs) dbgs()
5468                       << "- " << *S << "\n";
5469                    dbgs() << '\n');
5470 
5471         SolutionCost = NewCost;
5472         Solution = Workspace;
5473       }
5474       Workspace.pop_back();
5475     }
5476   }
5477 }
5478 
5479 /// Choose one formula from each use. Return the results in the given Solution
5480 /// vector.
5481 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5482   SmallVector<const Formula *, 8> Workspace;
5483   Cost SolutionCost(L, SE, TTI, AMK);
5484   SolutionCost.Lose();
5485   Cost CurCost(L, SE, TTI, AMK);
5486   SmallPtrSet<const SCEV *, 16> CurRegs;
5487   DenseSet<const SCEV *> VisitedRegs;
5488   Workspace.reserve(Uses.size());
5489 
5490   // SolveRecurse does all the work.
5491   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5492                CurRegs, VisitedRegs);
5493   if (Solution.empty()) {
5494     LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5495     return;
5496   }
5497 
5498   // Ok, we've now made all our decisions.
5499   LLVM_DEBUG(dbgs() << "\n"
5500                        "The chosen solution requires ";
5501              SolutionCost.print(dbgs()); dbgs() << ":\n";
5502              for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5503                dbgs() << "  ";
5504                Uses[i].print(dbgs());
5505                dbgs() << "\n"
5506                          "    ";
5507                Solution[i]->print(dbgs());
5508                dbgs() << '\n';
5509              });
5510 
5511   assert(Solution.size() == Uses.size() && "Malformed solution!");
5512 
5513   const bool EnableDropUnprofitableSolution = [&] {
5514     switch (AllowDropSolutionIfLessProfitable) {
5515     case cl::BOU_TRUE:
5516       return true;
5517     case cl::BOU_FALSE:
5518       return false;
5519     case cl::BOU_UNSET:
5520       return TTI.shouldDropLSRSolutionIfLessProfitable();
5521     }
5522     llvm_unreachable("Unhandled cl::boolOrDefault enum");
5523   }();
5524 
5525   if (BaselineCost.isLess(SolutionCost)) {
5526     if (!EnableDropUnprofitableSolution)
5527       LLVM_DEBUG(
5528           dbgs() << "Baseline is more profitable than chosen solution, "
5529                     "add option 'lsr-drop-solution' to drop LSR solution.\n");
5530     else {
5531       LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5532                            "solution, dropping LSR solution.\n";);
5533       Solution.clear();
5534     }
5535   }
5536 }
5537 
5538 /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5539 /// we can go while still being dominated by the input positions. This helps
5540 /// canonicalize the insert position, which encourages sharing.
5541 BasicBlock::iterator
5542 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5543                                  const SmallVectorImpl<Instruction *> &Inputs)
5544                                                                          const {
5545   Instruction *Tentative = &*IP;
5546   while (true) {
5547     bool AllDominate = true;
5548     Instruction *BetterPos = nullptr;
5549     // Don't bother attempting to insert before a catchswitch, their basic block
5550     // cannot have other non-PHI instructions.
5551     if (isa<CatchSwitchInst>(Tentative))
5552       return IP;
5553 
5554     for (Instruction *Inst : Inputs) {
5555       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5556         AllDominate = false;
5557         break;
5558       }
5559       // Attempt to find an insert position in the middle of the block,
5560       // instead of at the end, so that it can be used for other expansions.
5561       if (Tentative->getParent() == Inst->getParent() &&
5562           (!BetterPos || !DT.dominates(Inst, BetterPos)))
5563         BetterPos = &*std::next(BasicBlock::iterator(Inst));
5564     }
5565     if (!AllDominate)
5566       break;
5567     if (BetterPos)
5568       IP = BetterPos->getIterator();
5569     else
5570       IP = Tentative->getIterator();
5571 
5572     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5573     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5574 
5575     BasicBlock *IDom;
5576     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5577       if (!Rung) return IP;
5578       Rung = Rung->getIDom();
5579       if (!Rung) return IP;
5580       IDom = Rung->getBlock();
5581 
5582       // Don't climb into a loop though.
5583       const Loop *IDomLoop = LI.getLoopFor(IDom);
5584       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5585       if (IDomDepth <= IPLoopDepth &&
5586           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5587         break;
5588     }
5589 
5590     Tentative = IDom->getTerminator();
5591   }
5592 
5593   return IP;
5594 }
5595 
5596 /// Determine an input position which will be dominated by the operands and
5597 /// which will dominate the result.
5598 BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5599     BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5600   // Collect some instructions which must be dominated by the
5601   // expanding replacement. These must be dominated by any operands that
5602   // will be required in the expansion.
5603   SmallVector<Instruction *, 4> Inputs;
5604   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5605     Inputs.push_back(I);
5606   if (LU.Kind == LSRUse::ICmpZero)
5607     if (Instruction *I =
5608           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5609       Inputs.push_back(I);
5610   if (LF.PostIncLoops.count(L)) {
5611     if (LF.isUseFullyOutsideLoop(L))
5612       Inputs.push_back(L->getLoopLatch()->getTerminator());
5613     else
5614       Inputs.push_back(IVIncInsertPos);
5615   }
5616   // The expansion must also be dominated by the increment positions of any
5617   // loops it for which it is using post-inc mode.
5618   for (const Loop *PIL : LF.PostIncLoops) {
5619     if (PIL == L) continue;
5620 
5621     // Be dominated by the loop exit.
5622     SmallVector<BasicBlock *, 4> ExitingBlocks;
5623     PIL->getExitingBlocks(ExitingBlocks);
5624     if (!ExitingBlocks.empty()) {
5625       BasicBlock *BB = ExitingBlocks[0];
5626       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5627         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5628       Inputs.push_back(BB->getTerminator());
5629     }
5630   }
5631 
5632   assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5633          "Insertion point must be a normal instruction");
5634 
5635   // Then, climb up the immediate dominator tree as far as we can go while
5636   // still being dominated by the input positions.
5637   BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5638 
5639   // Don't insert instructions before PHI nodes.
5640   while (isa<PHINode>(IP)) ++IP;
5641 
5642   // Ignore landingpad instructions.
5643   while (IP->isEHPad()) ++IP;
5644 
5645   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5646   // IP consistent across expansions and allows the previously inserted
5647   // instructions to be reused by subsequent expansion.
5648   while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5649     ++IP;
5650 
5651   return IP;
5652 }
5653 
5654 /// Emit instructions for the leading candidate expression for this LSRUse (this
5655 /// is called "expanding").
5656 Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5657                            const Formula &F, BasicBlock::iterator IP,
5658                            SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5659   if (LU.RigidFormula)
5660     return LF.OperandValToReplace;
5661 
5662   // Determine an input position which will be dominated by the operands and
5663   // which will dominate the result.
5664   IP = AdjustInsertPositionForExpand(IP, LF, LU);
5665   Rewriter.setInsertPoint(&*IP);
5666 
5667   // Inform the Rewriter if we have a post-increment use, so that it can
5668   // perform an advantageous expansion.
5669   Rewriter.setPostInc(LF.PostIncLoops);
5670 
5671   // This is the type that the user actually needs.
5672   Type *OpTy = LF.OperandValToReplace->getType();
5673   // This will be the type that we'll initially expand to.
5674   Type *Ty = F.getType();
5675   if (!Ty)
5676     // No type known; just expand directly to the ultimate type.
5677     Ty = OpTy;
5678   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5679     // Expand directly to the ultimate type if it's the right size.
5680     Ty = OpTy;
5681   // This is the type to do integer arithmetic in.
5682   Type *IntTy = SE.getEffectiveSCEVType(Ty);
5683 
5684   // Build up a list of operands to add together to form the full base.
5685   SmallVector<const SCEV *, 8> Ops;
5686 
5687   // Expand the BaseRegs portion.
5688   for (const SCEV *Reg : F.BaseRegs) {
5689     assert(!Reg->isZero() && "Zero allocated in a base register!");
5690 
5691     // If we're expanding for a post-inc user, make the post-inc adjustment.
5692     Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5693     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5694   }
5695 
5696   // Expand the ScaledReg portion.
5697   Value *ICmpScaledV = nullptr;
5698   if (F.Scale != 0) {
5699     const SCEV *ScaledS = F.ScaledReg;
5700 
5701     // If we're expanding for a post-inc user, make the post-inc adjustment.
5702     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5703     ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5704 
5705     if (LU.Kind == LSRUse::ICmpZero) {
5706       // Expand ScaleReg as if it was part of the base regs.
5707       if (F.Scale == 1)
5708         Ops.push_back(
5709             SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5710       else {
5711         // An interesting way of "folding" with an icmp is to use a negated
5712         // scale, which we'll implement by inserting it into the other operand
5713         // of the icmp.
5714         assert(F.Scale == -1 &&
5715                "The only scale supported by ICmpZero uses is -1!");
5716         ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5717       }
5718     } else {
5719       // Otherwise just expand the scaled register and an explicit scale,
5720       // which is expected to be matched as part of the address.
5721 
5722       // Flush the operand list to suppress SCEVExpander hoisting address modes.
5723       // Unless the addressing mode will not be folded.
5724       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5725           isAMCompletelyFolded(TTI, LU, F)) {
5726         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5727         Ops.clear();
5728         Ops.push_back(SE.getUnknown(FullV));
5729       }
5730       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5731       if (F.Scale != 1)
5732         ScaledS =
5733             SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5734       Ops.push_back(ScaledS);
5735     }
5736   }
5737 
5738   // Expand the GV portion.
5739   if (F.BaseGV) {
5740     // Flush the operand list to suppress SCEVExpander hoisting.
5741     if (!Ops.empty()) {
5742       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5743       Ops.clear();
5744       Ops.push_back(SE.getUnknown(FullV));
5745     }
5746     Ops.push_back(SE.getUnknown(F.BaseGV));
5747   }
5748 
5749   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5750   // unfolded offsets. LSR assumes they both live next to their uses.
5751   if (!Ops.empty()) {
5752     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5753     Ops.clear();
5754     Ops.push_back(SE.getUnknown(FullV));
5755   }
5756 
5757   // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5758   // out at this point, or should we generate a SCEV adding together mixed
5759   // offsets?
5760   assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5761          "Expanding mismatched offsets\n");
5762   // Expand the immediate portion.
5763   Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5764   if (Offset.isNonZero()) {
5765     if (LU.Kind == LSRUse::ICmpZero) {
5766       // The other interesting way of "folding" with an ICmpZero is to use a
5767       // negated immediate.
5768       if (!ICmpScaledV)
5769         ICmpScaledV =
5770             ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5771       else {
5772         Ops.push_back(SE.getUnknown(ICmpScaledV));
5773         ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5774       }
5775     } else {
5776       // Just add the immediate values. These again are expected to be matched
5777       // as part of the address.
5778       Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5779     }
5780   }
5781 
5782   // Expand the unfolded offset portion.
5783   Immediate UnfoldedOffset = F.UnfoldedOffset;
5784   if (UnfoldedOffset.isNonZero()) {
5785     // Just add the immediate values.
5786     Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5787   }
5788 
5789   // Emit instructions summing all the operands.
5790   const SCEV *FullS = Ops.empty() ?
5791                       SE.getConstant(IntTy, 0) :
5792                       SE.getAddExpr(Ops);
5793   Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5794 
5795   // We're done expanding now, so reset the rewriter.
5796   Rewriter.clearPostInc();
5797 
5798   // An ICmpZero Formula represents an ICmp which we're handling as a
5799   // comparison against zero. Now that we've expanded an expression for that
5800   // form, update the ICmp's other operand.
5801   if (LU.Kind == LSRUse::ICmpZero) {
5802     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5803     if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5804       DeadInsts.emplace_back(OperandIsInstr);
5805     assert(!F.BaseGV && "ICmp does not support folding a global value and "
5806                            "a scale at the same time!");
5807     if (F.Scale == -1) {
5808       if (ICmpScaledV->getType() != OpTy) {
5809         Instruction *Cast = CastInst::Create(
5810             CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5811             ICmpScaledV, OpTy, "tmp", CI->getIterator());
5812         ICmpScaledV = Cast;
5813       }
5814       CI->setOperand(1, ICmpScaledV);
5815     } else {
5816       // A scale of 1 means that the scale has been expanded as part of the
5817       // base regs.
5818       assert((F.Scale == 0 || F.Scale == 1) &&
5819              "ICmp does not support folding a global value and "
5820              "a scale at the same time!");
5821       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
5822                                            -(uint64_t)Offset.getFixedValue());
5823       if (C->getType() != OpTy) {
5824         C = ConstantFoldCastOperand(
5825             CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5826             CI->getDataLayout());
5827         assert(C && "Cast of ConstantInt should have folded");
5828       }
5829 
5830       CI->setOperand(1, C);
5831     }
5832   }
5833 
5834   return FullV;
5835 }
5836 
5837 /// Helper for Rewrite. PHI nodes are special because the use of their operands
5838 /// effectively happens in their predecessor blocks, so the expression may need
5839 /// to be expanded in multiple places.
5840 void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5841                                 const LSRFixup &LF, const Formula &F,
5842                                 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5843   DenseMap<BasicBlock *, Value *> Inserted;
5844 
5845   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5846     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5847       bool needUpdateFixups = false;
5848       BasicBlock *BB = PN->getIncomingBlock(i);
5849 
5850       // If this is a critical edge, split the edge so that we do not insert
5851       // the code on all predecessor/successor paths.  We do this unless this
5852       // is the canonical backedge for this loop, which complicates post-inc
5853       // users.
5854       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5855           !isa<IndirectBrInst>(BB->getTerminator()) &&
5856           !isa<CatchSwitchInst>(BB->getTerminator())) {
5857         BasicBlock *Parent = PN->getParent();
5858         Loop *PNLoop = LI.getLoopFor(Parent);
5859         if (!PNLoop || Parent != PNLoop->getHeader()) {
5860           // Split the critical edge.
5861           BasicBlock *NewBB = nullptr;
5862           if (!Parent->isLandingPad()) {
5863             NewBB =
5864                 SplitCriticalEdge(BB, Parent,
5865                                   CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5866                                       .setMergeIdenticalEdges()
5867                                       .setKeepOneInputPHIs());
5868           } else {
5869             SmallVector<BasicBlock*, 2> NewBBs;
5870             DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5871             SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5872             NewBB = NewBBs[0];
5873           }
5874           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5875           // phi predecessors are identical. The simple thing to do is skip
5876           // splitting in this case rather than complicate the API.
5877           if (NewBB) {
5878             // If PN is outside of the loop and BB is in the loop, we want to
5879             // move the block to be immediately before the PHI block, not
5880             // immediately after BB.
5881             if (L->contains(BB) && !L->contains(PN))
5882               NewBB->moveBefore(PN->getParent());
5883 
5884             // Splitting the edge can reduce the number of PHI entries we have.
5885             e = PN->getNumIncomingValues();
5886             BB = NewBB;
5887             i = PN->getBasicBlockIndex(BB);
5888 
5889             needUpdateFixups = true;
5890           }
5891         }
5892       }
5893 
5894       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5895           Inserted.try_emplace(BB);
5896       if (!Pair.second)
5897         PN->setIncomingValue(i, Pair.first->second);
5898       else {
5899         Value *FullV =
5900             Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5901 
5902         // If this is reuse-by-noop-cast, insert the noop cast.
5903         Type *OpTy = LF.OperandValToReplace->getType();
5904         if (FullV->getType() != OpTy)
5905           FullV = CastInst::Create(
5906               CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5907               LF.OperandValToReplace->getType(), "tmp",
5908               BB->getTerminator()->getIterator());
5909 
5910         // If the incoming block for this value is not in the loop, it means the
5911         // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5912         // the inserted value.
5913         if (auto *I = dyn_cast<Instruction>(FullV))
5914           if (L->contains(I) && !L->contains(BB))
5915             InsertedNonLCSSAInsts.insert(I);
5916 
5917         PN->setIncomingValue(i, FullV);
5918         Pair.first->second = FullV;
5919       }
5920 
5921       // If LSR splits critical edge and phi node has other pending
5922       // fixup operands, we need to update those pending fixups. Otherwise
5923       // formulae will not be implemented completely and some instructions
5924       // will not be eliminated.
5925       if (needUpdateFixups) {
5926         for (LSRUse &LU : Uses)
5927           for (LSRFixup &Fixup : LU.Fixups)
5928             // If fixup is supposed to rewrite some operand in the phi
5929             // that was just updated, it may be already moved to
5930             // another phi node. Such fixup requires update.
5931             if (Fixup.UserInst == PN) {
5932               // Check if the operand we try to replace still exists in the
5933               // original phi.
5934               bool foundInOriginalPHI = false;
5935               for (const auto &val : PN->incoming_values())
5936                 if (val == Fixup.OperandValToReplace) {
5937                   foundInOriginalPHI = true;
5938                   break;
5939                 }
5940 
5941               // If fixup operand found in original PHI - nothing to do.
5942               if (foundInOriginalPHI)
5943                 continue;
5944 
5945               // Otherwise it might be moved to another PHI and requires update.
5946               // If fixup operand not found in any of the incoming blocks that
5947               // means we have already rewritten it - nothing to do.
5948               for (const auto &Block : PN->blocks())
5949                 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5950                      ++I) {
5951                   PHINode *NewPN = cast<PHINode>(I);
5952                   for (const auto &val : NewPN->incoming_values())
5953                     if (val == Fixup.OperandValToReplace)
5954                       Fixup.UserInst = NewPN;
5955                 }
5956             }
5957       }
5958     }
5959 }
5960 
5961 /// Emit instructions for the leading candidate expression for this LSRUse (this
5962 /// is called "expanding"), and update the UserInst to reference the newly
5963 /// expanded value.
5964 void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5965                           const Formula &F,
5966                           SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5967   // First, find an insertion point that dominates UserInst. For PHI nodes,
5968   // find the nearest block which dominates all the relevant uses.
5969   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5970     RewriteForPHI(PN, LU, LF, F, DeadInsts);
5971   } else {
5972     Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
5973 
5974     // If this is reuse-by-noop-cast, insert the noop cast.
5975     Type *OpTy = LF.OperandValToReplace->getType();
5976     if (FullV->getType() != OpTy) {
5977       Instruction *Cast =
5978           CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
5979                            FullV, OpTy, "tmp", LF.UserInst->getIterator());
5980       FullV = Cast;
5981     }
5982 
5983     // Update the user. ICmpZero is handled specially here (for now) because
5984     // Expand may have updated one of the operands of the icmp already, and
5985     // its new value may happen to be equal to LF.OperandValToReplace, in
5986     // which case doing replaceUsesOfWith leads to replacing both operands
5987     // with the same value. TODO: Reorganize this.
5988     if (LU.Kind == LSRUse::ICmpZero)
5989       LF.UserInst->setOperand(0, FullV);
5990     else
5991       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
5992   }
5993 
5994   if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
5995     DeadInsts.emplace_back(OperandIsInstr);
5996 }
5997 
5998 // Trying to hoist the IVInc to loop header if all IVInc users are in
5999 // the loop header. It will help backend to generate post index load/store
6000 // when the latch block is different from loop header block.
6001 static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6002                           const LSRUse &LU, Instruction *IVIncInsertPos,
6003                           Loop *L) {
6004   if (LU.Kind != LSRUse::Address)
6005     return false;
6006 
6007   // For now this code do the conservative optimization, only work for
6008   // the header block. Later we can hoist the IVInc to the block post
6009   // dominate all users.
6010   BasicBlock *LHeader = L->getHeader();
6011   if (IVIncInsertPos->getParent() == LHeader)
6012     return false;
6013 
6014   if (!Fixup.OperandValToReplace ||
6015       any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6016         Instruction *UI = cast<Instruction>(U);
6017         return UI->getParent() != LHeader;
6018       }))
6019     return false;
6020 
6021   Instruction *I = Fixup.UserInst;
6022   Type *Ty = I->getType();
6023   return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6024          (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
6025 }
6026 
6027 /// Rewrite all the fixup locations with new values, following the chosen
6028 /// solution.
6029 void LSRInstance::ImplementSolution(
6030     const SmallVectorImpl<const Formula *> &Solution) {
6031   // Keep track of instructions we may have made dead, so that
6032   // we can remove them after we are done working.
6033   SmallVector<WeakTrackingVH, 16> DeadInsts;
6034 
6035   // Mark phi nodes that terminate chains so the expander tries to reuse them.
6036   for (const IVChain &Chain : IVChainVec) {
6037     if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6038       Rewriter.setChainedPhi(PN);
6039   }
6040 
6041   // Expand the new value definitions and update the users.
6042   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6043     for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6044       Instruction *InsertPos =
6045           canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6046               ? L->getHeader()->getTerminator()
6047               : IVIncInsertPos;
6048       Rewriter.setIVIncInsertPos(L, InsertPos);
6049       Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6050       Changed = true;
6051     }
6052 
6053   auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6054   formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6055 
6056   for (const IVChain &Chain : IVChainVec) {
6057     GenerateIVChain(Chain, DeadInsts);
6058     Changed = true;
6059   }
6060 
6061   for (const WeakVH &IV : Rewriter.getInsertedIVs())
6062     if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6063       ScalarEvolutionIVs.push_back(IV);
6064 
6065   // Clean up after ourselves. This must be done before deleting any
6066   // instructions.
6067   Rewriter.clear();
6068 
6069   Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
6070                                                                   &TLI, MSSAU);
6071 
6072   // In our cost analysis above, we assume that each addrec consumes exactly
6073   // one register, and arrange to have increments inserted just before the
6074   // latch to maximimize the chance this is true.  However, if we reused
6075   // existing IVs, we now need to move the increments to match our
6076   // expectations.  Otherwise, our cost modeling results in us having a
6077   // chosen a non-optimal result for the actual schedule.  (And yes, this
6078   // scheduling decision does impact later codegen.)
6079   for (PHINode &PN : L->getHeader()->phis()) {
6080     BinaryOperator *BO = nullptr;
6081     Value *Start = nullptr, *Step = nullptr;
6082     if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6083       continue;
6084 
6085     switch (BO->getOpcode()) {
6086     case Instruction::Sub:
6087       if (BO->getOperand(0) != &PN)
6088         // sub is non-commutative - match handling elsewhere in LSR
6089         continue;
6090       break;
6091     case Instruction::Add:
6092       break;
6093     default:
6094       continue;
6095     };
6096 
6097     if (!isa<Constant>(Step))
6098       // If not a constant step, might increase register pressure
6099       // (We assume constants have been canonicalized to RHS)
6100       continue;
6101 
6102     if (BO->getParent() == IVIncInsertPos->getParent())
6103       // Only bother moving across blocks.  Isel can handle block local case.
6104       continue;
6105 
6106     // Can we legally schedule inc at the desired point?
6107     if (!llvm::all_of(BO->uses(),
6108                       [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6109       continue;
6110     BO->moveBefore(IVIncInsertPos->getIterator());
6111     Changed = true;
6112   }
6113 
6114 
6115 }
6116 
6117 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6118                          DominatorTree &DT, LoopInfo &LI,
6119                          const TargetTransformInfo &TTI, AssumptionCache &AC,
6120                          TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6121     : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6122       MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6123                             ? PreferredAddresingMode
6124                             : TTI.getPreferredAddressingMode(L, &SE)),
6125       Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6126       BaselineCost(L, SE, TTI, AMK) {
6127   // If LoopSimplify form is not available, stay out of trouble.
6128   if (!L->isLoopSimplifyForm())
6129     return;
6130 
6131   // If there's no interesting work to be done, bail early.
6132   if (IU.empty()) return;
6133 
6134   // If there's too much analysis to be done, bail early. We won't be able to
6135   // model the problem anyway.
6136   unsigned NumUsers = 0;
6137   for (const IVStrideUse &U : IU) {
6138     if (++NumUsers > MaxIVUsers) {
6139       (void)U;
6140       LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6141                         << "\n");
6142       return;
6143     }
6144     // Bail out if we have a PHI on an EHPad that gets a value from a
6145     // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
6146     // no good place to stick any instructions.
6147     if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6148        auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6149        if (isa<FuncletPadInst>(FirstNonPHI) ||
6150            isa<CatchSwitchInst>(FirstNonPHI))
6151          for (BasicBlock *PredBB : PN->blocks())
6152            if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6153              return;
6154     }
6155   }
6156 
6157   LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6158              L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6159              dbgs() << ":\n");
6160 
6161   // Check if we expect this loop to use a hardware loop instruction, which will
6162   // be used when calculating the costs of formulas.
6163   HardwareLoopInfo HWLoopInfo(L);
6164   HardwareLoopProfitable =
6165       TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6166 
6167   // Configure SCEVExpander already now, so the correct mode is used for
6168   // isSafeToExpand() checks.
6169 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
6170   Rewriter.setDebugType(DEBUG_TYPE);
6171 #endif
6172   Rewriter.disableCanonicalMode();
6173   Rewriter.enableLSRMode();
6174 
6175   // First, perform some low-level loop optimizations.
6176   OptimizeShadowIV();
6177   OptimizeLoopTermCond();
6178 
6179   // If loop preparation eliminates all interesting IV users, bail.
6180   if (IU.empty()) return;
6181 
6182   // Skip nested loops until we can model them better with formulae.
6183   if (!L->isInnermost()) {
6184     LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6185     return;
6186   }
6187 
6188   // Start collecting data and preparing for the solver.
6189   // If number of registers is not the major cost, we cannot benefit from the
6190   // current profitable chain optimization which is based on number of
6191   // registers.
6192   // FIXME: add profitable chain optimization for other kinds major cost, for
6193   // example number of instructions.
6194   if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6195     CollectChains();
6196   CollectInterestingTypesAndFactors();
6197   CollectFixupsAndInitialFormulae();
6198   CollectLoopInvariantFixupsAndFormulae();
6199 
6200   if (Uses.empty())
6201     return;
6202 
6203   LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6204              print_uses(dbgs()));
6205   LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6206              BaselineCost.print(dbgs()); dbgs() << "\n");
6207 
6208   // Now use the reuse data to generate a bunch of interesting ways
6209   // to formulate the values needed for the uses.
6210   GenerateAllReuseFormulae();
6211 
6212   FilterOutUndesirableDedicatedRegisters();
6213   NarrowSearchSpaceUsingHeuristics();
6214 
6215   SmallVector<const Formula *, 8> Solution;
6216   Solve(Solution);
6217 
6218   // Release memory that is no longer needed.
6219   Factors.clear();
6220   Types.clear();
6221   RegUses.clear();
6222 
6223   if (Solution.empty())
6224     return;
6225 
6226 #ifndef NDEBUG
6227   // Formulae should be legal.
6228   for (const LSRUse &LU : Uses) {
6229     for (const Formula &F : LU.Formulae)
6230       assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6231                         F) && "Illegal formula generated!");
6232   };
6233 #endif
6234 
6235   // Now that we've decided what we want, make it so.
6236   ImplementSolution(Solution);
6237 }
6238 
6239 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6240 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6241   if (Factors.empty() && Types.empty()) return;
6242 
6243   OS << "LSR has identified the following interesting factors and types: ";
6244   bool First = true;
6245 
6246   for (int64_t Factor : Factors) {
6247     if (!First) OS << ", ";
6248     First = false;
6249     OS << '*' << Factor;
6250   }
6251 
6252   for (Type *Ty : Types) {
6253     if (!First) OS << ", ";
6254     First = false;
6255     OS << '(' << *Ty << ')';
6256   }
6257   OS << '\n';
6258 }
6259 
6260 void LSRInstance::print_fixups(raw_ostream &OS) const {
6261   OS << "LSR is examining the following fixup sites:\n";
6262   for (const LSRUse &LU : Uses)
6263     for (const LSRFixup &LF : LU.Fixups) {
6264       dbgs() << "  ";
6265       LF.print(OS);
6266       OS << '\n';
6267     }
6268 }
6269 
6270 void LSRInstance::print_uses(raw_ostream &OS) const {
6271   OS << "LSR is examining the following uses:\n";
6272   for (const LSRUse &LU : Uses) {
6273     dbgs() << "  ";
6274     LU.print(OS);
6275     OS << '\n';
6276     for (const Formula &F : LU.Formulae) {
6277       OS << "    ";
6278       F.print(OS);
6279       OS << '\n';
6280     }
6281   }
6282 }
6283 
6284 void LSRInstance::print(raw_ostream &OS) const {
6285   print_factors_and_types(OS);
6286   print_fixups(OS);
6287   print_uses(OS);
6288 }
6289 
6290 LLVM_DUMP_METHOD void LSRInstance::dump() const {
6291   print(errs()); errs() << '\n';
6292 }
6293 #endif
6294 
6295 namespace {
6296 
6297 class LoopStrengthReduce : public LoopPass {
6298 public:
6299   static char ID; // Pass ID, replacement for typeid
6300 
6301   LoopStrengthReduce();
6302 
6303 private:
6304   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6305   void getAnalysisUsage(AnalysisUsage &AU) const override;
6306 };
6307 
6308 } // end anonymous namespace
6309 
6310 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6311   initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
6312 }
6313 
6314 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6315   // We split critical edges, so we change the CFG.  However, we do update
6316   // many analyses if they are around.
6317   AU.addPreservedID(LoopSimplifyID);
6318 
6319   AU.addRequired<LoopInfoWrapperPass>();
6320   AU.addPreserved<LoopInfoWrapperPass>();
6321   AU.addRequiredID(LoopSimplifyID);
6322   AU.addRequired<DominatorTreeWrapperPass>();
6323   AU.addPreserved<DominatorTreeWrapperPass>();
6324   AU.addRequired<ScalarEvolutionWrapperPass>();
6325   AU.addPreserved<ScalarEvolutionWrapperPass>();
6326   AU.addRequired<AssumptionCacheTracker>();
6327   AU.addRequired<TargetLibraryInfoWrapperPass>();
6328   // Requiring LoopSimplify a second time here prevents IVUsers from running
6329   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6330   AU.addRequiredID(LoopSimplifyID);
6331   AU.addRequired<IVUsersWrapperPass>();
6332   AU.addPreserved<IVUsersWrapperPass>();
6333   AU.addRequired<TargetTransformInfoWrapperPass>();
6334   AU.addPreserved<MemorySSAWrapperPass>();
6335 }
6336 
6337 namespace {
6338 
6339 /// Enables more convenient iteration over a DWARF expression vector.
6340 static iterator_range<llvm::DIExpression::expr_op_iterator>
6341 ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6342   llvm::DIExpression::expr_op_iterator Begin =
6343       llvm::DIExpression::expr_op_iterator(Expr.begin());
6344   llvm::DIExpression::expr_op_iterator End =
6345       llvm::DIExpression::expr_op_iterator(Expr.end());
6346   return {Begin, End};
6347 }
6348 
6349 struct SCEVDbgValueBuilder {
6350   SCEVDbgValueBuilder() = default;
6351   SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6352 
6353   void clone(const SCEVDbgValueBuilder &Base) {
6354     LocationOps = Base.LocationOps;
6355     Expr = Base.Expr;
6356   }
6357 
6358   void clear() {
6359     LocationOps.clear();
6360     Expr.clear();
6361   }
6362 
6363   /// The DIExpression as we translate the SCEV.
6364   SmallVector<uint64_t, 6> Expr;
6365   /// The location ops of the DIExpression.
6366   SmallVector<Value *, 2> LocationOps;
6367 
6368   void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6369   void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6370 
6371   /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6372   /// in the set of values referenced by the expression.
6373   void pushLocation(llvm::Value *V) {
6374     Expr.push_back(llvm::dwarf::DW_OP_LLVM_arg);
6375     auto *It = llvm::find(LocationOps, V);
6376     unsigned ArgIndex = 0;
6377     if (It != LocationOps.end()) {
6378       ArgIndex = std::distance(LocationOps.begin(), It);
6379     } else {
6380       ArgIndex = LocationOps.size();
6381       LocationOps.push_back(V);
6382     }
6383     Expr.push_back(ArgIndex);
6384   }
6385 
6386   void pushValue(const SCEVUnknown *U) {
6387     llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6388     pushLocation(V);
6389   }
6390 
6391   bool pushConst(const SCEVConstant *C) {
6392     if (C->getAPInt().getSignificantBits() > 64)
6393       return false;
6394     Expr.push_back(llvm::dwarf::DW_OP_consts);
6395     Expr.push_back(C->getAPInt().getSExtValue());
6396     return true;
6397   }
6398 
6399   // Iterating the expression as DWARF ops is convenient when updating
6400   // DWARF_OP_LLVM_args.
6401   iterator_range<llvm::DIExpression::expr_op_iterator> expr_ops() {
6402     return ToDwarfOpIter(Expr);
6403   }
6404 
6405   /// Several SCEV types are sequences of the same arithmetic operator applied
6406   /// to constants and values that may be extended or truncated.
6407   bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6408                           uint64_t DwarfOp) {
6409     assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6410            "Expected arithmetic SCEV type");
6411     bool Success = true;
6412     unsigned EmitOperator = 0;
6413     for (const auto &Op : CommExpr->operands()) {
6414       Success &= pushSCEV(Op);
6415 
6416       if (EmitOperator >= 1)
6417         pushOperator(DwarfOp);
6418       ++EmitOperator;
6419     }
6420     return Success;
6421   }
6422 
6423   // TODO: Identify and omit noop casts.
6424   bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6425     const llvm::SCEV *Inner = C->getOperand(0);
6426     const llvm::Type *Type = C->getType();
6427     uint64_t ToWidth = Type->getIntegerBitWidth();
6428     bool Success = pushSCEV(Inner);
6429     uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6430                           IsSigned ? llvm::dwarf::DW_ATE_signed
6431                                    : llvm::dwarf::DW_ATE_unsigned};
6432     for (const auto &Op : CastOps)
6433       pushOperator(Op);
6434     return Success;
6435   }
6436 
6437   // TODO: MinMax - although these haven't been encountered in the test suite.
6438   bool pushSCEV(const llvm::SCEV *S) {
6439     bool Success = true;
6440     if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6441       Success &= pushConst(StartInt);
6442 
6443     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6444       if (!U->getValue())
6445         return false;
6446       pushLocation(U->getValue());
6447 
6448     } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6449       Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6450 
6451     } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6452       Success &= pushSCEV(UDiv->getLHS());
6453       Success &= pushSCEV(UDiv->getRHS());
6454       pushOperator(llvm::dwarf::DW_OP_div);
6455 
6456     } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6457       // Assert if a new and unknown SCEVCastEXpr type is encountered.
6458       assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6459               isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6460              "Unexpected cast type in SCEV.");
6461       Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6462 
6463     } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6464       Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6465 
6466     } else if (isa<SCEVAddRecExpr>(S)) {
6467       // Nested SCEVAddRecExpr are generated by nested loops and are currently
6468       // unsupported.
6469       return false;
6470 
6471     } else {
6472       return false;
6473     }
6474     return Success;
6475   }
6476 
6477   /// Return true if the combination of arithmetic operator and underlying
6478   /// SCEV constant value is an identity function.
6479   bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6480     if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6481       if (C->getAPInt().getSignificantBits() > 64)
6482         return false;
6483       int64_t I = C->getAPInt().getSExtValue();
6484       switch (Op) {
6485       case llvm::dwarf::DW_OP_plus:
6486       case llvm::dwarf::DW_OP_minus:
6487         return I == 0;
6488       case llvm::dwarf::DW_OP_mul:
6489       case llvm::dwarf::DW_OP_div:
6490         return I == 1;
6491       }
6492     }
6493     return false;
6494   }
6495 
6496   /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6497   /// builder's expression stack. The stack should already contain an
6498   /// expression for the iteration count, so that it can be multiplied by
6499   /// the stride and added to the start.
6500   /// Components of the expression are omitted if they are an identity function.
6501   /// Chain (non-affine) SCEVs are not supported.
6502   bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6503     assert(SAR.isAffine() && "Expected affine SCEV");
6504     const SCEV *Start = SAR.getStart();
6505     const SCEV *Stride = SAR.getStepRecurrence(SE);
6506 
6507     // Skip pushing arithmetic noops.
6508     if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6509       if (!pushSCEV(Stride))
6510         return false;
6511       pushOperator(llvm::dwarf::DW_OP_mul);
6512     }
6513     if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6514       if (!pushSCEV(Start))
6515         return false;
6516       pushOperator(llvm::dwarf::DW_OP_plus);
6517     }
6518     return true;
6519   }
6520 
6521   /// Create an expression that is an offset from a value (usually the IV).
6522   void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6523     pushLocation(OffsetValue);
6524     DIExpression::appendOffset(Expr, Offset);
6525     LLVM_DEBUG(
6526         dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6527                << std::to_string(Offset) << "\n");
6528   }
6529 
6530   /// Combine a translation of the SCEV and the IV to create an expression that
6531   /// recovers a location's value.
6532   /// returns true if an expression was created.
6533   bool createIterCountExpr(const SCEV *S,
6534                            const SCEVDbgValueBuilder &IterationCount,
6535                            ScalarEvolution &SE) {
6536     // SCEVs for SSA values are most frquently of the form
6537     // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6538     // This is because %a is a PHI node that is not the IV. However, these
6539     // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6540     // so its not expected this point will be reached.
6541     if (!isa<SCEVAddRecExpr>(S))
6542       return false;
6543 
6544     LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6545                       << '\n');
6546 
6547     const auto *Rec = cast<SCEVAddRecExpr>(S);
6548     if (!Rec->isAffine())
6549       return false;
6550 
6551     if (S->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6552       return false;
6553 
6554     // Initialise a new builder with the iteration count expression. In
6555     // combination with the value's SCEV this enables recovery.
6556     clone(IterationCount);
6557     if (!SCEVToValueExpr(*Rec, SE))
6558       return false;
6559 
6560     return true;
6561   }
6562 
6563   /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6564   /// builder's expression stack. The stack should already contain an
6565   /// expression for the iteration count, so that it can be multiplied by
6566   /// the stride and added to the start.
6567   /// Components of the expression are omitted if they are an identity function.
6568   bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6569                            ScalarEvolution &SE) {
6570     assert(SAR.isAffine() && "Expected affine SCEV");
6571     const SCEV *Start = SAR.getStart();
6572     const SCEV *Stride = SAR.getStepRecurrence(SE);
6573 
6574     // Skip pushing arithmetic noops.
6575     if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6576       if (!pushSCEV(Start))
6577         return false;
6578       pushOperator(llvm::dwarf::DW_OP_minus);
6579     }
6580     if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6581       if (!pushSCEV(Stride))
6582         return false;
6583       pushOperator(llvm::dwarf::DW_OP_div);
6584     }
6585     return true;
6586   }
6587 
6588   // Append the current expression and locations to a location list and an
6589   // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6590   // the locations already present in the destination list.
6591   void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6592                        SmallVectorImpl<Value *> &DestLocations) {
6593     assert(!DestLocations.empty() &&
6594            "Expected the locations vector to contain the IV");
6595     // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6596     // modified to account for the locations already in the destination vector.
6597     // All builders contain the IV as the first location op.
6598     assert(!LocationOps.empty() &&
6599            "Expected the location ops to contain the IV.");
6600     // DestIndexMap[n] contains the index in DestLocations for the nth
6601     // location in this SCEVDbgValueBuilder.
6602     SmallVector<uint64_t, 2> DestIndexMap;
6603     for (const auto &Op : LocationOps) {
6604       auto It = find(DestLocations, Op);
6605       if (It != DestLocations.end()) {
6606         // Location already exists in DestLocations, reuse existing ArgIndex.
6607         DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6608         continue;
6609       }
6610       // Location is not in DestLocations, add it.
6611       DestIndexMap.push_back(DestLocations.size());
6612       DestLocations.push_back(Op);
6613     }
6614 
6615     for (const auto &Op : expr_ops()) {
6616       if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6617         Op.appendToVector(DestExpr);
6618         continue;
6619       }
6620 
6621       DestExpr.push_back(dwarf::DW_OP_LLVM_arg);
6622       // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6623       // DestIndexMap[n] contains its new index in DestLocations.
6624       uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6625       DestExpr.push_back(NewIndex);
6626     }
6627   }
6628 };
6629 
6630 /// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6631 /// and DIExpression.
6632 struct DVIRecoveryRec {
6633   DVIRecoveryRec(DbgValueInst *DbgValue)
6634       : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6635         HadLocationArgList(false) {}
6636   DVIRecoveryRec(DbgVariableRecord *DVR)
6637       : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6638 
6639   PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef;
6640   DIExpression *Expr;
6641   bool HadLocationArgList;
6642   SmallVector<WeakVH, 2> LocationOps;
6643   SmallVector<const llvm::SCEV *, 2> SCEVs;
6644   SmallVector<std::unique_ptr<SCEVDbgValueBuilder>, 2> RecoveryExprs;
6645 
6646   void clear() {
6647     for (auto &RE : RecoveryExprs)
6648       RE.reset();
6649     RecoveryExprs.clear();
6650   }
6651 
6652   ~DVIRecoveryRec() { clear(); }
6653 };
6654 } // namespace
6655 
6656 /// Returns the total number of DW_OP_llvm_arg operands in the expression.
6657 /// This helps in determining if a DIArglist is necessary or can be omitted from
6658 /// the dbg.value.
6659 static unsigned numLLVMArgOps(SmallVectorImpl<uint64_t> &Expr) {
6660   auto expr_ops = ToDwarfOpIter(Expr);
6661   unsigned Count = 0;
6662   for (auto Op : expr_ops)
6663     if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6664       Count++;
6665   return Count;
6666 }
6667 
6668 /// Overwrites DVI with the location and Ops as the DIExpression. This will
6669 /// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6670 /// because a DIArglist is not created for the first argument of the dbg.value.
6671 template <typename T>
6672 static void updateDVIWithLocation(T &DbgVal, Value *Location,
6673                                   SmallVectorImpl<uint64_t> &Ops) {
6674   assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6675                                     "contain any DW_OP_llvm_arg operands.");
6676   DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6677   DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6678   DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6679 }
6680 
6681 /// Overwrite DVI with locations placed into a DIArglist.
6682 template <typename T>
6683 static void updateDVIWithLocations(T &DbgVal,
6684                                    SmallVectorImpl<Value *> &Locations,
6685                                    SmallVectorImpl<uint64_t> &Ops) {
6686   assert(numLLVMArgOps(Ops) != 0 &&
6687          "Expected expression that references DIArglist locations using "
6688          "DW_OP_llvm_arg operands.");
6689   SmallVector<ValueAsMetadata *, 3> MetadataLocs;
6690   for (Value *V : Locations)
6691     MetadataLocs.push_back(ValueAsMetadata::get(V));
6692   auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6693   DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6694   DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6695 }
6696 
6697 /// Write the new expression and new location ops for the dbg.value. If possible
6698 /// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6699 /// can be omitted if:
6700 /// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6701 /// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6702 static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6703                                SmallVectorImpl<Value *> &NewLocationOps,
6704                                SmallVectorImpl<uint64_t> &NewExpr) {
6705   auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6706     unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6707     if (NumLLVMArgs == 0) {
6708       // Location assumed to be on the stack.
6709       updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6710     } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6711       // There is only a single DW_OP_llvm_arg at the start of the expression,
6712       // so it can be omitted along with DIArglist.
6713       assert(NewExpr[1] == 0 &&
6714              "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6715       llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
6716       updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6717     } else {
6718       // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6719       updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6720     }
6721 
6722     // If the DIExpression was previously empty then add the stack terminator.
6723     // Non-empty expressions have only had elements inserted into them and so
6724     // the terminator should already be present e.g. stack_value or fragment.
6725     DIExpression *SalvageExpr = DbgVal->getExpression();
6726     if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6727       SalvageExpr =
6728           DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6729       DbgVal->setExpression(SalvageExpr);
6730     }
6731   };
6732   if (isa<DbgValueInst *>(DVIRec.DbgRef))
6733     UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6734   else
6735     UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6736 }
6737 
6738 /// Cached location ops may be erased during LSR, in which case a poison is
6739 /// required when restoring from the cache. The type of that location is no
6740 /// longer available, so just use int8. The poison will be replaced by one or
6741 /// more locations later when a SCEVDbgValueBuilder selects alternative
6742 /// locations to use for the salvage.
6743 static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) {
6744   return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6745 }
6746 
6747 /// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6748 static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6749   auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6750     LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6751                       << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6752     assert(DVIRec.Expr && "Expected an expression");
6753     DbgVal->setExpression(DVIRec.Expr);
6754 
6755     // Even a single location-op may be inside a DIArgList and referenced with
6756     // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6757     if (!DVIRec.HadLocationArgList) {
6758       assert(DVIRec.LocationOps.size() == 1 &&
6759              "Unexpected number of location ops.");
6760       // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6761       // this case was not present before, so force the location back to a
6762       // single uncontained Value.
6763       Value *CachedValue =
6764           getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6765       DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6766     } else {
6767       SmallVector<ValueAsMetadata *, 3> MetadataLocs;
6768       for (WeakVH VH : DVIRec.LocationOps) {
6769         Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6770         MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6771       }
6772       auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6773       DbgVal->setRawLocation(
6774           llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6775     }
6776     LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6777   };
6778   if (isa<DbgValueInst *>(DVIRec.DbgRef))
6779     RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6780   else
6781     RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6782 }
6783 
6784 static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
6785                        llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6786                        const SCEV *SCEVInductionVar,
6787                        SCEVDbgValueBuilder IterCountExpr) {
6788 
6789   if (isa<DbgValueInst *>(DVIRec.DbgRef)
6790           ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6791           : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6792     return false;
6793 
6794   // LSR may have caused several changes to the dbg.value in the failed salvage
6795   // attempt. So restore the DIExpression, the location ops and also the
6796   // location ops format, which is always DIArglist for multiple ops, but only
6797   // sometimes for a single op.
6798   restorePreTransformState(DVIRec);
6799 
6800   // LocationOpIndexMap[i] will store the post-LSR location index of
6801   // the non-optimised out location at pre-LSR index i.
6802   SmallVector<int64_t, 2> LocationOpIndexMap;
6803   LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6804   SmallVector<Value *, 2> NewLocationOps;
6805   NewLocationOps.push_back(LSRInductionVar);
6806 
6807   for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6808     WeakVH VH = DVIRec.LocationOps[i];
6809     // Place the locations not optimised out in the list first, avoiding
6810     // inserts later. The map is used to update the DIExpression's
6811     // DW_OP_LLVM_arg arguments as the expression is updated.
6812     if (VH && !isa<UndefValue>(VH)) {
6813       NewLocationOps.push_back(VH);
6814       LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6815       LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6816                         << " now at index " << LocationOpIndexMap[i] << "\n");
6817       continue;
6818     }
6819 
6820     // It's possible that a value referred to in the SCEV may have been
6821     // optimised out by LSR.
6822     if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6823         SE.containsUndefs(DVIRec.SCEVs[i])) {
6824       LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6825                         << " refers to a location that is now undef or erased. "
6826                            "Salvage abandoned.\n");
6827       return false;
6828     }
6829 
6830     LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6831                       << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6832 
6833     DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6834     SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6835 
6836     // Create an offset-based salvage expression if possible, as it requires
6837     // less DWARF ops than an iteration count-based expression.
6838     if (std::optional<APInt> Offset =
6839             SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6840       if (Offset->getSignificantBits() <= 64)
6841         SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6842       else
6843         return false;
6844     } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6845                                                  SE))
6846       return false;
6847   }
6848 
6849   // Merge the DbgValueBuilder generated expressions and the original
6850   // DIExpression, place the result into an new vector.
6851   SmallVector<uint64_t, 3> NewExpr;
6852   if (DVIRec.Expr->getNumElements() == 0) {
6853     assert(DVIRec.RecoveryExprs.size() == 1 &&
6854            "Expected only a single recovery expression for an empty "
6855            "DIExpression.");
6856     assert(DVIRec.RecoveryExprs[0] &&
6857            "Expected a SCEVDbgSalvageBuilder for location 0");
6858     SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6859     B->appendToVectors(NewExpr, NewLocationOps);
6860   }
6861   for (const auto &Op : DVIRec.Expr->expr_ops()) {
6862     // Most Ops needn't be updated.
6863     if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6864       Op.appendToVector(NewExpr);
6865       continue;
6866     }
6867 
6868     uint64_t LocationArgIndex = Op.getArg(0);
6869     SCEVDbgValueBuilder *DbgBuilder =
6870         DVIRec.RecoveryExprs[LocationArgIndex].get();
6871     // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6872     // optimise it away. So just translate the argument to the updated
6873     // location index.
6874     if (!DbgBuilder) {
6875       NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6876       assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6877              "Expected a positive index for the location-op position.");
6878       NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6879       continue;
6880     }
6881     // The location has a recovery expression.
6882     DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6883   }
6884 
6885   UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6886   if (isa<DbgValueInst *>(DVIRec.DbgRef))
6887     LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6888                       << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6889   else
6890     LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6891                       << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6892   return true;
6893 }
6894 
6895 /// Obtain an expression for the iteration count, then attempt to salvage the
6896 /// dbg.value intrinsics.
6897 static void DbgRewriteSalvageableDVIs(
6898     llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6899     SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6900   if (DVIToUpdate.empty())
6901     return;
6902 
6903   const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6904   assert(SCEVInductionVar &&
6905          "Anticipated a SCEV for the post-LSR induction variable");
6906 
6907   if (const SCEVAddRecExpr *IVAddRec =
6908           dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6909     if (!IVAddRec->isAffine())
6910       return;
6911 
6912     // Prevent translation using excessive resources.
6913     if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6914       return;
6915 
6916     // The iteration count is required to recover location values.
6917     SCEVDbgValueBuilder IterCountExpr;
6918     IterCountExpr.pushLocation(LSRInductionVar);
6919     if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6920       return;
6921 
6922     LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6923                       << '\n');
6924 
6925     for (auto &DVIRec : DVIToUpdate) {
6926       SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6927                  IterCountExpr);
6928     }
6929   }
6930 }
6931 
6932 /// Identify and cache salvageable DVI locations and expressions along with the
6933 /// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6934 /// cacheing and salvaging.
6935 static void DbgGatherSalvagableDVI(
6936     Loop *L, ScalarEvolution &SE,
6937     SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6938     SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6939   for (const auto &B : L->getBlocks()) {
6940     for (auto &I : *B) {
6941       auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6942         // Ensure that if any location op is undef that the dbg.vlue is not
6943         // cached.
6944         if (DbgVal->isKillLocation())
6945           return false;
6946 
6947         // Check that the location op SCEVs are suitable for translation to
6948         // DIExpression.
6949         const auto &HasTranslatableLocationOps =
6950             [&](const auto *DbgValToTranslate) -> bool {
6951           for (const auto LocOp : DbgValToTranslate->location_ops()) {
6952             if (!LocOp)
6953               return false;
6954 
6955             if (!SE.isSCEVable(LocOp->getType()))
6956               return false;
6957 
6958             const SCEV *S = SE.getSCEV(LocOp);
6959             if (SE.containsUndefs(S))
6960               return false;
6961           }
6962           return true;
6963         };
6964 
6965         if (!HasTranslatableLocationOps(DbgVal))
6966           return false;
6967 
6968         std::unique_ptr<DVIRecoveryRec> NewRec =
6969             std::make_unique<DVIRecoveryRec>(DbgVal);
6970         // Each location Op may need a SCEVDbgValueBuilder in order to recover
6971         // it. Pre-allocating a vector will enable quick lookups of the builder
6972         // later during the salvage.
6973         NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
6974         for (const auto LocOp : DbgVal->location_ops()) {
6975           NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6976           NewRec->LocationOps.push_back(LocOp);
6977           NewRec->HadLocationArgList = DbgVal->hasArgList();
6978         }
6979         SalvageableDVISCEVs.push_back(std::move(NewRec));
6980         return true;
6981       };
6982       for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
6983         if (DVR.isDbgValue() || DVR.isDbgAssign())
6984           ProcessDbgValue(&DVR);
6985       }
6986       auto DVI = dyn_cast<DbgValueInst>(&I);
6987       if (!DVI)
6988         continue;
6989       if (ProcessDbgValue(DVI))
6990         DVIHandles.insert(DVI);
6991     }
6992   }
6993 }
6994 
6995 /// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6996 /// any PHi from the loop header is usable, but may have less chance of
6997 /// surviving subsequent transforms.
6998 static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE,
6999                                            const LSRInstance &LSR) {
7000 
7001   auto IsSuitableIV = [&](PHINode *P) {
7002     if (!SE.isSCEVable(P->getType()))
7003       return false;
7004     if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7005       return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7006     return false;
7007   };
7008 
7009   // For now, just pick the first IV that was generated and inserted by
7010   // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7011   // by subsequent transforms.
7012   for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7013     if (!IV)
7014       continue;
7015 
7016     // There should only be PHI node IVs.
7017     PHINode *P = cast<PHINode>(&*IV);
7018 
7019     if (IsSuitableIV(P))
7020       return P;
7021   }
7022 
7023   for (PHINode &P : L.getHeader()->phis()) {
7024     if (IsSuitableIV(&P))
7025       return &P;
7026   }
7027   return nullptr;
7028 }
7029 
7030 static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
7031                                DominatorTree &DT, LoopInfo &LI,
7032                                const TargetTransformInfo &TTI,
7033                                AssumptionCache &AC, TargetLibraryInfo &TLI,
7034                                MemorySSA *MSSA) {
7035 
7036   // Debug preservation - before we start removing anything identify which DVI
7037   // meet the salvageable criteria and store their DIExpression and SCEVs.
7038   SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7039   SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
7040   DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7041 
7042   bool Changed = false;
7043   std::unique_ptr<MemorySSAUpdater> MSSAU;
7044   if (MSSA)
7045     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7046 
7047   // Run the main LSR transformation.
7048   const LSRInstance &Reducer =
7049       LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7050   Changed |= Reducer.getChanged();
7051 
7052   // Remove any extra phis created by processing inner loops.
7053   Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7054   if (EnablePhiElim && L->isLoopSimplifyForm()) {
7055     SmallVector<WeakTrackingVH, 16> DeadInsts;
7056     const DataLayout &DL = L->getHeader()->getDataLayout();
7057     SCEVExpander Rewriter(SE, DL, "lsr", false);
7058 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
7059     Rewriter.setDebugType(DEBUG_TYPE);
7060 #endif
7061     unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7062     Rewriter.clear();
7063     if (numFolded) {
7064       Changed = true;
7065       RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
7066                                                            MSSAU.get());
7067       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7068     }
7069   }
7070   // LSR may at times remove all uses of an induction variable from a loop.
7071   // The only remaining use is the PHI in the exit block.
7072   // When this is the case, if the exit value of the IV can be calculated using
7073   // SCEV, we can replace the exit block PHI with the final value of the IV and
7074   // skip the updates in each loop iteration.
7075   if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7076     SmallVector<WeakTrackingVH, 16> DeadInsts;
7077     const DataLayout &DL = L->getHeader()->getDataLayout();
7078     SCEVExpander Rewriter(SE, DL, "lsr", true);
7079     int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7080                                          UnusedIndVarInLoop, DeadInsts);
7081     Rewriter.clear();
7082     if (Rewrites) {
7083       Changed = true;
7084       RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
7085                                                            MSSAU.get());
7086       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7087     }
7088   }
7089 
7090   if (SalvageableDVIRecords.empty())
7091     return Changed;
7092 
7093   // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7094   // expressions composed using the derived iteration count.
7095   // TODO: Allow for multiple IV references for nested AddRecSCEVs
7096   for (const auto &L : LI) {
7097     if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7098       DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7099     else {
7100       LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7101                            "could not be identified.\n");
7102     }
7103   }
7104 
7105   for (auto &Rec : SalvageableDVIRecords)
7106     Rec->clear();
7107   SalvageableDVIRecords.clear();
7108   DVIHandles.clear();
7109   return Changed;
7110 }
7111 
7112 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7113   if (skipLoop(L))
7114     return false;
7115 
7116   auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7117   auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7118   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7119   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7120   const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7121       *L->getHeader()->getParent());
7122   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7123       *L->getHeader()->getParent());
7124   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7125       *L->getHeader()->getParent());
7126   auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7127   MemorySSA *MSSA = nullptr;
7128   if (MSSAAnalysis)
7129     MSSA = &MSSAAnalysis->getMSSA();
7130   return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7131 }
7132 
7133 PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
7134                                               LoopStandardAnalysisResults &AR,
7135                                               LPMUpdater &) {
7136   if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7137                           AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7138     return PreservedAnalyses::all();
7139 
7140   auto PA = getLoopPassPreservedAnalyses();
7141   if (AR.MSSA)
7142     PA.preserve<MemorySSAAnalysis>();
7143   return PA;
7144 }
7145 
7146 char LoopStrengthReduce::ID = 0;
7147 
7148 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7149                       "Loop Strength Reduction", false, false)
7150 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7151 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7152 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7153 INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
7154 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7155 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7156 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7157                     "Loop Strength Reduction", false, false)
7158 
7159 Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
7160