xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp (revision 36b606ae6aa4b24061096ba18582e0a08ccd5dba)
1  //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // This transformation analyzes and transforms the induction variables (and
10  // computations derived from them) into forms suitable for efficient execution
11  // on the target.
12  //
13  // This pass performs a strength reduction on array references inside loops that
14  // have as one or more of their components the loop induction variable, it
15  // rewrites expressions to take advantage of scaled-index addressing modes
16  // available on the target, and it performs a variety of other optimizations
17  // related to loop induction variables.
18  //
19  // Terminology note: this code has a lot of handling for "post-increment" or
20  // "post-inc" users. This is not talking about post-increment addressing modes;
21  // it is instead talking about code like this:
22  //
23  //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
24  //   ...
25  //   %i.next = add %i, 1
26  //   %c = icmp eq %i.next, %n
27  //
28  // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29  // it's useful to think about these as the same register, with some uses using
30  // the value of the register before the add and some using it after. In this
31  // example, the icmp is a post-increment user, since it uses %i.next, which is
32  // the value of the induction variable after the increment. The other common
33  // case of post-increment users is users outside the loop.
34  //
35  // TODO: More sophistication in the way Formulae are generated and filtered.
36  //
37  // TODO: Handle multiple loops at a time.
38  //
39  // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40  //       of a GlobalValue?
41  //
42  // TODO: When truncation is free, truncate ICmp users' operands to make it a
43  //       smaller encoding (on x86 at least).
44  //
45  // TODO: When a negated register is used by an add (such as in a list of
46  //       multiple base registers, or as the increment expression in an addrec),
47  //       we may not actually need both reg and (-1 * reg) in registers; the
48  //       negation can be implemented by using a sub instead of an add. The
49  //       lack of support for taking this into consideration when making
50  //       register pressure decisions is partly worked around by the "Special"
51  //       use kind.
52  //
53  //===----------------------------------------------------------------------===//
54  
55  #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
56  #include "llvm/ADT/APInt.h"
57  #include "llvm/ADT/DenseMap.h"
58  #include "llvm/ADT/DenseSet.h"
59  #include "llvm/ADT/Hashing.h"
60  #include "llvm/ADT/PointerIntPair.h"
61  #include "llvm/ADT/STLExtras.h"
62  #include "llvm/ADT/SetVector.h"
63  #include "llvm/ADT/SmallBitVector.h"
64  #include "llvm/ADT/SmallPtrSet.h"
65  #include "llvm/ADT/SmallSet.h"
66  #include "llvm/ADT/SmallVector.h"
67  #include "llvm/ADT/Statistic.h"
68  #include "llvm/ADT/iterator_range.h"
69  #include "llvm/Analysis/AssumptionCache.h"
70  #include "llvm/Analysis/DomTreeUpdater.h"
71  #include "llvm/Analysis/IVUsers.h"
72  #include "llvm/Analysis/LoopAnalysisManager.h"
73  #include "llvm/Analysis/LoopInfo.h"
74  #include "llvm/Analysis/LoopPass.h"
75  #include "llvm/Analysis/MemorySSA.h"
76  #include "llvm/Analysis/MemorySSAUpdater.h"
77  #include "llvm/Analysis/ScalarEvolution.h"
78  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
79  #include "llvm/Analysis/ScalarEvolutionNormalization.h"
80  #include "llvm/Analysis/TargetLibraryInfo.h"
81  #include "llvm/Analysis/TargetTransformInfo.h"
82  #include "llvm/Analysis/ValueTracking.h"
83  #include "llvm/BinaryFormat/Dwarf.h"
84  #include "llvm/Config/llvm-config.h"
85  #include "llvm/IR/BasicBlock.h"
86  #include "llvm/IR/Constant.h"
87  #include "llvm/IR/Constants.h"
88  #include "llvm/IR/DebugInfoMetadata.h"
89  #include "llvm/IR/DerivedTypes.h"
90  #include "llvm/IR/Dominators.h"
91  #include "llvm/IR/GlobalValue.h"
92  #include "llvm/IR/IRBuilder.h"
93  #include "llvm/IR/InstrTypes.h"
94  #include "llvm/IR/Instruction.h"
95  #include "llvm/IR/Instructions.h"
96  #include "llvm/IR/IntrinsicInst.h"
97  #include "llvm/IR/Module.h"
98  #include "llvm/IR/Operator.h"
99  #include "llvm/IR/PassManager.h"
100  #include "llvm/IR/Type.h"
101  #include "llvm/IR/Use.h"
102  #include "llvm/IR/User.h"
103  #include "llvm/IR/Value.h"
104  #include "llvm/IR/ValueHandle.h"
105  #include "llvm/InitializePasses.h"
106  #include "llvm/Pass.h"
107  #include "llvm/Support/Casting.h"
108  #include "llvm/Support/CommandLine.h"
109  #include "llvm/Support/Compiler.h"
110  #include "llvm/Support/Debug.h"
111  #include "llvm/Support/ErrorHandling.h"
112  #include "llvm/Support/MathExtras.h"
113  #include "llvm/Support/raw_ostream.h"
114  #include "llvm/Transforms/Scalar.h"
115  #include "llvm/Transforms/Utils.h"
116  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
117  #include "llvm/Transforms/Utils/Local.h"
118  #include "llvm/Transforms/Utils/LoopUtils.h"
119  #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
120  #include <algorithm>
121  #include <cassert>
122  #include <cstddef>
123  #include <cstdint>
124  #include <iterator>
125  #include <limits>
126  #include <map>
127  #include <numeric>
128  #include <optional>
129  #include <utility>
130  
131  using namespace llvm;
132  
133  #define DEBUG_TYPE "loop-reduce"
134  
135  /// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136  /// bail out. This threshold is far beyond the number of users that LSR can
137  /// conceivably solve, so it should not affect generated code, but catches the
138  /// worst cases before LSR burns too much compile time and stack space.
139  static const unsigned MaxIVUsers = 200;
140  
141  /// Limit the size of expression that SCEV-based salvaging will attempt to
142  /// translate into a DIExpression.
143  /// Choose a maximum size such that debuginfo is not excessively increased and
144  /// the salvaging is not too expensive for the compiler.
145  static const unsigned MaxSCEVSalvageExpressionSize = 64;
146  
147  // Cleanup congruent phis after LSR phi expansion.
148  static cl::opt<bool> EnablePhiElim(
149    "enable-lsr-phielim", cl::Hidden, cl::init(true),
150    cl::desc("Enable LSR phi elimination"));
151  
152  // The flag adds instruction count to solutions cost comparison.
153  static cl::opt<bool> InsnsCost(
154    "lsr-insns-cost", cl::Hidden, cl::init(true),
155    cl::desc("Add instruction count to a LSR cost model"));
156  
157  // Flag to choose how to narrow complex lsr solution
158  static cl::opt<bool> LSRExpNarrow(
159    "lsr-exp-narrow", cl::Hidden, cl::init(false),
160    cl::desc("Narrow LSR complex solution using"
161             " expectation of registers number"));
162  
163  // Flag to narrow search space by filtering non-optimal formulae with
164  // the same ScaledReg and Scale.
165  static cl::opt<bool> FilterSameScaledReg(
166      "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167      cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168               " with the same ScaledReg and Scale"));
169  
170  static cl::opt<TTI::AddressingModeKind> PreferredAddresingMode(
171    "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172     cl::desc("A flag that overrides the target's preferred addressing mode."),
173     cl::values(clEnumValN(TTI::AMK_None,
174                           "none",
175                           "Don't prefer any addressing mode"),
176                clEnumValN(TTI::AMK_PreIndexed,
177                           "preindexed",
178                           "Prefer pre-indexed addressing mode"),
179                clEnumValN(TTI::AMK_PostIndexed,
180                           "postindexed",
181                           "Prefer post-indexed addressing mode")));
182  
183  static cl::opt<unsigned> ComplexityLimit(
184    "lsr-complexity-limit", cl::Hidden,
185    cl::init(std::numeric_limits<uint16_t>::max()),
186    cl::desc("LSR search space complexity limit"));
187  
188  static cl::opt<unsigned> SetupCostDepthLimit(
189      "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190      cl::desc("The limit on recursion depth for LSRs setup cost"));
191  
192  static cl::opt<cl::boolOrDefault> AllowTerminatingConditionFoldingAfterLSR(
193      "lsr-term-fold", cl::Hidden,
194      cl::desc("Attempt to replace primary IV with other IV."));
195  
196  static cl::opt<cl::boolOrDefault> AllowDropSolutionIfLessProfitable(
197      "lsr-drop-solution", cl::Hidden,
198      cl::desc("Attempt to drop solution if it is less profitable"));
199  
200  static cl::opt<bool> EnableVScaleImmediates(
201      "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
202      cl::desc("Enable analysis of vscale-relative immediates in LSR"));
203  
204  static cl::opt<bool> DropScaledForVScale(
205      "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
206      cl::desc("Avoid using scaled registers with vscale-relative addressing"));
207  
208  STATISTIC(NumTermFold,
209            "Number of terminating condition fold recognized and performed");
210  
211  #ifndef NDEBUG
212  // Stress test IV chain generation.
213  static cl::opt<bool> StressIVChain(
214    "stress-ivchain", cl::Hidden, cl::init(false),
215    cl::desc("Stress test LSR IV chains"));
216  #else
217  static bool StressIVChain = false;
218  #endif
219  
220  namespace {
221  
222  struct MemAccessTy {
223    /// Used in situations where the accessed memory type is unknown.
224    static const unsigned UnknownAddressSpace =
225        std::numeric_limits<unsigned>::max();
226  
227    Type *MemTy = nullptr;
228    unsigned AddrSpace = UnknownAddressSpace;
229  
230    MemAccessTy() = default;
MemAccessTy__anonc21373340111::MemAccessTy231    MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
232  
operator ==__anonc21373340111::MemAccessTy233    bool operator==(MemAccessTy Other) const {
234      return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
235    }
236  
operator !=__anonc21373340111::MemAccessTy237    bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
238  
getUnknown__anonc21373340111::MemAccessTy239    static MemAccessTy getUnknown(LLVMContext &Ctx,
240                                  unsigned AS = UnknownAddressSpace) {
241      return MemAccessTy(Type::getVoidTy(Ctx), AS);
242    }
243  
getType__anonc21373340111::MemAccessTy244    Type *getType() { return MemTy; }
245  };
246  
247  /// This class holds data which is used to order reuse candidates.
248  class RegSortData {
249  public:
250    /// This represents the set of LSRUse indices which reference
251    /// a particular register.
252    SmallBitVector UsedByIndices;
253  
254    void print(raw_ostream &OS) const;
255    void dump() const;
256  };
257  
258  // An offset from an address that is either scalable or fixed. Used for
259  // per-target optimizations of addressing modes.
260  class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
Immediate(ScalarTy MinVal,bool Scalable)261    constexpr Immediate(ScalarTy MinVal, bool Scalable)
262        : FixedOrScalableQuantity(MinVal, Scalable) {}
263  
Immediate(const FixedOrScalableQuantity<Immediate,int64_t> & V)264    constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
265        : FixedOrScalableQuantity(V) {}
266  
267  public:
268    constexpr Immediate() = delete;
269  
getFixed(ScalarTy MinVal)270    static constexpr Immediate getFixed(ScalarTy MinVal) {
271      return {MinVal, false};
272    }
getScalable(ScalarTy MinVal)273    static constexpr Immediate getScalable(ScalarTy MinVal) {
274      return {MinVal, true};
275    }
get(ScalarTy MinVal,bool Scalable)276    static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
277      return {MinVal, Scalable};
278    }
getZero()279    static constexpr Immediate getZero() { return {0, false}; }
getFixedMin()280    static constexpr Immediate getFixedMin() {
281      return {std::numeric_limits<int64_t>::min(), false};
282    }
getFixedMax()283    static constexpr Immediate getFixedMax() {
284      return {std::numeric_limits<int64_t>::max(), false};
285    }
getScalableMin()286    static constexpr Immediate getScalableMin() {
287      return {std::numeric_limits<int64_t>::min(), true};
288    }
getScalableMax()289    static constexpr Immediate getScalableMax() {
290      return {std::numeric_limits<int64_t>::max(), true};
291    }
292  
isLessThanZero() const293    constexpr bool isLessThanZero() const { return Quantity < 0; }
294  
isGreaterThanZero() const295    constexpr bool isGreaterThanZero() const { return Quantity > 0; }
296  
isCompatibleImmediate(const Immediate & Imm) const297    constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
298      return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
299    }
300  
isMin() const301    constexpr bool isMin() const {
302      return Quantity == std::numeric_limits<ScalarTy>::min();
303    }
304  
isMax() const305    constexpr bool isMax() const {
306      return Quantity == std::numeric_limits<ScalarTy>::max();
307    }
308  
309    // Arithmetic 'operators' that cast to unsigned types first.
addUnsigned(const Immediate & RHS) const310    constexpr Immediate addUnsigned(const Immediate &RHS) const {
311      assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
312      ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
313      return {Value, Scalable || RHS.isScalable()};
314    }
315  
subUnsigned(const Immediate & RHS) const316    constexpr Immediate subUnsigned(const Immediate &RHS) const {
317      assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
318      ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
319      return {Value, Scalable || RHS.isScalable()};
320    }
321  
322    // Scale the quantity by a constant without caring about runtime scalability.
mulUnsigned(const ScalarTy RHS) const323    constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
324      ScalarTy Value = (uint64_t)Quantity * RHS;
325      return {Value, Scalable};
326    }
327  
328    // Helpers for generating SCEVs with vscale terms where needed.
getSCEV(ScalarEvolution & SE,Type * Ty) const329    const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
330      const SCEV *S = SE.getConstant(Ty, Quantity);
331      if (Scalable)
332        S = SE.getMulExpr(S, SE.getVScale(S->getType()));
333      return S;
334    }
335  
getNegativeSCEV(ScalarEvolution & SE,Type * Ty) const336    const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
337      const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
338      if (Scalable)
339        NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
340      return NegS;
341    }
342  
getUnknownSCEV(ScalarEvolution & SE,Type * Ty) const343    const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
344      const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
345      if (Scalable)
346        SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
347      return SU;
348    }
349  };
350  
351  // This is needed for the Compare type of std::map when Immediate is used
352  // as a key. We don't need it to be fully correct against any value of vscale,
353  // just to make sure that vscale-related terms in the map are considered against
354  // each other rather than being mixed up and potentially missing opportunities.
355  struct KeyOrderTargetImmediate {
operator ()__anonc21373340111::KeyOrderTargetImmediate356    bool operator()(const Immediate &LHS, const Immediate &RHS) const {
357      if (LHS.isScalable() && !RHS.isScalable())
358        return false;
359      if (!LHS.isScalable() && RHS.isScalable())
360        return true;
361      return LHS.getKnownMinValue() < RHS.getKnownMinValue();
362    }
363  };
364  
365  // This would be nicer if we could be generic instead of directly using size_t,
366  // but there doesn't seem to be a type trait for is_orderable or
367  // is_lessthan_comparable or similar.
368  struct KeyOrderSizeTAndImmediate {
operator ()__anonc21373340111::KeyOrderSizeTAndImmediate369    bool operator()(const std::pair<size_t, Immediate> &LHS,
370                    const std::pair<size_t, Immediate> &RHS) const {
371      size_t LSize = LHS.first;
372      size_t RSize = RHS.first;
373      if (LSize != RSize)
374        return LSize < RSize;
375      return KeyOrderTargetImmediate()(LHS.second, RHS.second);
376    }
377  };
378  } // end anonymous namespace
379  
380  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS) const381  void RegSortData::print(raw_ostream &OS) const {
382    OS << "[NumUses=" << UsedByIndices.count() << ']';
383  }
384  
dump() const385  LLVM_DUMP_METHOD void RegSortData::dump() const {
386    print(errs()); errs() << '\n';
387  }
388  #endif
389  
390  namespace {
391  
392  /// Map register candidates to information about how they are used.
393  class RegUseTracker {
394    using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
395  
396    RegUsesTy RegUsesMap;
397    SmallVector<const SCEV *, 16> RegSequence;
398  
399  public:
400    void countRegister(const SCEV *Reg, size_t LUIdx);
401    void dropRegister(const SCEV *Reg, size_t LUIdx);
402    void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
403  
404    bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
405  
406    const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
407  
408    void clear();
409  
410    using iterator = SmallVectorImpl<const SCEV *>::iterator;
411    using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
412  
begin()413    iterator begin() { return RegSequence.begin(); }
end()414    iterator end()   { return RegSequence.end(); }
begin() const415    const_iterator begin() const { return RegSequence.begin(); }
end() const416    const_iterator end() const   { return RegSequence.end(); }
417  };
418  
419  } // end anonymous namespace
420  
421  void
countRegister(const SCEV * Reg,size_t LUIdx)422  RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
423    std::pair<RegUsesTy::iterator, bool> Pair =
424      RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
425    RegSortData &RSD = Pair.first->second;
426    if (Pair.second)
427      RegSequence.push_back(Reg);
428    RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
429    RSD.UsedByIndices.set(LUIdx);
430  }
431  
432  void
dropRegister(const SCEV * Reg,size_t LUIdx)433  RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
434    RegUsesTy::iterator It = RegUsesMap.find(Reg);
435    assert(It != RegUsesMap.end());
436    RegSortData &RSD = It->second;
437    assert(RSD.UsedByIndices.size() > LUIdx);
438    RSD.UsedByIndices.reset(LUIdx);
439  }
440  
441  void
swapAndDropUse(size_t LUIdx,size_t LastLUIdx)442  RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
443    assert(LUIdx <= LastLUIdx);
444  
445    // Update RegUses. The data structure is not optimized for this purpose;
446    // we must iterate through it and update each of the bit vectors.
447    for (auto &Pair : RegUsesMap) {
448      SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
449      if (LUIdx < UsedByIndices.size())
450        UsedByIndices[LUIdx] =
451          LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
452      UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
453    }
454  }
455  
456  bool
isRegUsedByUsesOtherThan(const SCEV * Reg,size_t LUIdx) const457  RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
458    RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
459    if (I == RegUsesMap.end())
460      return false;
461    const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
462    int i = UsedByIndices.find_first();
463    if (i == -1) return false;
464    if ((size_t)i != LUIdx) return true;
465    return UsedByIndices.find_next(i) != -1;
466  }
467  
getUsedByIndices(const SCEV * Reg) const468  const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
469    RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
470    assert(I != RegUsesMap.end() && "Unknown register!");
471    return I->second.UsedByIndices;
472  }
473  
clear()474  void RegUseTracker::clear() {
475    RegUsesMap.clear();
476    RegSequence.clear();
477  }
478  
479  namespace {
480  
481  /// This class holds information that describes a formula for computing
482  /// satisfying a use. It may include broken-out immediates and scaled registers.
483  struct Formula {
484    /// Global base address used for complex addressing.
485    GlobalValue *BaseGV = nullptr;
486  
487    /// Base offset for complex addressing.
488    Immediate BaseOffset = Immediate::getZero();
489  
490    /// Whether any complex addressing has a base register.
491    bool HasBaseReg = false;
492  
493    /// The scale of any complex addressing.
494    int64_t Scale = 0;
495  
496    /// The list of "base" registers for this use. When this is non-empty. The
497    /// canonical representation of a formula is
498    /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
499    /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
500    /// 3. The reg containing recurrent expr related with currect loop in the
501    /// formula should be put in the ScaledReg.
502    /// #1 enforces that the scaled register is always used when at least two
503    /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
504    /// #2 enforces that 1 * reg is reg.
505    /// #3 ensures invariant regs with respect to current loop can be combined
506    /// together in LSR codegen.
507    /// This invariant can be temporarily broken while building a formula.
508    /// However, every formula inserted into the LSRInstance must be in canonical
509    /// form.
510    SmallVector<const SCEV *, 4> BaseRegs;
511  
512    /// The 'scaled' register for this use. This should be non-null when Scale is
513    /// not zero.
514    const SCEV *ScaledReg = nullptr;
515  
516    /// An additional constant offset which added near the use. This requires a
517    /// temporary register, but the offset itself can live in an add immediate
518    /// field rather than a register.
519    Immediate UnfoldedOffset = Immediate::getZero();
520  
521    Formula() = default;
522  
523    void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
524  
525    bool isCanonical(const Loop &L) const;
526  
527    void canonicalize(const Loop &L);
528  
529    bool unscale();
530  
531    bool hasZeroEnd() const;
532  
533    size_t getNumRegs() const;
534    Type *getType() const;
535  
536    void deleteBaseReg(const SCEV *&S);
537  
538    bool referencesReg(const SCEV *S) const;
539    bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
540                                    const RegUseTracker &RegUses) const;
541  
542    void print(raw_ostream &OS) const;
543    void dump() const;
544  };
545  
546  } // end anonymous namespace
547  
548  /// Recursion helper for initialMatch.
DoInitialMatch(const SCEV * S,Loop * L,SmallVectorImpl<const SCEV * > & Good,SmallVectorImpl<const SCEV * > & Bad,ScalarEvolution & SE)549  static void DoInitialMatch(const SCEV *S, Loop *L,
550                             SmallVectorImpl<const SCEV *> &Good,
551                             SmallVectorImpl<const SCEV *> &Bad,
552                             ScalarEvolution &SE) {
553    // Collect expressions which properly dominate the loop header.
554    if (SE.properlyDominates(S, L->getHeader())) {
555      Good.push_back(S);
556      return;
557    }
558  
559    // Look at add operands.
560    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
561      for (const SCEV *S : Add->operands())
562        DoInitialMatch(S, L, Good, Bad, SE);
563      return;
564    }
565  
566    // Look at addrec operands.
567    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
568      if (!AR->getStart()->isZero() && AR->isAffine()) {
569        DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
570        DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
571                                        AR->getStepRecurrence(SE),
572                                        // FIXME: AR->getNoWrapFlags()
573                                        AR->getLoop(), SCEV::FlagAnyWrap),
574                       L, Good, Bad, SE);
575        return;
576      }
577  
578    // Handle a multiplication by -1 (negation) if it didn't fold.
579    if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
580      if (Mul->getOperand(0)->isAllOnesValue()) {
581        SmallVector<const SCEV *, 4> Ops(drop_begin(Mul->operands()));
582        const SCEV *NewMul = SE.getMulExpr(Ops);
583  
584        SmallVector<const SCEV *, 4> MyGood;
585        SmallVector<const SCEV *, 4> MyBad;
586        DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
587        const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
588          SE.getEffectiveSCEVType(NewMul->getType())));
589        for (const SCEV *S : MyGood)
590          Good.push_back(SE.getMulExpr(NegOne, S));
591        for (const SCEV *S : MyBad)
592          Bad.push_back(SE.getMulExpr(NegOne, S));
593        return;
594      }
595  
596    // Ok, we can't do anything interesting. Just stuff the whole thing into a
597    // register and hope for the best.
598    Bad.push_back(S);
599  }
600  
601  /// Incorporate loop-variant parts of S into this Formula, attempting to keep
602  /// all loop-invariant and loop-computable values in a single base register.
initialMatch(const SCEV * S,Loop * L,ScalarEvolution & SE)603  void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
604    SmallVector<const SCEV *, 4> Good;
605    SmallVector<const SCEV *, 4> Bad;
606    DoInitialMatch(S, L, Good, Bad, SE);
607    if (!Good.empty()) {
608      const SCEV *Sum = SE.getAddExpr(Good);
609      if (!Sum->isZero())
610        BaseRegs.push_back(Sum);
611      HasBaseReg = true;
612    }
613    if (!Bad.empty()) {
614      const SCEV *Sum = SE.getAddExpr(Bad);
615      if (!Sum->isZero())
616        BaseRegs.push_back(Sum);
617      HasBaseReg = true;
618    }
619    canonicalize(*L);
620  }
621  
containsAddRecDependentOnLoop(const SCEV * S,const Loop & L)622  static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
623    return SCEVExprContains(S, [&L](const SCEV *S) {
624      return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
625    });
626  }
627  
628  /// Check whether or not this formula satisfies the canonical
629  /// representation.
630  /// \see Formula::BaseRegs.
isCanonical(const Loop & L) const631  bool Formula::isCanonical(const Loop &L) const {
632    if (!ScaledReg)
633      return BaseRegs.size() <= 1;
634  
635    if (Scale != 1)
636      return true;
637  
638    if (Scale == 1 && BaseRegs.empty())
639      return false;
640  
641    if (containsAddRecDependentOnLoop(ScaledReg, L))
642      return true;
643  
644    // If ScaledReg is not a recurrent expr, or it is but its loop is not current
645    // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
646    // loop, we want to swap the reg in BaseRegs with ScaledReg.
647    return none_of(BaseRegs, [&L](const SCEV *S) {
648      return containsAddRecDependentOnLoop(S, L);
649    });
650  }
651  
652  /// Helper method to morph a formula into its canonical representation.
653  /// \see Formula::BaseRegs.
654  /// Every formula having more than one base register, must use the ScaledReg
655  /// field. Otherwise, we would have to do special cases everywhere in LSR
656  /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
657  /// On the other hand, 1*reg should be canonicalized into reg.
canonicalize(const Loop & L)658  void Formula::canonicalize(const Loop &L) {
659    if (isCanonical(L))
660      return;
661  
662    if (BaseRegs.empty()) {
663      // No base reg? Use scale reg with scale = 1 as such.
664      assert(ScaledReg && "Expected 1*reg => reg");
665      assert(Scale == 1 && "Expected 1*reg => reg");
666      BaseRegs.push_back(ScaledReg);
667      Scale = 0;
668      ScaledReg = nullptr;
669      return;
670    }
671  
672    // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
673    if (!ScaledReg) {
674      ScaledReg = BaseRegs.pop_back_val();
675      Scale = 1;
676    }
677  
678    // If ScaledReg is an invariant with respect to L, find the reg from
679    // BaseRegs containing the recurrent expr related with Loop L. Swap the
680    // reg with ScaledReg.
681    if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
682      auto I = find_if(BaseRegs, [&L](const SCEV *S) {
683        return containsAddRecDependentOnLoop(S, L);
684      });
685      if (I != BaseRegs.end())
686        std::swap(ScaledReg, *I);
687    }
688    assert(isCanonical(L) && "Failed to canonicalize?");
689  }
690  
691  /// Get rid of the scale in the formula.
692  /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
693  /// \return true if it was possible to get rid of the scale, false otherwise.
694  /// \note After this operation the formula may not be in the canonical form.
unscale()695  bool Formula::unscale() {
696    if (Scale != 1)
697      return false;
698    Scale = 0;
699    BaseRegs.push_back(ScaledReg);
700    ScaledReg = nullptr;
701    return true;
702  }
703  
hasZeroEnd() const704  bool Formula::hasZeroEnd() const {
705    if (UnfoldedOffset || BaseOffset)
706      return false;
707    if (BaseRegs.size() != 1 || ScaledReg)
708      return false;
709    return true;
710  }
711  
712  /// Return the total number of register operands used by this formula. This does
713  /// not include register uses implied by non-constant addrec strides.
getNumRegs() const714  size_t Formula::getNumRegs() const {
715    return !!ScaledReg + BaseRegs.size();
716  }
717  
718  /// Return the type of this formula, if it has one, or null otherwise. This type
719  /// is meaningless except for the bit size.
getType() const720  Type *Formula::getType() const {
721    return !BaseRegs.empty() ? BaseRegs.front()->getType() :
722           ScaledReg ? ScaledReg->getType() :
723           BaseGV ? BaseGV->getType() :
724           nullptr;
725  }
726  
727  /// Delete the given base reg from the BaseRegs list.
deleteBaseReg(const SCEV * & S)728  void Formula::deleteBaseReg(const SCEV *&S) {
729    if (&S != &BaseRegs.back())
730      std::swap(S, BaseRegs.back());
731    BaseRegs.pop_back();
732  }
733  
734  /// Test if this formula references the given register.
referencesReg(const SCEV * S) const735  bool Formula::referencesReg(const SCEV *S) const {
736    return S == ScaledReg || is_contained(BaseRegs, S);
737  }
738  
739  /// Test whether this formula uses registers which are used by uses other than
740  /// the use with the given index.
hasRegsUsedByUsesOtherThan(size_t LUIdx,const RegUseTracker & RegUses) const741  bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
742                                           const RegUseTracker &RegUses) const {
743    if (ScaledReg)
744      if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
745        return true;
746    for (const SCEV *BaseReg : BaseRegs)
747      if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
748        return true;
749    return false;
750  }
751  
752  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS) const753  void Formula::print(raw_ostream &OS) const {
754    bool First = true;
755    if (BaseGV) {
756      if (!First) OS << " + "; else First = false;
757      BaseGV->printAsOperand(OS, /*PrintType=*/false);
758    }
759    if (BaseOffset.isNonZero()) {
760      if (!First) OS << " + "; else First = false;
761      OS << BaseOffset;
762    }
763    for (const SCEV *BaseReg : BaseRegs) {
764      if (!First) OS << " + "; else First = false;
765      OS << "reg(" << *BaseReg << ')';
766    }
767    if (HasBaseReg && BaseRegs.empty()) {
768      if (!First) OS << " + "; else First = false;
769      OS << "**error: HasBaseReg**";
770    } else if (!HasBaseReg && !BaseRegs.empty()) {
771      if (!First) OS << " + "; else First = false;
772      OS << "**error: !HasBaseReg**";
773    }
774    if (Scale != 0) {
775      if (!First) OS << " + "; else First = false;
776      OS << Scale << "*reg(";
777      if (ScaledReg)
778        OS << *ScaledReg;
779      else
780        OS << "<unknown>";
781      OS << ')';
782    }
783    if (UnfoldedOffset.isNonZero()) {
784      if (!First) OS << " + ";
785      OS << "imm(" << UnfoldedOffset << ')';
786    }
787  }
788  
dump() const789  LLVM_DUMP_METHOD void Formula::dump() const {
790    print(errs()); errs() << '\n';
791  }
792  #endif
793  
794  /// Return true if the given addrec can be sign-extended without changing its
795  /// value.
isAddRecSExtable(const SCEVAddRecExpr * AR,ScalarEvolution & SE)796  static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
797    Type *WideTy =
798      IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
799    return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
800  }
801  
802  /// Return true if the given add can be sign-extended without changing its
803  /// value.
isAddSExtable(const SCEVAddExpr * A,ScalarEvolution & SE)804  static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
805    Type *WideTy =
806      IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
807    return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
808  }
809  
810  /// Return true if the given mul can be sign-extended without changing its
811  /// value.
isMulSExtable(const SCEVMulExpr * M,ScalarEvolution & SE)812  static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
813    Type *WideTy =
814      IntegerType::get(SE.getContext(),
815                       SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
816    return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
817  }
818  
819  /// Return an expression for LHS /s RHS, if it can be determined and if the
820  /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
821  /// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
822  /// the multiplication may overflow, which is useful when the result will be
823  /// used in a context where the most significant bits are ignored.
getExactSDiv(const SCEV * LHS,const SCEV * RHS,ScalarEvolution & SE,bool IgnoreSignificantBits=false)824  static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
825                                  ScalarEvolution &SE,
826                                  bool IgnoreSignificantBits = false) {
827    // Handle the trivial case, which works for any SCEV type.
828    if (LHS == RHS)
829      return SE.getConstant(LHS->getType(), 1);
830  
831    // Handle a few RHS special cases.
832    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
833    if (RC) {
834      const APInt &RA = RC->getAPInt();
835      // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
836      // some folding.
837      if (RA.isAllOnes()) {
838        if (LHS->getType()->isPointerTy())
839          return nullptr;
840        return SE.getMulExpr(LHS, RC);
841      }
842      // Handle x /s 1 as x.
843      if (RA == 1)
844        return LHS;
845    }
846  
847    // Check for a division of a constant by a constant.
848    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
849      if (!RC)
850        return nullptr;
851      const APInt &LA = C->getAPInt();
852      const APInt &RA = RC->getAPInt();
853      if (LA.srem(RA) != 0)
854        return nullptr;
855      return SE.getConstant(LA.sdiv(RA));
856    }
857  
858    // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
859    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
860      if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
861        const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
862                                        IgnoreSignificantBits);
863        if (!Step) return nullptr;
864        const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
865                                         IgnoreSignificantBits);
866        if (!Start) return nullptr;
867        // FlagNW is independent of the start value, step direction, and is
868        // preserved with smaller magnitude steps.
869        // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
870        return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
871      }
872      return nullptr;
873    }
874  
875    // Distribute the sdiv over add operands, if the add doesn't overflow.
876    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
877      if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
878        SmallVector<const SCEV *, 8> Ops;
879        for (const SCEV *S : Add->operands()) {
880          const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
881          if (!Op) return nullptr;
882          Ops.push_back(Op);
883        }
884        return SE.getAddExpr(Ops);
885      }
886      return nullptr;
887    }
888  
889    // Check for a multiply operand that we can pull RHS out of.
890    if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
891      if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
892        // Handle special case C1*X*Y /s C2*X*Y.
893        if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
894          if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
895            const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
896            const SCEVConstant *RC =
897                dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
898            if (LC && RC) {
899              SmallVector<const SCEV *, 4> LOps(drop_begin(Mul->operands()));
900              SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
901              if (LOps == ROps)
902                return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
903            }
904          }
905        }
906  
907        SmallVector<const SCEV *, 4> Ops;
908        bool Found = false;
909        for (const SCEV *S : Mul->operands()) {
910          if (!Found)
911            if (const SCEV *Q = getExactSDiv(S, RHS, SE,
912                                             IgnoreSignificantBits)) {
913              S = Q;
914              Found = true;
915            }
916          Ops.push_back(S);
917        }
918        return Found ? SE.getMulExpr(Ops) : nullptr;
919      }
920      return nullptr;
921    }
922  
923    // Otherwise we don't know.
924    return nullptr;
925  }
926  
927  /// If S involves the addition of a constant integer value, return that integer
928  /// value, and mutate S to point to a new SCEV with that value excluded.
ExtractImmediate(const SCEV * & S,ScalarEvolution & SE)929  static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
930    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
931      if (C->getAPInt().getSignificantBits() <= 64) {
932        S = SE.getConstant(C->getType(), 0);
933        return Immediate::getFixed(C->getValue()->getSExtValue());
934      }
935    } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
936      SmallVector<const SCEV *, 8> NewOps(Add->operands());
937      Immediate Result = ExtractImmediate(NewOps.front(), SE);
938      if (Result.isNonZero())
939        S = SE.getAddExpr(NewOps);
940      return Result;
941    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
942      SmallVector<const SCEV *, 8> NewOps(AR->operands());
943      Immediate Result = ExtractImmediate(NewOps.front(), SE);
944      if (Result.isNonZero())
945        S = SE.getAddRecExpr(NewOps, AR->getLoop(),
946                             // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
947                             SCEV::FlagAnyWrap);
948      return Result;
949    } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
950      if (EnableVScaleImmediates && M->getNumOperands() == 2) {
951        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
952          if (isa<SCEVVScale>(M->getOperand(1))) {
953            S = SE.getConstant(M->getType(), 0);
954            return Immediate::getScalable(C->getValue()->getSExtValue());
955          }
956      }
957    }
958    return Immediate::getZero();
959  }
960  
961  /// If S involves the addition of a GlobalValue address, return that symbol, and
962  /// mutate S to point to a new SCEV with that value excluded.
ExtractSymbol(const SCEV * & S,ScalarEvolution & SE)963  static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
964    if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
965      if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
966        S = SE.getConstant(GV->getType(), 0);
967        return GV;
968      }
969    } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
970      SmallVector<const SCEV *, 8> NewOps(Add->operands());
971      GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
972      if (Result)
973        S = SE.getAddExpr(NewOps);
974      return Result;
975    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
976      SmallVector<const SCEV *, 8> NewOps(AR->operands());
977      GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
978      if (Result)
979        S = SE.getAddRecExpr(NewOps, AR->getLoop(),
980                             // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
981                             SCEV::FlagAnyWrap);
982      return Result;
983    }
984    return nullptr;
985  }
986  
987  /// Returns true if the specified instruction is using the specified value as an
988  /// address.
isAddressUse(const TargetTransformInfo & TTI,Instruction * Inst,Value * OperandVal)989  static bool isAddressUse(const TargetTransformInfo &TTI,
990                           Instruction *Inst, Value *OperandVal) {
991    bool isAddress = isa<LoadInst>(Inst);
992    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
993      if (SI->getPointerOperand() == OperandVal)
994        isAddress = true;
995    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
996      // Addressing modes can also be folded into prefetches and a variety
997      // of intrinsics.
998      switch (II->getIntrinsicID()) {
999      case Intrinsic::memset:
1000      case Intrinsic::prefetch:
1001      case Intrinsic::masked_load:
1002        if (II->getArgOperand(0) == OperandVal)
1003          isAddress = true;
1004        break;
1005      case Intrinsic::masked_store:
1006        if (II->getArgOperand(1) == OperandVal)
1007          isAddress = true;
1008        break;
1009      case Intrinsic::memmove:
1010      case Intrinsic::memcpy:
1011        if (II->getArgOperand(0) == OperandVal ||
1012            II->getArgOperand(1) == OperandVal)
1013          isAddress = true;
1014        break;
1015      default: {
1016        MemIntrinsicInfo IntrInfo;
1017        if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1018          if (IntrInfo.PtrVal == OperandVal)
1019            isAddress = true;
1020        }
1021      }
1022      }
1023    } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1024      if (RMW->getPointerOperand() == OperandVal)
1025        isAddress = true;
1026    } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1027      if (CmpX->getPointerOperand() == OperandVal)
1028        isAddress = true;
1029    }
1030    return isAddress;
1031  }
1032  
1033  /// Return the type of the memory being accessed.
getAccessType(const TargetTransformInfo & TTI,Instruction * Inst,Value * OperandVal)1034  static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1035                                   Instruction *Inst, Value *OperandVal) {
1036    MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1037  
1038    // First get the type of memory being accessed.
1039    if (Type *Ty = Inst->getAccessType())
1040      AccessTy.MemTy = Ty;
1041  
1042    // Then get the pointer address space.
1043    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1044      AccessTy.AddrSpace = SI->getPointerAddressSpace();
1045    } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1046      AccessTy.AddrSpace = LI->getPointerAddressSpace();
1047    } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1048      AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1049    } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1050      AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1051    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1052      switch (II->getIntrinsicID()) {
1053      case Intrinsic::prefetch:
1054      case Intrinsic::memset:
1055        AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1056        AccessTy.MemTy = OperandVal->getType();
1057        break;
1058      case Intrinsic::memmove:
1059      case Intrinsic::memcpy:
1060        AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1061        AccessTy.MemTy = OperandVal->getType();
1062        break;
1063      case Intrinsic::masked_load:
1064        AccessTy.AddrSpace =
1065            II->getArgOperand(0)->getType()->getPointerAddressSpace();
1066        break;
1067      case Intrinsic::masked_store:
1068        AccessTy.AddrSpace =
1069            II->getArgOperand(1)->getType()->getPointerAddressSpace();
1070        break;
1071      default: {
1072        MemIntrinsicInfo IntrInfo;
1073        if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1074          AccessTy.AddrSpace
1075            = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1076        }
1077  
1078        break;
1079      }
1080      }
1081    }
1082  
1083    return AccessTy;
1084  }
1085  
1086  /// Return true if this AddRec is already a phi in its loop.
isExistingPhi(const SCEVAddRecExpr * AR,ScalarEvolution & SE)1087  static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1088    for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1089      if (SE.isSCEVable(PN.getType()) &&
1090          (SE.getEffectiveSCEVType(PN.getType()) ==
1091           SE.getEffectiveSCEVType(AR->getType())) &&
1092          SE.getSCEV(&PN) == AR)
1093        return true;
1094    }
1095    return false;
1096  }
1097  
1098  /// Check if expanding this expression is likely to incur significant cost. This
1099  /// is tricky because SCEV doesn't track which expressions are actually computed
1100  /// by the current IR.
1101  ///
1102  /// We currently allow expansion of IV increments that involve adds,
1103  /// multiplication by constants, and AddRecs from existing phis.
1104  ///
1105  /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1106  /// obvious multiple of the UDivExpr.
isHighCostExpansion(const SCEV * S,SmallPtrSetImpl<const SCEV * > & Processed,ScalarEvolution & SE)1107  static bool isHighCostExpansion(const SCEV *S,
1108                                  SmallPtrSetImpl<const SCEV*> &Processed,
1109                                  ScalarEvolution &SE) {
1110    // Zero/One operand expressions
1111    switch (S->getSCEVType()) {
1112    case scUnknown:
1113    case scConstant:
1114    case scVScale:
1115      return false;
1116    case scTruncate:
1117      return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1118                                 Processed, SE);
1119    case scZeroExtend:
1120      return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1121                                 Processed, SE);
1122    case scSignExtend:
1123      return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1124                                 Processed, SE);
1125    default:
1126      break;
1127    }
1128  
1129    if (!Processed.insert(S).second)
1130      return false;
1131  
1132    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1133      for (const SCEV *S : Add->operands()) {
1134        if (isHighCostExpansion(S, Processed, SE))
1135          return true;
1136      }
1137      return false;
1138    }
1139  
1140    if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1141      if (Mul->getNumOperands() == 2) {
1142        // Multiplication by a constant is ok
1143        if (isa<SCEVConstant>(Mul->getOperand(0)))
1144          return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1145  
1146        // If we have the value of one operand, check if an existing
1147        // multiplication already generates this expression.
1148        if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1149          Value *UVal = U->getValue();
1150          for (User *UR : UVal->users()) {
1151            // If U is a constant, it may be used by a ConstantExpr.
1152            Instruction *UI = dyn_cast<Instruction>(UR);
1153            if (UI && UI->getOpcode() == Instruction::Mul &&
1154                SE.isSCEVable(UI->getType())) {
1155              return SE.getSCEV(UI) == Mul;
1156            }
1157          }
1158        }
1159      }
1160    }
1161  
1162    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1163      if (isExistingPhi(AR, SE))
1164        return false;
1165    }
1166  
1167    // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1168    return true;
1169  }
1170  
1171  namespace {
1172  
1173  class LSRUse;
1174  
1175  } // end anonymous namespace
1176  
1177  /// Check if the addressing mode defined by \p F is completely
1178  /// folded in \p LU at isel time.
1179  /// This includes address-mode folding and special icmp tricks.
1180  /// This function returns true if \p LU can accommodate what \p F
1181  /// defines and up to 1 base + 1 scaled + offset.
1182  /// In other words, if \p F has several base registers, this function may
1183  /// still return true. Therefore, users still need to account for
1184  /// additional base registers and/or unfolded offsets to derive an
1185  /// accurate cost model.
1186  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1187                                   const LSRUse &LU, const Formula &F);
1188  
1189  // Get the cost of the scaling factor used in F for LU.
1190  static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1191                                              const LSRUse &LU, const Formula &F,
1192                                              const Loop &L);
1193  
1194  namespace {
1195  
1196  /// This class is used to measure and compare candidate formulae.
1197  class Cost {
1198    const Loop *L = nullptr;
1199    ScalarEvolution *SE = nullptr;
1200    const TargetTransformInfo *TTI = nullptr;
1201    TargetTransformInfo::LSRCost C;
1202    TTI::AddressingModeKind AMK = TTI::AMK_None;
1203  
1204  public:
1205    Cost() = delete;
Cost(const Loop * L,ScalarEvolution & SE,const TargetTransformInfo & TTI,TTI::AddressingModeKind AMK)1206    Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1207         TTI::AddressingModeKind AMK) :
1208      L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1209      C.Insns = 0;
1210      C.NumRegs = 0;
1211      C.AddRecCost = 0;
1212      C.NumIVMuls = 0;
1213      C.NumBaseAdds = 0;
1214      C.ImmCost = 0;
1215      C.SetupCost = 0;
1216      C.ScaleCost = 0;
1217    }
1218  
1219    bool isLess(const Cost &Other) const;
1220  
1221    void Lose();
1222  
1223  #ifndef NDEBUG
1224    // Once any of the metrics loses, they must all remain losers.
isValid()1225    bool isValid() {
1226      return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1227               | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1228        || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1229             & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1230    }
1231  #endif
1232  
isLoser()1233    bool isLoser() {
1234      assert(isValid() && "invalid cost");
1235      return C.NumRegs == ~0u;
1236    }
1237  
1238    void RateFormula(const Formula &F,
1239                     SmallPtrSetImpl<const SCEV *> &Regs,
1240                     const DenseSet<const SCEV *> &VisitedRegs,
1241                     const LSRUse &LU,
1242                     SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1243  
1244    void print(raw_ostream &OS) const;
1245    void dump() const;
1246  
1247  private:
1248    void RateRegister(const Formula &F, const SCEV *Reg,
1249                      SmallPtrSetImpl<const SCEV *> &Regs);
1250    void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1251                             SmallPtrSetImpl<const SCEV *> &Regs,
1252                             SmallPtrSetImpl<const SCEV *> *LoserRegs);
1253  };
1254  
1255  /// An operand value in an instruction which is to be replaced with some
1256  /// equivalent, possibly strength-reduced, replacement.
1257  struct LSRFixup {
1258    /// The instruction which will be updated.
1259    Instruction *UserInst = nullptr;
1260  
1261    /// The operand of the instruction which will be replaced. The operand may be
1262    /// used more than once; every instance will be replaced.
1263    Value *OperandValToReplace = nullptr;
1264  
1265    /// If this user is to use the post-incremented value of an induction
1266    /// variable, this set is non-empty and holds the loops associated with the
1267    /// induction variable.
1268    PostIncLoopSet PostIncLoops;
1269  
1270    /// A constant offset to be added to the LSRUse expression.  This allows
1271    /// multiple fixups to share the same LSRUse with different offsets, for
1272    /// example in an unrolled loop.
1273    Immediate Offset = Immediate::getZero();
1274  
1275    LSRFixup() = default;
1276  
1277    bool isUseFullyOutsideLoop(const Loop *L) const;
1278  
1279    void print(raw_ostream &OS) const;
1280    void dump() const;
1281  };
1282  
1283  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1284  /// SmallVectors of const SCEV*.
1285  struct UniquifierDenseMapInfo {
getEmptyKey__anonc21373340811::UniquifierDenseMapInfo1286    static SmallVector<const SCEV *, 4> getEmptyKey() {
1287      SmallVector<const SCEV *, 4>  V;
1288      V.push_back(reinterpret_cast<const SCEV *>(-1));
1289      return V;
1290    }
1291  
getTombstoneKey__anonc21373340811::UniquifierDenseMapInfo1292    static SmallVector<const SCEV *, 4> getTombstoneKey() {
1293      SmallVector<const SCEV *, 4> V;
1294      V.push_back(reinterpret_cast<const SCEV *>(-2));
1295      return V;
1296    }
1297  
getHashValue__anonc21373340811::UniquifierDenseMapInfo1298    static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1299      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1300    }
1301  
isEqual__anonc21373340811::UniquifierDenseMapInfo1302    static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1303                        const SmallVector<const SCEV *, 4> &RHS) {
1304      return LHS == RHS;
1305    }
1306  };
1307  
1308  /// This class holds the state that LSR keeps for each use in IVUsers, as well
1309  /// as uses invented by LSR itself. It includes information about what kinds of
1310  /// things can be folded into the user, information about the user itself, and
1311  /// information about how the use may be satisfied.  TODO: Represent multiple
1312  /// users of the same expression in common?
1313  class LSRUse {
1314    DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1315  
1316  public:
1317    /// An enum for a kind of use, indicating what types of scaled and immediate
1318    /// operands it might support.
1319    enum KindType {
1320      Basic,   ///< A normal use, with no folding.
1321      Special, ///< A special case of basic, allowing -1 scales.
1322      Address, ///< An address use; folding according to TargetLowering
1323      ICmpZero ///< An equality icmp with both operands folded into one.
1324      // TODO: Add a generic icmp too?
1325    };
1326  
1327    using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1328  
1329    KindType Kind;
1330    MemAccessTy AccessTy;
1331  
1332    /// The list of operands which are to be replaced.
1333    SmallVector<LSRFixup, 8> Fixups;
1334  
1335    /// Keep track of the min and max offsets of the fixups.
1336    Immediate MinOffset = Immediate::getFixedMax();
1337    Immediate MaxOffset = Immediate::getFixedMin();
1338  
1339    /// This records whether all of the fixups using this LSRUse are outside of
1340    /// the loop, in which case some special-case heuristics may be used.
1341    bool AllFixupsOutsideLoop = true;
1342  
1343    /// RigidFormula is set to true to guarantee that this use will be associated
1344    /// with a single formula--the one that initially matched. Some SCEV
1345    /// expressions cannot be expanded. This allows LSR to consider the registers
1346    /// used by those expressions without the need to expand them later after
1347    /// changing the formula.
1348    bool RigidFormula = false;
1349  
1350    /// This records the widest use type for any fixup using this
1351    /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1352    /// fixup widths to be equivalent, because the narrower one may be relying on
1353    /// the implicit truncation to truncate away bogus bits.
1354    Type *WidestFixupType = nullptr;
1355  
1356    /// A list of ways to build a value that can satisfy this user.  After the
1357    /// list is populated, one of these is selected heuristically and used to
1358    /// formulate a replacement for OperandValToReplace in UserInst.
1359    SmallVector<Formula, 12> Formulae;
1360  
1361    /// The set of register candidates used by all formulae in this LSRUse.
1362    SmallPtrSet<const SCEV *, 4> Regs;
1363  
LSRUse(KindType K,MemAccessTy AT)1364    LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1365  
getNewFixup()1366    LSRFixup &getNewFixup() {
1367      Fixups.push_back(LSRFixup());
1368      return Fixups.back();
1369    }
1370  
pushFixup(LSRFixup & f)1371    void pushFixup(LSRFixup &f) {
1372      Fixups.push_back(f);
1373      if (Immediate::isKnownGT(f.Offset, MaxOffset))
1374        MaxOffset = f.Offset;
1375      if (Immediate::isKnownLT(f.Offset, MinOffset))
1376        MinOffset = f.Offset;
1377    }
1378  
1379    bool HasFormulaWithSameRegs(const Formula &F) const;
1380    float getNotSelectedProbability(const SCEV *Reg) const;
1381    bool InsertFormula(const Formula &F, const Loop &L);
1382    void DeleteFormula(Formula &F);
1383    void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1384  
1385    void print(raw_ostream &OS) const;
1386    void dump() const;
1387  };
1388  
1389  } // end anonymous namespace
1390  
1391  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1392                                   LSRUse::KindType Kind, MemAccessTy AccessTy,
1393                                   GlobalValue *BaseGV, Immediate BaseOffset,
1394                                   bool HasBaseReg, int64_t Scale,
1395                                   Instruction *Fixup = nullptr);
1396  
getSetupCost(const SCEV * Reg,unsigned Depth)1397  static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1398    if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1399      return 1;
1400    if (Depth == 0)
1401      return 0;
1402    if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1403      return getSetupCost(S->getStart(), Depth - 1);
1404    if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1405      return getSetupCost(S->getOperand(), Depth - 1);
1406    if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1407      return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1408                             [&](unsigned i, const SCEV *Reg) {
1409                               return i + getSetupCost(Reg, Depth - 1);
1410                             });
1411    if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1412      return getSetupCost(S->getLHS(), Depth - 1) +
1413             getSetupCost(S->getRHS(), Depth - 1);
1414    return 0;
1415  }
1416  
1417  /// Tally up interesting quantities from the given register.
RateRegister(const Formula & F,const SCEV * Reg,SmallPtrSetImpl<const SCEV * > & Regs)1418  void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1419                          SmallPtrSetImpl<const SCEV *> &Regs) {
1420    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1421      // If this is an addrec for another loop, it should be an invariant
1422      // with respect to L since L is the innermost loop (at least
1423      // for now LSR only handles innermost loops).
1424      if (AR->getLoop() != L) {
1425        // If the AddRec exists, consider it's register free and leave it alone.
1426        if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1427          return;
1428  
1429        // It is bad to allow LSR for current loop to add induction variables
1430        // for its sibling loops.
1431        if (!AR->getLoop()->contains(L)) {
1432          Lose();
1433          return;
1434        }
1435  
1436        // Otherwise, it will be an invariant with respect to Loop L.
1437        ++C.NumRegs;
1438        return;
1439      }
1440  
1441      unsigned LoopCost = 1;
1442      if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1443          TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1444  
1445        // If the step size matches the base offset, we could use pre-indexed
1446        // addressing.
1447        if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
1448          if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1449            if (Step->getAPInt() == F.BaseOffset.getFixedValue())
1450              LoopCost = 0;
1451        } else if (AMK == TTI::AMK_PostIndexed) {
1452          const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1453          if (isa<SCEVConstant>(LoopStep)) {
1454            const SCEV *LoopStart = AR->getStart();
1455            if (!isa<SCEVConstant>(LoopStart) &&
1456                SE->isLoopInvariant(LoopStart, L))
1457              LoopCost = 0;
1458          }
1459        }
1460      }
1461      C.AddRecCost += LoopCost;
1462  
1463      // Add the step value register, if it needs one.
1464      // TODO: The non-affine case isn't precisely modeled here.
1465      if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1466        if (!Regs.count(AR->getOperand(1))) {
1467          RateRegister(F, AR->getOperand(1), Regs);
1468          if (isLoser())
1469            return;
1470        }
1471      }
1472    }
1473    ++C.NumRegs;
1474  
1475    // Rough heuristic; favor registers which don't require extra setup
1476    // instructions in the preheader.
1477    C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1478    // Ensure we don't, even with the recusion limit, produce invalid costs.
1479    C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1480  
1481    C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1482                 SE->hasComputableLoopEvolution(Reg, L);
1483  }
1484  
1485  /// Record this register in the set. If we haven't seen it before, rate
1486  /// it. Optional LoserRegs provides a way to declare any formula that refers to
1487  /// one of those regs an instant loser.
RatePrimaryRegister(const Formula & F,const SCEV * Reg,SmallPtrSetImpl<const SCEV * > & Regs,SmallPtrSetImpl<const SCEV * > * LoserRegs)1488  void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1489                                 SmallPtrSetImpl<const SCEV *> &Regs,
1490                                 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1491    if (LoserRegs && LoserRegs->count(Reg)) {
1492      Lose();
1493      return;
1494    }
1495    if (Regs.insert(Reg).second) {
1496      RateRegister(F, Reg, Regs);
1497      if (LoserRegs && isLoser())
1498        LoserRegs->insert(Reg);
1499    }
1500  }
1501  
RateFormula(const Formula & F,SmallPtrSetImpl<const SCEV * > & Regs,const DenseSet<const SCEV * > & VisitedRegs,const LSRUse & LU,SmallPtrSetImpl<const SCEV * > * LoserRegs)1502  void Cost::RateFormula(const Formula &F,
1503                         SmallPtrSetImpl<const SCEV *> &Regs,
1504                         const DenseSet<const SCEV *> &VisitedRegs,
1505                         const LSRUse &LU,
1506                         SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1507    if (isLoser())
1508      return;
1509    assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1510    // Tally up the registers.
1511    unsigned PrevAddRecCost = C.AddRecCost;
1512    unsigned PrevNumRegs = C.NumRegs;
1513    unsigned PrevNumBaseAdds = C.NumBaseAdds;
1514    if (const SCEV *ScaledReg = F.ScaledReg) {
1515      if (VisitedRegs.count(ScaledReg)) {
1516        Lose();
1517        return;
1518      }
1519      RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1520      if (isLoser())
1521        return;
1522    }
1523    for (const SCEV *BaseReg : F.BaseRegs) {
1524      if (VisitedRegs.count(BaseReg)) {
1525        Lose();
1526        return;
1527      }
1528      RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1529      if (isLoser())
1530        return;
1531    }
1532  
1533    // Determine how many (unfolded) adds we'll need inside the loop.
1534    size_t NumBaseParts = F.getNumRegs();
1535    if (NumBaseParts > 1)
1536      // Do not count the base and a possible second register if the target
1537      // allows to fold 2 registers.
1538      C.NumBaseAdds +=
1539          NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1540    C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1541  
1542    // Accumulate non-free scaling amounts.
1543    C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1544  
1545    // Tally up the non-zero immediates.
1546    for (const LSRFixup &Fixup : LU.Fixups) {
1547      if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1548        Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1549        if (F.BaseGV)
1550          C.ImmCost += 64; // Handle symbolic values conservatively.
1551                           // TODO: This should probably be the pointer size.
1552        else if (Offset.isNonZero())
1553          C.ImmCost +=
1554              APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1555  
1556        // Check with target if this offset with this instruction is
1557        // specifically not supported.
1558        if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1559            !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1560                                  Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1561          C.NumBaseAdds++;
1562      } else {
1563        // Incompatible immediate type, increase cost to avoid using
1564        C.ImmCost += 2048;
1565      }
1566    }
1567  
1568    // If we don't count instruction cost exit here.
1569    if (!InsnsCost) {
1570      assert(isValid() && "invalid cost");
1571      return;
1572    }
1573  
1574    // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1575    // additional instruction (at least fill).
1576    // TODO: Need distinguish register class?
1577    unsigned TTIRegNum = TTI->getNumberOfRegisters(
1578                         TTI->getRegisterClassForType(false, F.getType())) - 1;
1579    if (C.NumRegs > TTIRegNum) {
1580      // Cost already exceeded TTIRegNum, then only newly added register can add
1581      // new instructions.
1582      if (PrevNumRegs > TTIRegNum)
1583        C.Insns += (C.NumRegs - PrevNumRegs);
1584      else
1585        C.Insns += (C.NumRegs - TTIRegNum);
1586    }
1587  
1588    // If ICmpZero formula ends with not 0, it could not be replaced by
1589    // just add or sub. We'll need to compare final result of AddRec.
1590    // That means we'll need an additional instruction. But if the target can
1591    // macro-fuse a compare with a branch, don't count this extra instruction.
1592    // For -10 + {0, +, 1}:
1593    // i = i + 1;
1594    // cmp i, 10
1595    //
1596    // For {-10, +, 1}:
1597    // i = i + 1;
1598    if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1599        !TTI->canMacroFuseCmp())
1600      C.Insns++;
1601    // Each new AddRec adds 1 instruction to calculation.
1602    C.Insns += (C.AddRecCost - PrevAddRecCost);
1603  
1604    // BaseAdds adds instructions for unfolded registers.
1605    if (LU.Kind != LSRUse::ICmpZero)
1606      C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1607    assert(isValid() && "invalid cost");
1608  }
1609  
1610  /// Set this cost to a losing value.
Lose()1611  void Cost::Lose() {
1612    C.Insns = std::numeric_limits<unsigned>::max();
1613    C.NumRegs = std::numeric_limits<unsigned>::max();
1614    C.AddRecCost = std::numeric_limits<unsigned>::max();
1615    C.NumIVMuls = std::numeric_limits<unsigned>::max();
1616    C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1617    C.ImmCost = std::numeric_limits<unsigned>::max();
1618    C.SetupCost = std::numeric_limits<unsigned>::max();
1619    C.ScaleCost = std::numeric_limits<unsigned>::max();
1620  }
1621  
1622  /// Choose the lower cost.
isLess(const Cost & Other) const1623  bool Cost::isLess(const Cost &Other) const {
1624    if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1625        C.Insns != Other.C.Insns)
1626      return C.Insns < Other.C.Insns;
1627    return TTI->isLSRCostLess(C, Other.C);
1628  }
1629  
1630  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS) const1631  void Cost::print(raw_ostream &OS) const {
1632    if (InsnsCost)
1633      OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1634    OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1635    if (C.AddRecCost != 0)
1636      OS << ", with addrec cost " << C.AddRecCost;
1637    if (C.NumIVMuls != 0)
1638      OS << ", plus " << C.NumIVMuls << " IV mul"
1639         << (C.NumIVMuls == 1 ? "" : "s");
1640    if (C.NumBaseAdds != 0)
1641      OS << ", plus " << C.NumBaseAdds << " base add"
1642         << (C.NumBaseAdds == 1 ? "" : "s");
1643    if (C.ScaleCost != 0)
1644      OS << ", plus " << C.ScaleCost << " scale cost";
1645    if (C.ImmCost != 0)
1646      OS << ", plus " << C.ImmCost << " imm cost";
1647    if (C.SetupCost != 0)
1648      OS << ", plus " << C.SetupCost << " setup cost";
1649  }
1650  
dump() const1651  LLVM_DUMP_METHOD void Cost::dump() const {
1652    print(errs()); errs() << '\n';
1653  }
1654  #endif
1655  
1656  /// Test whether this fixup always uses its value outside of the given loop.
isUseFullyOutsideLoop(const Loop * L) const1657  bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1658    // PHI nodes use their value in their incoming blocks.
1659    if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1660      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1661        if (PN->getIncomingValue(i) == OperandValToReplace &&
1662            L->contains(PN->getIncomingBlock(i)))
1663          return false;
1664      return true;
1665    }
1666  
1667    return !L->contains(UserInst);
1668  }
1669  
1670  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS) const1671  void LSRFixup::print(raw_ostream &OS) const {
1672    OS << "UserInst=";
1673    // Store is common and interesting enough to be worth special-casing.
1674    if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1675      OS << "store ";
1676      Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1677    } else if (UserInst->getType()->isVoidTy())
1678      OS << UserInst->getOpcodeName();
1679    else
1680      UserInst->printAsOperand(OS, /*PrintType=*/false);
1681  
1682    OS << ", OperandValToReplace=";
1683    OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1684  
1685    for (const Loop *PIL : PostIncLoops) {
1686      OS << ", PostIncLoop=";
1687      PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1688    }
1689  
1690    if (Offset.isNonZero())
1691      OS << ", Offset=" << Offset;
1692  }
1693  
dump() const1694  LLVM_DUMP_METHOD void LSRFixup::dump() const {
1695    print(errs()); errs() << '\n';
1696  }
1697  #endif
1698  
1699  /// Test whether this use as a formula which has the same registers as the given
1700  /// formula.
HasFormulaWithSameRegs(const Formula & F) const1701  bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1702    SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1703    if (F.ScaledReg) Key.push_back(F.ScaledReg);
1704    // Unstable sort by host order ok, because this is only used for uniquifying.
1705    llvm::sort(Key);
1706    return Uniquifier.count(Key);
1707  }
1708  
1709  /// The function returns a probability of selecting formula without Reg.
getNotSelectedProbability(const SCEV * Reg) const1710  float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1711    unsigned FNum = 0;
1712    for (const Formula &F : Formulae)
1713      if (F.referencesReg(Reg))
1714        FNum++;
1715    return ((float)(Formulae.size() - FNum)) / Formulae.size();
1716  }
1717  
1718  /// If the given formula has not yet been inserted, add it to the list, and
1719  /// return true. Return false otherwise.  The formula must be in canonical form.
InsertFormula(const Formula & F,const Loop & L)1720  bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1721    assert(F.isCanonical(L) && "Invalid canonical representation");
1722  
1723    if (!Formulae.empty() && RigidFormula)
1724      return false;
1725  
1726    SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1727    if (F.ScaledReg) Key.push_back(F.ScaledReg);
1728    // Unstable sort by host order ok, because this is only used for uniquifying.
1729    llvm::sort(Key);
1730  
1731    if (!Uniquifier.insert(Key).second)
1732      return false;
1733  
1734    // Using a register to hold the value of 0 is not profitable.
1735    assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1736           "Zero allocated in a scaled register!");
1737  #ifndef NDEBUG
1738    for (const SCEV *BaseReg : F.BaseRegs)
1739      assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1740  #endif
1741  
1742    // Add the formula to the list.
1743    Formulae.push_back(F);
1744  
1745    // Record registers now being used by this use.
1746    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1747    if (F.ScaledReg)
1748      Regs.insert(F.ScaledReg);
1749  
1750    return true;
1751  }
1752  
1753  /// Remove the given formula from this use's list.
DeleteFormula(Formula & F)1754  void LSRUse::DeleteFormula(Formula &F) {
1755    if (&F != &Formulae.back())
1756      std::swap(F, Formulae.back());
1757    Formulae.pop_back();
1758  }
1759  
1760  /// Recompute the Regs field, and update RegUses.
RecomputeRegs(size_t LUIdx,RegUseTracker & RegUses)1761  void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1762    // Now that we've filtered out some formulae, recompute the Regs set.
1763    SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1764    Regs.clear();
1765    for (const Formula &F : Formulae) {
1766      if (F.ScaledReg) Regs.insert(F.ScaledReg);
1767      Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1768    }
1769  
1770    // Update the RegTracker.
1771    for (const SCEV *S : OldRegs)
1772      if (!Regs.count(S))
1773        RegUses.dropRegister(S, LUIdx);
1774  }
1775  
1776  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS) const1777  void LSRUse::print(raw_ostream &OS) const {
1778    OS << "LSR Use: Kind=";
1779    switch (Kind) {
1780    case Basic:    OS << "Basic"; break;
1781    case Special:  OS << "Special"; break;
1782    case ICmpZero: OS << "ICmpZero"; break;
1783    case Address:
1784      OS << "Address of ";
1785      if (AccessTy.MemTy->isPointerTy())
1786        OS << "pointer"; // the full pointer type could be really verbose
1787      else {
1788        OS << *AccessTy.MemTy;
1789      }
1790  
1791      OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1792    }
1793  
1794    OS << ", Offsets={";
1795    bool NeedComma = false;
1796    for (const LSRFixup &Fixup : Fixups) {
1797      if (NeedComma) OS << ',';
1798      OS << Fixup.Offset;
1799      NeedComma = true;
1800    }
1801    OS << '}';
1802  
1803    if (AllFixupsOutsideLoop)
1804      OS << ", all-fixups-outside-loop";
1805  
1806    if (WidestFixupType)
1807      OS << ", widest fixup type: " << *WidestFixupType;
1808  }
1809  
dump() const1810  LLVM_DUMP_METHOD void LSRUse::dump() const {
1811    print(errs()); errs() << '\n';
1812  }
1813  #endif
1814  
isAMCompletelyFolded(const TargetTransformInfo & TTI,LSRUse::KindType Kind,MemAccessTy AccessTy,GlobalValue * BaseGV,Immediate BaseOffset,bool HasBaseReg,int64_t Scale,Instruction * Fixup)1815  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1816                                   LSRUse::KindType Kind, MemAccessTy AccessTy,
1817                                   GlobalValue *BaseGV, Immediate BaseOffset,
1818                                   bool HasBaseReg, int64_t Scale,
1819                                   Instruction *Fixup /* = nullptr */) {
1820    switch (Kind) {
1821    case LSRUse::Address: {
1822      int64_t FixedOffset =
1823          BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1824      int64_t ScalableOffset =
1825          BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1826      return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1827                                       HasBaseReg, Scale, AccessTy.AddrSpace,
1828                                       Fixup, ScalableOffset);
1829    }
1830    case LSRUse::ICmpZero:
1831      // There's not even a target hook for querying whether it would be legal to
1832      // fold a GV into an ICmp.
1833      if (BaseGV)
1834        return false;
1835  
1836      // ICmp only has two operands; don't allow more than two non-trivial parts.
1837      if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1838        return false;
1839  
1840      // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1841      // putting the scaled register in the other operand of the icmp.
1842      if (Scale != 0 && Scale != -1)
1843        return false;
1844  
1845      // If we have low-level target information, ask the target if it can fold an
1846      // integer immediate on an icmp.
1847      if (BaseOffset.isNonZero()) {
1848        // We don't have an interface to query whether the target supports
1849        // icmpzero against scalable quantities yet.
1850        if (BaseOffset.isScalable())
1851          return false;
1852  
1853        // We have one of:
1854        // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1855        // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1856        // Offs is the ICmp immediate.
1857        if (Scale == 0)
1858          // The cast does the right thing with
1859          // std::numeric_limits<int64_t>::min().
1860          BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1861        return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1862      }
1863  
1864      // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1865      return true;
1866  
1867    case LSRUse::Basic:
1868      // Only handle single-register values.
1869      return !BaseGV && Scale == 0 && BaseOffset.isZero();
1870  
1871    case LSRUse::Special:
1872      // Special case Basic to handle -1 scales.
1873      return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1874    }
1875  
1876    llvm_unreachable("Invalid LSRUse Kind!");
1877  }
1878  
isAMCompletelyFolded(const TargetTransformInfo & TTI,Immediate MinOffset,Immediate MaxOffset,LSRUse::KindType Kind,MemAccessTy AccessTy,GlobalValue * BaseGV,Immediate BaseOffset,bool HasBaseReg,int64_t Scale)1879  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1880                                   Immediate MinOffset, Immediate MaxOffset,
1881                                   LSRUse::KindType Kind, MemAccessTy AccessTy,
1882                                   GlobalValue *BaseGV, Immediate BaseOffset,
1883                                   bool HasBaseReg, int64_t Scale) {
1884    if (BaseOffset.isNonZero() &&
1885        (BaseOffset.isScalable() != MinOffset.isScalable() ||
1886         BaseOffset.isScalable() != MaxOffset.isScalable()))
1887      return false;
1888    // Check for overflow.
1889    int64_t Base = BaseOffset.getKnownMinValue();
1890    int64_t Min = MinOffset.getKnownMinValue();
1891    int64_t Max = MaxOffset.getKnownMinValue();
1892    if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1893      return false;
1894    MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1895    if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1896      return false;
1897    MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1898  
1899    return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1900                                HasBaseReg, Scale) &&
1901           isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1902                                HasBaseReg, Scale);
1903  }
1904  
isAMCompletelyFolded(const TargetTransformInfo & TTI,Immediate MinOffset,Immediate MaxOffset,LSRUse::KindType Kind,MemAccessTy AccessTy,const Formula & F,const Loop & L)1905  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1906                                   Immediate MinOffset, Immediate MaxOffset,
1907                                   LSRUse::KindType Kind, MemAccessTy AccessTy,
1908                                   const Formula &F, const Loop &L) {
1909    // For the purpose of isAMCompletelyFolded either having a canonical formula
1910    // or a scale not equal to zero is correct.
1911    // Problems may arise from non canonical formulae having a scale == 0.
1912    // Strictly speaking it would best to just rely on canonical formulae.
1913    // However, when we generate the scaled formulae, we first check that the
1914    // scaling factor is profitable before computing the actual ScaledReg for
1915    // compile time sake.
1916    assert((F.isCanonical(L) || F.Scale != 0));
1917    return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1918                                F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1919  }
1920  
1921  /// Test whether we know how to expand the current formula.
isLegalUse(const TargetTransformInfo & TTI,Immediate MinOffset,Immediate MaxOffset,LSRUse::KindType Kind,MemAccessTy AccessTy,GlobalValue * BaseGV,Immediate BaseOffset,bool HasBaseReg,int64_t Scale)1922  static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1923                         Immediate MaxOffset, LSRUse::KindType Kind,
1924                         MemAccessTy AccessTy, GlobalValue *BaseGV,
1925                         Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1926    // We know how to expand completely foldable formulae.
1927    return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1928                                BaseOffset, HasBaseReg, Scale) ||
1929           // Or formulae that use a base register produced by a sum of base
1930           // registers.
1931           (Scale == 1 &&
1932            isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1933                                 BaseGV, BaseOffset, true, 0));
1934  }
1935  
isLegalUse(const TargetTransformInfo & TTI,Immediate MinOffset,Immediate MaxOffset,LSRUse::KindType Kind,MemAccessTy AccessTy,const Formula & F)1936  static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1937                         Immediate MaxOffset, LSRUse::KindType Kind,
1938                         MemAccessTy AccessTy, const Formula &F) {
1939    return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1940                      F.BaseOffset, F.HasBaseReg, F.Scale);
1941  }
1942  
isLegalAddImmediate(const TargetTransformInfo & TTI,Immediate Offset)1943  static bool isLegalAddImmediate(const TargetTransformInfo &TTI,
1944                                  Immediate Offset) {
1945    if (Offset.isScalable())
1946      return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1947  
1948    return TTI.isLegalAddImmediate(Offset.getFixedValue());
1949  }
1950  
isAMCompletelyFolded(const TargetTransformInfo & TTI,const LSRUse & LU,const Formula & F)1951  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1952                                   const LSRUse &LU, const Formula &F) {
1953    // Target may want to look at the user instructions.
1954    if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1955      for (const LSRFixup &Fixup : LU.Fixups)
1956        if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1957                                  (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1958                                  F.Scale, Fixup.UserInst))
1959          return false;
1960      return true;
1961    }
1962  
1963    return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1964                                LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1965                                F.Scale);
1966  }
1967  
getScalingFactorCost(const TargetTransformInfo & TTI,const LSRUse & LU,const Formula & F,const Loop & L)1968  static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1969                                              const LSRUse &LU, const Formula &F,
1970                                              const Loop &L) {
1971    if (!F.Scale)
1972      return 0;
1973  
1974    // If the use is not completely folded in that instruction, we will have to
1975    // pay an extra cost only for scale != 1.
1976    if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1977                              LU.AccessTy, F, L))
1978      return F.Scale != 1;
1979  
1980    switch (LU.Kind) {
1981    case LSRUse::Address: {
1982      // Check the scaling factor cost with both the min and max offsets.
1983      int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1984      if (F.BaseOffset.isScalable()) {
1985        ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1986        ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1987      } else {
1988        FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1989        FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1990      }
1991      InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1992          LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1993          F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1994      InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1995          LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1996          F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1997  
1998      assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1999             "Legal addressing mode has an illegal cost!");
2000      return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
2001    }
2002    case LSRUse::ICmpZero:
2003    case LSRUse::Basic:
2004    case LSRUse::Special:
2005      // The use is completely folded, i.e., everything is folded into the
2006      // instruction.
2007      return 0;
2008    }
2009  
2010    llvm_unreachable("Invalid LSRUse Kind!");
2011  }
2012  
isAlwaysFoldable(const TargetTransformInfo & TTI,LSRUse::KindType Kind,MemAccessTy AccessTy,GlobalValue * BaseGV,Immediate BaseOffset,bool HasBaseReg)2013  static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
2014                               LSRUse::KindType Kind, MemAccessTy AccessTy,
2015                               GlobalValue *BaseGV, Immediate BaseOffset,
2016                               bool HasBaseReg) {
2017    // Fast-path: zero is always foldable.
2018    if (BaseOffset.isZero() && !BaseGV)
2019      return true;
2020  
2021    // Conservatively, create an address with an immediate and a
2022    // base and a scale.
2023    int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2024  
2025    // Canonicalize a scale of 1 to a base register if the formula doesn't
2026    // already have a base register.
2027    if (!HasBaseReg && Scale == 1) {
2028      Scale = 0;
2029      HasBaseReg = true;
2030    }
2031  
2032    // FIXME: Try with + without a scale? Maybe based on TTI?
2033    // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2034    // default for many architectures, not just AArch64 SVE. More investigation
2035    // needed later to determine if this should be used more widely than just
2036    // on scalable types.
2037    if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2038        AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2039      Scale = 0;
2040  
2041    return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2042                                HasBaseReg, Scale);
2043  }
2044  
isAlwaysFoldable(const TargetTransformInfo & TTI,ScalarEvolution & SE,Immediate MinOffset,Immediate MaxOffset,LSRUse::KindType Kind,MemAccessTy AccessTy,const SCEV * S,bool HasBaseReg)2045  static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
2046                               ScalarEvolution &SE, Immediate MinOffset,
2047                               Immediate MaxOffset, LSRUse::KindType Kind,
2048                               MemAccessTy AccessTy, const SCEV *S,
2049                               bool HasBaseReg) {
2050    // Fast-path: zero is always foldable.
2051    if (S->isZero()) return true;
2052  
2053    // Conservatively, create an address with an immediate and a
2054    // base and a scale.
2055    Immediate BaseOffset = ExtractImmediate(S, SE);
2056    GlobalValue *BaseGV = ExtractSymbol(S, SE);
2057  
2058    // If there's anything else involved, it's not foldable.
2059    if (!S->isZero()) return false;
2060  
2061    // Fast-path: zero is always foldable.
2062    if (BaseOffset.isZero() && !BaseGV)
2063      return true;
2064  
2065    if (BaseOffset.isScalable())
2066      return false;
2067  
2068    // Conservatively, create an address with an immediate and a
2069    // base and a scale.
2070    int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2071  
2072    return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2073                                BaseOffset, HasBaseReg, Scale);
2074  }
2075  
2076  namespace {
2077  
2078  /// An individual increment in a Chain of IV increments.  Relate an IV user to
2079  /// an expression that computes the IV it uses from the IV used by the previous
2080  /// link in the Chain.
2081  ///
2082  /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2083  /// original IVOperand. The head of the chain's IVOperand is only valid during
2084  /// chain collection, before LSR replaces IV users. During chain generation,
2085  /// IncExpr can be used to find the new IVOperand that computes the same
2086  /// expression.
2087  struct IVInc {
2088    Instruction *UserInst;
2089    Value* IVOperand;
2090    const SCEV *IncExpr;
2091  
IVInc__anonc21373340a11::IVInc2092    IVInc(Instruction *U, Value *O, const SCEV *E)
2093        : UserInst(U), IVOperand(O), IncExpr(E) {}
2094  };
2095  
2096  // The list of IV increments in program order.  We typically add the head of a
2097  // chain without finding subsequent links.
2098  struct IVChain {
2099    SmallVector<IVInc, 1> Incs;
2100    const SCEV *ExprBase = nullptr;
2101  
2102    IVChain() = default;
IVChain__anonc21373340a11::IVChain2103    IVChain(const IVInc &Head, const SCEV *Base)
2104        : Incs(1, Head), ExprBase(Base) {}
2105  
2106    using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2107  
2108    // Return the first increment in the chain.
begin__anonc21373340a11::IVChain2109    const_iterator begin() const {
2110      assert(!Incs.empty());
2111      return std::next(Incs.begin());
2112    }
end__anonc21373340a11::IVChain2113    const_iterator end() const {
2114      return Incs.end();
2115    }
2116  
2117    // Returns true if this chain contains any increments.
hasIncs__anonc21373340a11::IVChain2118    bool hasIncs() const { return Incs.size() >= 2; }
2119  
2120    // Add an IVInc to the end of this chain.
add__anonc21373340a11::IVChain2121    void add(const IVInc &X) { Incs.push_back(X); }
2122  
2123    // Returns the last UserInst in the chain.
tailUserInst__anonc21373340a11::IVChain2124    Instruction *tailUserInst() const { return Incs.back().UserInst; }
2125  
2126    // Returns true if IncExpr can be profitably added to this chain.
2127    bool isProfitableIncrement(const SCEV *OperExpr,
2128                               const SCEV *IncExpr,
2129                               ScalarEvolution&);
2130  };
2131  
2132  /// Helper for CollectChains to track multiple IV increment uses.  Distinguish
2133  /// between FarUsers that definitely cross IV increments and NearUsers that may
2134  /// be used between IV increments.
2135  struct ChainUsers {
2136    SmallPtrSet<Instruction*, 4> FarUsers;
2137    SmallPtrSet<Instruction*, 4> NearUsers;
2138  };
2139  
2140  /// This class holds state for the main loop strength reduction logic.
2141  class LSRInstance {
2142    IVUsers &IU;
2143    ScalarEvolution &SE;
2144    DominatorTree &DT;
2145    LoopInfo &LI;
2146    AssumptionCache &AC;
2147    TargetLibraryInfo &TLI;
2148    const TargetTransformInfo &TTI;
2149    Loop *const L;
2150    MemorySSAUpdater *MSSAU;
2151    TTI::AddressingModeKind AMK;
2152    mutable SCEVExpander Rewriter;
2153    bool Changed = false;
2154  
2155    /// This is the insert position that the current loop's induction variable
2156    /// increment should be placed. In simple loops, this is the latch block's
2157    /// terminator. But in more complicated cases, this is a position which will
2158    /// dominate all the in-loop post-increment users.
2159    Instruction *IVIncInsertPos = nullptr;
2160  
2161    /// Interesting factors between use strides.
2162    ///
2163    /// We explicitly use a SetVector which contains a SmallSet, instead of the
2164    /// default, a SmallDenseSet, because we need to use the full range of
2165    /// int64_ts, and there's currently no good way of doing that with
2166    /// SmallDenseSet.
2167    SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2168  
2169    /// The cost of the current SCEV, the best solution by LSR will be dropped if
2170    /// the solution is not profitable.
2171    Cost BaselineCost;
2172  
2173    /// Interesting use types, to facilitate truncation reuse.
2174    SmallSetVector<Type *, 4> Types;
2175  
2176    /// The list of interesting uses.
2177    mutable SmallVector<LSRUse, 16> Uses;
2178  
2179    /// Track which uses use which register candidates.
2180    RegUseTracker RegUses;
2181  
2182    // Limit the number of chains to avoid quadratic behavior. We don't expect to
2183    // have more than a few IV increment chains in a loop. Missing a Chain falls
2184    // back to normal LSR behavior for those uses.
2185    static const unsigned MaxChains = 8;
2186  
2187    /// IV users can form a chain of IV increments.
2188    SmallVector<IVChain, MaxChains> IVChainVec;
2189  
2190    /// IV users that belong to profitable IVChains.
2191    SmallPtrSet<Use*, MaxChains> IVIncSet;
2192  
2193    /// Induction variables that were generated and inserted by the SCEV Expander.
2194    SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2195  
2196    void OptimizeShadowIV();
2197    bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2198    ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2199    void OptimizeLoopTermCond();
2200  
2201    void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2202                          SmallVectorImpl<ChainUsers> &ChainUsersVec);
2203    void FinalizeChain(IVChain &Chain);
2204    void CollectChains();
2205    void GenerateIVChain(const IVChain &Chain,
2206                         SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2207  
2208    void CollectInterestingTypesAndFactors();
2209    void CollectFixupsAndInitialFormulae();
2210  
2211    // Support for sharing of LSRUses between LSRFixups.
2212    using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2213    UseMapTy UseMap;
2214  
2215    bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2216                            LSRUse::KindType Kind, MemAccessTy AccessTy);
2217  
2218    std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2219                                        MemAccessTy AccessTy);
2220  
2221    void DeleteUse(LSRUse &LU, size_t LUIdx);
2222  
2223    LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2224  
2225    void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2226    void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2227    void CountRegisters(const Formula &F, size_t LUIdx);
2228    bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2229  
2230    void CollectLoopInvariantFixupsAndFormulae();
2231  
2232    void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2233                                unsigned Depth = 0);
2234  
2235    void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2236                                    const Formula &Base, unsigned Depth,
2237                                    size_t Idx, bool IsScaledReg = false);
2238    void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2239    void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2240                                     const Formula &Base, size_t Idx,
2241                                     bool IsScaledReg = false);
2242    void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2243    void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2244                                     const Formula &Base,
2245                                     const SmallVectorImpl<Immediate> &Worklist,
2246                                     size_t Idx, bool IsScaledReg = false);
2247    void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2248    void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2249    void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2250    void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2251    void GenerateCrossUseConstantOffsets();
2252    void GenerateAllReuseFormulae();
2253  
2254    void FilterOutUndesirableDedicatedRegisters();
2255  
2256    size_t EstimateSearchSpaceComplexity() const;
2257    void NarrowSearchSpaceByDetectingSupersets();
2258    void NarrowSearchSpaceByCollapsingUnrolledCode();
2259    void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2260    void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2261    void NarrowSearchSpaceByFilterPostInc();
2262    void NarrowSearchSpaceByDeletingCostlyFormulas();
2263    void NarrowSearchSpaceByPickingWinnerRegs();
2264    void NarrowSearchSpaceUsingHeuristics();
2265  
2266    void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2267                      Cost &SolutionCost,
2268                      SmallVectorImpl<const Formula *> &Workspace,
2269                      const Cost &CurCost,
2270                      const SmallPtrSet<const SCEV *, 16> &CurRegs,
2271                      DenseSet<const SCEV *> &VisitedRegs) const;
2272    void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2273  
2274    BasicBlock::iterator
2275    HoistInsertPosition(BasicBlock::iterator IP,
2276                        const SmallVectorImpl<Instruction *> &Inputs) const;
2277    BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2278                                                       const LSRFixup &LF,
2279                                                       const LSRUse &LU) const;
2280  
2281    Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2282                  BasicBlock::iterator IP,
2283                  SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2284    void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2285                       const Formula &F,
2286                       SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2287    void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2288                 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2289    void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2290  
2291  public:
2292    LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2293                LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2294                TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2295  
getChanged() const2296    bool getChanged() const { return Changed; }
getScalarEvolutionIVs() const2297    const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2298      return ScalarEvolutionIVs;
2299    }
2300  
2301    void print_factors_and_types(raw_ostream &OS) const;
2302    void print_fixups(raw_ostream &OS) const;
2303    void print_uses(raw_ostream &OS) const;
2304    void print(raw_ostream &OS) const;
2305    void dump() const;
2306  };
2307  
2308  } // end anonymous namespace
2309  
2310  /// If IV is used in a int-to-float cast inside the loop then try to eliminate
2311  /// the cast operation.
OptimizeShadowIV()2312  void LSRInstance::OptimizeShadowIV() {
2313    const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2314    if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2315      return;
2316  
2317    for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2318         UI != E; /* empty */) {
2319      IVUsers::const_iterator CandidateUI = UI;
2320      ++UI;
2321      Instruction *ShadowUse = CandidateUI->getUser();
2322      Type *DestTy = nullptr;
2323      bool IsSigned = false;
2324  
2325      /* If shadow use is a int->float cast then insert a second IV
2326         to eliminate this cast.
2327  
2328           for (unsigned i = 0; i < n; ++i)
2329             foo((double)i);
2330  
2331         is transformed into
2332  
2333           double d = 0.0;
2334           for (unsigned i = 0; i < n; ++i, ++d)
2335             foo(d);
2336      */
2337      if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2338        IsSigned = false;
2339        DestTy = UCast->getDestTy();
2340      }
2341      else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2342        IsSigned = true;
2343        DestTy = SCast->getDestTy();
2344      }
2345      if (!DestTy) continue;
2346  
2347      // If target does not support DestTy natively then do not apply
2348      // this transformation.
2349      if (!TTI.isTypeLegal(DestTy)) continue;
2350  
2351      PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2352      if (!PH) continue;
2353      if (PH->getNumIncomingValues() != 2) continue;
2354  
2355      // If the calculation in integers overflows, the result in FP type will
2356      // differ. So we only can do this transformation if we are guaranteed to not
2357      // deal with overflowing values
2358      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2359      if (!AR) continue;
2360      if (IsSigned && !AR->hasNoSignedWrap()) continue;
2361      if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2362  
2363      Type *SrcTy = PH->getType();
2364      int Mantissa = DestTy->getFPMantissaWidth();
2365      if (Mantissa == -1) continue;
2366      if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2367        continue;
2368  
2369      unsigned Entry, Latch;
2370      if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2371        Entry = 0;
2372        Latch = 1;
2373      } else {
2374        Entry = 1;
2375        Latch = 0;
2376      }
2377  
2378      ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2379      if (!Init) continue;
2380      Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2381                                          (double)Init->getSExtValue() :
2382                                          (double)Init->getZExtValue());
2383  
2384      BinaryOperator *Incr =
2385        dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2386      if (!Incr) continue;
2387      if (Incr->getOpcode() != Instruction::Add
2388          && Incr->getOpcode() != Instruction::Sub)
2389        continue;
2390  
2391      /* Initialize new IV, double d = 0.0 in above example. */
2392      ConstantInt *C = nullptr;
2393      if (Incr->getOperand(0) == PH)
2394        C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2395      else if (Incr->getOperand(1) == PH)
2396        C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2397      else
2398        continue;
2399  
2400      if (!C) continue;
2401  
2402      // Ignore negative constants, as the code below doesn't handle them
2403      // correctly. TODO: Remove this restriction.
2404      if (!C->getValue().isStrictlyPositive())
2405        continue;
2406  
2407      /* Add new PHINode. */
2408      PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2409      NewPH->setDebugLoc(PH->getDebugLoc());
2410  
2411      /* create new increment. '++d' in above example. */
2412      Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2413      BinaryOperator *NewIncr = BinaryOperator::Create(
2414          Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2415                                                : Instruction::FSub,
2416          NewPH, CFP, "IV.S.next.", Incr->getIterator());
2417      NewIncr->setDebugLoc(Incr->getDebugLoc());
2418  
2419      NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2420      NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2421  
2422      /* Remove cast operation */
2423      ShadowUse->replaceAllUsesWith(NewPH);
2424      ShadowUse->eraseFromParent();
2425      Changed = true;
2426      break;
2427    }
2428  }
2429  
2430  /// If Cond has an operand that is an expression of an IV, set the IV user and
2431  /// stride information and return true, otherwise return false.
FindIVUserForCond(ICmpInst * Cond,IVStrideUse * & CondUse)2432  bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2433    for (IVStrideUse &U : IU)
2434      if (U.getUser() == Cond) {
2435        // NOTE: we could handle setcc instructions with multiple uses here, but
2436        // InstCombine does it as well for simple uses, it's not clear that it
2437        // occurs enough in real life to handle.
2438        CondUse = &U;
2439        return true;
2440      }
2441    return false;
2442  }
2443  
2444  /// Rewrite the loop's terminating condition if it uses a max computation.
2445  ///
2446  /// This is a narrow solution to a specific, but acute, problem. For loops
2447  /// like this:
2448  ///
2449  ///   i = 0;
2450  ///   do {
2451  ///     p[i] = 0.0;
2452  ///   } while (++i < n);
2453  ///
2454  /// the trip count isn't just 'n', because 'n' might not be positive. And
2455  /// unfortunately this can come up even for loops where the user didn't use
2456  /// a C do-while loop. For example, seemingly well-behaved top-test loops
2457  /// will commonly be lowered like this:
2458  ///
2459  ///   if (n > 0) {
2460  ///     i = 0;
2461  ///     do {
2462  ///       p[i] = 0.0;
2463  ///     } while (++i < n);
2464  ///   }
2465  ///
2466  /// and then it's possible for subsequent optimization to obscure the if
2467  /// test in such a way that indvars can't find it.
2468  ///
2469  /// When indvars can't find the if test in loops like this, it creates a
2470  /// max expression, which allows it to give the loop a canonical
2471  /// induction variable:
2472  ///
2473  ///   i = 0;
2474  ///   max = n < 1 ? 1 : n;
2475  ///   do {
2476  ///     p[i] = 0.0;
2477  ///   } while (++i != max);
2478  ///
2479  /// Canonical induction variables are necessary because the loop passes
2480  /// are designed around them. The most obvious example of this is the
2481  /// LoopInfo analysis, which doesn't remember trip count values. It
2482  /// expects to be able to rediscover the trip count each time it is
2483  /// needed, and it does this using a simple analysis that only succeeds if
2484  /// the loop has a canonical induction variable.
2485  ///
2486  /// However, when it comes time to generate code, the maximum operation
2487  /// can be quite costly, especially if it's inside of an outer loop.
2488  ///
2489  /// This function solves this problem by detecting this type of loop and
2490  /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2491  /// the instructions for the maximum computation.
OptimizeMax(ICmpInst * Cond,IVStrideUse * & CondUse)2492  ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2493    // Check that the loop matches the pattern we're looking for.
2494    if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2495        Cond->getPredicate() != CmpInst::ICMP_NE)
2496      return Cond;
2497  
2498    SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2499    if (!Sel || !Sel->hasOneUse()) return Cond;
2500  
2501    const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2502    if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2503      return Cond;
2504    const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2505  
2506    // Add one to the backedge-taken count to get the trip count.
2507    const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2508    if (IterationCount != SE.getSCEV(Sel)) return Cond;
2509  
2510    // Check for a max calculation that matches the pattern. There's no check
2511    // for ICMP_ULE here because the comparison would be with zero, which
2512    // isn't interesting.
2513    CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2514    const SCEVNAryExpr *Max = nullptr;
2515    if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2516      Pred = ICmpInst::ICMP_SLE;
2517      Max = S;
2518    } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2519      Pred = ICmpInst::ICMP_SLT;
2520      Max = S;
2521    } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2522      Pred = ICmpInst::ICMP_ULT;
2523      Max = U;
2524    } else {
2525      // No match; bail.
2526      return Cond;
2527    }
2528  
2529    // To handle a max with more than two operands, this optimization would
2530    // require additional checking and setup.
2531    if (Max->getNumOperands() != 2)
2532      return Cond;
2533  
2534    const SCEV *MaxLHS = Max->getOperand(0);
2535    const SCEV *MaxRHS = Max->getOperand(1);
2536  
2537    // ScalarEvolution canonicalizes constants to the left. For < and >, look
2538    // for a comparison with 1. For <= and >=, a comparison with zero.
2539    if (!MaxLHS ||
2540        (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2541      return Cond;
2542  
2543    // Check the relevant induction variable for conformance to
2544    // the pattern.
2545    const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2546    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2547    if (!AR || !AR->isAffine() ||
2548        AR->getStart() != One ||
2549        AR->getStepRecurrence(SE) != One)
2550      return Cond;
2551  
2552    assert(AR->getLoop() == L &&
2553           "Loop condition operand is an addrec in a different loop!");
2554  
2555    // Check the right operand of the select, and remember it, as it will
2556    // be used in the new comparison instruction.
2557    Value *NewRHS = nullptr;
2558    if (ICmpInst::isTrueWhenEqual(Pred)) {
2559      // Look for n+1, and grab n.
2560      if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2561        if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2562           if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2563             NewRHS = BO->getOperand(0);
2564      if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2565        if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2566          if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2567            NewRHS = BO->getOperand(0);
2568      if (!NewRHS)
2569        return Cond;
2570    } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2571      NewRHS = Sel->getOperand(1);
2572    else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2573      NewRHS = Sel->getOperand(2);
2574    else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2575      NewRHS = SU->getValue();
2576    else
2577      // Max doesn't match expected pattern.
2578      return Cond;
2579  
2580    // Determine the new comparison opcode. It may be signed or unsigned,
2581    // and the original comparison may be either equality or inequality.
2582    if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2583      Pred = CmpInst::getInversePredicate(Pred);
2584  
2585    // Ok, everything looks ok to change the condition into an SLT or SGE and
2586    // delete the max calculation.
2587    ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2588                                     Cond->getOperand(0), NewRHS, "scmp");
2589  
2590    // Delete the max calculation instructions.
2591    NewCond->setDebugLoc(Cond->getDebugLoc());
2592    Cond->replaceAllUsesWith(NewCond);
2593    CondUse->setUser(NewCond);
2594    Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2595    Cond->eraseFromParent();
2596    Sel->eraseFromParent();
2597    if (Cmp->use_empty())
2598      Cmp->eraseFromParent();
2599    return NewCond;
2600  }
2601  
2602  /// Change loop terminating condition to use the postinc iv when possible.
2603  void
OptimizeLoopTermCond()2604  LSRInstance::OptimizeLoopTermCond() {
2605    SmallPtrSet<Instruction *, 4> PostIncs;
2606  
2607    // We need a different set of heuristics for rotated and non-rotated loops.
2608    // If a loop is rotated then the latch is also the backedge, so inserting
2609    // post-inc expressions just before the latch is ideal. To reduce live ranges
2610    // it also makes sense to rewrite terminating conditions to use post-inc
2611    // expressions.
2612    //
2613    // If the loop is not rotated then the latch is not a backedge; the latch
2614    // check is done in the loop head. Adding post-inc expressions before the
2615    // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2616    // in the loop body. In this case we do *not* want to use post-inc expressions
2617    // in the latch check, and we want to insert post-inc expressions before
2618    // the backedge.
2619    BasicBlock *LatchBlock = L->getLoopLatch();
2620    SmallVector<BasicBlock*, 8> ExitingBlocks;
2621    L->getExitingBlocks(ExitingBlocks);
2622    if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2623      // The backedge doesn't exit the loop; treat this as a head-tested loop.
2624      IVIncInsertPos = LatchBlock->getTerminator();
2625      return;
2626    }
2627  
2628    // Otherwise treat this as a rotated loop.
2629    for (BasicBlock *ExitingBlock : ExitingBlocks) {
2630      // Get the terminating condition for the loop if possible.  If we
2631      // can, we want to change it to use a post-incremented version of its
2632      // induction variable, to allow coalescing the live ranges for the IV into
2633      // one register value.
2634  
2635      BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2636      if (!TermBr)
2637        continue;
2638      // FIXME: Overly conservative, termination condition could be an 'or' etc..
2639      if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2640        continue;
2641  
2642      // Search IVUsesByStride to find Cond's IVUse if there is one.
2643      IVStrideUse *CondUse = nullptr;
2644      ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2645      if (!FindIVUserForCond(Cond, CondUse))
2646        continue;
2647  
2648      // If the trip count is computed in terms of a max (due to ScalarEvolution
2649      // being unable to find a sufficient guard, for example), change the loop
2650      // comparison to use SLT or ULT instead of NE.
2651      // One consequence of doing this now is that it disrupts the count-down
2652      // optimization. That's not always a bad thing though, because in such
2653      // cases it may still be worthwhile to avoid a max.
2654      Cond = OptimizeMax(Cond, CondUse);
2655  
2656      // If this exiting block dominates the latch block, it may also use
2657      // the post-inc value if it won't be shared with other uses.
2658      // Check for dominance.
2659      if (!DT.dominates(ExitingBlock, LatchBlock))
2660        continue;
2661  
2662      // Conservatively avoid trying to use the post-inc value in non-latch
2663      // exits if there may be pre-inc users in intervening blocks.
2664      if (LatchBlock != ExitingBlock)
2665        for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2666          // Test if the use is reachable from the exiting block. This dominator
2667          // query is a conservative approximation of reachability.
2668          if (&*UI != CondUse &&
2669              !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2670            // Conservatively assume there may be reuse if the quotient of their
2671            // strides could be a legal scale.
2672            const SCEV *A = IU.getStride(*CondUse, L);
2673            const SCEV *B = IU.getStride(*UI, L);
2674            if (!A || !B) continue;
2675            if (SE.getTypeSizeInBits(A->getType()) !=
2676                SE.getTypeSizeInBits(B->getType())) {
2677              if (SE.getTypeSizeInBits(A->getType()) >
2678                  SE.getTypeSizeInBits(B->getType()))
2679                B = SE.getSignExtendExpr(B, A->getType());
2680              else
2681                A = SE.getSignExtendExpr(A, B->getType());
2682            }
2683            if (const SCEVConstant *D =
2684                  dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2685              const ConstantInt *C = D->getValue();
2686              // Stride of one or negative one can have reuse with non-addresses.
2687              if (C->isOne() || C->isMinusOne())
2688                goto decline_post_inc;
2689              // Avoid weird situations.
2690              if (C->getValue().getSignificantBits() >= 64 ||
2691                  C->getValue().isMinSignedValue())
2692                goto decline_post_inc;
2693              // Check for possible scaled-address reuse.
2694              if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2695                MemAccessTy AccessTy = getAccessType(
2696                    TTI, UI->getUser(), UI->getOperandValToReplace());
2697                int64_t Scale = C->getSExtValue();
2698                if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2699                                              /*BaseOffset=*/0,
2700                                              /*HasBaseReg=*/true, Scale,
2701                                              AccessTy.AddrSpace))
2702                  goto decline_post_inc;
2703                Scale = -Scale;
2704                if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2705                                              /*BaseOffset=*/0,
2706                                              /*HasBaseReg=*/true, Scale,
2707                                              AccessTy.AddrSpace))
2708                  goto decline_post_inc;
2709              }
2710            }
2711          }
2712  
2713      LLVM_DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
2714                        << *Cond << '\n');
2715  
2716      // It's possible for the setcc instruction to be anywhere in the loop, and
2717      // possible for it to have multiple users.  If it is not immediately before
2718      // the exiting block branch, move it.
2719      if (Cond->getNextNonDebugInstruction() != TermBr) {
2720        if (Cond->hasOneUse()) {
2721          Cond->moveBefore(TermBr);
2722        } else {
2723          // Clone the terminating condition and insert into the loopend.
2724          ICmpInst *OldCond = Cond;
2725          Cond = cast<ICmpInst>(Cond->clone());
2726          Cond->setName(L->getHeader()->getName() + ".termcond");
2727          Cond->insertInto(ExitingBlock, TermBr->getIterator());
2728  
2729          // Clone the IVUse, as the old use still exists!
2730          CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2731          TermBr->replaceUsesOfWith(OldCond, Cond);
2732        }
2733      }
2734  
2735      // If we get to here, we know that we can transform the setcc instruction to
2736      // use the post-incremented version of the IV, allowing us to coalesce the
2737      // live ranges for the IV correctly.
2738      CondUse->transformToPostInc(L);
2739      Changed = true;
2740  
2741      PostIncs.insert(Cond);
2742    decline_post_inc:;
2743    }
2744  
2745    // Determine an insertion point for the loop induction variable increment. It
2746    // must dominate all the post-inc comparisons we just set up, and it must
2747    // dominate the loop latch edge.
2748    IVIncInsertPos = L->getLoopLatch()->getTerminator();
2749    for (Instruction *Inst : PostIncs)
2750      IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2751  }
2752  
2753  /// Determine if the given use can accommodate a fixup at the given offset and
2754  /// other details. If so, update the use and return true.
reconcileNewOffset(LSRUse & LU,Immediate NewOffset,bool HasBaseReg,LSRUse::KindType Kind,MemAccessTy AccessTy)2755  bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2756                                       bool HasBaseReg, LSRUse::KindType Kind,
2757                                       MemAccessTy AccessTy) {
2758    Immediate NewMinOffset = LU.MinOffset;
2759    Immediate NewMaxOffset = LU.MaxOffset;
2760    MemAccessTy NewAccessTy = AccessTy;
2761  
2762    // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2763    // something conservative, however this can pessimize in the case that one of
2764    // the uses will have all its uses outside the loop, for example.
2765    if (LU.Kind != Kind)
2766      return false;
2767  
2768    // Check for a mismatched access type, and fall back conservatively as needed.
2769    // TODO: Be less conservative when the type is similar and can use the same
2770    // addressing modes.
2771    if (Kind == LSRUse::Address) {
2772      if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2773        NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2774                                              AccessTy.AddrSpace);
2775      }
2776    }
2777  
2778    // Conservatively assume HasBaseReg is true for now.
2779    if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2780      if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2781                            LU.MaxOffset - NewOffset, HasBaseReg))
2782        return false;
2783      NewMinOffset = NewOffset;
2784    } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2785      if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2786                            NewOffset - LU.MinOffset, HasBaseReg))
2787        return false;
2788      NewMaxOffset = NewOffset;
2789    }
2790  
2791    // FIXME: We should be able to handle some level of scalable offset support
2792    // for 'void', but in order to get basic support up and running this is
2793    // being left out.
2794    if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2795        (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2796      return false;
2797  
2798    // Update the use.
2799    LU.MinOffset = NewMinOffset;
2800    LU.MaxOffset = NewMaxOffset;
2801    LU.AccessTy = NewAccessTy;
2802    return true;
2803  }
2804  
2805  /// Return an LSRUse index and an offset value for a fixup which needs the given
2806  /// expression, with the given kind and optional access type.  Either reuse an
2807  /// existing use or create a new one, as needed.
getUse(const SCEV * & Expr,LSRUse::KindType Kind,MemAccessTy AccessTy)2808  std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2809                                                   LSRUse::KindType Kind,
2810                                                   MemAccessTy AccessTy) {
2811    const SCEV *Copy = Expr;
2812    Immediate Offset = ExtractImmediate(Expr, SE);
2813  
2814    // Basic uses can't accept any offset, for example.
2815    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2816                          Offset, /*HasBaseReg=*/ true)) {
2817      Expr = Copy;
2818      Offset = Immediate::getFixed(0);
2819    }
2820  
2821    std::pair<UseMapTy::iterator, bool> P =
2822      UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2823    if (!P.second) {
2824      // A use already existed with this base.
2825      size_t LUIdx = P.first->second;
2826      LSRUse &LU = Uses[LUIdx];
2827      if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2828        // Reuse this use.
2829        return std::make_pair(LUIdx, Offset);
2830    }
2831  
2832    // Create a new use.
2833    size_t LUIdx = Uses.size();
2834    P.first->second = LUIdx;
2835    Uses.push_back(LSRUse(Kind, AccessTy));
2836    LSRUse &LU = Uses[LUIdx];
2837  
2838    LU.MinOffset = Offset;
2839    LU.MaxOffset = Offset;
2840    return std::make_pair(LUIdx, Offset);
2841  }
2842  
2843  /// Delete the given use from the Uses list.
DeleteUse(LSRUse & LU,size_t LUIdx)2844  void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2845    if (&LU != &Uses.back())
2846      std::swap(LU, Uses.back());
2847    Uses.pop_back();
2848  
2849    // Update RegUses.
2850    RegUses.swapAndDropUse(LUIdx, Uses.size());
2851  }
2852  
2853  /// Look for a use distinct from OrigLU which is has a formula that has the same
2854  /// registers as the given formula.
2855  LSRUse *
FindUseWithSimilarFormula(const Formula & OrigF,const LSRUse & OrigLU)2856  LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2857                                         const LSRUse &OrigLU) {
2858    // Search all uses for the formula. This could be more clever.
2859    for (LSRUse &LU : Uses) {
2860      // Check whether this use is close enough to OrigLU, to see whether it's
2861      // worthwhile looking through its formulae.
2862      // Ignore ICmpZero uses because they may contain formulae generated by
2863      // GenerateICmpZeroScales, in which case adding fixup offsets may
2864      // be invalid.
2865      if (&LU != &OrigLU &&
2866          LU.Kind != LSRUse::ICmpZero &&
2867          LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2868          LU.WidestFixupType == OrigLU.WidestFixupType &&
2869          LU.HasFormulaWithSameRegs(OrigF)) {
2870        // Scan through this use's formulae.
2871        for (const Formula &F : LU.Formulae) {
2872          // Check to see if this formula has the same registers and symbols
2873          // as OrigF.
2874          if (F.BaseRegs == OrigF.BaseRegs &&
2875              F.ScaledReg == OrigF.ScaledReg &&
2876              F.BaseGV == OrigF.BaseGV &&
2877              F.Scale == OrigF.Scale &&
2878              F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2879            if (F.BaseOffset.isZero())
2880              return &LU;
2881            // This is the formula where all the registers and symbols matched;
2882            // there aren't going to be any others. Since we declined it, we
2883            // can skip the rest of the formulae and proceed to the next LSRUse.
2884            break;
2885          }
2886        }
2887      }
2888    }
2889  
2890    // Nothing looked good.
2891    return nullptr;
2892  }
2893  
CollectInterestingTypesAndFactors()2894  void LSRInstance::CollectInterestingTypesAndFactors() {
2895    SmallSetVector<const SCEV *, 4> Strides;
2896  
2897    // Collect interesting types and strides.
2898    SmallVector<const SCEV *, 4> Worklist;
2899    for (const IVStrideUse &U : IU) {
2900      const SCEV *Expr = IU.getExpr(U);
2901      if (!Expr)
2902        continue;
2903  
2904      // Collect interesting types.
2905      Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2906  
2907      // Add strides for mentioned loops.
2908      Worklist.push_back(Expr);
2909      do {
2910        const SCEV *S = Worklist.pop_back_val();
2911        if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2912          if (AR->getLoop() == L)
2913            Strides.insert(AR->getStepRecurrence(SE));
2914          Worklist.push_back(AR->getStart());
2915        } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2916          append_range(Worklist, Add->operands());
2917        }
2918      } while (!Worklist.empty());
2919    }
2920  
2921    // Compute interesting factors from the set of interesting strides.
2922    for (SmallSetVector<const SCEV *, 4>::const_iterator
2923         I = Strides.begin(), E = Strides.end(); I != E; ++I)
2924      for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2925           std::next(I); NewStrideIter != E; ++NewStrideIter) {
2926        const SCEV *OldStride = *I;
2927        const SCEV *NewStride = *NewStrideIter;
2928  
2929        if (SE.getTypeSizeInBits(OldStride->getType()) !=
2930            SE.getTypeSizeInBits(NewStride->getType())) {
2931          if (SE.getTypeSizeInBits(OldStride->getType()) >
2932              SE.getTypeSizeInBits(NewStride->getType()))
2933            NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2934          else
2935            OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2936        }
2937        if (const SCEVConstant *Factor =
2938              dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2939                                                          SE, true))) {
2940          if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2941            Factors.insert(Factor->getAPInt().getSExtValue());
2942        } else if (const SCEVConstant *Factor =
2943                     dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2944                                                                 NewStride,
2945                                                                 SE, true))) {
2946          if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2947            Factors.insert(Factor->getAPInt().getSExtValue());
2948        }
2949      }
2950  
2951    // If all uses use the same type, don't bother looking for truncation-based
2952    // reuse.
2953    if (Types.size() == 1)
2954      Types.clear();
2955  
2956    LLVM_DEBUG(print_factors_and_types(dbgs()));
2957  }
2958  
2959  /// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2960  /// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2961  /// IVStrideUses, we could partially skip this.
2962  static User::op_iterator
findIVOperand(User::op_iterator OI,User::op_iterator OE,Loop * L,ScalarEvolution & SE)2963  findIVOperand(User::op_iterator OI, User::op_iterator OE,
2964                Loop *L, ScalarEvolution &SE) {
2965    for(; OI != OE; ++OI) {
2966      if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2967        if (!SE.isSCEVable(Oper->getType()))
2968          continue;
2969  
2970        if (const SCEVAddRecExpr *AR =
2971            dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2972          if (AR->getLoop() == L)
2973            break;
2974        }
2975      }
2976    }
2977    return OI;
2978  }
2979  
2980  /// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2981  /// a convenient helper.
getWideOperand(Value * Oper)2982  static Value *getWideOperand(Value *Oper) {
2983    if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2984      return Trunc->getOperand(0);
2985    return Oper;
2986  }
2987  
2988  /// Return an approximation of this SCEV expression's "base", or NULL for any
2989  /// constant. Returning the expression itself is conservative. Returning a
2990  /// deeper subexpression is more precise and valid as long as it isn't less
2991  /// complex than another subexpression. For expressions involving multiple
2992  /// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2993  /// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2994  /// IVInc==b-a.
2995  ///
2996  /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2997  /// SCEVUnknown, we simply return the rightmost SCEV operand.
getExprBase(const SCEV * S)2998  static const SCEV *getExprBase(const SCEV *S) {
2999    switch (S->getSCEVType()) {
3000    default: // including scUnknown.
3001      return S;
3002    case scConstant:
3003    case scVScale:
3004      return nullptr;
3005    case scTruncate:
3006      return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3007    case scZeroExtend:
3008      return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3009    case scSignExtend:
3010      return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3011    case scAddExpr: {
3012      // Skip over scaled operands (scMulExpr) to follow add operands as long as
3013      // there's nothing more complex.
3014      // FIXME: not sure if we want to recognize negation.
3015      const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3016      for (const SCEV *SubExpr : reverse(Add->operands())) {
3017        if (SubExpr->getSCEVType() == scAddExpr)
3018          return getExprBase(SubExpr);
3019  
3020        if (SubExpr->getSCEVType() != scMulExpr)
3021          return SubExpr;
3022      }
3023      return S; // all operands are scaled, be conservative.
3024    }
3025    case scAddRecExpr:
3026      return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3027    }
3028    llvm_unreachable("Unknown SCEV kind!");
3029  }
3030  
3031  /// Return true if the chain increment is profitable to expand into a loop
3032  /// invariant value, which may require its own register. A profitable chain
3033  /// increment will be an offset relative to the same base. We allow such offsets
3034  /// to potentially be used as chain increment as long as it's not obviously
3035  /// expensive to expand using real instructions.
isProfitableIncrement(const SCEV * OperExpr,const SCEV * IncExpr,ScalarEvolution & SE)3036  bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3037                                      const SCEV *IncExpr,
3038                                      ScalarEvolution &SE) {
3039    // Aggressively form chains when -stress-ivchain.
3040    if (StressIVChain)
3041      return true;
3042  
3043    // Do not replace a constant offset from IV head with a nonconstant IV
3044    // increment.
3045    if (!isa<SCEVConstant>(IncExpr)) {
3046      const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3047      if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3048        return false;
3049    }
3050  
3051    SmallPtrSet<const SCEV*, 8> Processed;
3052    return !isHighCostExpansion(IncExpr, Processed, SE);
3053  }
3054  
3055  /// Return true if the number of registers needed for the chain is estimated to
3056  /// be less than the number required for the individual IV users. First prohibit
3057  /// any IV users that keep the IV live across increments (the Users set should
3058  /// be empty). Next count the number and type of increments in the chain.
3059  ///
3060  /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3061  /// effectively use postinc addressing modes. Only consider it profitable it the
3062  /// increments can be computed in fewer registers when chained.
3063  ///
3064  /// TODO: Consider IVInc free if it's already used in another chains.
isProfitableChain(IVChain & Chain,SmallPtrSetImpl<Instruction * > & Users,ScalarEvolution & SE,const TargetTransformInfo & TTI)3065  static bool isProfitableChain(IVChain &Chain,
3066                                SmallPtrSetImpl<Instruction *> &Users,
3067                                ScalarEvolution &SE,
3068                                const TargetTransformInfo &TTI) {
3069    if (StressIVChain)
3070      return true;
3071  
3072    if (!Chain.hasIncs())
3073      return false;
3074  
3075    if (!Users.empty()) {
3076      LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3077                 for (Instruction *Inst
3078                      : Users) { dbgs() << "  " << *Inst << "\n"; });
3079      return false;
3080    }
3081    assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3082  
3083    // The chain itself may require a register, so intialize cost to 1.
3084    int cost = 1;
3085  
3086    // A complete chain likely eliminates the need for keeping the original IV in
3087    // a register. LSR does not currently know how to form a complete chain unless
3088    // the header phi already exists.
3089    if (isa<PHINode>(Chain.tailUserInst())
3090        && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3091      --cost;
3092    }
3093    const SCEV *LastIncExpr = nullptr;
3094    unsigned NumConstIncrements = 0;
3095    unsigned NumVarIncrements = 0;
3096    unsigned NumReusedIncrements = 0;
3097  
3098    if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3099      return true;
3100  
3101    for (const IVInc &Inc : Chain) {
3102      if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3103        return true;
3104      if (Inc.IncExpr->isZero())
3105        continue;
3106  
3107      // Incrementing by zero or some constant is neutral. We assume constants can
3108      // be folded into an addressing mode or an add's immediate operand.
3109      if (isa<SCEVConstant>(Inc.IncExpr)) {
3110        ++NumConstIncrements;
3111        continue;
3112      }
3113  
3114      if (Inc.IncExpr == LastIncExpr)
3115        ++NumReusedIncrements;
3116      else
3117        ++NumVarIncrements;
3118  
3119      LastIncExpr = Inc.IncExpr;
3120    }
3121    // An IV chain with a single increment is handled by LSR's postinc
3122    // uses. However, a chain with multiple increments requires keeping the IV's
3123    // value live longer than it needs to be if chained.
3124    if (NumConstIncrements > 1)
3125      --cost;
3126  
3127    // Materializing increment expressions in the preheader that didn't exist in
3128    // the original code may cost a register. For example, sign-extended array
3129    // indices can produce ridiculous increments like this:
3130    // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3131    cost += NumVarIncrements;
3132  
3133    // Reusing variable increments likely saves a register to hold the multiple of
3134    // the stride.
3135    cost -= NumReusedIncrements;
3136  
3137    LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3138                      << "\n");
3139  
3140    return cost < 0;
3141  }
3142  
3143  /// Add this IV user to an existing chain or make it the head of a new chain.
ChainInstruction(Instruction * UserInst,Instruction * IVOper,SmallVectorImpl<ChainUsers> & ChainUsersVec)3144  void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3145                                     SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3146    // When IVs are used as types of varying widths, they are generally converted
3147    // to a wider type with some uses remaining narrow under a (free) trunc.
3148    Value *const NextIV = getWideOperand(IVOper);
3149    const SCEV *const OperExpr = SE.getSCEV(NextIV);
3150    const SCEV *const OperExprBase = getExprBase(OperExpr);
3151  
3152    // Visit all existing chains. Check if its IVOper can be computed as a
3153    // profitable loop invariant increment from the last link in the Chain.
3154    unsigned ChainIdx = 0, NChains = IVChainVec.size();
3155    const SCEV *LastIncExpr = nullptr;
3156    for (; ChainIdx < NChains; ++ChainIdx) {
3157      IVChain &Chain = IVChainVec[ChainIdx];
3158  
3159      // Prune the solution space aggressively by checking that both IV operands
3160      // are expressions that operate on the same unscaled SCEVUnknown. This
3161      // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3162      // first avoids creating extra SCEV expressions.
3163      if (!StressIVChain && Chain.ExprBase != OperExprBase)
3164        continue;
3165  
3166      Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3167      if (PrevIV->getType() != NextIV->getType())
3168        continue;
3169  
3170      // A phi node terminates a chain.
3171      if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3172        continue;
3173  
3174      // The increment must be loop-invariant so it can be kept in a register.
3175      const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3176      const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3177      if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3178        continue;
3179  
3180      if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3181        LastIncExpr = IncExpr;
3182        break;
3183      }
3184    }
3185    // If we haven't found a chain, create a new one, unless we hit the max. Don't
3186    // bother for phi nodes, because they must be last in the chain.
3187    if (ChainIdx == NChains) {
3188      if (isa<PHINode>(UserInst))
3189        return;
3190      if (NChains >= MaxChains && !StressIVChain) {
3191        LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3192        return;
3193      }
3194      LastIncExpr = OperExpr;
3195      // IVUsers may have skipped over sign/zero extensions. We don't currently
3196      // attempt to form chains involving extensions unless they can be hoisted
3197      // into this loop's AddRec.
3198      if (!isa<SCEVAddRecExpr>(LastIncExpr))
3199        return;
3200      ++NChains;
3201      IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3202                                   OperExprBase));
3203      ChainUsersVec.resize(NChains);
3204      LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3205                        << ") IV=" << *LastIncExpr << "\n");
3206    } else {
3207      LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
3208                        << ") IV+" << *LastIncExpr << "\n");
3209      // Add this IV user to the end of the chain.
3210      IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3211    }
3212    IVChain &Chain = IVChainVec[ChainIdx];
3213  
3214    SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3215    // This chain's NearUsers become FarUsers.
3216    if (!LastIncExpr->isZero()) {
3217      ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3218                                              NearUsers.end());
3219      NearUsers.clear();
3220    }
3221  
3222    // All other uses of IVOperand become near uses of the chain.
3223    // We currently ignore intermediate values within SCEV expressions, assuming
3224    // they will eventually be used be the current chain, or can be computed
3225    // from one of the chain increments. To be more precise we could
3226    // transitively follow its user and only add leaf IV users to the set.
3227    for (User *U : IVOper->users()) {
3228      Instruction *OtherUse = dyn_cast<Instruction>(U);
3229      if (!OtherUse)
3230        continue;
3231      // Uses in the chain will no longer be uses if the chain is formed.
3232      // Include the head of the chain in this iteration (not Chain.begin()).
3233      IVChain::const_iterator IncIter = Chain.Incs.begin();
3234      IVChain::const_iterator IncEnd = Chain.Incs.end();
3235      for( ; IncIter != IncEnd; ++IncIter) {
3236        if (IncIter->UserInst == OtherUse)
3237          break;
3238      }
3239      if (IncIter != IncEnd)
3240        continue;
3241  
3242      if (SE.isSCEVable(OtherUse->getType())
3243          && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3244          && IU.isIVUserOrOperand(OtherUse)) {
3245        continue;
3246      }
3247      NearUsers.insert(OtherUse);
3248    }
3249  
3250    // Since this user is part of the chain, it's no longer considered a use
3251    // of the chain.
3252    ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3253  }
3254  
3255  /// Populate the vector of Chains.
3256  ///
3257  /// This decreases ILP at the architecture level. Targets with ample registers,
3258  /// multiple memory ports, and no register renaming probably don't want
3259  /// this. However, such targets should probably disable LSR altogether.
3260  ///
3261  /// The job of LSR is to make a reasonable choice of induction variables across
3262  /// the loop. Subsequent passes can easily "unchain" computation exposing more
3263  /// ILP *within the loop* if the target wants it.
3264  ///
3265  /// Finding the best IV chain is potentially a scheduling problem. Since LSR
3266  /// will not reorder memory operations, it will recognize this as a chain, but
3267  /// will generate redundant IV increments. Ideally this would be corrected later
3268  /// by a smart scheduler:
3269  ///        = A[i]
3270  ///        = A[i+x]
3271  /// A[i]   =
3272  /// A[i+x] =
3273  ///
3274  /// TODO: Walk the entire domtree within this loop, not just the path to the
3275  /// loop latch. This will discover chains on side paths, but requires
3276  /// maintaining multiple copies of the Chains state.
CollectChains()3277  void LSRInstance::CollectChains() {
3278    LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3279    SmallVector<ChainUsers, 8> ChainUsersVec;
3280  
3281    SmallVector<BasicBlock *,8> LatchPath;
3282    BasicBlock *LoopHeader = L->getHeader();
3283    for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3284         Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3285      LatchPath.push_back(Rung->getBlock());
3286    }
3287    LatchPath.push_back(LoopHeader);
3288  
3289    // Walk the instruction stream from the loop header to the loop latch.
3290    for (BasicBlock *BB : reverse(LatchPath)) {
3291      for (Instruction &I : *BB) {
3292        // Skip instructions that weren't seen by IVUsers analysis.
3293        if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3294          continue;
3295  
3296        // Ignore users that are part of a SCEV expression. This way we only
3297        // consider leaf IV Users. This effectively rediscovers a portion of
3298        // IVUsers analysis but in program order this time.
3299        if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3300            continue;
3301  
3302        // Remove this instruction from any NearUsers set it may be in.
3303        for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3304             ChainIdx < NChains; ++ChainIdx) {
3305          ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3306        }
3307        // Search for operands that can be chained.
3308        SmallPtrSet<Instruction*, 4> UniqueOperands;
3309        User::op_iterator IVOpEnd = I.op_end();
3310        User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3311        while (IVOpIter != IVOpEnd) {
3312          Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3313          if (UniqueOperands.insert(IVOpInst).second)
3314            ChainInstruction(&I, IVOpInst, ChainUsersVec);
3315          IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3316        }
3317      } // Continue walking down the instructions.
3318    } // Continue walking down the domtree.
3319    // Visit phi backedges to determine if the chain can generate the IV postinc.
3320    for (PHINode &PN : L->getHeader()->phis()) {
3321      if (!SE.isSCEVable(PN.getType()))
3322        continue;
3323  
3324      Instruction *IncV =
3325          dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3326      if (IncV)
3327        ChainInstruction(&PN, IncV, ChainUsersVec);
3328    }
3329    // Remove any unprofitable chains.
3330    unsigned ChainIdx = 0;
3331    for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3332         UsersIdx < NChains; ++UsersIdx) {
3333      if (!isProfitableChain(IVChainVec[UsersIdx],
3334                             ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3335        continue;
3336      // Preserve the chain at UsesIdx.
3337      if (ChainIdx != UsersIdx)
3338        IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3339      FinalizeChain(IVChainVec[ChainIdx]);
3340      ++ChainIdx;
3341    }
3342    IVChainVec.resize(ChainIdx);
3343  }
3344  
FinalizeChain(IVChain & Chain)3345  void LSRInstance::FinalizeChain(IVChain &Chain) {
3346    assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3347    LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3348  
3349    for (const IVInc &Inc : Chain) {
3350      LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
3351      auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3352      assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3353      IVIncSet.insert(UseI);
3354    }
3355  }
3356  
3357  /// Return true if the IVInc can be folded into an addressing mode.
canFoldIVIncExpr(const SCEV * IncExpr,Instruction * UserInst,Value * Operand,const TargetTransformInfo & TTI)3358  static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3359                               Value *Operand, const TargetTransformInfo &TTI) {
3360    const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3361    Immediate IncOffset = Immediate::getZero();
3362    if (IncConst) {
3363      if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3364        return false;
3365      IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3366    } else {
3367      // Look for mul(vscale, constant), to detect a scalable offset.
3368      auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3369      if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3370          !isa<SCEVVScale>(IncVScale->getOperand(1)))
3371        return false;
3372      auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3373      if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3374        return false;
3375      IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
3376    }
3377  
3378    if (!isAddressUse(TTI, UserInst, Operand))
3379      return false;
3380  
3381    MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3382    if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3383                          IncOffset, /*HasBaseReg=*/false))
3384      return false;
3385  
3386    return true;
3387  }
3388  
3389  /// Generate an add or subtract for each IVInc in a chain to materialize the IV
3390  /// user's operand from the previous IV user's operand.
GenerateIVChain(const IVChain & Chain,SmallVectorImpl<WeakTrackingVH> & DeadInsts)3391  void LSRInstance::GenerateIVChain(const IVChain &Chain,
3392                                    SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3393    // Find the new IVOperand for the head of the chain. It may have been replaced
3394    // by LSR.
3395    const IVInc &Head = Chain.Incs[0];
3396    User::op_iterator IVOpEnd = Head.UserInst->op_end();
3397    // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3398    User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3399                                               IVOpEnd, L, SE);
3400    Value *IVSrc = nullptr;
3401    while (IVOpIter != IVOpEnd) {
3402      IVSrc = getWideOperand(*IVOpIter);
3403  
3404      // If this operand computes the expression that the chain needs, we may use
3405      // it. (Check this after setting IVSrc which is used below.)
3406      //
3407      // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3408      // narrow for the chain, so we can no longer use it. We do allow using a
3409      // wider phi, assuming the LSR checked for free truncation. In that case we
3410      // should already have a truncate on this operand such that
3411      // getSCEV(IVSrc) == IncExpr.
3412      if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3413          || SE.getSCEV(IVSrc) == Head.IncExpr) {
3414        break;
3415      }
3416      IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3417    }
3418    if (IVOpIter == IVOpEnd) {
3419      // Gracefully give up on this chain.
3420      LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3421      return;
3422    }
3423    assert(IVSrc && "Failed to find IV chain source");
3424  
3425    LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3426    Type *IVTy = IVSrc->getType();
3427    Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3428    const SCEV *LeftOverExpr = nullptr;
3429    const SCEV *Accum = SE.getZero(IntTy);
3430    SmallVector<std::pair<const SCEV *, Value *>> Bases;
3431    Bases.emplace_back(Accum, IVSrc);
3432  
3433    for (const IVInc &Inc : Chain) {
3434      Instruction *InsertPt = Inc.UserInst;
3435      if (isa<PHINode>(InsertPt))
3436        InsertPt = L->getLoopLatch()->getTerminator();
3437  
3438      // IVOper will replace the current IV User's operand. IVSrc is the IV
3439      // value currently held in a register.
3440      Value *IVOper = IVSrc;
3441      if (!Inc.IncExpr->isZero()) {
3442        // IncExpr was the result of subtraction of two narrow values, so must
3443        // be signed.
3444        const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3445        Accum = SE.getAddExpr(Accum, IncExpr);
3446        LeftOverExpr = LeftOverExpr ?
3447          SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3448      }
3449  
3450      // Look through each base to see if any can produce a nice addressing mode.
3451      bool FoundBase = false;
3452      for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3453        const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3454        if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3455          if (!Remainder->isZero()) {
3456            Rewriter.clearPostInc();
3457            Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3458            const SCEV *IVOperExpr =
3459                SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3460            IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3461          } else {
3462            IVOper = MapIVOper;
3463          }
3464  
3465          FoundBase = true;
3466          break;
3467        }
3468      }
3469      if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3470        // Expand the IV increment.
3471        Rewriter.clearPostInc();
3472        Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3473        const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3474                                               SE.getUnknown(IncV));
3475        IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3476  
3477        // If an IV increment can't be folded, use it as the next IV value.
3478        if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3479          assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3480          Bases.emplace_back(Accum, IVOper);
3481          IVSrc = IVOper;
3482          LeftOverExpr = nullptr;
3483        }
3484      }
3485      Type *OperTy = Inc.IVOperand->getType();
3486      if (IVTy != OperTy) {
3487        assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3488               "cannot extend a chained IV");
3489        IRBuilder<> Builder(InsertPt);
3490        IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3491      }
3492      Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3493      if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3494        DeadInsts.emplace_back(OperandIsInstr);
3495    }
3496    // If LSR created a new, wider phi, we may also replace its postinc. We only
3497    // do this if we also found a wide value for the head of the chain.
3498    if (isa<PHINode>(Chain.tailUserInst())) {
3499      for (PHINode &Phi : L->getHeader()->phis()) {
3500        if (Phi.getType() != IVSrc->getType())
3501          continue;
3502        Instruction *PostIncV = dyn_cast<Instruction>(
3503            Phi.getIncomingValueForBlock(L->getLoopLatch()));
3504        if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3505          continue;
3506        Value *IVOper = IVSrc;
3507        Type *PostIncTy = PostIncV->getType();
3508        if (IVTy != PostIncTy) {
3509          assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3510          IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3511          Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3512          IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3513        }
3514        Phi.replaceUsesOfWith(PostIncV, IVOper);
3515        DeadInsts.emplace_back(PostIncV);
3516      }
3517    }
3518  }
3519  
CollectFixupsAndInitialFormulae()3520  void LSRInstance::CollectFixupsAndInitialFormulae() {
3521    BranchInst *ExitBranch = nullptr;
3522    bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3523  
3524    // For calculating baseline cost
3525    SmallPtrSet<const SCEV *, 16> Regs;
3526    DenseSet<const SCEV *> VisitedRegs;
3527    DenseSet<size_t> VisitedLSRUse;
3528  
3529    for (const IVStrideUse &U : IU) {
3530      Instruction *UserInst = U.getUser();
3531      // Skip IV users that are part of profitable IV Chains.
3532      User::op_iterator UseI =
3533          find(UserInst->operands(), U.getOperandValToReplace());
3534      assert(UseI != UserInst->op_end() && "cannot find IV operand");
3535      if (IVIncSet.count(UseI)) {
3536        LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3537        continue;
3538      }
3539  
3540      LSRUse::KindType Kind = LSRUse::Basic;
3541      MemAccessTy AccessTy;
3542      if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3543        Kind = LSRUse::Address;
3544        AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3545      }
3546  
3547      const SCEV *S = IU.getExpr(U);
3548      if (!S)
3549        continue;
3550      PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3551  
3552      // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3553      // (N - i == 0), and this allows (N - i) to be the expression that we work
3554      // with rather than just N or i, so we can consider the register
3555      // requirements for both N and i at the same time. Limiting this code to
3556      // equality icmps is not a problem because all interesting loops use
3557      // equality icmps, thanks to IndVarSimplify.
3558      if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3559        // If CI can be saved in some target, like replaced inside hardware loop
3560        // in PowerPC, no need to generate initial formulae for it.
3561        if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3562          continue;
3563        if (CI->isEquality()) {
3564          // Swap the operands if needed to put the OperandValToReplace on the
3565          // left, for consistency.
3566          Value *NV = CI->getOperand(1);
3567          if (NV == U.getOperandValToReplace()) {
3568            CI->setOperand(1, CI->getOperand(0));
3569            CI->setOperand(0, NV);
3570            NV = CI->getOperand(1);
3571            Changed = true;
3572          }
3573  
3574          // x == y  -->  x - y == 0
3575          const SCEV *N = SE.getSCEV(NV);
3576          if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3577              (!NV->getType()->isPointerTy() ||
3578               SE.getPointerBase(N) == SE.getPointerBase(S))) {
3579            // S is normalized, so normalize N before folding it into S
3580            // to keep the result normalized.
3581            N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3582            if (!N)
3583              continue;
3584            Kind = LSRUse::ICmpZero;
3585            S = SE.getMinusSCEV(N, S);
3586          } else if (L->isLoopInvariant(NV) &&
3587                     (!isa<Instruction>(NV) ||
3588                      DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3589                     !NV->getType()->isPointerTy()) {
3590            // If we can't generally expand the expression (e.g. it contains
3591            // a divide), but it is already at a loop invariant point before the
3592            // loop, wrap it in an unknown (to prevent the expander from trying
3593            // to re-expand in a potentially unsafe way.)  The restriction to
3594            // integer types is required because the unknown hides the base, and
3595            // SCEV can't compute the difference of two unknown pointers.
3596            N = SE.getUnknown(NV);
3597            N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3598            if (!N)
3599              continue;
3600            Kind = LSRUse::ICmpZero;
3601            S = SE.getMinusSCEV(N, S);
3602            assert(!isa<SCEVCouldNotCompute>(S));
3603          }
3604  
3605          // -1 and the negations of all interesting strides (except the negation
3606          // of -1) are now also interesting.
3607          for (size_t i = 0, e = Factors.size(); i != e; ++i)
3608            if (Factors[i] != -1)
3609              Factors.insert(-(uint64_t)Factors[i]);
3610          Factors.insert(-1);
3611        }
3612      }
3613  
3614      // Get or create an LSRUse.
3615      std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3616      size_t LUIdx = P.first;
3617      Immediate Offset = P.second;
3618      LSRUse &LU = Uses[LUIdx];
3619  
3620      // Record the fixup.
3621      LSRFixup &LF = LU.getNewFixup();
3622      LF.UserInst = UserInst;
3623      LF.OperandValToReplace = U.getOperandValToReplace();
3624      LF.PostIncLoops = TmpPostIncLoops;
3625      LF.Offset = Offset;
3626      LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3627  
3628      // Create SCEV as Formula for calculating baseline cost
3629      if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3630        Formula F;
3631        F.initialMatch(S, L, SE);
3632        BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3633        VisitedLSRUse.insert(LUIdx);
3634      }
3635  
3636      if (!LU.WidestFixupType ||
3637          SE.getTypeSizeInBits(LU.WidestFixupType) <
3638          SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3639        LU.WidestFixupType = LF.OperandValToReplace->getType();
3640  
3641      // If this is the first use of this LSRUse, give it a formula.
3642      if (LU.Formulae.empty()) {
3643        InsertInitialFormula(S, LU, LUIdx);
3644        CountRegisters(LU.Formulae.back(), LUIdx);
3645      }
3646    }
3647  
3648    LLVM_DEBUG(print_fixups(dbgs()));
3649  }
3650  
3651  /// Insert a formula for the given expression into the given use, separating out
3652  /// loop-variant portions from loop-invariant and loop-computable portions.
InsertInitialFormula(const SCEV * S,LSRUse & LU,size_t LUIdx)3653  void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3654                                         size_t LUIdx) {
3655    // Mark uses whose expressions cannot be expanded.
3656    if (!Rewriter.isSafeToExpand(S))
3657      LU.RigidFormula = true;
3658  
3659    Formula F;
3660    F.initialMatch(S, L, SE);
3661    bool Inserted = InsertFormula(LU, LUIdx, F);
3662    assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3663  }
3664  
3665  /// Insert a simple single-register formula for the given expression into the
3666  /// given use.
3667  void
InsertSupplementalFormula(const SCEV * S,LSRUse & LU,size_t LUIdx)3668  LSRInstance::InsertSupplementalFormula(const SCEV *S,
3669                                         LSRUse &LU, size_t LUIdx) {
3670    Formula F;
3671    F.BaseRegs.push_back(S);
3672    F.HasBaseReg = true;
3673    bool Inserted = InsertFormula(LU, LUIdx, F);
3674    assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3675  }
3676  
3677  /// Note which registers are used by the given formula, updating RegUses.
CountRegisters(const Formula & F,size_t LUIdx)3678  void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3679    if (F.ScaledReg)
3680      RegUses.countRegister(F.ScaledReg, LUIdx);
3681    for (const SCEV *BaseReg : F.BaseRegs)
3682      RegUses.countRegister(BaseReg, LUIdx);
3683  }
3684  
3685  /// If the given formula has not yet been inserted, add it to the list, and
3686  /// return true. Return false otherwise.
InsertFormula(LSRUse & LU,unsigned LUIdx,const Formula & F)3687  bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3688    // Do not insert formula that we will not be able to expand.
3689    assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3690           "Formula is illegal");
3691  
3692    if (!LU.InsertFormula(F, *L))
3693      return false;
3694  
3695    CountRegisters(F, LUIdx);
3696    return true;
3697  }
3698  
3699  /// Check for other uses of loop-invariant values which we're tracking. These
3700  /// other uses will pin these values in registers, making them less profitable
3701  /// for elimination.
3702  /// TODO: This currently misses non-constant addrec step registers.
3703  /// TODO: Should this give more weight to users inside the loop?
3704  void
CollectLoopInvariantFixupsAndFormulae()3705  LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3706    SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3707    SmallPtrSet<const SCEV *, 32> Visited;
3708  
3709    // Don't collect outside uses if we are favoring postinc - the instructions in
3710    // the loop are more important than the ones outside of it.
3711    if (AMK == TTI::AMK_PostIndexed)
3712      return;
3713  
3714    while (!Worklist.empty()) {
3715      const SCEV *S = Worklist.pop_back_val();
3716  
3717      // Don't process the same SCEV twice
3718      if (!Visited.insert(S).second)
3719        continue;
3720  
3721      if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3722        append_range(Worklist, N->operands());
3723      else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3724        Worklist.push_back(C->getOperand());
3725      else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3726        Worklist.push_back(D->getLHS());
3727        Worklist.push_back(D->getRHS());
3728      } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3729        const Value *V = US->getValue();
3730        if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3731          // Look for instructions defined outside the loop.
3732          if (L->contains(Inst)) continue;
3733        } else if (isa<Constant>(V))
3734          // Constants can be re-materialized.
3735          continue;
3736        for (const Use &U : V->uses()) {
3737          const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3738          // Ignore non-instructions.
3739          if (!UserInst)
3740            continue;
3741          // Don't bother if the instruction is an EHPad.
3742          if (UserInst->isEHPad())
3743            continue;
3744          // Ignore instructions in other functions (as can happen with
3745          // Constants).
3746          if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3747            continue;
3748          // Ignore instructions not dominated by the loop.
3749          const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3750            UserInst->getParent() :
3751            cast<PHINode>(UserInst)->getIncomingBlock(
3752              PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3753          if (!DT.dominates(L->getHeader(), UseBB))
3754            continue;
3755          // Don't bother if the instruction is in a BB which ends in an EHPad.
3756          if (UseBB->getTerminator()->isEHPad())
3757            continue;
3758  
3759          // Ignore cases in which the currently-examined value could come from
3760          // a basic block terminated with an EHPad. This checks all incoming
3761          // blocks of the phi node since it is possible that the same incoming
3762          // value comes from multiple basic blocks, only some of which may end
3763          // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3764          // pass would try to insert instructions into an EHPad, hitting an
3765          // assertion.
3766          if (isa<PHINode>(UserInst)) {
3767            const auto *PhiNode = cast<PHINode>(UserInst);
3768            bool HasIncompatibleEHPTerminatedBlock = false;
3769            llvm::Value *ExpectedValue = U;
3770            for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3771              if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3772                if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3773                  HasIncompatibleEHPTerminatedBlock = true;
3774                  break;
3775                }
3776              }
3777            }
3778            if (HasIncompatibleEHPTerminatedBlock) {
3779              continue;
3780            }
3781          }
3782  
3783          // Don't bother rewriting PHIs in catchswitch blocks.
3784          if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3785            continue;
3786          // Ignore uses which are part of other SCEV expressions, to avoid
3787          // analyzing them multiple times.
3788          if (SE.isSCEVable(UserInst->getType())) {
3789            const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3790            // If the user is a no-op, look through to its uses.
3791            if (!isa<SCEVUnknown>(UserS))
3792              continue;
3793            if (UserS == US) {
3794              Worklist.push_back(
3795                SE.getUnknown(const_cast<Instruction *>(UserInst)));
3796              continue;
3797            }
3798          }
3799          // Ignore icmp instructions which are already being analyzed.
3800          if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3801            unsigned OtherIdx = !U.getOperandNo();
3802            Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3803            if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3804              continue;
3805          }
3806  
3807          std::pair<size_t, Immediate> P =
3808              getUse(S, LSRUse::Basic, MemAccessTy());
3809          size_t LUIdx = P.first;
3810          Immediate Offset = P.second;
3811          LSRUse &LU = Uses[LUIdx];
3812          LSRFixup &LF = LU.getNewFixup();
3813          LF.UserInst = const_cast<Instruction *>(UserInst);
3814          LF.OperandValToReplace = U;
3815          LF.Offset = Offset;
3816          LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3817          if (!LU.WidestFixupType ||
3818              SE.getTypeSizeInBits(LU.WidestFixupType) <
3819              SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3820            LU.WidestFixupType = LF.OperandValToReplace->getType();
3821          InsertSupplementalFormula(US, LU, LUIdx);
3822          CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3823          break;
3824        }
3825      }
3826    }
3827  }
3828  
3829  /// Split S into subexpressions which can be pulled out into separate
3830  /// registers. If C is non-null, multiply each subexpression by C.
3831  ///
3832  /// Return remainder expression after factoring the subexpressions captured by
3833  /// Ops. If Ops is complete, return NULL.
CollectSubexprs(const SCEV * S,const SCEVConstant * C,SmallVectorImpl<const SCEV * > & Ops,const Loop * L,ScalarEvolution & SE,unsigned Depth=0)3834  static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3835                                     SmallVectorImpl<const SCEV *> &Ops,
3836                                     const Loop *L,
3837                                     ScalarEvolution &SE,
3838                                     unsigned Depth = 0) {
3839    // Arbitrarily cap recursion to protect compile time.
3840    if (Depth >= 3)
3841      return S;
3842  
3843    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3844      // Break out add operands.
3845      for (const SCEV *S : Add->operands()) {
3846        const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3847        if (Remainder)
3848          Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3849      }
3850      return nullptr;
3851    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3852      // Split a non-zero base out of an addrec.
3853      if (AR->getStart()->isZero() || !AR->isAffine())
3854        return S;
3855  
3856      const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3857                                              C, Ops, L, SE, Depth+1);
3858      // Split the non-zero AddRec unless it is part of a nested recurrence that
3859      // does not pertain to this loop.
3860      if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3861        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3862        Remainder = nullptr;
3863      }
3864      if (Remainder != AR->getStart()) {
3865        if (!Remainder)
3866          Remainder = SE.getConstant(AR->getType(), 0);
3867        return SE.getAddRecExpr(Remainder,
3868                                AR->getStepRecurrence(SE),
3869                                AR->getLoop(),
3870                                //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3871                                SCEV::FlagAnyWrap);
3872      }
3873    } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3874      // Break (C * (a + b + c)) into C*a + C*b + C*c.
3875      if (Mul->getNumOperands() != 2)
3876        return S;
3877      if (const SCEVConstant *Op0 =
3878          dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3879        C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3880        const SCEV *Remainder =
3881          CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3882        if (Remainder)
3883          Ops.push_back(SE.getMulExpr(C, Remainder));
3884        return nullptr;
3885      }
3886    }
3887    return S;
3888  }
3889  
3890  /// Return true if the SCEV represents a value that may end up as a
3891  /// post-increment operation.
mayUsePostIncMode(const TargetTransformInfo & TTI,LSRUse & LU,const SCEV * S,const Loop * L,ScalarEvolution & SE)3892  static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
3893                                LSRUse &LU, const SCEV *S, const Loop *L,
3894                                ScalarEvolution &SE) {
3895    if (LU.Kind != LSRUse::Address ||
3896        !LU.AccessTy.getType()->isIntOrIntVectorTy())
3897      return false;
3898    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3899    if (!AR)
3900      return false;
3901    const SCEV *LoopStep = AR->getStepRecurrence(SE);
3902    if (!isa<SCEVConstant>(LoopStep))
3903      return false;
3904    // Check if a post-indexed load/store can be used.
3905    if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
3906        TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
3907      const SCEV *LoopStart = AR->getStart();
3908      if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3909        return true;
3910    }
3911    return false;
3912  }
3913  
3914  /// Helper function for LSRInstance::GenerateReassociations.
GenerateReassociationsImpl(LSRUse & LU,unsigned LUIdx,const Formula & Base,unsigned Depth,size_t Idx,bool IsScaledReg)3915  void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3916                                               const Formula &Base,
3917                                               unsigned Depth, size_t Idx,
3918                                               bool IsScaledReg) {
3919    const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3920    // Don't generate reassociations for the base register of a value that
3921    // may generate a post-increment operator. The reason is that the
3922    // reassociations cause extra base+register formula to be created,
3923    // and possibly chosen, but the post-increment is more efficient.
3924    if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3925      return;
3926    SmallVector<const SCEV *, 8> AddOps;
3927    const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3928    if (Remainder)
3929      AddOps.push_back(Remainder);
3930  
3931    if (AddOps.size() == 1)
3932      return;
3933  
3934    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3935                                                       JE = AddOps.end();
3936         J != JE; ++J) {
3937      // Loop-variant "unknown" values are uninteresting; we won't be able to
3938      // do anything meaningful with them.
3939      if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3940        continue;
3941  
3942      // Don't pull a constant into a register if the constant could be folded
3943      // into an immediate field.
3944      if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3945                           LU.AccessTy, *J, Base.getNumRegs() > 1))
3946        continue;
3947  
3948      // Collect all operands except *J.
3949      SmallVector<const SCEV *, 8> InnerAddOps(
3950          ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3951      InnerAddOps.append(std::next(J),
3952                         ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3953  
3954      // Don't leave just a constant behind in a register if the constant could
3955      // be folded into an immediate field.
3956      if (InnerAddOps.size() == 1 &&
3957          isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3958                           LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3959        continue;
3960  
3961      const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3962      if (InnerSum->isZero())
3963        continue;
3964      Formula F = Base;
3965  
3966      if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3967        continue;
3968  
3969      // Add the remaining pieces of the add back into the new formula.
3970      const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3971      if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3972          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3973                                  InnerSumSC->getValue()->getZExtValue())) {
3974        F.UnfoldedOffset =
3975            Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3976                                InnerSumSC->getValue()->getZExtValue());
3977        if (IsScaledReg)
3978          F.ScaledReg = nullptr;
3979        else
3980          F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3981      } else if (IsScaledReg)
3982        F.ScaledReg = InnerSum;
3983      else
3984        F.BaseRegs[Idx] = InnerSum;
3985  
3986      // Add J as its own register, or an unfolded immediate.
3987      const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3988      if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3989          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3990                                  SC->getValue()->getZExtValue()))
3991        F.UnfoldedOffset =
3992            Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3993                                SC->getValue()->getZExtValue());
3994      else
3995        F.BaseRegs.push_back(*J);
3996      // We may have changed the number of register in base regs, adjust the
3997      // formula accordingly.
3998      F.canonicalize(*L);
3999  
4000      if (InsertFormula(LU, LUIdx, F))
4001        // If that formula hadn't been seen before, recurse to find more like
4002        // it.
4003        // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4004        // Because just Depth is not enough to bound compile time.
4005        // This means that every time AddOps.size() is greater 16^x we will add
4006        // x to Depth.
4007        GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4008                               Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4009    }
4010  }
4011  
4012  /// Split out subexpressions from adds and the bases of addrecs.
GenerateReassociations(LSRUse & LU,unsigned LUIdx,Formula Base,unsigned Depth)4013  void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4014                                           Formula Base, unsigned Depth) {
4015    assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4016    // Arbitrarily cap recursion to protect compile time.
4017    if (Depth >= 3)
4018      return;
4019  
4020    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4021      GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4022  
4023    if (Base.Scale == 1)
4024      GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4025                                 /* Idx */ -1, /* IsScaledReg */ true);
4026  }
4027  
4028  ///  Generate a formula consisting of all of the loop-dominating registers added
4029  /// into a single register.
GenerateCombinations(LSRUse & LU,unsigned LUIdx,Formula Base)4030  void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4031                                         Formula Base) {
4032    // This method is only interesting on a plurality of registers.
4033    if (Base.BaseRegs.size() + (Base.Scale == 1) +
4034            (Base.UnfoldedOffset.isNonZero()) <=
4035        1)
4036      return;
4037  
4038    // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4039    // processing the formula.
4040    Base.unscale();
4041    SmallVector<const SCEV *, 4> Ops;
4042    Formula NewBase = Base;
4043    NewBase.BaseRegs.clear();
4044    Type *CombinedIntegerType = nullptr;
4045    for (const SCEV *BaseReg : Base.BaseRegs) {
4046      if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4047          !SE.hasComputableLoopEvolution(BaseReg, L)) {
4048        if (!CombinedIntegerType)
4049          CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4050        Ops.push_back(BaseReg);
4051      }
4052      else
4053        NewBase.BaseRegs.push_back(BaseReg);
4054    }
4055  
4056    // If no register is relevant, we're done.
4057    if (Ops.size() == 0)
4058      return;
4059  
4060    // Utility function for generating the required variants of the combined
4061    // registers.
4062    auto GenerateFormula = [&](const SCEV *Sum) {
4063      Formula F = NewBase;
4064  
4065      // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4066      // opportunity to fold something. For now, just ignore such cases
4067      // rather than proceed with zero in a register.
4068      if (Sum->isZero())
4069        return;
4070  
4071      F.BaseRegs.push_back(Sum);
4072      F.canonicalize(*L);
4073      (void)InsertFormula(LU, LUIdx, F);
4074    };
4075  
4076    // If we collected at least two registers, generate a formula combining them.
4077    if (Ops.size() > 1) {
4078      SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4079      GenerateFormula(SE.getAddExpr(OpsCopy));
4080    }
4081  
4082    // If we have an unfolded offset, generate a formula combining it with the
4083    // registers collected.
4084    if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4085      assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4086      Ops.push_back(SE.getConstant(CombinedIntegerType,
4087                                   NewBase.UnfoldedOffset.getFixedValue(), true));
4088      NewBase.UnfoldedOffset = Immediate::getFixed(0);
4089      GenerateFormula(SE.getAddExpr(Ops));
4090    }
4091  }
4092  
4093  /// Helper function for LSRInstance::GenerateSymbolicOffsets.
GenerateSymbolicOffsetsImpl(LSRUse & LU,unsigned LUIdx,const Formula & Base,size_t Idx,bool IsScaledReg)4094  void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4095                                                const Formula &Base, size_t Idx,
4096                                                bool IsScaledReg) {
4097    const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4098    GlobalValue *GV = ExtractSymbol(G, SE);
4099    if (G->isZero() || !GV)
4100      return;
4101    Formula F = Base;
4102    F.BaseGV = GV;
4103    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4104      return;
4105    if (IsScaledReg)
4106      F.ScaledReg = G;
4107    else
4108      F.BaseRegs[Idx] = G;
4109    (void)InsertFormula(LU, LUIdx, F);
4110  }
4111  
4112  /// Generate reuse formulae using symbolic offsets.
GenerateSymbolicOffsets(LSRUse & LU,unsigned LUIdx,Formula Base)4113  void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4114                                            Formula Base) {
4115    // We can't add a symbolic offset if the address already contains one.
4116    if (Base.BaseGV) return;
4117  
4118    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4119      GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4120    if (Base.Scale == 1)
4121      GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4122                                  /* IsScaledReg */ true);
4123  }
4124  
4125  /// Helper function for LSRInstance::GenerateConstantOffsets.
GenerateConstantOffsetsImpl(LSRUse & LU,unsigned LUIdx,const Formula & Base,const SmallVectorImpl<Immediate> & Worklist,size_t Idx,bool IsScaledReg)4126  void LSRInstance::GenerateConstantOffsetsImpl(
4127      LSRUse &LU, unsigned LUIdx, const Formula &Base,
4128      const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4129  
4130    auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4131      Formula F = Base;
4132      if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4133        return;
4134      F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4135  
4136      if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4137        // Add the offset to the base register.
4138        const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4139        const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4140        // If it cancelled out, drop the base register, otherwise update it.
4141        if (NewG->isZero()) {
4142          if (IsScaledReg) {
4143            F.Scale = 0;
4144            F.ScaledReg = nullptr;
4145          } else
4146            F.deleteBaseReg(F.BaseRegs[Idx]);
4147          F.canonicalize(*L);
4148        } else if (IsScaledReg)
4149          F.ScaledReg = NewG;
4150        else
4151          F.BaseRegs[Idx] = NewG;
4152  
4153        (void)InsertFormula(LU, LUIdx, F);
4154      }
4155    };
4156  
4157    const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4158  
4159    // With constant offsets and constant steps, we can generate pre-inc
4160    // accesses by having the offset equal the step. So, for access #0 with a
4161    // step of 8, we generate a G - 8 base which would require the first access
4162    // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4163    // for itself and hopefully becomes the base for other accesses. This means
4164    // means that a single pre-indexed access can be generated to become the new
4165    // base pointer for each iteration of the loop, resulting in no extra add/sub
4166    // instructions for pointer updating.
4167    if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4168      if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
4169        if (auto *StepRec =
4170            dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
4171          const APInt &StepInt = StepRec->getAPInt();
4172          int64_t Step = StepInt.isNegative() ?
4173            StepInt.getSExtValue() : StepInt.getZExtValue();
4174  
4175          for (Immediate Offset : Worklist) {
4176            if (Offset.isFixed()) {
4177              Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4178              GenerateOffset(G, Offset);
4179            }
4180          }
4181        }
4182      }
4183    }
4184    for (Immediate Offset : Worklist)
4185      GenerateOffset(G, Offset);
4186  
4187    Immediate Imm = ExtractImmediate(G, SE);
4188    if (G->isZero() || Imm.isZero() ||
4189        !Base.BaseOffset.isCompatibleImmediate(Imm))
4190      return;
4191    Formula F = Base;
4192    F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4193    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4194      return;
4195    if (IsScaledReg) {
4196      F.ScaledReg = G;
4197    } else {
4198      F.BaseRegs[Idx] = G;
4199      // We may generate non canonical Formula if G is a recurrent expr reg
4200      // related with current loop while F.ScaledReg is not.
4201      F.canonicalize(*L);
4202    }
4203    (void)InsertFormula(LU, LUIdx, F);
4204  }
4205  
4206  /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
GenerateConstantOffsets(LSRUse & LU,unsigned LUIdx,Formula Base)4207  void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4208                                            Formula Base) {
4209    // TODO: For now, just add the min and max offset, because it usually isn't
4210    // worthwhile looking at everything inbetween.
4211    SmallVector<Immediate, 2> Worklist;
4212    Worklist.push_back(LU.MinOffset);
4213    if (LU.MaxOffset != LU.MinOffset)
4214      Worklist.push_back(LU.MaxOffset);
4215  
4216    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4217      GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4218    if (Base.Scale == 1)
4219      GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4220                                  /* IsScaledReg */ true);
4221  }
4222  
4223  /// For ICmpZero, check to see if we can scale up the comparison. For example, x
4224  /// == y -> x*c == y*c.
GenerateICmpZeroScales(LSRUse & LU,unsigned LUIdx,Formula Base)4225  void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4226                                           Formula Base) {
4227    if (LU.Kind != LSRUse::ICmpZero) return;
4228  
4229    // Determine the integer type for the base formula.
4230    Type *IntTy = Base.getType();
4231    if (!IntTy) return;
4232    if (SE.getTypeSizeInBits(IntTy) > 64) return;
4233  
4234    // Don't do this if there is more than one offset.
4235    if (LU.MinOffset != LU.MaxOffset) return;
4236  
4237    // Check if transformation is valid. It is illegal to multiply pointer.
4238    if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4239      return;
4240    for (const SCEV *BaseReg : Base.BaseRegs)
4241      if (BaseReg->getType()->isPointerTy())
4242        return;
4243    assert(!Base.BaseGV && "ICmpZero use is not legal!");
4244  
4245    // Check each interesting stride.
4246    for (int64_t Factor : Factors) {
4247      // Check that Factor can be represented by IntTy
4248      if (!ConstantInt::isValueValidForType(IntTy, Factor))
4249        continue;
4250      // Check that the multiplication doesn't overflow.
4251      if (Base.BaseOffset.isMin() && Factor == -1)
4252        continue;
4253      // Not supporting scalable immediates.
4254      if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4255        continue;
4256      Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4257      assert(Factor != 0 && "Zero factor not expected!");
4258      if (NewBaseOffset.getFixedValue() / Factor !=
4259          Base.BaseOffset.getFixedValue())
4260        continue;
4261      // If the offset will be truncated at this use, check that it is in bounds.
4262      if (!IntTy->isPointerTy() &&
4263          !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4264        continue;
4265  
4266      // Check that multiplying with the use offset doesn't overflow.
4267      Immediate Offset = LU.MinOffset;
4268      if (Offset.isMin() && Factor == -1)
4269        continue;
4270      Offset = Offset.mulUnsigned(Factor);
4271      if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4272        continue;
4273      // If the offset will be truncated at this use, check that it is in bounds.
4274      if (!IntTy->isPointerTy() &&
4275          !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4276        continue;
4277  
4278      Formula F = Base;
4279      F.BaseOffset = NewBaseOffset;
4280  
4281      // Check that this scale is legal.
4282      if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4283        continue;
4284  
4285      // Compensate for the use having MinOffset built into it.
4286      F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4287  
4288      const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4289  
4290      // Check that multiplying with each base register doesn't overflow.
4291      for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4292        F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4293        if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4294          goto next;
4295      }
4296  
4297      // Check that multiplying with the scaled register doesn't overflow.
4298      if (F.ScaledReg) {
4299        F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4300        if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4301          continue;
4302      }
4303  
4304      // Check that multiplying with the unfolded offset doesn't overflow.
4305      if (F.UnfoldedOffset.isNonZero()) {
4306        if (F.UnfoldedOffset.isMin() && Factor == -1)
4307          continue;
4308        F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4309        if (F.UnfoldedOffset.getFixedValue() / Factor !=
4310            Base.UnfoldedOffset.getFixedValue())
4311          continue;
4312        // If the offset will be truncated, check that it is in bounds.
4313        if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType(
4314                                         IntTy, F.UnfoldedOffset.getFixedValue()))
4315          continue;
4316      }
4317  
4318      // If we make it here and it's legal, add it.
4319      (void)InsertFormula(LU, LUIdx, F);
4320    next:;
4321    }
4322  }
4323  
4324  /// Generate stride factor reuse formulae by making use of scaled-offset address
4325  /// modes, for example.
GenerateScales(LSRUse & LU,unsigned LUIdx,Formula Base)4326  void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4327    // Determine the integer type for the base formula.
4328    Type *IntTy = Base.getType();
4329    if (!IntTy) return;
4330  
4331    // If this Formula already has a scaled register, we can't add another one.
4332    // Try to unscale the formula to generate a better scale.
4333    if (Base.Scale != 0 && !Base.unscale())
4334      return;
4335  
4336    assert(Base.Scale == 0 && "unscale did not did its job!");
4337  
4338    // Check each interesting stride.
4339    for (int64_t Factor : Factors) {
4340      Base.Scale = Factor;
4341      Base.HasBaseReg = Base.BaseRegs.size() > 1;
4342      // Check whether this scale is going to be legal.
4343      if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4344                      Base)) {
4345        // As a special-case, handle special out-of-loop Basic users specially.
4346        // TODO: Reconsider this special case.
4347        if (LU.Kind == LSRUse::Basic &&
4348            isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4349                       LU.AccessTy, Base) &&
4350            LU.AllFixupsOutsideLoop)
4351          LU.Kind = LSRUse::Special;
4352        else
4353          continue;
4354      }
4355      // For an ICmpZero, negating a solitary base register won't lead to
4356      // new solutions.
4357      if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4358          Base.BaseOffset.isZero() && !Base.BaseGV)
4359        continue;
4360      // For each addrec base reg, if its loop is current loop, apply the scale.
4361      for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4362        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4363        if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4364          const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4365          if (FactorS->isZero())
4366            continue;
4367          // Divide out the factor, ignoring high bits, since we'll be
4368          // scaling the value back up in the end.
4369          if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4370            if (!Quotient->isZero()) {
4371              // TODO: This could be optimized to avoid all the copying.
4372              Formula F = Base;
4373              F.ScaledReg = Quotient;
4374              F.deleteBaseReg(F.BaseRegs[i]);
4375              // The canonical representation of 1*reg is reg, which is already in
4376              // Base. In that case, do not try to insert the formula, it will be
4377              // rejected anyway.
4378              if (F.Scale == 1 && (F.BaseRegs.empty() ||
4379                                   (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4380                continue;
4381              // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4382              // non canonical Formula with ScaledReg's loop not being L.
4383              if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4384                F.canonicalize(*L);
4385              (void)InsertFormula(LU, LUIdx, F);
4386            }
4387        }
4388      }
4389    }
4390  }
4391  
4392  /// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4393  /// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4394  /// perform the extension/truncate and normalize again, as the normalized form
4395  /// can result in folds that are not valid in the post-inc use contexts. The
4396  /// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4397  static const SCEV *
getAnyExtendConsideringPostIncUses(ArrayRef<PostIncLoopSet> Loops,const SCEV * Expr,Type * ToTy,ScalarEvolution & SE)4398  getAnyExtendConsideringPostIncUses(ArrayRef<PostIncLoopSet> Loops,
4399                                     const SCEV *Expr, Type *ToTy,
4400                                     ScalarEvolution &SE) {
4401    const SCEV *Result = nullptr;
4402    for (auto &L : Loops) {
4403      auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4404      const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4405      const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4406      if (!New || (Result && New != Result))
4407        return nullptr;
4408      Result = New;
4409    }
4410  
4411    assert(Result && "failed to create expression");
4412    return Result;
4413  }
4414  
4415  /// Generate reuse formulae from different IV types.
GenerateTruncates(LSRUse & LU,unsigned LUIdx,Formula Base)4416  void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4417    // Don't bother truncating symbolic values.
4418    if (Base.BaseGV) return;
4419  
4420    // Determine the integer type for the base formula.
4421    Type *DstTy = Base.getType();
4422    if (!DstTy) return;
4423    if (DstTy->isPointerTy())
4424      return;
4425  
4426    // It is invalid to extend a pointer type so exit early if ScaledReg or
4427    // any of the BaseRegs are pointers.
4428    if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4429      return;
4430    if (any_of(Base.BaseRegs,
4431               [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4432      return;
4433  
4434    SmallVector<PostIncLoopSet> Loops;
4435    for (auto &LF : LU.Fixups)
4436      Loops.push_back(LF.PostIncLoops);
4437  
4438    for (Type *SrcTy : Types) {
4439      if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4440        Formula F = Base;
4441  
4442        // Sometimes SCEV is able to prove zero during ext transform. It may
4443        // happen if SCEV did not do all possible transforms while creating the
4444        // initial node (maybe due to depth limitations), but it can do them while
4445        // taking ext.
4446        if (F.ScaledReg) {
4447          const SCEV *NewScaledReg =
4448              getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4449          if (!NewScaledReg || NewScaledReg->isZero())
4450            continue;
4451          F.ScaledReg = NewScaledReg;
4452        }
4453        bool HasZeroBaseReg = false;
4454        for (const SCEV *&BaseReg : F.BaseRegs) {
4455          const SCEV *NewBaseReg =
4456              getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4457          if (!NewBaseReg || NewBaseReg->isZero()) {
4458            HasZeroBaseReg = true;
4459            break;
4460          }
4461          BaseReg = NewBaseReg;
4462        }
4463        if (HasZeroBaseReg)
4464          continue;
4465  
4466        // TODO: This assumes we've done basic processing on all uses and
4467        // have an idea what the register usage is.
4468        if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4469          continue;
4470  
4471        F.canonicalize(*L);
4472        (void)InsertFormula(LU, LUIdx, F);
4473      }
4474    }
4475  }
4476  
4477  namespace {
4478  
4479  /// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4480  /// modifications so that the search phase doesn't have to worry about the data
4481  /// structures moving underneath it.
4482  struct WorkItem {
4483    size_t LUIdx;
4484    Immediate Imm;
4485    const SCEV *OrigReg;
4486  
WorkItem__anonc21373340e11::WorkItem4487    WorkItem(size_t LI, Immediate I, const SCEV *R)
4488        : LUIdx(LI), Imm(I), OrigReg(R) {}
4489  
4490    void print(raw_ostream &OS) const;
4491    void dump() const;
4492  };
4493  
4494  } // end anonymous namespace
4495  
4496  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS) const4497  void WorkItem::print(raw_ostream &OS) const {
4498    OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4499       << " , add offset " << Imm;
4500  }
4501  
dump() const4502  LLVM_DUMP_METHOD void WorkItem::dump() const {
4503    print(errs()); errs() << '\n';
4504  }
4505  #endif
4506  
4507  /// Look for registers which are a constant distance apart and try to form reuse
4508  /// opportunities between them.
GenerateCrossUseConstantOffsets()4509  void LSRInstance::GenerateCrossUseConstantOffsets() {
4510    // Group the registers by their value without any added constant offset.
4511    using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4512  
4513    DenseMap<const SCEV *, ImmMapTy> Map;
4514    DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4515    SmallVector<const SCEV *, 8> Sequence;
4516    for (const SCEV *Use : RegUses) {
4517      const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4518      Immediate Imm = ExtractImmediate(Reg, SE);
4519      auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4520      if (Pair.second)
4521        Sequence.push_back(Reg);
4522      Pair.first->second.insert(std::make_pair(Imm, Use));
4523      UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4524    }
4525  
4526    // Now examine each set of registers with the same base value. Build up
4527    // a list of work to do and do the work in a separate step so that we're
4528    // not adding formulae and register counts while we're searching.
4529    SmallVector<WorkItem, 32> WorkItems;
4530    SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4531        UniqueItems;
4532    for (const SCEV *Reg : Sequence) {
4533      const ImmMapTy &Imms = Map.find(Reg)->second;
4534  
4535      // It's not worthwhile looking for reuse if there's only one offset.
4536      if (Imms.size() == 1)
4537        continue;
4538  
4539      LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4540                 for (const auto &Entry
4541                      : Imms) dbgs()
4542                 << ' ' << Entry.first;
4543                 dbgs() << '\n');
4544  
4545      // Examine each offset.
4546      for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4547           J != JE; ++J) {
4548        const SCEV *OrigReg = J->second;
4549  
4550        Immediate JImm = J->first;
4551        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4552  
4553        if (!isa<SCEVConstant>(OrigReg) &&
4554            UsedByIndicesMap[Reg].count() == 1) {
4555          LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4556                            << '\n');
4557          continue;
4558        }
4559  
4560        // Conservatively examine offsets between this orig reg a few selected
4561        // other orig regs.
4562        Immediate First = Imms.begin()->first;
4563        Immediate Last = std::prev(Imms.end())->first;
4564        if (!First.isCompatibleImmediate(Last)) {
4565          LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4566                            << "\n");
4567          continue;
4568        }
4569        // Only scalable if both terms are scalable, or if one is scalable and
4570        // the other is 0.
4571        bool Scalable = First.isScalable() || Last.isScalable();
4572        int64_t FI = First.getKnownMinValue();
4573        int64_t LI = Last.getKnownMinValue();
4574        // Compute (First + Last)  / 2 without overflow using the fact that
4575        // First + Last = 2 * (First + Last) + (First ^ Last).
4576        int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4577        // If the result is negative and FI is odd and LI even (or vice versa),
4578        // we rounded towards -inf. Add 1 in that case, to round towards 0.
4579        Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4580        ImmMapTy::const_iterator OtherImms[] = {
4581            Imms.begin(), std::prev(Imms.end()),
4582            Imms.lower_bound(Immediate::get(Avg, Scalable))};
4583        for (const auto &M : OtherImms) {
4584          if (M == J || M == JE) continue;
4585          if (!JImm.isCompatibleImmediate(M->first))
4586            continue;
4587  
4588          // Compute the difference between the two.
4589          Immediate Imm = JImm.subUnsigned(M->first);
4590          for (unsigned LUIdx : UsedByIndices.set_bits())
4591            // Make a memo of this use, offset, and register tuple.
4592            if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4593              WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4594        }
4595      }
4596    }
4597  
4598    Map.clear();
4599    Sequence.clear();
4600    UsedByIndicesMap.clear();
4601    UniqueItems.clear();
4602  
4603    // Now iterate through the worklist and add new formulae.
4604    for (const WorkItem &WI : WorkItems) {
4605      size_t LUIdx = WI.LUIdx;
4606      LSRUse &LU = Uses[LUIdx];
4607      Immediate Imm = WI.Imm;
4608      const SCEV *OrigReg = WI.OrigReg;
4609  
4610      Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4611      const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4612      unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4613  
4614      // TODO: Use a more targeted data structure.
4615      for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4616        Formula F = LU.Formulae[L];
4617        // FIXME: The code for the scaled and unscaled registers looks
4618        // very similar but slightly different. Investigate if they
4619        // could be merged. That way, we would not have to unscale the
4620        // Formula.
4621        F.unscale();
4622        // Use the immediate in the scaled register.
4623        if (F.ScaledReg == OrigReg) {
4624          if (!F.BaseOffset.isCompatibleImmediate(Imm))
4625            continue;
4626          Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4627          // Don't create 50 + reg(-50).
4628          const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4629          if (F.referencesReg(S))
4630            continue;
4631          Formula NewF = F;
4632          NewF.BaseOffset = Offset;
4633          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4634                          NewF))
4635            continue;
4636          NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4637  
4638          // If the new scale is a constant in a register, and adding the constant
4639          // value to the immediate would produce a value closer to zero than the
4640          // immediate itself, then the formula isn't worthwhile.
4641          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4642            // FIXME: Do we need to do something for scalable immediates here?
4643            //        A scalable SCEV won't be constant, but we might still have
4644            //        something in the offset? Bail out for now to be safe.
4645            if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4646              continue;
4647            if (C->getValue()->isNegative() !=
4648                    (NewF.BaseOffset.isLessThanZero()) &&
4649                (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4650                    .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4651              continue;
4652          }
4653  
4654          // OK, looks good.
4655          NewF.canonicalize(*this->L);
4656          (void)InsertFormula(LU, LUIdx, NewF);
4657        } else {
4658          // Use the immediate in a base register.
4659          for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4660            const SCEV *BaseReg = F.BaseRegs[N];
4661            if (BaseReg != OrigReg)
4662              continue;
4663            Formula NewF = F;
4664            if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4665                !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4666                !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4667              continue;
4668            NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4669            if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4670                            LU.Kind, LU.AccessTy, NewF)) {
4671              if (AMK == TTI::AMK_PostIndexed &&
4672                  mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4673                continue;
4674              Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4675              if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4676                continue;
4677              NewF = F;
4678              NewF.UnfoldedOffset = NewUnfoldedOffset;
4679            }
4680            NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4681  
4682            // If the new formula has a constant in a register, and adding the
4683            // constant value to the immediate would produce a value closer to
4684            // zero than the immediate itself, then the formula isn't worthwhile.
4685            for (const SCEV *NewReg : NewF.BaseRegs)
4686              if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4687                if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4688                  goto skip_formula;
4689                if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4690                        .abs()
4691                        .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4692                    (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4693                            .countr_zero() >=
4694                        (unsigned)llvm::countr_zero<uint64_t>(
4695                            NewF.BaseOffset.getFixedValue()))
4696                  goto skip_formula;
4697              }
4698  
4699            // Ok, looks good.
4700            NewF.canonicalize(*this->L);
4701            (void)InsertFormula(LU, LUIdx, NewF);
4702            break;
4703          skip_formula:;
4704          }
4705        }
4706      }
4707    }
4708  }
4709  
4710  /// Generate formulae for each use.
4711  void
GenerateAllReuseFormulae()4712  LSRInstance::GenerateAllReuseFormulae() {
4713    // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4714    // queries are more precise.
4715    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4716      LSRUse &LU = Uses[LUIdx];
4717      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4718        GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4719      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4720        GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4721    }
4722    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4723      LSRUse &LU = Uses[LUIdx];
4724      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4725        GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4726      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4727        GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4728      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729        GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4730      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4731        GenerateScales(LU, LUIdx, LU.Formulae[i]);
4732    }
4733    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4734      LSRUse &LU = Uses[LUIdx];
4735      for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4736        GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4737    }
4738  
4739    GenerateCrossUseConstantOffsets();
4740  
4741    LLVM_DEBUG(dbgs() << "\n"
4742                         "After generating reuse formulae:\n";
4743               print_uses(dbgs()));
4744  }
4745  
4746  /// If there are multiple formulae with the same set of registers used
4747  /// by other uses, pick the best one and delete the others.
FilterOutUndesirableDedicatedRegisters()4748  void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4749    DenseSet<const SCEV *> VisitedRegs;
4750    SmallPtrSet<const SCEV *, 16> Regs;
4751    SmallPtrSet<const SCEV *, 16> LoserRegs;
4752  #ifndef NDEBUG
4753    bool ChangedFormulae = false;
4754  #endif
4755  
4756    // Collect the best formula for each unique set of shared registers. This
4757    // is reset for each use.
4758    using BestFormulaeTy =
4759        DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4760  
4761    BestFormulaeTy BestFormulae;
4762  
4763    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4764      LSRUse &LU = Uses[LUIdx];
4765      LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4766                 dbgs() << '\n');
4767  
4768      bool Any = false;
4769      for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4770           FIdx != NumForms; ++FIdx) {
4771        Formula &F = LU.Formulae[FIdx];
4772  
4773        // Some formulas are instant losers. For example, they may depend on
4774        // nonexistent AddRecs from other loops. These need to be filtered
4775        // immediately, otherwise heuristics could choose them over others leading
4776        // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4777        // avoids the need to recompute this information across formulae using the
4778        // same bad AddRec. Passing LoserRegs is also essential unless we remove
4779        // the corresponding bad register from the Regs set.
4780        Cost CostF(L, SE, TTI, AMK);
4781        Regs.clear();
4782        CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4783        if (CostF.isLoser()) {
4784          // During initial formula generation, undesirable formulae are generated
4785          // by uses within other loops that have some non-trivial address mode or
4786          // use the postinc form of the IV. LSR needs to provide these formulae
4787          // as the basis of rediscovering the desired formula that uses an AddRec
4788          // corresponding to the existing phi. Once all formulae have been
4789          // generated, these initial losers may be pruned.
4790          LLVM_DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
4791                     dbgs() << "\n");
4792        }
4793        else {
4794          SmallVector<const SCEV *, 4> Key;
4795          for (const SCEV *Reg : F.BaseRegs) {
4796            if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4797              Key.push_back(Reg);
4798          }
4799          if (F.ScaledReg &&
4800              RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4801            Key.push_back(F.ScaledReg);
4802          // Unstable sort by host order ok, because this is only used for
4803          // uniquifying.
4804          llvm::sort(Key);
4805  
4806          std::pair<BestFormulaeTy::const_iterator, bool> P =
4807            BestFormulae.insert(std::make_pair(Key, FIdx));
4808          if (P.second)
4809            continue;
4810  
4811          Formula &Best = LU.Formulae[P.first->second];
4812  
4813          Cost CostBest(L, SE, TTI, AMK);
4814          Regs.clear();
4815          CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4816          if (CostF.isLess(CostBest))
4817            std::swap(F, Best);
4818          LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
4819                     dbgs() << "\n"
4820                               "    in favor of formula ";
4821                     Best.print(dbgs()); dbgs() << '\n');
4822        }
4823  #ifndef NDEBUG
4824        ChangedFormulae = true;
4825  #endif
4826        LU.DeleteFormula(F);
4827        --FIdx;
4828        --NumForms;
4829        Any = true;
4830      }
4831  
4832      // Now that we've filtered out some formulae, recompute the Regs set.
4833      if (Any)
4834        LU.RecomputeRegs(LUIdx, RegUses);
4835  
4836      // Reset this to prepare for the next use.
4837      BestFormulae.clear();
4838    }
4839  
4840    LLVM_DEBUG(if (ChangedFormulae) {
4841      dbgs() << "\n"
4842                "After filtering out undesirable candidates:\n";
4843      print_uses(dbgs());
4844    });
4845  }
4846  
4847  /// Estimate the worst-case number of solutions the solver might have to
4848  /// consider. It almost never considers this many solutions because it prune the
4849  /// search space, but the pruning isn't always sufficient.
EstimateSearchSpaceComplexity() const4850  size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4851    size_t Power = 1;
4852    for (const LSRUse &LU : Uses) {
4853      size_t FSize = LU.Formulae.size();
4854      if (FSize >= ComplexityLimit) {
4855        Power = ComplexityLimit;
4856        break;
4857      }
4858      Power *= FSize;
4859      if (Power >= ComplexityLimit)
4860        break;
4861    }
4862    return Power;
4863  }
4864  
4865  /// When one formula uses a superset of the registers of another formula, it
4866  /// won't help reduce register pressure (though it may not necessarily hurt
4867  /// register pressure); remove it to simplify the system.
NarrowSearchSpaceByDetectingSupersets()4868  void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4869    if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4870      LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4871  
4872      LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4873                           "which use a superset of registers used by other "
4874                           "formulae.\n");
4875  
4876      for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4877        LSRUse &LU = Uses[LUIdx];
4878        bool Any = false;
4879        for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4880          Formula &F = LU.Formulae[i];
4881          if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4882            continue;
4883          // Look for a formula with a constant or GV in a register. If the use
4884          // also has a formula with that same value in an immediate field,
4885          // delete the one that uses a register.
4886          for (SmallVectorImpl<const SCEV *>::const_iterator
4887               I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4888            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4889              Formula NewF = F;
4890              //FIXME: Formulas should store bitwidth to do wrapping properly.
4891              //       See PR41034.
4892              NewF.BaseOffset =
4893                  Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4894                                      (uint64_t)C->getValue()->getSExtValue());
4895              NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4896                                  (I - F.BaseRegs.begin()));
4897              if (LU.HasFormulaWithSameRegs(NewF)) {
4898                LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4899                           dbgs() << '\n');
4900                LU.DeleteFormula(F);
4901                --i;
4902                --e;
4903                Any = true;
4904                break;
4905              }
4906            } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4907              if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4908                if (!F.BaseGV) {
4909                  Formula NewF = F;
4910                  NewF.BaseGV = GV;
4911                  NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4912                                      (I - F.BaseRegs.begin()));
4913                  if (LU.HasFormulaWithSameRegs(NewF)) {
4914                    LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
4915                               dbgs() << '\n');
4916                    LU.DeleteFormula(F);
4917                    --i;
4918                    --e;
4919                    Any = true;
4920                    break;
4921                  }
4922                }
4923            }
4924          }
4925        }
4926        if (Any)
4927          LU.RecomputeRegs(LUIdx, RegUses);
4928      }
4929  
4930      LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4931    }
4932  }
4933  
4934  /// When there are many registers for expressions like A, A+1, A+2, etc.,
4935  /// allocate a single register for them.
NarrowSearchSpaceByCollapsingUnrolledCode()4936  void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4937    if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4938      return;
4939  
4940    LLVM_DEBUG(
4941        dbgs() << "The search space is too complex.\n"
4942                  "Narrowing the search space by assuming that uses separated "
4943                  "by a constant offset will use the same registers.\n");
4944  
4945    // This is especially useful for unrolled loops.
4946  
4947    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4948      LSRUse &LU = Uses[LUIdx];
4949      for (const Formula &F : LU.Formulae) {
4950        if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4951          continue;
4952  
4953        LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4954        if (!LUThatHas)
4955          continue;
4956  
4957        if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4958                                LU.Kind, LU.AccessTy))
4959          continue;
4960  
4961        LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4962  
4963        LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4964  
4965        // Transfer the fixups of LU to LUThatHas.
4966        for (LSRFixup &Fixup : LU.Fixups) {
4967          Fixup.Offset += F.BaseOffset;
4968          LUThatHas->pushFixup(Fixup);
4969          LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4970        }
4971  
4972        // Delete formulae from the new use which are no longer legal.
4973        bool Any = false;
4974        for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4975          Formula &F = LUThatHas->Formulae[i];
4976          if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4977                          LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4978            LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4979            LUThatHas->DeleteFormula(F);
4980            --i;
4981            --e;
4982            Any = true;
4983          }
4984        }
4985  
4986        if (Any)
4987          LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4988  
4989        // Delete the old use.
4990        DeleteUse(LU, LUIdx);
4991        --LUIdx;
4992        --NumUses;
4993        break;
4994      }
4995    }
4996  
4997    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4998  }
4999  
5000  /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5001  /// we've done more filtering, as it may be able to find more formulae to
5002  /// eliminate.
NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters()5003  void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5004    if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5005      LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5006  
5007      LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5008                           "undesirable dedicated registers.\n");
5009  
5010      FilterOutUndesirableDedicatedRegisters();
5011  
5012      LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5013    }
5014  }
5015  
5016  /// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5017  /// Pick the best one and delete the others.
5018  /// This narrowing heuristic is to keep as many formulae with different
5019  /// Scale and ScaledReg pair as possible while narrowing the search space.
5020  /// The benefit is that it is more likely to find out a better solution
5021  /// from a formulae set with more Scale and ScaledReg variations than
5022  /// a formulae set with the same Scale and ScaledReg. The picking winner
5023  /// reg heuristic will often keep the formulae with the same Scale and
5024  /// ScaledReg and filter others, and we want to avoid that if possible.
NarrowSearchSpaceByFilterFormulaWithSameScaledReg()5025  void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5026    if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5027      return;
5028  
5029    LLVM_DEBUG(
5030        dbgs() << "The search space is too complex.\n"
5031                  "Narrowing the search space by choosing the best Formula "
5032                  "from the Formulae with the same Scale and ScaledReg.\n");
5033  
5034    // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5035    using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5036  
5037    BestFormulaeTy BestFormulae;
5038  #ifndef NDEBUG
5039    bool ChangedFormulae = false;
5040  #endif
5041    DenseSet<const SCEV *> VisitedRegs;
5042    SmallPtrSet<const SCEV *, 16> Regs;
5043  
5044    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5045      LSRUse &LU = Uses[LUIdx];
5046      LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5047                 dbgs() << '\n');
5048  
5049      // Return true if Formula FA is better than Formula FB.
5050      auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5051        // First we will try to choose the Formula with fewer new registers.
5052        // For a register used by current Formula, the more the register is
5053        // shared among LSRUses, the less we increase the register number
5054        // counter of the formula.
5055        size_t FARegNum = 0;
5056        for (const SCEV *Reg : FA.BaseRegs) {
5057          const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5058          FARegNum += (NumUses - UsedByIndices.count() + 1);
5059        }
5060        size_t FBRegNum = 0;
5061        for (const SCEV *Reg : FB.BaseRegs) {
5062          const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5063          FBRegNum += (NumUses - UsedByIndices.count() + 1);
5064        }
5065        if (FARegNum != FBRegNum)
5066          return FARegNum < FBRegNum;
5067  
5068        // If the new register numbers are the same, choose the Formula with
5069        // less Cost.
5070        Cost CostFA(L, SE, TTI, AMK);
5071        Cost CostFB(L, SE, TTI, AMK);
5072        Regs.clear();
5073        CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
5074        Regs.clear();
5075        CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
5076        return CostFA.isLess(CostFB);
5077      };
5078  
5079      bool Any = false;
5080      for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5081           ++FIdx) {
5082        Formula &F = LU.Formulae[FIdx];
5083        if (!F.ScaledReg)
5084          continue;
5085        auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5086        if (P.second)
5087          continue;
5088  
5089        Formula &Best = LU.Formulae[P.first->second];
5090        if (IsBetterThan(F, Best))
5091          std::swap(F, Best);
5092        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
5093                   dbgs() << "\n"
5094                             "    in favor of formula ";
5095                   Best.print(dbgs()); dbgs() << '\n');
5096  #ifndef NDEBUG
5097        ChangedFormulae = true;
5098  #endif
5099        LU.DeleteFormula(F);
5100        --FIdx;
5101        --NumForms;
5102        Any = true;
5103      }
5104      if (Any)
5105        LU.RecomputeRegs(LUIdx, RegUses);
5106  
5107      // Reset this to prepare for the next use.
5108      BestFormulae.clear();
5109    }
5110  
5111    LLVM_DEBUG(if (ChangedFormulae) {
5112      dbgs() << "\n"
5113                "After filtering out undesirable candidates:\n";
5114      print_uses(dbgs());
5115    });
5116  }
5117  
5118  /// If we are over the complexity limit, filter out any post-inc prefering
5119  /// variables to only post-inc values.
NarrowSearchSpaceByFilterPostInc()5120  void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5121    if (AMK != TTI::AMK_PostIndexed)
5122      return;
5123    if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5124      return;
5125  
5126    LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5127                         "Narrowing the search space by choosing the lowest "
5128                         "register Formula for PostInc Uses.\n");
5129  
5130    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5131      LSRUse &LU = Uses[LUIdx];
5132  
5133      if (LU.Kind != LSRUse::Address)
5134        continue;
5135      if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5136          !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5137        continue;
5138  
5139      size_t MinRegs = std::numeric_limits<size_t>::max();
5140      for (const Formula &F : LU.Formulae)
5141        MinRegs = std::min(F.getNumRegs(), MinRegs);
5142  
5143      bool Any = false;
5144      for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5145           ++FIdx) {
5146        Formula &F = LU.Formulae[FIdx];
5147        if (F.getNumRegs() > MinRegs) {
5148          LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
5149                     dbgs() << "\n");
5150          LU.DeleteFormula(F);
5151          --FIdx;
5152          --NumForms;
5153          Any = true;
5154        }
5155      }
5156      if (Any)
5157        LU.RecomputeRegs(LUIdx, RegUses);
5158  
5159      if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5160        break;
5161    }
5162  
5163    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5164  }
5165  
5166  /// The function delete formulas with high registers number expectation.
5167  /// Assuming we don't know the value of each formula (already delete
5168  /// all inefficient), generate probability of not selecting for each
5169  /// register.
5170  /// For example,
5171  /// Use1:
5172  ///  reg(a) + reg({0,+,1})
5173  ///  reg(a) + reg({-1,+,1}) + 1
5174  ///  reg({a,+,1})
5175  /// Use2:
5176  ///  reg(b) + reg({0,+,1})
5177  ///  reg(b) + reg({-1,+,1}) + 1
5178  ///  reg({b,+,1})
5179  /// Use3:
5180  ///  reg(c) + reg(b) + reg({0,+,1})
5181  ///  reg(c) + reg({b,+,1})
5182  ///
5183  /// Probability of not selecting
5184  ///                 Use1   Use2    Use3
5185  /// reg(a)         (1/3) *   1   *   1
5186  /// reg(b)           1   * (1/3) * (1/2)
5187  /// reg({0,+,1})   (2/3) * (2/3) * (1/2)
5188  /// reg({-1,+,1})  (2/3) * (2/3) *   1
5189  /// reg({a,+,1})   (2/3) *   1   *   1
5190  /// reg({b,+,1})     1   * (2/3) * (2/3)
5191  /// reg(c)           1   *   1   *   0
5192  ///
5193  /// Now count registers number mathematical expectation for each formula:
5194  /// Note that for each use we exclude probability if not selecting for the use.
5195  /// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5196  /// probabilty 1/3 of not selecting for Use1).
5197  /// Use1:
5198  ///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
5199  ///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
5200  ///  reg({a,+,1})                   1
5201  /// Use2:
5202  ///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
5203  ///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
5204  ///  reg({b,+,1})                   2/3
5205  /// Use3:
5206  ///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5207  ///  reg(c) + reg({b,+,1})          1 + 2/3
NarrowSearchSpaceByDeletingCostlyFormulas()5208  void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5209    if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5210      return;
5211    // Ok, we have too many of formulae on our hands to conveniently handle.
5212    // Use a rough heuristic to thin out the list.
5213  
5214    // Set of Regs wich will be 100% used in final solution.
5215    // Used in each formula of a solution (in example above this is reg(c)).
5216    // We can skip them in calculations.
5217    SmallPtrSet<const SCEV *, 4> UniqRegs;
5218    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5219  
5220    // Map each register to probability of not selecting
5221    DenseMap <const SCEV *, float> RegNumMap;
5222    for (const SCEV *Reg : RegUses) {
5223      if (UniqRegs.count(Reg))
5224        continue;
5225      float PNotSel = 1;
5226      for (const LSRUse &LU : Uses) {
5227        if (!LU.Regs.count(Reg))
5228          continue;
5229        float P = LU.getNotSelectedProbability(Reg);
5230        if (P != 0.0)
5231          PNotSel *= P;
5232        else
5233          UniqRegs.insert(Reg);
5234      }
5235      RegNumMap.insert(std::make_pair(Reg, PNotSel));
5236    }
5237  
5238    LLVM_DEBUG(
5239        dbgs() << "Narrowing the search space by deleting costly formulas\n");
5240  
5241    // Delete formulas where registers number expectation is high.
5242    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5243      LSRUse &LU = Uses[LUIdx];
5244      // If nothing to delete - continue.
5245      if (LU.Formulae.size() < 2)
5246        continue;
5247      // This is temporary solution to test performance. Float should be
5248      // replaced with round independent type (based on integers) to avoid
5249      // different results for different target builds.
5250      float FMinRegNum = LU.Formulae[0].getNumRegs();
5251      float FMinARegNum = LU.Formulae[0].getNumRegs();
5252      size_t MinIdx = 0;
5253      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5254        Formula &F = LU.Formulae[i];
5255        float FRegNum = 0;
5256        float FARegNum = 0;
5257        for (const SCEV *BaseReg : F.BaseRegs) {
5258          if (UniqRegs.count(BaseReg))
5259            continue;
5260          FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5261          if (isa<SCEVAddRecExpr>(BaseReg))
5262            FARegNum +=
5263                RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5264        }
5265        if (const SCEV *ScaledReg = F.ScaledReg) {
5266          if (!UniqRegs.count(ScaledReg)) {
5267            FRegNum +=
5268                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5269            if (isa<SCEVAddRecExpr>(ScaledReg))
5270              FARegNum +=
5271                  RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5272          }
5273        }
5274        if (FMinRegNum > FRegNum ||
5275            (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5276          FMinRegNum = FRegNum;
5277          FMinARegNum = FARegNum;
5278          MinIdx = i;
5279        }
5280      }
5281      LLVM_DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
5282                 dbgs() << " with min reg num " << FMinRegNum << '\n');
5283      if (MinIdx != 0)
5284        std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5285      while (LU.Formulae.size() != 1) {
5286        LLVM_DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
5287                   dbgs() << '\n');
5288        LU.Formulae.pop_back();
5289      }
5290      LU.RecomputeRegs(LUIdx, RegUses);
5291      assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5292      Formula &F = LU.Formulae[0];
5293      LLVM_DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
5294      // When we choose the formula, the regs become unique.
5295      UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5296      if (F.ScaledReg)
5297        UniqRegs.insert(F.ScaledReg);
5298    }
5299    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5300  }
5301  
5302  // Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5303  // would the addressing offset +C would be legal where the negative offset -C is
5304  // not.
IsSimplerBaseSCEVForTarget(const TargetTransformInfo & TTI,ScalarEvolution & SE,const SCEV * Best,const SCEV * Reg,MemAccessTy AccessType)5305  static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI,
5306                                         ScalarEvolution &SE, const SCEV *Best,
5307                                         const SCEV *Reg,
5308                                         MemAccessTy AccessType) {
5309    if (Best->getType() != Reg->getType() ||
5310        (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5311         cast<SCEVAddRecExpr>(Best)->getLoop() !=
5312             cast<SCEVAddRecExpr>(Reg)->getLoop()))
5313      return false;
5314    const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Best, Reg));
5315    if (!Diff)
5316      return false;
5317  
5318    return TTI.isLegalAddressingMode(
5319               AccessType.MemTy, /*BaseGV=*/nullptr,
5320               /*BaseOffset=*/Diff->getAPInt().getSExtValue(),
5321               /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5322           !TTI.isLegalAddressingMode(
5323               AccessType.MemTy, /*BaseGV=*/nullptr,
5324               /*BaseOffset=*/-Diff->getAPInt().getSExtValue(),
5325               /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5326  }
5327  
5328  /// Pick a register which seems likely to be profitable, and then in any use
5329  /// which has any reference to that register, delete all formulae which do not
5330  /// reference that register.
NarrowSearchSpaceByPickingWinnerRegs()5331  void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5332    // With all other options exhausted, loop until the system is simple
5333    // enough to handle.
5334    SmallPtrSet<const SCEV *, 4> Taken;
5335    while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5336      // Ok, we have too many of formulae on our hands to conveniently handle.
5337      // Use a rough heuristic to thin out the list.
5338      LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5339  
5340      // Pick the register which is used by the most LSRUses, which is likely
5341      // to be a good reuse register candidate.
5342      const SCEV *Best = nullptr;
5343      unsigned BestNum = 0;
5344      for (const SCEV *Reg : RegUses) {
5345        if (Taken.count(Reg))
5346          continue;
5347        if (!Best) {
5348          Best = Reg;
5349          BestNum = RegUses.getUsedByIndices(Reg).count();
5350        } else {
5351          unsigned Count = RegUses.getUsedByIndices(Reg).count();
5352          if (Count > BestNum) {
5353            Best = Reg;
5354            BestNum = Count;
5355          }
5356  
5357          // If the scores are the same, but the Reg is simpler for the target
5358          // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5359          // handle +C but not -C), opt for the simpler formula.
5360          if (Count == BestNum) {
5361            int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5362            if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5363                IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5364                                           Uses[LUIdx].AccessTy)) {
5365              Best = Reg;
5366              BestNum = Count;
5367            }
5368          }
5369        }
5370      }
5371      assert(Best && "Failed to find best LSRUse candidate");
5372  
5373      LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5374                        << " will yield profitable reuse.\n");
5375      Taken.insert(Best);
5376  
5377      // In any use with formulae which references this register, delete formulae
5378      // which don't reference it.
5379      for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5380        LSRUse &LU = Uses[LUIdx];
5381        if (!LU.Regs.count(Best)) continue;
5382  
5383        bool Any = false;
5384        for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5385          Formula &F = LU.Formulae[i];
5386          if (!F.referencesReg(Best)) {
5387            LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
5388            LU.DeleteFormula(F);
5389            --e;
5390            --i;
5391            Any = true;
5392            assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5393            continue;
5394          }
5395        }
5396  
5397        if (Any)
5398          LU.RecomputeRegs(LUIdx, RegUses);
5399      }
5400  
5401      LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5402    }
5403  }
5404  
5405  /// If there are an extraordinary number of formulae to choose from, use some
5406  /// rough heuristics to prune down the number of formulae. This keeps the main
5407  /// solver from taking an extraordinary amount of time in some worst-case
5408  /// scenarios.
NarrowSearchSpaceUsingHeuristics()5409  void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5410    NarrowSearchSpaceByDetectingSupersets();
5411    NarrowSearchSpaceByCollapsingUnrolledCode();
5412    NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5413    if (FilterSameScaledReg)
5414      NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5415    NarrowSearchSpaceByFilterPostInc();
5416    if (LSRExpNarrow)
5417      NarrowSearchSpaceByDeletingCostlyFormulas();
5418    else
5419      NarrowSearchSpaceByPickingWinnerRegs();
5420  }
5421  
5422  /// This is the recursive solver.
SolveRecurse(SmallVectorImpl<const Formula * > & Solution,Cost & SolutionCost,SmallVectorImpl<const Formula * > & Workspace,const Cost & CurCost,const SmallPtrSet<const SCEV *,16> & CurRegs,DenseSet<const SCEV * > & VisitedRegs) const5423  void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5424                                 Cost &SolutionCost,
5425                                 SmallVectorImpl<const Formula *> &Workspace,
5426                                 const Cost &CurCost,
5427                                 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5428                                 DenseSet<const SCEV *> &VisitedRegs) const {
5429    // Some ideas:
5430    //  - prune more:
5431    //    - use more aggressive filtering
5432    //    - sort the formula so that the most profitable solutions are found first
5433    //    - sort the uses too
5434    //  - search faster:
5435    //    - don't compute a cost, and then compare. compare while computing a cost
5436    //      and bail early.
5437    //    - track register sets with SmallBitVector
5438  
5439    const LSRUse &LU = Uses[Workspace.size()];
5440  
5441    // If this use references any register that's already a part of the
5442    // in-progress solution, consider it a requirement that a formula must
5443    // reference that register in order to be considered. This prunes out
5444    // unprofitable searching.
5445    SmallSetVector<const SCEV *, 4> ReqRegs;
5446    for (const SCEV *S : CurRegs)
5447      if (LU.Regs.count(S))
5448        ReqRegs.insert(S);
5449  
5450    SmallPtrSet<const SCEV *, 16> NewRegs;
5451    Cost NewCost(L, SE, TTI, AMK);
5452    for (const Formula &F : LU.Formulae) {
5453      // Ignore formulae which may not be ideal in terms of register reuse of
5454      // ReqRegs.  The formula should use all required registers before
5455      // introducing new ones.
5456      // This can sometimes (notably when trying to favour postinc) lead to
5457      // sub-optimial decisions. There it is best left to the cost modelling to
5458      // get correct.
5459      if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5460        int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5461        for (const SCEV *Reg : ReqRegs) {
5462          if ((F.ScaledReg && F.ScaledReg == Reg) ||
5463              is_contained(F.BaseRegs, Reg)) {
5464            --NumReqRegsToFind;
5465            if (NumReqRegsToFind == 0)
5466              break;
5467          }
5468        }
5469        if (NumReqRegsToFind != 0) {
5470          // If none of the formulae satisfied the required registers, then we could
5471          // clear ReqRegs and try again. Currently, we simply give up in this case.
5472          continue;
5473        }
5474      }
5475  
5476      // Evaluate the cost of the current formula. If it's already worse than
5477      // the current best, prune the search at that point.
5478      NewCost = CurCost;
5479      NewRegs = CurRegs;
5480      NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5481      if (NewCost.isLess(SolutionCost)) {
5482        Workspace.push_back(&F);
5483        if (Workspace.size() != Uses.size()) {
5484          SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5485                       NewRegs, VisitedRegs);
5486          if (F.getNumRegs() == 1 && Workspace.size() == 1)
5487            VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5488        } else {
5489          LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5490                     dbgs() << ".\nRegs:\n";
5491                     for (const SCEV *S : NewRegs) dbgs()
5492                        << "- " << *S << "\n";
5493                     dbgs() << '\n');
5494  
5495          SolutionCost = NewCost;
5496          Solution = Workspace;
5497        }
5498        Workspace.pop_back();
5499      }
5500    }
5501  }
5502  
5503  /// Choose one formula from each use. Return the results in the given Solution
5504  /// vector.
Solve(SmallVectorImpl<const Formula * > & Solution) const5505  void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5506    SmallVector<const Formula *, 8> Workspace;
5507    Cost SolutionCost(L, SE, TTI, AMK);
5508    SolutionCost.Lose();
5509    Cost CurCost(L, SE, TTI, AMK);
5510    SmallPtrSet<const SCEV *, 16> CurRegs;
5511    DenseSet<const SCEV *> VisitedRegs;
5512    Workspace.reserve(Uses.size());
5513  
5514    // SolveRecurse does all the work.
5515    SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5516                 CurRegs, VisitedRegs);
5517    if (Solution.empty()) {
5518      LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5519      return;
5520    }
5521  
5522    // Ok, we've now made all our decisions.
5523    LLVM_DEBUG(dbgs() << "\n"
5524                         "The chosen solution requires ";
5525               SolutionCost.print(dbgs()); dbgs() << ":\n";
5526               for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5527                 dbgs() << "  ";
5528                 Uses[i].print(dbgs());
5529                 dbgs() << "\n"
5530                           "    ";
5531                 Solution[i]->print(dbgs());
5532                 dbgs() << '\n';
5533               });
5534  
5535    assert(Solution.size() == Uses.size() && "Malformed solution!");
5536  
5537    const bool EnableDropUnprofitableSolution = [&] {
5538      switch (AllowDropSolutionIfLessProfitable) {
5539      case cl::BOU_TRUE:
5540        return true;
5541      case cl::BOU_FALSE:
5542        return false;
5543      case cl::BOU_UNSET:
5544        return TTI.shouldDropLSRSolutionIfLessProfitable();
5545      }
5546      llvm_unreachable("Unhandled cl::boolOrDefault enum");
5547    }();
5548  
5549    if (BaselineCost.isLess(SolutionCost)) {
5550      if (!EnableDropUnprofitableSolution)
5551        LLVM_DEBUG(
5552            dbgs() << "Baseline is more profitable than chosen solution, "
5553                      "add option 'lsr-drop-solution' to drop LSR solution.\n");
5554      else {
5555        LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5556                             "solution, dropping LSR solution.\n";);
5557        Solution.clear();
5558      }
5559    }
5560  }
5561  
5562  /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5563  /// we can go while still being dominated by the input positions. This helps
5564  /// canonicalize the insert position, which encourages sharing.
5565  BasicBlock::iterator
HoistInsertPosition(BasicBlock::iterator IP,const SmallVectorImpl<Instruction * > & Inputs) const5566  LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5567                                   const SmallVectorImpl<Instruction *> &Inputs)
5568                                                                           const {
5569    Instruction *Tentative = &*IP;
5570    while (true) {
5571      bool AllDominate = true;
5572      Instruction *BetterPos = nullptr;
5573      // Don't bother attempting to insert before a catchswitch, their basic block
5574      // cannot have other non-PHI instructions.
5575      if (isa<CatchSwitchInst>(Tentative))
5576        return IP;
5577  
5578      for (Instruction *Inst : Inputs) {
5579        if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5580          AllDominate = false;
5581          break;
5582        }
5583        // Attempt to find an insert position in the middle of the block,
5584        // instead of at the end, so that it can be used for other expansions.
5585        if (Tentative->getParent() == Inst->getParent() &&
5586            (!BetterPos || !DT.dominates(Inst, BetterPos)))
5587          BetterPos = &*std::next(BasicBlock::iterator(Inst));
5588      }
5589      if (!AllDominate)
5590        break;
5591      if (BetterPos)
5592        IP = BetterPos->getIterator();
5593      else
5594        IP = Tentative->getIterator();
5595  
5596      const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5597      unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5598  
5599      BasicBlock *IDom;
5600      for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5601        if (!Rung) return IP;
5602        Rung = Rung->getIDom();
5603        if (!Rung) return IP;
5604        IDom = Rung->getBlock();
5605  
5606        // Don't climb into a loop though.
5607        const Loop *IDomLoop = LI.getLoopFor(IDom);
5608        unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5609        if (IDomDepth <= IPLoopDepth &&
5610            (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5611          break;
5612      }
5613  
5614      Tentative = IDom->getTerminator();
5615    }
5616  
5617    return IP;
5618  }
5619  
5620  /// Determine an input position which will be dominated by the operands and
5621  /// which will dominate the result.
AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,const LSRFixup & LF,const LSRUse & LU) const5622  BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5623      BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5624    // Collect some instructions which must be dominated by the
5625    // expanding replacement. These must be dominated by any operands that
5626    // will be required in the expansion.
5627    SmallVector<Instruction *, 4> Inputs;
5628    if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5629      Inputs.push_back(I);
5630    if (LU.Kind == LSRUse::ICmpZero)
5631      if (Instruction *I =
5632            dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5633        Inputs.push_back(I);
5634    if (LF.PostIncLoops.count(L)) {
5635      if (LF.isUseFullyOutsideLoop(L))
5636        Inputs.push_back(L->getLoopLatch()->getTerminator());
5637      else
5638        Inputs.push_back(IVIncInsertPos);
5639    }
5640    // The expansion must also be dominated by the increment positions of any
5641    // loops it for which it is using post-inc mode.
5642    for (const Loop *PIL : LF.PostIncLoops) {
5643      if (PIL == L) continue;
5644  
5645      // Be dominated by the loop exit.
5646      SmallVector<BasicBlock *, 4> ExitingBlocks;
5647      PIL->getExitingBlocks(ExitingBlocks);
5648      if (!ExitingBlocks.empty()) {
5649        BasicBlock *BB = ExitingBlocks[0];
5650        for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5651          BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5652        Inputs.push_back(BB->getTerminator());
5653      }
5654    }
5655  
5656    assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5657           && !isa<DbgInfoIntrinsic>(LowestIP) &&
5658           "Insertion point must be a normal instruction");
5659  
5660    // Then, climb up the immediate dominator tree as far as we can go while
5661    // still being dominated by the input positions.
5662    BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5663  
5664    // Don't insert instructions before PHI nodes.
5665    while (isa<PHINode>(IP)) ++IP;
5666  
5667    // Ignore landingpad instructions.
5668    while (IP->isEHPad()) ++IP;
5669  
5670    // Ignore debug intrinsics.
5671    while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5672  
5673    // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5674    // IP consistent across expansions and allows the previously inserted
5675    // instructions to be reused by subsequent expansion.
5676    while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5677      ++IP;
5678  
5679    return IP;
5680  }
5681  
5682  /// Emit instructions for the leading candidate expression for this LSRUse (this
5683  /// is called "expanding").
Expand(const LSRUse & LU,const LSRFixup & LF,const Formula & F,BasicBlock::iterator IP,SmallVectorImpl<WeakTrackingVH> & DeadInsts) const5684  Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5685                             const Formula &F, BasicBlock::iterator IP,
5686                             SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5687    if (LU.RigidFormula)
5688      return LF.OperandValToReplace;
5689  
5690    // Determine an input position which will be dominated by the operands and
5691    // which will dominate the result.
5692    IP = AdjustInsertPositionForExpand(IP, LF, LU);
5693    Rewriter.setInsertPoint(&*IP);
5694  
5695    // Inform the Rewriter if we have a post-increment use, so that it can
5696    // perform an advantageous expansion.
5697    Rewriter.setPostInc(LF.PostIncLoops);
5698  
5699    // This is the type that the user actually needs.
5700    Type *OpTy = LF.OperandValToReplace->getType();
5701    // This will be the type that we'll initially expand to.
5702    Type *Ty = F.getType();
5703    if (!Ty)
5704      // No type known; just expand directly to the ultimate type.
5705      Ty = OpTy;
5706    else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5707      // Expand directly to the ultimate type if it's the right size.
5708      Ty = OpTy;
5709    // This is the type to do integer arithmetic in.
5710    Type *IntTy = SE.getEffectiveSCEVType(Ty);
5711  
5712    // Build up a list of operands to add together to form the full base.
5713    SmallVector<const SCEV *, 8> Ops;
5714  
5715    // Expand the BaseRegs portion.
5716    for (const SCEV *Reg : F.BaseRegs) {
5717      assert(!Reg->isZero() && "Zero allocated in a base register!");
5718  
5719      // If we're expanding for a post-inc user, make the post-inc adjustment.
5720      Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5721      Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5722    }
5723  
5724    // Expand the ScaledReg portion.
5725    Value *ICmpScaledV = nullptr;
5726    if (F.Scale != 0) {
5727      const SCEV *ScaledS = F.ScaledReg;
5728  
5729      // If we're expanding for a post-inc user, make the post-inc adjustment.
5730      PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5731      ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5732  
5733      if (LU.Kind == LSRUse::ICmpZero) {
5734        // Expand ScaleReg as if it was part of the base regs.
5735        if (F.Scale == 1)
5736          Ops.push_back(
5737              SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5738        else {
5739          // An interesting way of "folding" with an icmp is to use a negated
5740          // scale, which we'll implement by inserting it into the other operand
5741          // of the icmp.
5742          assert(F.Scale == -1 &&
5743                 "The only scale supported by ICmpZero uses is -1!");
5744          ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5745        }
5746      } else {
5747        // Otherwise just expand the scaled register and an explicit scale,
5748        // which is expected to be matched as part of the address.
5749  
5750        // Flush the operand list to suppress SCEVExpander hoisting address modes.
5751        // Unless the addressing mode will not be folded.
5752        if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5753            isAMCompletelyFolded(TTI, LU, F)) {
5754          Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5755          Ops.clear();
5756          Ops.push_back(SE.getUnknown(FullV));
5757        }
5758        ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5759        if (F.Scale != 1)
5760          ScaledS =
5761              SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5762        Ops.push_back(ScaledS);
5763      }
5764    }
5765  
5766    // Expand the GV portion.
5767    if (F.BaseGV) {
5768      // Flush the operand list to suppress SCEVExpander hoisting.
5769      if (!Ops.empty()) {
5770        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5771        Ops.clear();
5772        Ops.push_back(SE.getUnknown(FullV));
5773      }
5774      Ops.push_back(SE.getUnknown(F.BaseGV));
5775    }
5776  
5777    // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5778    // unfolded offsets. LSR assumes they both live next to their uses.
5779    if (!Ops.empty()) {
5780      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5781      Ops.clear();
5782      Ops.push_back(SE.getUnknown(FullV));
5783    }
5784  
5785    // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5786    // out at this point, or should we generate a SCEV adding together mixed
5787    // offsets?
5788    assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5789           "Expanding mismatched offsets\n");
5790    // Expand the immediate portion.
5791    Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5792    if (Offset.isNonZero()) {
5793      if (LU.Kind == LSRUse::ICmpZero) {
5794        // The other interesting way of "folding" with an ICmpZero is to use a
5795        // negated immediate.
5796        if (!ICmpScaledV)
5797          ICmpScaledV =
5798              ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5799        else {
5800          Ops.push_back(SE.getUnknown(ICmpScaledV));
5801          ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5802        }
5803      } else {
5804        // Just add the immediate values. These again are expected to be matched
5805        // as part of the address.
5806        Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5807      }
5808    }
5809  
5810    // Expand the unfolded offset portion.
5811    Immediate UnfoldedOffset = F.UnfoldedOffset;
5812    if (UnfoldedOffset.isNonZero()) {
5813      // Just add the immediate values.
5814      Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5815    }
5816  
5817    // Emit instructions summing all the operands.
5818    const SCEV *FullS = Ops.empty() ?
5819                        SE.getConstant(IntTy, 0) :
5820                        SE.getAddExpr(Ops);
5821    Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5822  
5823    // We're done expanding now, so reset the rewriter.
5824    Rewriter.clearPostInc();
5825  
5826    // An ICmpZero Formula represents an ICmp which we're handling as a
5827    // comparison against zero. Now that we've expanded an expression for that
5828    // form, update the ICmp's other operand.
5829    if (LU.Kind == LSRUse::ICmpZero) {
5830      ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5831      if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5832        DeadInsts.emplace_back(OperandIsInstr);
5833      assert(!F.BaseGV && "ICmp does not support folding a global value and "
5834                             "a scale at the same time!");
5835      if (F.Scale == -1) {
5836        if (ICmpScaledV->getType() != OpTy) {
5837          Instruction *Cast = CastInst::Create(
5838              CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5839              ICmpScaledV, OpTy, "tmp", CI->getIterator());
5840          ICmpScaledV = Cast;
5841        }
5842        CI->setOperand(1, ICmpScaledV);
5843      } else {
5844        // A scale of 1 means that the scale has been expanded as part of the
5845        // base regs.
5846        assert((F.Scale == 0 || F.Scale == 1) &&
5847               "ICmp does not support folding a global value and "
5848               "a scale at the same time!");
5849        Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
5850                                             -(uint64_t)Offset.getFixedValue());
5851        if (C->getType() != OpTy) {
5852          C = ConstantFoldCastOperand(
5853              CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5854              CI->getDataLayout());
5855          assert(C && "Cast of ConstantInt should have folded");
5856        }
5857  
5858        CI->setOperand(1, C);
5859      }
5860    }
5861  
5862    return FullV;
5863  }
5864  
5865  /// Helper for Rewrite. PHI nodes are special because the use of their operands
5866  /// effectively happens in their predecessor blocks, so the expression may need
5867  /// to be expanded in multiple places.
RewriteForPHI(PHINode * PN,const LSRUse & LU,const LSRFixup & LF,const Formula & F,SmallVectorImpl<WeakTrackingVH> & DeadInsts) const5868  void LSRInstance::RewriteForPHI(
5869      PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5870      SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5871    DenseMap<BasicBlock *, Value *> Inserted;
5872  
5873    // Inserting instructions in the loop and using them as PHI's input could
5874    // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
5875    // corresponding incoming block is not loop exiting). So collect all such
5876    // instructions to form LCSSA for them later.
5877    SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
5878  
5879    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5880      if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5881        bool needUpdateFixups = false;
5882        BasicBlock *BB = PN->getIncomingBlock(i);
5883  
5884        // If this is a critical edge, split the edge so that we do not insert
5885        // the code on all predecessor/successor paths.  We do this unless this
5886        // is the canonical backedge for this loop, which complicates post-inc
5887        // users.
5888        if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5889            !isa<IndirectBrInst>(BB->getTerminator()) &&
5890            !isa<CatchSwitchInst>(BB->getTerminator())) {
5891          BasicBlock *Parent = PN->getParent();
5892          Loop *PNLoop = LI.getLoopFor(Parent);
5893          if (!PNLoop || Parent != PNLoop->getHeader()) {
5894            // Split the critical edge.
5895            BasicBlock *NewBB = nullptr;
5896            if (!Parent->isLandingPad()) {
5897              NewBB =
5898                  SplitCriticalEdge(BB, Parent,
5899                                    CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5900                                        .setMergeIdenticalEdges()
5901                                        .setKeepOneInputPHIs());
5902            } else {
5903              SmallVector<BasicBlock*, 2> NewBBs;
5904              DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5905              SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5906              NewBB = NewBBs[0];
5907            }
5908            // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5909            // phi predecessors are identical. The simple thing to do is skip
5910            // splitting in this case rather than complicate the API.
5911            if (NewBB) {
5912              // If PN is outside of the loop and BB is in the loop, we want to
5913              // move the block to be immediately before the PHI block, not
5914              // immediately after BB.
5915              if (L->contains(BB) && !L->contains(PN))
5916                NewBB->moveBefore(PN->getParent());
5917  
5918              // Splitting the edge can reduce the number of PHI entries we have.
5919              e = PN->getNumIncomingValues();
5920              BB = NewBB;
5921              i = PN->getBasicBlockIndex(BB);
5922  
5923              needUpdateFixups = true;
5924            }
5925          }
5926        }
5927  
5928        std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5929          Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5930        if (!Pair.second)
5931          PN->setIncomingValue(i, Pair.first->second);
5932        else {
5933          Value *FullV =
5934              Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5935  
5936          // If this is reuse-by-noop-cast, insert the noop cast.
5937          Type *OpTy = LF.OperandValToReplace->getType();
5938          if (FullV->getType() != OpTy)
5939            FullV = CastInst::Create(
5940                CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5941                LF.OperandValToReplace->getType(), "tmp",
5942                BB->getTerminator()->getIterator());
5943  
5944          // If the incoming block for this value is not in the loop, it means the
5945          // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5946          // the inserted value.
5947          if (auto *I = dyn_cast<Instruction>(FullV))
5948            if (L->contains(I) && !L->contains(BB))
5949              InsertedNonLCSSAInsts.push_back(I);
5950  
5951          PN->setIncomingValue(i, FullV);
5952          Pair.first->second = FullV;
5953        }
5954  
5955        // If LSR splits critical edge and phi node has other pending
5956        // fixup operands, we need to update those pending fixups. Otherwise
5957        // formulae will not be implemented completely and some instructions
5958        // will not be eliminated.
5959        if (needUpdateFixups) {
5960          for (LSRUse &LU : Uses)
5961            for (LSRFixup &Fixup : LU.Fixups)
5962              // If fixup is supposed to rewrite some operand in the phi
5963              // that was just updated, it may be already moved to
5964              // another phi node. Such fixup requires update.
5965              if (Fixup.UserInst == PN) {
5966                // Check if the operand we try to replace still exists in the
5967                // original phi.
5968                bool foundInOriginalPHI = false;
5969                for (const auto &val : PN->incoming_values())
5970                  if (val == Fixup.OperandValToReplace) {
5971                    foundInOriginalPHI = true;
5972                    break;
5973                  }
5974  
5975                // If fixup operand found in original PHI - nothing to do.
5976                if (foundInOriginalPHI)
5977                  continue;
5978  
5979                // Otherwise it might be moved to another PHI and requires update.
5980                // If fixup operand not found in any of the incoming blocks that
5981                // means we have already rewritten it - nothing to do.
5982                for (const auto &Block : PN->blocks())
5983                  for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5984                       ++I) {
5985                    PHINode *NewPN = cast<PHINode>(I);
5986                    for (const auto &val : NewPN->incoming_values())
5987                      if (val == Fixup.OperandValToReplace)
5988                        Fixup.UserInst = NewPN;
5989                  }
5990              }
5991        }
5992      }
5993  
5994    formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
5995  }
5996  
5997  /// Emit instructions for the leading candidate expression for this LSRUse (this
5998  /// is called "expanding"), and update the UserInst to reference the newly
5999  /// expanded value.
Rewrite(const LSRUse & LU,const LSRFixup & LF,const Formula & F,SmallVectorImpl<WeakTrackingVH> & DeadInsts) const6000  void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6001                            const Formula &F,
6002                            SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
6003    // First, find an insertion point that dominates UserInst. For PHI nodes,
6004    // find the nearest block which dominates all the relevant uses.
6005    if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6006      RewriteForPHI(PN, LU, LF, F, DeadInsts);
6007    } else {
6008      Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6009  
6010      // If this is reuse-by-noop-cast, insert the noop cast.
6011      Type *OpTy = LF.OperandValToReplace->getType();
6012      if (FullV->getType() != OpTy) {
6013        Instruction *Cast =
6014            CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6015                             FullV, OpTy, "tmp", LF.UserInst->getIterator());
6016        FullV = Cast;
6017      }
6018  
6019      // Update the user. ICmpZero is handled specially here (for now) because
6020      // Expand may have updated one of the operands of the icmp already, and
6021      // its new value may happen to be equal to LF.OperandValToReplace, in
6022      // which case doing replaceUsesOfWith leads to replacing both operands
6023      // with the same value. TODO: Reorganize this.
6024      if (LU.Kind == LSRUse::ICmpZero)
6025        LF.UserInst->setOperand(0, FullV);
6026      else
6027        LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6028    }
6029  
6030    if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6031      DeadInsts.emplace_back(OperandIsInstr);
6032  }
6033  
6034  // Trying to hoist the IVInc to loop header if all IVInc users are in
6035  // the loop header. It will help backend to generate post index load/store
6036  // when the latch block is different from loop header block.
canHoistIVInc(const TargetTransformInfo & TTI,const LSRFixup & Fixup,const LSRUse & LU,Instruction * IVIncInsertPos,Loop * L)6037  static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6038                            const LSRUse &LU, Instruction *IVIncInsertPos,
6039                            Loop *L) {
6040    if (LU.Kind != LSRUse::Address)
6041      return false;
6042  
6043    // For now this code do the conservative optimization, only work for
6044    // the header block. Later we can hoist the IVInc to the block post
6045    // dominate all users.
6046    BasicBlock *LHeader = L->getHeader();
6047    if (IVIncInsertPos->getParent() == LHeader)
6048      return false;
6049  
6050    if (!Fixup.OperandValToReplace ||
6051        any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6052          Instruction *UI = cast<Instruction>(U);
6053          return UI->getParent() != LHeader;
6054        }))
6055      return false;
6056  
6057    Instruction *I = Fixup.UserInst;
6058    Type *Ty = I->getType();
6059    return Ty->isIntegerTy() &&
6060           ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6061            (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
6062  }
6063  
6064  /// Rewrite all the fixup locations with new values, following the chosen
6065  /// solution.
ImplementSolution(const SmallVectorImpl<const Formula * > & Solution)6066  void LSRInstance::ImplementSolution(
6067      const SmallVectorImpl<const Formula *> &Solution) {
6068    // Keep track of instructions we may have made dead, so that
6069    // we can remove them after we are done working.
6070    SmallVector<WeakTrackingVH, 16> DeadInsts;
6071  
6072    // Mark phi nodes that terminate chains so the expander tries to reuse them.
6073    for (const IVChain &Chain : IVChainVec) {
6074      if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6075        Rewriter.setChainedPhi(PN);
6076    }
6077  
6078    // Expand the new value definitions and update the users.
6079    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6080      for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6081        Instruction *InsertPos =
6082            canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6083                ? L->getHeader()->getTerminator()
6084                : IVIncInsertPos;
6085        Rewriter.setIVIncInsertPos(L, InsertPos);
6086        Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6087        Changed = true;
6088      }
6089  
6090    for (const IVChain &Chain : IVChainVec) {
6091      GenerateIVChain(Chain, DeadInsts);
6092      Changed = true;
6093    }
6094  
6095    for (const WeakVH &IV : Rewriter.getInsertedIVs())
6096      if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6097        ScalarEvolutionIVs.push_back(IV);
6098  
6099    // Clean up after ourselves. This must be done before deleting any
6100    // instructions.
6101    Rewriter.clear();
6102  
6103    Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
6104                                                                    &TLI, MSSAU);
6105  
6106    // In our cost analysis above, we assume that each addrec consumes exactly
6107    // one register, and arrange to have increments inserted just before the
6108    // latch to maximimize the chance this is true.  However, if we reused
6109    // existing IVs, we now need to move the increments to match our
6110    // expectations.  Otherwise, our cost modeling results in us having a
6111    // chosen a non-optimal result for the actual schedule.  (And yes, this
6112    // scheduling decision does impact later codegen.)
6113    for (PHINode &PN : L->getHeader()->phis()) {
6114      BinaryOperator *BO = nullptr;
6115      Value *Start = nullptr, *Step = nullptr;
6116      if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6117        continue;
6118  
6119      switch (BO->getOpcode()) {
6120      case Instruction::Sub:
6121        if (BO->getOperand(0) != &PN)
6122          // sub is non-commutative - match handling elsewhere in LSR
6123          continue;
6124        break;
6125      case Instruction::Add:
6126        break;
6127      default:
6128        continue;
6129      };
6130  
6131      if (!isa<Constant>(Step))
6132        // If not a constant step, might increase register pressure
6133        // (We assume constants have been canonicalized to RHS)
6134        continue;
6135  
6136      if (BO->getParent() == IVIncInsertPos->getParent())
6137        // Only bother moving across blocks.  Isel can handle block local case.
6138        continue;
6139  
6140      // Can we legally schedule inc at the desired point?
6141      if (!llvm::all_of(BO->uses(),
6142                        [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6143        continue;
6144      BO->moveBefore(IVIncInsertPos);
6145      Changed = true;
6146    }
6147  
6148  
6149  }
6150  
LSRInstance(Loop * L,IVUsers & IU,ScalarEvolution & SE,DominatorTree & DT,LoopInfo & LI,const TargetTransformInfo & TTI,AssumptionCache & AC,TargetLibraryInfo & TLI,MemorySSAUpdater * MSSAU)6151  LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6152                           DominatorTree &DT, LoopInfo &LI,
6153                           const TargetTransformInfo &TTI, AssumptionCache &AC,
6154                           TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6155      : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6156        MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6157                              ? PreferredAddresingMode
6158                              : TTI.getPreferredAddressingMode(L, &SE)),
6159        Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6160        BaselineCost(L, SE, TTI, AMK) {
6161    // If LoopSimplify form is not available, stay out of trouble.
6162    if (!L->isLoopSimplifyForm())
6163      return;
6164  
6165    // If there's no interesting work to be done, bail early.
6166    if (IU.empty()) return;
6167  
6168    // If there's too much analysis to be done, bail early. We won't be able to
6169    // model the problem anyway.
6170    unsigned NumUsers = 0;
6171    for (const IVStrideUse &U : IU) {
6172      if (++NumUsers > MaxIVUsers) {
6173        (void)U;
6174        LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6175                          << "\n");
6176        return;
6177      }
6178      // Bail out if we have a PHI on an EHPad that gets a value from a
6179      // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
6180      // no good place to stick any instructions.
6181      if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6182         auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
6183         if (isa<FuncletPadInst>(FirstNonPHI) ||
6184             isa<CatchSwitchInst>(FirstNonPHI))
6185           for (BasicBlock *PredBB : PN->blocks())
6186             if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
6187               return;
6188      }
6189    }
6190  
6191    LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6192               L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6193               dbgs() << ":\n");
6194  
6195    // Configure SCEVExpander already now, so the correct mode is used for
6196    // isSafeToExpand() checks.
6197  #ifndef NDEBUG
6198    Rewriter.setDebugType(DEBUG_TYPE);
6199  #endif
6200    Rewriter.disableCanonicalMode();
6201    Rewriter.enableLSRMode();
6202  
6203    // First, perform some low-level loop optimizations.
6204    OptimizeShadowIV();
6205    OptimizeLoopTermCond();
6206  
6207    // If loop preparation eliminates all interesting IV users, bail.
6208    if (IU.empty()) return;
6209  
6210    // Skip nested loops until we can model them better with formulae.
6211    if (!L->isInnermost()) {
6212      LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6213      return;
6214    }
6215  
6216    // Start collecting data and preparing for the solver.
6217    // If number of registers is not the major cost, we cannot benefit from the
6218    // current profitable chain optimization which is based on number of
6219    // registers.
6220    // FIXME: add profitable chain optimization for other kinds major cost, for
6221    // example number of instructions.
6222    if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6223      CollectChains();
6224    CollectInterestingTypesAndFactors();
6225    CollectFixupsAndInitialFormulae();
6226    CollectLoopInvariantFixupsAndFormulae();
6227  
6228    if (Uses.empty())
6229      return;
6230  
6231    LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6232               print_uses(dbgs()));
6233    LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6234               BaselineCost.print(dbgs()); dbgs() << "\n");
6235  
6236    // Now use the reuse data to generate a bunch of interesting ways
6237    // to formulate the values needed for the uses.
6238    GenerateAllReuseFormulae();
6239  
6240    FilterOutUndesirableDedicatedRegisters();
6241    NarrowSearchSpaceUsingHeuristics();
6242  
6243    SmallVector<const Formula *, 8> Solution;
6244    Solve(Solution);
6245  
6246    // Release memory that is no longer needed.
6247    Factors.clear();
6248    Types.clear();
6249    RegUses.clear();
6250  
6251    if (Solution.empty())
6252      return;
6253  
6254  #ifndef NDEBUG
6255    // Formulae should be legal.
6256    for (const LSRUse &LU : Uses) {
6257      for (const Formula &F : LU.Formulae)
6258        assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6259                          F) && "Illegal formula generated!");
6260    };
6261  #endif
6262  
6263    // Now that we've decided what we want, make it so.
6264    ImplementSolution(Solution);
6265  }
6266  
6267  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print_factors_and_types(raw_ostream & OS) const6268  void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6269    if (Factors.empty() && Types.empty()) return;
6270  
6271    OS << "LSR has identified the following interesting factors and types: ";
6272    bool First = true;
6273  
6274    for (int64_t Factor : Factors) {
6275      if (!First) OS << ", ";
6276      First = false;
6277      OS << '*' << Factor;
6278    }
6279  
6280    for (Type *Ty : Types) {
6281      if (!First) OS << ", ";
6282      First = false;
6283      OS << '(' << *Ty << ')';
6284    }
6285    OS << '\n';
6286  }
6287  
print_fixups(raw_ostream & OS) const6288  void LSRInstance::print_fixups(raw_ostream &OS) const {
6289    OS << "LSR is examining the following fixup sites:\n";
6290    for (const LSRUse &LU : Uses)
6291      for (const LSRFixup &LF : LU.Fixups) {
6292        dbgs() << "  ";
6293        LF.print(OS);
6294        OS << '\n';
6295      }
6296  }
6297  
print_uses(raw_ostream & OS) const6298  void LSRInstance::print_uses(raw_ostream &OS) const {
6299    OS << "LSR is examining the following uses:\n";
6300    for (const LSRUse &LU : Uses) {
6301      dbgs() << "  ";
6302      LU.print(OS);
6303      OS << '\n';
6304      for (const Formula &F : LU.Formulae) {
6305        OS << "    ";
6306        F.print(OS);
6307        OS << '\n';
6308      }
6309    }
6310  }
6311  
print(raw_ostream & OS) const6312  void LSRInstance::print(raw_ostream &OS) const {
6313    print_factors_and_types(OS);
6314    print_fixups(OS);
6315    print_uses(OS);
6316  }
6317  
dump() const6318  LLVM_DUMP_METHOD void LSRInstance::dump() const {
6319    print(errs()); errs() << '\n';
6320  }
6321  #endif
6322  
6323  namespace {
6324  
6325  class LoopStrengthReduce : public LoopPass {
6326  public:
6327    static char ID; // Pass ID, replacement for typeid
6328  
6329    LoopStrengthReduce();
6330  
6331  private:
6332    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6333    void getAnalysisUsage(AnalysisUsage &AU) const override;
6334  };
6335  
6336  } // end anonymous namespace
6337  
LoopStrengthReduce()6338  LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6339    initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
6340  }
6341  
getAnalysisUsage(AnalysisUsage & AU) const6342  void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6343    // We split critical edges, so we change the CFG.  However, we do update
6344    // many analyses if they are around.
6345    AU.addPreservedID(LoopSimplifyID);
6346  
6347    AU.addRequired<LoopInfoWrapperPass>();
6348    AU.addPreserved<LoopInfoWrapperPass>();
6349    AU.addRequiredID(LoopSimplifyID);
6350    AU.addRequired<DominatorTreeWrapperPass>();
6351    AU.addPreserved<DominatorTreeWrapperPass>();
6352    AU.addRequired<ScalarEvolutionWrapperPass>();
6353    AU.addPreserved<ScalarEvolutionWrapperPass>();
6354    AU.addRequired<AssumptionCacheTracker>();
6355    AU.addRequired<TargetLibraryInfoWrapperPass>();
6356    // Requiring LoopSimplify a second time here prevents IVUsers from running
6357    // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6358    AU.addRequiredID(LoopSimplifyID);
6359    AU.addRequired<IVUsersWrapperPass>();
6360    AU.addPreserved<IVUsersWrapperPass>();
6361    AU.addRequired<TargetTransformInfoWrapperPass>();
6362    AU.addPreserved<MemorySSAWrapperPass>();
6363  }
6364  
6365  namespace {
6366  
6367  /// Enables more convenient iteration over a DWARF expression vector.
6368  static iterator_range<llvm::DIExpression::expr_op_iterator>
ToDwarfOpIter(SmallVectorImpl<uint64_t> & Expr)6369  ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6370    llvm::DIExpression::expr_op_iterator Begin =
6371        llvm::DIExpression::expr_op_iterator(Expr.begin());
6372    llvm::DIExpression::expr_op_iterator End =
6373        llvm::DIExpression::expr_op_iterator(Expr.end());
6374    return {Begin, End};
6375  }
6376  
6377  struct SCEVDbgValueBuilder {
6378    SCEVDbgValueBuilder() = default;
SCEVDbgValueBuilder__anonc21373341411::SCEVDbgValueBuilder6379    SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6380  
clone__anonc21373341411::SCEVDbgValueBuilder6381    void clone(const SCEVDbgValueBuilder &Base) {
6382      LocationOps = Base.LocationOps;
6383      Expr = Base.Expr;
6384    }
6385  
clear__anonc21373341411::SCEVDbgValueBuilder6386    void clear() {
6387      LocationOps.clear();
6388      Expr.clear();
6389    }
6390  
6391    /// The DIExpression as we translate the SCEV.
6392    SmallVector<uint64_t, 6> Expr;
6393    /// The location ops of the DIExpression.
6394    SmallVector<Value *, 2> LocationOps;
6395  
pushOperator__anonc21373341411::SCEVDbgValueBuilder6396    void pushOperator(uint64_t Op) { Expr.push_back(Op); }
pushUInt__anonc21373341411::SCEVDbgValueBuilder6397    void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6398  
6399    /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6400    /// in the set of values referenced by the expression.
pushLocation__anonc21373341411::SCEVDbgValueBuilder6401    void pushLocation(llvm::Value *V) {
6402      Expr.push_back(llvm::dwarf::DW_OP_LLVM_arg);
6403      auto *It = llvm::find(LocationOps, V);
6404      unsigned ArgIndex = 0;
6405      if (It != LocationOps.end()) {
6406        ArgIndex = std::distance(LocationOps.begin(), It);
6407      } else {
6408        ArgIndex = LocationOps.size();
6409        LocationOps.push_back(V);
6410      }
6411      Expr.push_back(ArgIndex);
6412    }
6413  
pushValue__anonc21373341411::SCEVDbgValueBuilder6414    void pushValue(const SCEVUnknown *U) {
6415      llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6416      pushLocation(V);
6417    }
6418  
pushConst__anonc21373341411::SCEVDbgValueBuilder6419    bool pushConst(const SCEVConstant *C) {
6420      if (C->getAPInt().getSignificantBits() > 64)
6421        return false;
6422      Expr.push_back(llvm::dwarf::DW_OP_consts);
6423      Expr.push_back(C->getAPInt().getSExtValue());
6424      return true;
6425    }
6426  
6427    // Iterating the expression as DWARF ops is convenient when updating
6428    // DWARF_OP_LLVM_args.
expr_ops__anonc21373341411::SCEVDbgValueBuilder6429    iterator_range<llvm::DIExpression::expr_op_iterator> expr_ops() {
6430      return ToDwarfOpIter(Expr);
6431    }
6432  
6433    /// Several SCEV types are sequences of the same arithmetic operator applied
6434    /// to constants and values that may be extended or truncated.
pushArithmeticExpr__anonc21373341411::SCEVDbgValueBuilder6435    bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6436                            uint64_t DwarfOp) {
6437      assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6438             "Expected arithmetic SCEV type");
6439      bool Success = true;
6440      unsigned EmitOperator = 0;
6441      for (const auto &Op : CommExpr->operands()) {
6442        Success &= pushSCEV(Op);
6443  
6444        if (EmitOperator >= 1)
6445          pushOperator(DwarfOp);
6446        ++EmitOperator;
6447      }
6448      return Success;
6449    }
6450  
6451    // TODO: Identify and omit noop casts.
pushCast__anonc21373341411::SCEVDbgValueBuilder6452    bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6453      const llvm::SCEV *Inner = C->getOperand(0);
6454      const llvm::Type *Type = C->getType();
6455      uint64_t ToWidth = Type->getIntegerBitWidth();
6456      bool Success = pushSCEV(Inner);
6457      uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6458                            IsSigned ? llvm::dwarf::DW_ATE_signed
6459                                     : llvm::dwarf::DW_ATE_unsigned};
6460      for (const auto &Op : CastOps)
6461        pushOperator(Op);
6462      return Success;
6463    }
6464  
6465    // TODO: MinMax - although these haven't been encountered in the test suite.
pushSCEV__anonc21373341411::SCEVDbgValueBuilder6466    bool pushSCEV(const llvm::SCEV *S) {
6467      bool Success = true;
6468      if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6469        Success &= pushConst(StartInt);
6470  
6471      } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6472        if (!U->getValue())
6473          return false;
6474        pushLocation(U->getValue());
6475  
6476      } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6477        Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6478  
6479      } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6480        Success &= pushSCEV(UDiv->getLHS());
6481        Success &= pushSCEV(UDiv->getRHS());
6482        pushOperator(llvm::dwarf::DW_OP_div);
6483  
6484      } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6485        // Assert if a new and unknown SCEVCastEXpr type is encountered.
6486        assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6487                isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6488               "Unexpected cast type in SCEV.");
6489        Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6490  
6491      } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6492        Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6493  
6494      } else if (isa<SCEVAddRecExpr>(S)) {
6495        // Nested SCEVAddRecExpr are generated by nested loops and are currently
6496        // unsupported.
6497        return false;
6498  
6499      } else {
6500        return false;
6501      }
6502      return Success;
6503    }
6504  
6505    /// Return true if the combination of arithmetic operator and underlying
6506    /// SCEV constant value is an identity function.
isIdentityFunction__anonc21373341411::SCEVDbgValueBuilder6507    bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6508      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6509        if (C->getAPInt().getSignificantBits() > 64)
6510          return false;
6511        int64_t I = C->getAPInt().getSExtValue();
6512        switch (Op) {
6513        case llvm::dwarf::DW_OP_plus:
6514        case llvm::dwarf::DW_OP_minus:
6515          return I == 0;
6516        case llvm::dwarf::DW_OP_mul:
6517        case llvm::dwarf::DW_OP_div:
6518          return I == 1;
6519        }
6520      }
6521      return false;
6522    }
6523  
6524    /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6525    /// builder's expression stack. The stack should already contain an
6526    /// expression for the iteration count, so that it can be multiplied by
6527    /// the stride and added to the start.
6528    /// Components of the expression are omitted if they are an identity function.
6529    /// Chain (non-affine) SCEVs are not supported.
SCEVToValueExpr__anonc21373341411::SCEVDbgValueBuilder6530    bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6531      assert(SAR.isAffine() && "Expected affine SCEV");
6532      // TODO: Is this check needed?
6533      if (isa<SCEVAddRecExpr>(SAR.getStart()))
6534        return false;
6535  
6536      const SCEV *Start = SAR.getStart();
6537      const SCEV *Stride = SAR.getStepRecurrence(SE);
6538  
6539      // Skip pushing arithmetic noops.
6540      if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6541        if (!pushSCEV(Stride))
6542          return false;
6543        pushOperator(llvm::dwarf::DW_OP_mul);
6544      }
6545      if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6546        if (!pushSCEV(Start))
6547          return false;
6548        pushOperator(llvm::dwarf::DW_OP_plus);
6549      }
6550      return true;
6551    }
6552  
6553    /// Create an expression that is an offset from a value (usually the IV).
createOffsetExpr__anonc21373341411::SCEVDbgValueBuilder6554    void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6555      pushLocation(OffsetValue);
6556      DIExpression::appendOffset(Expr, Offset);
6557      LLVM_DEBUG(
6558          dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6559                 << std::to_string(Offset) << "\n");
6560    }
6561  
6562    /// Combine a translation of the SCEV and the IV to create an expression that
6563    /// recovers a location's value.
6564    /// returns true if an expression was created.
createIterCountExpr__anonc21373341411::SCEVDbgValueBuilder6565    bool createIterCountExpr(const SCEV *S,
6566                             const SCEVDbgValueBuilder &IterationCount,
6567                             ScalarEvolution &SE) {
6568      // SCEVs for SSA values are most frquently of the form
6569      // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6570      // This is because %a is a PHI node that is not the IV. However, these
6571      // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6572      // so its not expected this point will be reached.
6573      if (!isa<SCEVAddRecExpr>(S))
6574        return false;
6575  
6576      LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6577                        << '\n');
6578  
6579      const auto *Rec = cast<SCEVAddRecExpr>(S);
6580      if (!Rec->isAffine())
6581        return false;
6582  
6583      if (S->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6584        return false;
6585  
6586      // Initialise a new builder with the iteration count expression. In
6587      // combination with the value's SCEV this enables recovery.
6588      clone(IterationCount);
6589      if (!SCEVToValueExpr(*Rec, SE))
6590        return false;
6591  
6592      return true;
6593    }
6594  
6595    /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6596    /// builder's expression stack. The stack should already contain an
6597    /// expression for the iteration count, so that it can be multiplied by
6598    /// the stride and added to the start.
6599    /// Components of the expression are omitted if they are an identity function.
SCEVToIterCountExpr__anonc21373341411::SCEVDbgValueBuilder6600    bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6601                             ScalarEvolution &SE) {
6602      assert(SAR.isAffine() && "Expected affine SCEV");
6603      if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6604        LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6605                          << SAR << '\n');
6606        return false;
6607      }
6608      const SCEV *Start = SAR.getStart();
6609      const SCEV *Stride = SAR.getStepRecurrence(SE);
6610  
6611      // Skip pushing arithmetic noops.
6612      if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6613        if (!pushSCEV(Start))
6614          return false;
6615        pushOperator(llvm::dwarf::DW_OP_minus);
6616      }
6617      if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6618        if (!pushSCEV(Stride))
6619          return false;
6620        pushOperator(llvm::dwarf::DW_OP_div);
6621      }
6622      return true;
6623    }
6624  
6625    // Append the current expression and locations to a location list and an
6626    // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6627    // the locations already present in the destination list.
appendToVectors__anonc21373341411::SCEVDbgValueBuilder6628    void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6629                         SmallVectorImpl<Value *> &DestLocations) {
6630      assert(!DestLocations.empty() &&
6631             "Expected the locations vector to contain the IV");
6632      // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6633      // modified to account for the locations already in the destination vector.
6634      // All builders contain the IV as the first location op.
6635      assert(!LocationOps.empty() &&
6636             "Expected the location ops to contain the IV.");
6637      // DestIndexMap[n] contains the index in DestLocations for the nth
6638      // location in this SCEVDbgValueBuilder.
6639      SmallVector<uint64_t, 2> DestIndexMap;
6640      for (const auto &Op : LocationOps) {
6641        auto It = find(DestLocations, Op);
6642        if (It != DestLocations.end()) {
6643          // Location already exists in DestLocations, reuse existing ArgIndex.
6644          DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6645          continue;
6646        }
6647        // Location is not in DestLocations, add it.
6648        DestIndexMap.push_back(DestLocations.size());
6649        DestLocations.push_back(Op);
6650      }
6651  
6652      for (const auto &Op : expr_ops()) {
6653        if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6654          Op.appendToVector(DestExpr);
6655          continue;
6656        }
6657  
6658        DestExpr.push_back(dwarf::DW_OP_LLVM_arg);
6659        // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6660        // DestIndexMap[n] contains its new index in DestLocations.
6661        uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6662        DestExpr.push_back(NewIndex);
6663      }
6664    }
6665  };
6666  
6667  /// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6668  /// and DIExpression.
6669  struct DVIRecoveryRec {
DVIRecoveryRec__anonc21373341411::DVIRecoveryRec6670    DVIRecoveryRec(DbgValueInst *DbgValue)
6671        : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6672          HadLocationArgList(false) {}
DVIRecoveryRec__anonc21373341411::DVIRecoveryRec6673    DVIRecoveryRec(DbgVariableRecord *DVR)
6674        : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6675  
6676    PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef;
6677    DIExpression *Expr;
6678    bool HadLocationArgList;
6679    SmallVector<WeakVH, 2> LocationOps;
6680    SmallVector<const llvm::SCEV *, 2> SCEVs;
6681    SmallVector<std::unique_ptr<SCEVDbgValueBuilder>, 2> RecoveryExprs;
6682  
clear__anonc21373341411::DVIRecoveryRec6683    void clear() {
6684      for (auto &RE : RecoveryExprs)
6685        RE.reset();
6686      RecoveryExprs.clear();
6687    }
6688  
~DVIRecoveryRec__anonc21373341411::DVIRecoveryRec6689    ~DVIRecoveryRec() { clear(); }
6690  };
6691  } // namespace
6692  
6693  /// Returns the total number of DW_OP_llvm_arg operands in the expression.
6694  /// This helps in determining if a DIArglist is necessary or can be omitted from
6695  /// the dbg.value.
numLLVMArgOps(SmallVectorImpl<uint64_t> & Expr)6696  static unsigned numLLVMArgOps(SmallVectorImpl<uint64_t> &Expr) {
6697    auto expr_ops = ToDwarfOpIter(Expr);
6698    unsigned Count = 0;
6699    for (auto Op : expr_ops)
6700      if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6701        Count++;
6702    return Count;
6703  }
6704  
6705  /// Overwrites DVI with the location and Ops as the DIExpression. This will
6706  /// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6707  /// because a DIArglist is not created for the first argument of the dbg.value.
6708  template <typename T>
updateDVIWithLocation(T & DbgVal,Value * Location,SmallVectorImpl<uint64_t> & Ops)6709  static void updateDVIWithLocation(T &DbgVal, Value *Location,
6710                                    SmallVectorImpl<uint64_t> &Ops) {
6711    assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6712                                      "contain any DW_OP_llvm_arg operands.");
6713    DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6714    DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6715    DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6716  }
6717  
6718  /// Overwrite DVI with locations placed into a DIArglist.
6719  template <typename T>
updateDVIWithLocations(T & DbgVal,SmallVectorImpl<Value * > & Locations,SmallVectorImpl<uint64_t> & Ops)6720  static void updateDVIWithLocations(T &DbgVal,
6721                                     SmallVectorImpl<Value *> &Locations,
6722                                     SmallVectorImpl<uint64_t> &Ops) {
6723    assert(numLLVMArgOps(Ops) != 0 &&
6724           "Expected expression that references DIArglist locations using "
6725           "DW_OP_llvm_arg operands.");
6726    SmallVector<ValueAsMetadata *, 3> MetadataLocs;
6727    for (Value *V : Locations)
6728      MetadataLocs.push_back(ValueAsMetadata::get(V));
6729    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6730    DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6731    DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6732  }
6733  
6734  /// Write the new expression and new location ops for the dbg.value. If possible
6735  /// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6736  /// can be omitted if:
6737  /// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6738  /// 2. The DW_OP_LLVM_arg is the first operand in the expression.
UpdateDbgValueInst(DVIRecoveryRec & DVIRec,SmallVectorImpl<Value * > & NewLocationOps,SmallVectorImpl<uint64_t> & NewExpr)6739  static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6740                                 SmallVectorImpl<Value *> &NewLocationOps,
6741                                 SmallVectorImpl<uint64_t> &NewExpr) {
6742    auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6743      unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6744      if (NumLLVMArgs == 0) {
6745        // Location assumed to be on the stack.
6746        updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6747      } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6748        // There is only a single DW_OP_llvm_arg at the start of the expression,
6749        // so it can be omitted along with DIArglist.
6750        assert(NewExpr[1] == 0 &&
6751               "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6752        llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
6753        updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6754      } else {
6755        // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6756        updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6757      }
6758  
6759      // If the DIExpression was previously empty then add the stack terminator.
6760      // Non-empty expressions have only had elements inserted into them and so
6761      // the terminator should already be present e.g. stack_value or fragment.
6762      DIExpression *SalvageExpr = DbgVal->getExpression();
6763      if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6764        SalvageExpr =
6765            DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6766        DbgVal->setExpression(SalvageExpr);
6767      }
6768    };
6769    if (isa<DbgValueInst *>(DVIRec.DbgRef))
6770      UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6771    else
6772      UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6773  }
6774  
6775  /// Cached location ops may be erased during LSR, in which case a poison is
6776  /// required when restoring from the cache. The type of that location is no
6777  /// longer available, so just use int8. The poison will be replaced by one or
6778  /// more locations later when a SCEVDbgValueBuilder selects alternative
6779  /// locations to use for the salvage.
getValueOrPoison(WeakVH & VH,LLVMContext & C)6780  static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) {
6781    return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6782  }
6783  
6784  /// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
restorePreTransformState(DVIRecoveryRec & DVIRec)6785  static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6786    auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6787      LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6788                        << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6789      assert(DVIRec.Expr && "Expected an expression");
6790      DbgVal->setExpression(DVIRec.Expr);
6791  
6792      // Even a single location-op may be inside a DIArgList and referenced with
6793      // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6794      if (!DVIRec.HadLocationArgList) {
6795        assert(DVIRec.LocationOps.size() == 1 &&
6796               "Unexpected number of location ops.");
6797        // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6798        // this case was not present before, so force the location back to a
6799        // single uncontained Value.
6800        Value *CachedValue =
6801            getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6802        DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6803      } else {
6804        SmallVector<ValueAsMetadata *, 3> MetadataLocs;
6805        for (WeakVH VH : DVIRec.LocationOps) {
6806          Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6807          MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6808        }
6809        auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6810        DbgVal->setRawLocation(
6811            llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6812      }
6813      LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6814    };
6815    if (isa<DbgValueInst *>(DVIRec.DbgRef))
6816      RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6817    else
6818      RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6819  }
6820  
SalvageDVI(llvm::Loop * L,ScalarEvolution & SE,llvm::PHINode * LSRInductionVar,DVIRecoveryRec & DVIRec,const SCEV * SCEVInductionVar,SCEVDbgValueBuilder IterCountExpr)6821  static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
6822                         llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6823                         const SCEV *SCEVInductionVar,
6824                         SCEVDbgValueBuilder IterCountExpr) {
6825  
6826    if (isa<DbgValueInst *>(DVIRec.DbgRef)
6827            ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6828            : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6829      return false;
6830  
6831    // LSR may have caused several changes to the dbg.value in the failed salvage
6832    // attempt. So restore the DIExpression, the location ops and also the
6833    // location ops format, which is always DIArglist for multiple ops, but only
6834    // sometimes for a single op.
6835    restorePreTransformState(DVIRec);
6836  
6837    // LocationOpIndexMap[i] will store the post-LSR location index of
6838    // the non-optimised out location at pre-LSR index i.
6839    SmallVector<int64_t, 2> LocationOpIndexMap;
6840    LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6841    SmallVector<Value *, 2> NewLocationOps;
6842    NewLocationOps.push_back(LSRInductionVar);
6843  
6844    for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6845      WeakVH VH = DVIRec.LocationOps[i];
6846      // Place the locations not optimised out in the list first, avoiding
6847      // inserts later. The map is used to update the DIExpression's
6848      // DW_OP_LLVM_arg arguments as the expression is updated.
6849      if (VH && !isa<UndefValue>(VH)) {
6850        NewLocationOps.push_back(VH);
6851        LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6852        LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6853                          << " now at index " << LocationOpIndexMap[i] << "\n");
6854        continue;
6855      }
6856  
6857      // It's possible that a value referred to in the SCEV may have been
6858      // optimised out by LSR.
6859      if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6860          SE.containsUndefs(DVIRec.SCEVs[i])) {
6861        LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6862                          << " refers to a location that is now undef or erased. "
6863                             "Salvage abandoned.\n");
6864        return false;
6865      }
6866  
6867      LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6868                        << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6869  
6870      DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6871      SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6872  
6873      // Create an offset-based salvage expression if possible, as it requires
6874      // less DWARF ops than an iteration count-based expression.
6875      if (std::optional<APInt> Offset =
6876              SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6877        if (Offset->getSignificantBits() <= 64)
6878          SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6879      } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6880                                                   SE))
6881        return false;
6882    }
6883  
6884    // Merge the DbgValueBuilder generated expressions and the original
6885    // DIExpression, place the result into an new vector.
6886    SmallVector<uint64_t, 3> NewExpr;
6887    if (DVIRec.Expr->getNumElements() == 0) {
6888      assert(DVIRec.RecoveryExprs.size() == 1 &&
6889             "Expected only a single recovery expression for an empty "
6890             "DIExpression.");
6891      assert(DVIRec.RecoveryExprs[0] &&
6892             "Expected a SCEVDbgSalvageBuilder for location 0");
6893      SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6894      B->appendToVectors(NewExpr, NewLocationOps);
6895    }
6896    for (const auto &Op : DVIRec.Expr->expr_ops()) {
6897      // Most Ops needn't be updated.
6898      if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6899        Op.appendToVector(NewExpr);
6900        continue;
6901      }
6902  
6903      uint64_t LocationArgIndex = Op.getArg(0);
6904      SCEVDbgValueBuilder *DbgBuilder =
6905          DVIRec.RecoveryExprs[LocationArgIndex].get();
6906      // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6907      // optimise it away. So just translate the argument to the updated
6908      // location index.
6909      if (!DbgBuilder) {
6910        NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6911        assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6912               "Expected a positive index for the location-op position.");
6913        NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6914        continue;
6915      }
6916      // The location has a recovery expression.
6917      DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6918    }
6919  
6920    UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6921    if (isa<DbgValueInst *>(DVIRec.DbgRef))
6922      LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6923                        << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6924    else
6925      LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6926                        << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6927    return true;
6928  }
6929  
6930  /// Obtain an expression for the iteration count, then attempt to salvage the
6931  /// dbg.value intrinsics.
DbgRewriteSalvageableDVIs(llvm::Loop * L,ScalarEvolution & SE,llvm::PHINode * LSRInductionVar,SmallVector<std::unique_ptr<DVIRecoveryRec>,2> & DVIToUpdate)6932  static void DbgRewriteSalvageableDVIs(
6933      llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6934      SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6935    if (DVIToUpdate.empty())
6936      return;
6937  
6938    const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6939    assert(SCEVInductionVar &&
6940           "Anticipated a SCEV for the post-LSR induction variable");
6941  
6942    if (const SCEVAddRecExpr *IVAddRec =
6943            dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6944      if (!IVAddRec->isAffine())
6945        return;
6946  
6947      // Prevent translation using excessive resources.
6948      if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6949        return;
6950  
6951      // The iteration count is required to recover location values.
6952      SCEVDbgValueBuilder IterCountExpr;
6953      IterCountExpr.pushLocation(LSRInductionVar);
6954      if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6955        return;
6956  
6957      LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6958                        << '\n');
6959  
6960      for (auto &DVIRec : DVIToUpdate) {
6961        SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6962                   IterCountExpr);
6963      }
6964    }
6965  }
6966  
6967  /// Identify and cache salvageable DVI locations and expressions along with the
6968  /// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6969  /// cacheing and salvaging.
DbgGatherSalvagableDVI(Loop * L,ScalarEvolution & SE,SmallVector<std::unique_ptr<DVIRecoveryRec>,2> & SalvageableDVISCEVs,SmallSet<AssertingVH<DbgValueInst>,2> & DVIHandles)6970  static void DbgGatherSalvagableDVI(
6971      Loop *L, ScalarEvolution &SE,
6972      SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6973      SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6974    for (const auto &B : L->getBlocks()) {
6975      for (auto &I : *B) {
6976        auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6977          // Ensure that if any location op is undef that the dbg.vlue is not
6978          // cached.
6979          if (DbgVal->isKillLocation())
6980            return false;
6981  
6982          // Check that the location op SCEVs are suitable for translation to
6983          // DIExpression.
6984          const auto &HasTranslatableLocationOps =
6985              [&](const auto *DbgValToTranslate) -> bool {
6986            for (const auto LocOp : DbgValToTranslate->location_ops()) {
6987              if (!LocOp)
6988                return false;
6989  
6990              if (!SE.isSCEVable(LocOp->getType()))
6991                return false;
6992  
6993              const SCEV *S = SE.getSCEV(LocOp);
6994              if (SE.containsUndefs(S))
6995                return false;
6996            }
6997            return true;
6998          };
6999  
7000          if (!HasTranslatableLocationOps(DbgVal))
7001            return false;
7002  
7003          std::unique_ptr<DVIRecoveryRec> NewRec =
7004              std::make_unique<DVIRecoveryRec>(DbgVal);
7005          // Each location Op may need a SCEVDbgValueBuilder in order to recover
7006          // it. Pre-allocating a vector will enable quick lookups of the builder
7007          // later during the salvage.
7008          NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
7009          for (const auto LocOp : DbgVal->location_ops()) {
7010            NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7011            NewRec->LocationOps.push_back(LocOp);
7012            NewRec->HadLocationArgList = DbgVal->hasArgList();
7013          }
7014          SalvageableDVISCEVs.push_back(std::move(NewRec));
7015          return true;
7016        };
7017        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
7018          if (DVR.isDbgValue() || DVR.isDbgAssign())
7019            ProcessDbgValue(&DVR);
7020        }
7021        auto DVI = dyn_cast<DbgValueInst>(&I);
7022        if (!DVI)
7023          continue;
7024        if (ProcessDbgValue(DVI))
7025          DVIHandles.insert(DVI);
7026      }
7027    }
7028  }
7029  
7030  /// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7031  /// any PHi from the loop header is usable, but may have less chance of
7032  /// surviving subsequent transforms.
GetInductionVariable(const Loop & L,ScalarEvolution & SE,const LSRInstance & LSR)7033  static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE,
7034                                             const LSRInstance &LSR) {
7035  
7036    auto IsSuitableIV = [&](PHINode *P) {
7037      if (!SE.isSCEVable(P->getType()))
7038        return false;
7039      if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7040        return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7041      return false;
7042    };
7043  
7044    // For now, just pick the first IV that was generated and inserted by
7045    // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7046    // by subsequent transforms.
7047    for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7048      if (!IV)
7049        continue;
7050  
7051      // There should only be PHI node IVs.
7052      PHINode *P = cast<PHINode>(&*IV);
7053  
7054      if (IsSuitableIV(P))
7055        return P;
7056    }
7057  
7058    for (PHINode &P : L.getHeader()->phis()) {
7059      if (IsSuitableIV(&P))
7060        return &P;
7061    }
7062    return nullptr;
7063  }
7064  
7065  static std::optional<std::tuple<PHINode *, PHINode *, const SCEV *, bool>>
canFoldTermCondOfLoop(Loop * L,ScalarEvolution & SE,DominatorTree & DT,const LoopInfo & LI,const TargetTransformInfo & TTI)7066  canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
7067                        const LoopInfo &LI, const TargetTransformInfo &TTI) {
7068    if (!L->isInnermost()) {
7069      LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n");
7070      return std::nullopt;
7071    }
7072    // Only inspect on simple loop structure
7073    if (!L->isLoopSimplifyForm()) {
7074      LLVM_DEBUG(dbgs() << "Cannot fold on non-simple loop\n");
7075      return std::nullopt;
7076    }
7077  
7078    if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
7079      LLVM_DEBUG(dbgs() << "Cannot fold on backedge that is loop variant\n");
7080      return std::nullopt;
7081    }
7082  
7083    BasicBlock *LoopLatch = L->getLoopLatch();
7084    BranchInst *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
7085    if (!BI || BI->isUnconditional())
7086      return std::nullopt;
7087    auto *TermCond = dyn_cast<ICmpInst>(BI->getCondition());
7088    if (!TermCond) {
7089      LLVM_DEBUG(
7090          dbgs() << "Cannot fold on branching condition that is not an ICmpInst");
7091      return std::nullopt;
7092    }
7093    if (!TermCond->hasOneUse()) {
7094      LLVM_DEBUG(
7095          dbgs()
7096          << "Cannot replace terminating condition with more than one use\n");
7097      return std::nullopt;
7098    }
7099  
7100    BinaryOperator *LHS = dyn_cast<BinaryOperator>(TermCond->getOperand(0));
7101    Value *RHS = TermCond->getOperand(1);
7102    if (!LHS || !L->isLoopInvariant(RHS))
7103      // We could pattern match the inverse form of the icmp, but that is
7104      // non-canonical, and this pass is running *very* late in the pipeline.
7105      return std::nullopt;
7106  
7107    // Find the IV used by the current exit condition.
7108    PHINode *ToFold;
7109    Value *ToFoldStart, *ToFoldStep;
7110    if (!matchSimpleRecurrence(LHS, ToFold, ToFoldStart, ToFoldStep))
7111      return std::nullopt;
7112  
7113    // Ensure the simple recurrence is a part of the current loop.
7114    if (ToFold->getParent() != L->getHeader())
7115      return std::nullopt;
7116  
7117    // If that IV isn't dead after we rewrite the exit condition in terms of
7118    // another IV, there's no point in doing the transform.
7119    if (!isAlmostDeadIV(ToFold, LoopLatch, TermCond))
7120      return std::nullopt;
7121  
7122    // Inserting instructions in the preheader has a runtime cost, scale
7123    // the allowed cost with the loops trip count as best we can.
7124    const unsigned ExpansionBudget = [&]() {
7125      unsigned Budget = 2 * SCEVCheapExpansionBudget;
7126      if (unsigned SmallTC = SE.getSmallConstantMaxTripCount(L))
7127        return std::min(Budget, SmallTC);
7128      if (std::optional<unsigned> SmallTC = getLoopEstimatedTripCount(L))
7129        return std::min(Budget, *SmallTC);
7130      // Unknown trip count, assume long running by default.
7131      return Budget;
7132    }();
7133  
7134    const SCEV *BECount = SE.getBackedgeTakenCount(L);
7135    const DataLayout &DL = L->getHeader()->getDataLayout();
7136    SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7137  
7138    PHINode *ToHelpFold = nullptr;
7139    const SCEV *TermValueS = nullptr;
7140    bool MustDropPoison = false;
7141    auto InsertPt = L->getLoopPreheader()->getTerminator();
7142    for (PHINode &PN : L->getHeader()->phis()) {
7143      if (ToFold == &PN)
7144        continue;
7145  
7146      if (!SE.isSCEVable(PN.getType())) {
7147        LLVM_DEBUG(dbgs() << "IV of phi '" << PN
7148                          << "' is not SCEV-able, not qualified for the "
7149                             "terminating condition folding.\n");
7150        continue;
7151      }
7152      const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
7153      // Only speculate on affine AddRec
7154      if (!AddRec || !AddRec->isAffine()) {
7155        LLVM_DEBUG(dbgs() << "SCEV of phi '" << PN
7156                          << "' is not an affine add recursion, not qualified "
7157                             "for the terminating condition folding.\n");
7158        continue;
7159      }
7160  
7161      // Check that we can compute the value of AddRec on the exiting iteration
7162      // without soundness problems.  evaluateAtIteration internally needs
7163      // to multiply the stride of the iteration number - which may wrap around.
7164      // The issue here is subtle because computing the result accounting for
7165      // wrap is insufficient. In order to use the result in an exit test, we
7166      // must also know that AddRec doesn't take the same value on any previous
7167      // iteration. The simplest case to consider is a candidate IV which is
7168      // narrower than the trip count (and thus original IV), but this can
7169      // also happen due to non-unit strides on the candidate IVs.
7170      if (!AddRec->hasNoSelfWrap() ||
7171          !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
7172        continue;
7173  
7174      const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
7175      const SCEV *TermValueSLocal = PostInc->evaluateAtIteration(BECount, SE);
7176      if (!Expander.isSafeToExpand(TermValueSLocal)) {
7177        LLVM_DEBUG(
7178            dbgs() << "Is not safe to expand terminating value for phi node" << PN
7179                   << "\n");
7180        continue;
7181      }
7182  
7183      if (Expander.isHighCostExpansion(TermValueSLocal, L, ExpansionBudget,
7184                                       &TTI, InsertPt)) {
7185        LLVM_DEBUG(
7186            dbgs() << "Is too expensive to expand terminating value for phi node"
7187                   << PN << "\n");
7188        continue;
7189      }
7190  
7191      // The candidate IV may have been otherwise dead and poison from the
7192      // very first iteration.  If we can't disprove that, we can't use the IV.
7193      if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) {
7194        LLVM_DEBUG(dbgs() << "Can not prove poison safety for IV "
7195                          << PN << "\n");
7196        continue;
7197      }
7198  
7199      // The candidate IV may become poison on the last iteration.  If this
7200      // value is not branched on, this is a well defined program.  We're
7201      // about to add a new use to this IV, and we have to ensure we don't
7202      // insert UB which didn't previously exist.
7203      bool MustDropPoisonLocal = false;
7204      Instruction *PostIncV =
7205        cast<Instruction>(PN.getIncomingValueForBlock(LoopLatch));
7206      if (!mustExecuteUBIfPoisonOnPathTo(PostIncV, LoopLatch->getTerminator(),
7207                                         &DT)) {
7208        LLVM_DEBUG(dbgs() << "Can not prove poison safety to insert use"
7209                          << PN << "\n");
7210  
7211        // If this is a complex recurrance with multiple instructions computing
7212        // the backedge value, we might need to strip poison flags from all of
7213        // them.
7214        if (PostIncV->getOperand(0) != &PN)
7215          continue;
7216  
7217        // In order to perform the transform, we need to drop the poison generating
7218        // flags on this instruction (if any).
7219        MustDropPoisonLocal = PostIncV->hasPoisonGeneratingFlags();
7220      }
7221  
7222      // We pick the last legal alternate IV.  We could expore choosing an optimal
7223      // alternate IV if we had a decent heuristic to do so.
7224      ToHelpFold = &PN;
7225      TermValueS = TermValueSLocal;
7226      MustDropPoison = MustDropPoisonLocal;
7227    }
7228  
7229    LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs()
7230                   << "Cannot find other AddRec IV to help folding\n";);
7231  
7232    LLVM_DEBUG(if (ToFold && ToHelpFold) dbgs()
7233               << "\nFound loop that can fold terminating condition\n"
7234               << "  BECount (SCEV): " << *SE.getBackedgeTakenCount(L) << "\n"
7235               << "  TermCond: " << *TermCond << "\n"
7236               << "  BrandInst: " << *BI << "\n"
7237               << "  ToFold: " << *ToFold << "\n"
7238               << "  ToHelpFold: " << *ToHelpFold << "\n");
7239  
7240    if (!ToFold || !ToHelpFold)
7241      return std::nullopt;
7242    return std::make_tuple(ToFold, ToHelpFold, TermValueS, MustDropPoison);
7243  }
7244  
ReduceLoopStrength(Loop * L,IVUsers & IU,ScalarEvolution & SE,DominatorTree & DT,LoopInfo & LI,const TargetTransformInfo & TTI,AssumptionCache & AC,TargetLibraryInfo & TLI,MemorySSA * MSSA)7245  static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
7246                                 DominatorTree &DT, LoopInfo &LI,
7247                                 const TargetTransformInfo &TTI,
7248                                 AssumptionCache &AC, TargetLibraryInfo &TLI,
7249                                 MemorySSA *MSSA) {
7250  
7251    // Debug preservation - before we start removing anything identify which DVI
7252    // meet the salvageable criteria and store their DIExpression and SCEVs.
7253    SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7254    SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
7255    DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7256  
7257    bool Changed = false;
7258    std::unique_ptr<MemorySSAUpdater> MSSAU;
7259    if (MSSA)
7260      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7261  
7262    // Run the main LSR transformation.
7263    const LSRInstance &Reducer =
7264        LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7265    Changed |= Reducer.getChanged();
7266  
7267    // Remove any extra phis created by processing inner loops.
7268    Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7269    if (EnablePhiElim && L->isLoopSimplifyForm()) {
7270      SmallVector<WeakTrackingVH, 16> DeadInsts;
7271      const DataLayout &DL = L->getHeader()->getDataLayout();
7272      SCEVExpander Rewriter(SE, DL, "lsr", false);
7273  #ifndef NDEBUG
7274      Rewriter.setDebugType(DEBUG_TYPE);
7275  #endif
7276      unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7277      Rewriter.clear();
7278      if (numFolded) {
7279        Changed = true;
7280        RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
7281                                                             MSSAU.get());
7282        DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7283      }
7284    }
7285    // LSR may at times remove all uses of an induction variable from a loop.
7286    // The only remaining use is the PHI in the exit block.
7287    // When this is the case, if the exit value of the IV can be calculated using
7288    // SCEV, we can replace the exit block PHI with the final value of the IV and
7289    // skip the updates in each loop iteration.
7290    if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7291      SmallVector<WeakTrackingVH, 16> DeadInsts;
7292      const DataLayout &DL = L->getHeader()->getDataLayout();
7293      SCEVExpander Rewriter(SE, DL, "lsr", true);
7294      int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7295                                           UnusedIndVarInLoop, DeadInsts);
7296      Rewriter.clear();
7297      if (Rewrites) {
7298        Changed = true;
7299        RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
7300                                                             MSSAU.get());
7301        DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7302      }
7303    }
7304  
7305    const bool EnableFormTerm = [&] {
7306      switch (AllowTerminatingConditionFoldingAfterLSR) {
7307      case cl::BOU_TRUE:
7308        return true;
7309      case cl::BOU_FALSE:
7310        return false;
7311      case cl::BOU_UNSET:
7312        return TTI.shouldFoldTerminatingConditionAfterLSR();
7313      }
7314      llvm_unreachable("Unhandled cl::boolOrDefault enum");
7315    }();
7316  
7317    if (EnableFormTerm) {
7318      if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI, TTI)) {
7319        auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
7320  
7321        Changed = true;
7322        NumTermFold++;
7323  
7324        BasicBlock *LoopPreheader = L->getLoopPreheader();
7325        BasicBlock *LoopLatch = L->getLoopLatch();
7326  
7327        (void)ToFold;
7328        LLVM_DEBUG(dbgs() << "To fold phi-node:\n"
7329                          << *ToFold << "\n"
7330                          << "New term-cond phi-node:\n"
7331                          << *ToHelpFold << "\n");
7332  
7333        Value *StartValue = ToHelpFold->getIncomingValueForBlock(LoopPreheader);
7334        (void)StartValue;
7335        Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch);
7336  
7337        // See comment in canFoldTermCondOfLoop on why this is sufficient.
7338        if (MustDrop)
7339          cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags();
7340  
7341        // SCEVExpander for both use in preheader and latch
7342        const DataLayout &DL = L->getHeader()->getDataLayout();
7343        SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7344  
7345        assert(Expander.isSafeToExpand(TermValueS) &&
7346               "Terminating value was checked safe in canFoldTerminatingCondition");
7347  
7348        // Create new terminating value at loop preheader
7349        Value *TermValue = Expander.expandCodeFor(TermValueS, ToHelpFold->getType(),
7350                                                  LoopPreheader->getTerminator());
7351  
7352        LLVM_DEBUG(dbgs() << "Start value of new term-cond phi-node:\n"
7353                          << *StartValue << "\n"
7354                          << "Terminating value of new term-cond phi-node:\n"
7355                          << *TermValue << "\n");
7356  
7357        // Create new terminating condition at loop latch
7358        BranchInst *BI = cast<BranchInst>(LoopLatch->getTerminator());
7359        ICmpInst *OldTermCond = cast<ICmpInst>(BI->getCondition());
7360        IRBuilder<> LatchBuilder(LoopLatch->getTerminator());
7361        Value *NewTermCond =
7362            LatchBuilder.CreateICmp(CmpInst::ICMP_EQ, LoopValue, TermValue,
7363                                    "lsr_fold_term_cond.replaced_term_cond");
7364        // Swap successors to exit loop body if IV equals to new TermValue
7365        if (BI->getSuccessor(0) == L->getHeader())
7366          BI->swapSuccessors();
7367  
7368        LLVM_DEBUG(dbgs() << "Old term-cond:\n"
7369                          << *OldTermCond << "\n"
7370                          << "New term-cond:\n" << *NewTermCond << "\n");
7371  
7372        BI->setCondition(NewTermCond);
7373  
7374        Expander.clear();
7375        OldTermCond->eraseFromParent();
7376        DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7377      }
7378    }
7379  
7380    if (SalvageableDVIRecords.empty())
7381      return Changed;
7382  
7383    // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7384    // expressions composed using the derived iteration count.
7385    // TODO: Allow for multiple IV references for nested AddRecSCEVs
7386    for (const auto &L : LI) {
7387      if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7388        DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7389      else {
7390        LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7391                             "could not be identified.\n");
7392      }
7393    }
7394  
7395    for (auto &Rec : SalvageableDVIRecords)
7396      Rec->clear();
7397    SalvageableDVIRecords.clear();
7398    DVIHandles.clear();
7399    return Changed;
7400  }
7401  
runOnLoop(Loop * L,LPPassManager &)7402  bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7403    if (skipLoop(L))
7404      return false;
7405  
7406    auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7407    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7408    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7409    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7410    const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7411        *L->getHeader()->getParent());
7412    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7413        *L->getHeader()->getParent());
7414    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7415        *L->getHeader()->getParent());
7416    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7417    MemorySSA *MSSA = nullptr;
7418    if (MSSAAnalysis)
7419      MSSA = &MSSAAnalysis->getMSSA();
7420    return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7421  }
7422  
run(Loop & L,LoopAnalysisManager & AM,LoopStandardAnalysisResults & AR,LPMUpdater &)7423  PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
7424                                                LoopStandardAnalysisResults &AR,
7425                                                LPMUpdater &) {
7426    if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7427                            AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7428      return PreservedAnalyses::all();
7429  
7430    auto PA = getLoopPassPreservedAnalyses();
7431    if (AR.MSSA)
7432      PA.preserve<MemorySSAAnalysis>();
7433    return PA;
7434  }
7435  
7436  char LoopStrengthReduce::ID = 0;
7437  
7438  INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7439                        "Loop Strength Reduction", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)7440  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7441  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7442  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7443  INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
7444  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7445  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7446  INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7447                      "Loop Strength Reduction", false, false)
7448  
7449  Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
7450