xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp (revision a7beca6fb113986839de73b7cf73d933464898c6)
1  //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // This pass converts selects to conditional jumps when profitable.
10  //
11  //===----------------------------------------------------------------------===//
12  
13  #include "llvm/CodeGen/SelectOptimize.h"
14  #include "llvm/ADT/SmallVector.h"
15  #include "llvm/ADT/Statistic.h"
16  #include "llvm/Analysis/BlockFrequencyInfo.h"
17  #include "llvm/Analysis/BranchProbabilityInfo.h"
18  #include "llvm/Analysis/LoopInfo.h"
19  #include "llvm/Analysis/OptimizationRemarkEmitter.h"
20  #include "llvm/Analysis/ProfileSummaryInfo.h"
21  #include "llvm/Analysis/TargetTransformInfo.h"
22  #include "llvm/CodeGen/Passes.h"
23  #include "llvm/CodeGen/TargetLowering.h"
24  #include "llvm/CodeGen/TargetPassConfig.h"
25  #include "llvm/CodeGen/TargetSchedule.h"
26  #include "llvm/CodeGen/TargetSubtargetInfo.h"
27  #include "llvm/IR/BasicBlock.h"
28  #include "llvm/IR/Dominators.h"
29  #include "llvm/IR/Function.h"
30  #include "llvm/IR/IRBuilder.h"
31  #include "llvm/IR/Instruction.h"
32  #include "llvm/IR/PatternMatch.h"
33  #include "llvm/IR/ProfDataUtils.h"
34  #include "llvm/InitializePasses.h"
35  #include "llvm/Pass.h"
36  #include "llvm/Support/ScaledNumber.h"
37  #include "llvm/Target/TargetMachine.h"
38  #include "llvm/Transforms/Utils/SizeOpts.h"
39  #include <algorithm>
40  #include <memory>
41  #include <queue>
42  #include <stack>
43  
44  using namespace llvm;
45  using namespace llvm::PatternMatch;
46  
47  #define DEBUG_TYPE "select-optimize"
48  
49  STATISTIC(NumSelectOptAnalyzed,
50            "Number of select groups considered for conversion to branch");
51  STATISTIC(NumSelectConvertedExpColdOperand,
52            "Number of select groups converted due to expensive cold operand");
53  STATISTIC(NumSelectConvertedHighPred,
54            "Number of select groups converted due to high-predictability");
55  STATISTIC(NumSelectUnPred,
56            "Number of select groups not converted due to unpredictability");
57  STATISTIC(NumSelectColdBB,
58            "Number of select groups not converted due to cold basic block");
59  STATISTIC(NumSelectConvertedLoop,
60            "Number of select groups converted due to loop-level analysis");
61  STATISTIC(NumSelectsConverted, "Number of selects converted");
62  
63  static cl::opt<unsigned> ColdOperandThreshold(
64      "cold-operand-threshold",
65      cl::desc("Maximum frequency of path for an operand to be considered cold."),
66      cl::init(20), cl::Hidden);
67  
68  static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
69      "cold-operand-max-cost-multiplier",
70      cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71               "slice of a cold operand to be considered inexpensive."),
72      cl::init(1), cl::Hidden);
73  
74  static cl::opt<unsigned>
75      GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76                            cl::desc("Gradient gain threshold (%)."),
77                            cl::init(25), cl::Hidden);
78  
79  static cl::opt<unsigned>
80      GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81                         cl::desc("Minimum gain per loop (in cycles) threshold."),
82                         cl::init(4), cl::Hidden);
83  
84  static cl::opt<unsigned> GainRelativeThreshold(
85      "select-opti-loop-relative-gain-threshold",
86      cl::desc(
87          "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
88      cl::init(8), cl::Hidden);
89  
90  static cl::opt<unsigned> MispredictDefaultRate(
91      "mispredict-default-rate", cl::Hidden, cl::init(25),
92      cl::desc("Default mispredict rate (initialized to 25%)."));
93  
94  static cl::opt<bool>
95      DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96                                 cl::init(false),
97                                 cl::desc("Disable loop-level heuristics."));
98  
99  namespace {
100  
101  class SelectOptimizeImpl {
102    const TargetMachine *TM = nullptr;
103    const TargetSubtargetInfo *TSI = nullptr;
104    const TargetLowering *TLI = nullptr;
105    const TargetTransformInfo *TTI = nullptr;
106    const LoopInfo *LI = nullptr;
107    BlockFrequencyInfo *BFI;
108    ProfileSummaryInfo *PSI = nullptr;
109    OptimizationRemarkEmitter *ORE = nullptr;
110    TargetSchedModel TSchedModel;
111  
112  public:
113    SelectOptimizeImpl() = default;
114    SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
115    PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
116    bool runOnFunction(Function &F, Pass &P);
117  
118    using Scaled64 = ScaledNumber<uint64_t>;
119  
120    struct CostInfo {
121      /// Predicated cost (with selects as conditional moves).
122      Scaled64 PredCost;
123      /// Non-predicated cost (with selects converted to branches).
124      Scaled64 NonPredCost;
125    };
126  
127    /// SelectLike is an abstraction over SelectInst and other operations that can
128    /// act like selects. For example Or(Zext(icmp), X) can be treated like
129    /// select(icmp, X|1, X).
130    class SelectLike {
131      SelectLike(Instruction *I) : I(I) {}
132  
133      Instruction *I;
134  
135    public:
136      /// Match a select or select-like instruction, returning a SelectLike.
137      static SelectLike match(Instruction *I) {
138        // Select instruction are what we are usually looking for.
139        if (isa<SelectInst>(I))
140          return SelectLike(I);
141  
142        // An Or(zext(i1 X), Y) can also be treated like a select, with condition
143        // C and values Y|1 and Y.
144        Value *X;
145        if (PatternMatch::match(
146                I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
147            X->getType()->isIntegerTy(1))
148          return SelectLike(I);
149  
150        return SelectLike(nullptr);
151      }
152  
153      bool isValid() { return I; }
154      operator bool() { return isValid(); }
155  
156      Instruction *getI() { return I; }
157      const Instruction *getI() const { return I; }
158  
159      Type *getType() const { return I->getType(); }
160  
161      /// Return the condition for the SelectLike instruction. For example the
162      /// condition of a select or c in `or(zext(c), x)`
163      Value *getCondition() const {
164        if (auto *Sel = dyn_cast<SelectInst>(I))
165          return Sel->getCondition();
166        // Or(zext) case
167        if (auto *BO = dyn_cast<BinaryOperator>(I)) {
168          Value *X;
169          if (PatternMatch::match(BO->getOperand(0),
170                                  m_OneUse(m_ZExt(m_Value(X)))))
171            return X;
172          if (PatternMatch::match(BO->getOperand(1),
173                                  m_OneUse(m_ZExt(m_Value(X)))))
174            return X;
175        }
176  
177        llvm_unreachable("Unhandled case in getCondition");
178      }
179  
180      /// Return the true value for the SelectLike instruction. Note this may not
181      /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
182      /// the true value would be `or(x,1)`. As this value does not exist, nullptr
183      /// is returned.
184      Value *getTrueValue() const {
185        if (auto *Sel = dyn_cast<SelectInst>(I))
186          return Sel->getTrueValue();
187        // Or(zext) case - The true value is Or(X), so return nullptr as the value
188        // does not yet exist.
189        if (isa<BinaryOperator>(I))
190          return nullptr;
191  
192        llvm_unreachable("Unhandled case in getTrueValue");
193      }
194  
195      /// Return the false value for the SelectLike instruction. For example the
196      /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
197      /// `select(c, x|1, x)`)
198      Value *getFalseValue() const {
199        if (auto *Sel = dyn_cast<SelectInst>(I))
200          return Sel->getFalseValue();
201        // Or(zext) case - return the operand which is not the zext.
202        if (auto *BO = dyn_cast<BinaryOperator>(I)) {
203          Value *X;
204          if (PatternMatch::match(BO->getOperand(0),
205                                  m_OneUse(m_ZExt(m_Value(X)))))
206            return BO->getOperand(1);
207          if (PatternMatch::match(BO->getOperand(1),
208                                  m_OneUse(m_ZExt(m_Value(X)))))
209            return BO->getOperand(0);
210        }
211  
212        llvm_unreachable("Unhandled case in getFalseValue");
213      }
214  
215      /// Return the NonPredCost cost of the true op, given the costs in
216      /// InstCostMap. This may need to be generated for select-like instructions.
217      Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
218                             const TargetTransformInfo *TTI) {
219        if (auto *Sel = dyn_cast<SelectInst>(I))
220          if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
221            return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
222                                           : Scaled64::getZero();
223  
224        // Or case - add the cost of an extra Or to the cost of the False case.
225        if (isa<BinaryOperator>(I))
226          if (auto I = dyn_cast<Instruction>(getFalseValue()))
227            if (InstCostMap.contains(I)) {
228              InstructionCost OrCost = TTI->getArithmeticInstrCost(
229                  Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
230                  {TargetTransformInfo::OK_AnyValue,
231                   TargetTransformInfo::OP_None},
232                  {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
233              return InstCostMap[I].NonPredCost +
234                     Scaled64::get(*OrCost.getValue());
235            }
236  
237        return Scaled64::getZero();
238      }
239  
240      /// Return the NonPredCost cost of the false op, given the costs in
241      /// InstCostMap. This may need to be generated for select-like instructions.
242      Scaled64
243      getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
244                     const TargetTransformInfo *TTI) {
245        if (auto *Sel = dyn_cast<SelectInst>(I))
246          if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
247            return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
248                                           : Scaled64::getZero();
249  
250        // Or case - return the cost of the false case
251        if (isa<BinaryOperator>(I))
252          if (auto I = dyn_cast<Instruction>(getFalseValue()))
253            if (InstCostMap.contains(I))
254              return InstCostMap[I].NonPredCost;
255  
256        return Scaled64::getZero();
257      }
258    };
259  
260  private:
261    // Select groups consist of consecutive select instructions with the same
262    // condition.
263    using SelectGroup = SmallVector<SelectLike, 2>;
264    using SelectGroups = SmallVector<SelectGroup, 2>;
265  
266    // Converts select instructions of a function to conditional jumps when deemed
267    // profitable. Returns true if at least one select was converted.
268    bool optimizeSelects(Function &F);
269  
270    // Heuristics for determining which select instructions can be profitably
271    // conveted to branches. Separate heuristics for selects in inner-most loops
272    // and the rest of code regions (base heuristics for non-inner-most loop
273    // regions).
274    void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
275    void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
276  
277    // Converts to branches the select groups that were deemed
278    // profitable-to-convert.
279    void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
280  
281    // Splits selects of a given basic block into select groups.
282    void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
283  
284    // Determines for which select groups it is profitable converting to branches
285    // (base and inner-most-loop heuristics).
286    void findProfitableSIGroupsBase(SelectGroups &SIGroups,
287                                    SelectGroups &ProfSIGroups);
288    void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
289                                          SelectGroups &ProfSIGroups);
290  
291    // Determines if a select group should be converted to a branch (base
292    // heuristics).
293    bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
294  
295    // Returns true if there are expensive instructions in the cold value
296    // operand's (if any) dependence slice of any of the selects of the given
297    // group.
298    bool hasExpensiveColdOperand(const SelectGroup &ASI);
299  
300    // For a given source instruction, collect its backwards dependence slice
301    // consisting of instructions exclusively computed for producing the operands
302    // of the source instruction.
303    void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
304                               Instruction *SI, bool ForSinking = false);
305  
306    // Returns true if the condition of the select is highly predictable.
307    bool isSelectHighlyPredictable(const SelectLike SI);
308  
309    // Loop-level checks to determine if a non-predicated version (with branches)
310    // of the given loop is more profitable than its predicated version.
311    bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
312  
313    // Computes instruction and loop-critical-path costs for both the predicated
314    // and non-predicated version of the given loop.
315    bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
316                          DenseMap<const Instruction *, CostInfo> &InstCostMap,
317                          CostInfo *LoopCost);
318  
319    // Returns a set of all the select instructions in the given select groups.
320    SmallDenseMap<const Instruction *, SelectLike, 2>
321    getSImap(const SelectGroups &SIGroups);
322  
323    // Returns the latency cost of a given instruction.
324    std::optional<uint64_t> computeInstCost(const Instruction *I);
325  
326    // Returns the misprediction cost of a given select when converted to branch.
327    Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
328  
329    // Returns the cost of a branch when the prediction is correct.
330    Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
331                                  const SelectLike SI);
332  
333    // Returns true if the target architecture supports lowering a given select.
334    bool isSelectKindSupported(const SelectLike SI);
335  };
336  
337  class SelectOptimize : public FunctionPass {
338    SelectOptimizeImpl Impl;
339  
340  public:
341    static char ID;
342  
343    SelectOptimize() : FunctionPass(ID) {
344      initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
345    }
346  
347    bool runOnFunction(Function &F) override {
348      return Impl.runOnFunction(F, *this);
349    }
350  
351    void getAnalysisUsage(AnalysisUsage &AU) const override {
352      AU.addRequired<ProfileSummaryInfoWrapperPass>();
353      AU.addRequired<TargetPassConfig>();
354      AU.addRequired<TargetTransformInfoWrapperPass>();
355      AU.addRequired<LoopInfoWrapperPass>();
356      AU.addRequired<BlockFrequencyInfoWrapperPass>();
357      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
358    }
359  };
360  
361  } // namespace
362  
363  PreservedAnalyses SelectOptimizePass::run(Function &F,
364                                            FunctionAnalysisManager &FAM) {
365    SelectOptimizeImpl Impl(TM);
366    return Impl.run(F, FAM);
367  }
368  
369  char SelectOptimize::ID = 0;
370  
371  INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
372                        false)
373  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
374  INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
375  INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
376  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
377  INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
378  INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
379  INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
380                      false)
381  
382  FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
383  
384  PreservedAnalyses SelectOptimizeImpl::run(Function &F,
385                                            FunctionAnalysisManager &FAM) {
386    TSI = TM->getSubtargetImpl(F);
387    TLI = TSI->getTargetLowering();
388  
389    // If none of the select types are supported then skip this pass.
390    // This is an optimization pass. Legality issues will be handled by
391    // instruction selection.
392    if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
393        !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
394        !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
395      return PreservedAnalyses::all();
396  
397    TTI = &FAM.getResult<TargetIRAnalysis>(F);
398    if (!TTI->enableSelectOptimize())
399      return PreservedAnalyses::all();
400  
401    PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)
402              .getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
403    assert(PSI && "This pass requires module analysis pass `profile-summary`!");
404    BFI = &FAM.getResult<BlockFrequencyAnalysis>(F);
405  
406    // When optimizing for size, selects are preferable over branches.
407    if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
408      return PreservedAnalyses::all();
409  
410    LI = &FAM.getResult<LoopAnalysis>(F);
411    ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
412    TSchedModel.init(TSI);
413  
414    bool Changed = optimizeSelects(F);
415    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
416  }
417  
418  bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
419    TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
420    TSI = TM->getSubtargetImpl(F);
421    TLI = TSI->getTargetLowering();
422  
423    // If none of the select types are supported then skip this pass.
424    // This is an optimization pass. Legality issues will be handled by
425    // instruction selection.
426    if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
427        !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
428        !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
429      return false;
430  
431    TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
432  
433    if (!TTI->enableSelectOptimize())
434      return false;
435  
436    LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
437    BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
438    PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
439    ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
440    TSchedModel.init(TSI);
441  
442    // When optimizing for size, selects are preferable over branches.
443    if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
444      return false;
445  
446    return optimizeSelects(F);
447  }
448  
449  bool SelectOptimizeImpl::optimizeSelects(Function &F) {
450    // Determine for which select groups it is profitable converting to branches.
451    SelectGroups ProfSIGroups;
452    // Base heuristics apply only to non-loops and outer loops.
453    optimizeSelectsBase(F, ProfSIGroups);
454    // Separate heuristics for inner-most loops.
455    optimizeSelectsInnerLoops(F, ProfSIGroups);
456  
457    // Convert to branches the select groups that were deemed
458    // profitable-to-convert.
459    convertProfitableSIGroups(ProfSIGroups);
460  
461    // Code modified if at least one select group was converted.
462    return !ProfSIGroups.empty();
463  }
464  
465  void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
466                                               SelectGroups &ProfSIGroups) {
467    // Collect all the select groups.
468    SelectGroups SIGroups;
469    for (BasicBlock &BB : F) {
470      // Base heuristics apply only to non-loops and outer loops.
471      Loop *L = LI->getLoopFor(&BB);
472      if (L && L->isInnermost())
473        continue;
474      collectSelectGroups(BB, SIGroups);
475    }
476  
477    // Determine for which select groups it is profitable converting to branches.
478    findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
479  }
480  
481  void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
482                                                     SelectGroups &ProfSIGroups) {
483    SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
484    // Need to check size on each iteration as we accumulate child loops.
485    for (unsigned long i = 0; i < Loops.size(); ++i)
486      for (Loop *ChildL : Loops[i]->getSubLoops())
487        Loops.push_back(ChildL);
488  
489    for (Loop *L : Loops) {
490      if (!L->isInnermost())
491        continue;
492  
493      SelectGroups SIGroups;
494      for (BasicBlock *BB : L->getBlocks())
495        collectSelectGroups(*BB, SIGroups);
496  
497      findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
498    }
499  }
500  
501  /// If \p isTrue is true, return the true value of \p SI, otherwise return
502  /// false value of \p SI. If the true/false value of \p SI is defined by any
503  /// select instructions in \p Selects, look through the defining select
504  /// instruction until the true/false value is not defined in \p Selects.
505  static Value *
506  getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
507                      const SmallPtrSet<const Instruction *, 2> &Selects,
508                      IRBuilder<> &IB) {
509    Value *V = nullptr;
510    for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
511         DefSI != nullptr && Selects.count(DefSI);
512         DefSI = dyn_cast<SelectInst>(V)) {
513      assert(DefSI->getCondition() == SI.getCondition() &&
514             "The condition of DefSI does not match with SI");
515      V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
516    }
517  
518    if (isa<BinaryOperator>(SI.getI())) {
519      assert(SI.getI()->getOpcode() == Instruction::Or &&
520             "Only currently handling Or instructions.");
521      V = SI.getFalseValue();
522      if (isTrue)
523        V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
524    }
525  
526    assert(V && "Failed to get select true/false value");
527    return V;
528  }
529  
530  void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
531    for (SelectGroup &ASI : ProfSIGroups) {
532      // The code transformation here is a modified version of the sinking
533      // transformation in CodeGenPrepare::optimizeSelectInst with a more
534      // aggressive strategy of which instructions to sink.
535      //
536      // TODO: eliminate the redundancy of logic transforming selects to branches
537      // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
538      // selects for all cases (with and without profile information).
539  
540      // Transform a sequence like this:
541      //    start:
542      //       %cmp = cmp uge i32 %a, %b
543      //       %sel = select i1 %cmp, i32 %c, i32 %d
544      //
545      // Into:
546      //    start:
547      //       %cmp = cmp uge i32 %a, %b
548      //       %cmp.frozen = freeze %cmp
549      //       br i1 %cmp.frozen, label %select.true, label %select.false
550      //    select.true:
551      //       br label %select.end
552      //    select.false:
553      //       br label %select.end
554      //    select.end:
555      //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
556      //
557      // %cmp should be frozen, otherwise it may introduce undefined behavior.
558      // In addition, we may sink instructions that produce %c or %d into the
559      // destination(s) of the new branch.
560      // If the true or false blocks do not contain a sunken instruction, that
561      // block and its branch may be optimized away. In that case, one side of the
562      // first branch will point directly to select.end, and the corresponding PHI
563      // predecessor block will be the start block.
564  
565      // Find all the instructions that can be soundly sunk to the true/false
566      // blocks. These are instructions that are computed solely for producing the
567      // operands of the select instructions in the group and can be sunk without
568      // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
569      // with side effects).
570      SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
571      typedef std::stack<Instruction *>::size_type StackSizeType;
572      StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
573      for (SelectLike SI : ASI) {
574        // For each select, compute the sinkable dependence chains of the true and
575        // false operands.
576        if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) {
577          std::stack<Instruction *> TrueSlice;
578          getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true);
579          maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
580          TrueSlices.push_back(TrueSlice);
581        }
582        if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) {
583          if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) {
584            std::stack<Instruction *> FalseSlice;
585            getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true);
586            maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
587            FalseSlices.push_back(FalseSlice);
588          }
589        }
590      }
591      // In the case of multiple select instructions in the same group, the order
592      // of non-dependent instructions (instructions of different dependence
593      // slices) in the true/false blocks appears to affect performance.
594      // Interleaving the slices seems to experimentally be the optimal approach.
595      // This interleaving scheduling allows for more ILP (with a natural downside
596      // of increasing a bit register pressure) compared to a simple ordering of
597      // one whole chain after another. One would expect that this ordering would
598      // not matter since the scheduling in the backend of the compiler  would
599      // take care of it, but apparently the scheduler fails to deliver optimal
600      // ILP with a naive ordering here.
601      SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
602      for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
603        for (auto &S : TrueSlices) {
604          if (!S.empty()) {
605            TrueSlicesInterleaved.push_back(S.top());
606            S.pop();
607          }
608        }
609      }
610      for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
611        for (auto &S : FalseSlices) {
612          if (!S.empty()) {
613            FalseSlicesInterleaved.push_back(S.top());
614            S.pop();
615          }
616        }
617      }
618  
619      // We split the block containing the select(s) into two blocks.
620      SelectLike SI = ASI.front();
621      SelectLike LastSI = ASI.back();
622      BasicBlock *StartBlock = SI.getI()->getParent();
623      BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
624      BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
625      BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
626      // Delete the unconditional branch that was just created by the split.
627      StartBlock->getTerminator()->eraseFromParent();
628  
629      // Move any debug/pseudo instructions that were in-between the select
630      // group to the newly-created end block.
631      SmallVector<Instruction *, 2> DebugPseudoINS;
632      auto DIt = SI.getI()->getIterator();
633      while (&*DIt != LastSI.getI()) {
634        if (DIt->isDebugOrPseudoInst())
635          DebugPseudoINS.push_back(&*DIt);
636        DIt++;
637      }
638      for (auto *DI : DebugPseudoINS) {
639        DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
640      }
641  
642      // Duplicate implementation for DPValues, the non-instruction debug-info
643      // record. Helper lambda for moving DPValues to the end block.
644      auto TransferDPValues = [&](Instruction &I) {
645        for (auto &DPValue : llvm::make_early_inc_range(I.getDbgValueRange())) {
646          DPValue.removeFromParent();
647          EndBlock->insertDPValueBefore(&DPValue,
648                                        EndBlock->getFirstInsertionPt());
649        }
650      };
651  
652      // Iterate over all instructions in between SI and LastSI, not including
653      // SI itself. These are all the variable assignments that happen "in the
654      // middle" of the select group.
655      auto R = make_range(std::next(SI.getI()->getIterator()),
656                          std::next(LastSI.getI()->getIterator()));
657      llvm::for_each(R, TransferDPValues);
658  
659      // These are the new basic blocks for the conditional branch.
660      // At least one will become an actual new basic block.
661      BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
662      BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
663      if (!TrueSlicesInterleaved.empty()) {
664        TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink",
665                                       EndBlock->getParent(), EndBlock);
666        TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
667        TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
668        for (Instruction *TrueInst : TrueSlicesInterleaved)
669          TrueInst->moveBefore(TrueBranch);
670      }
671      if (!FalseSlicesInterleaved.empty()) {
672        FalseBlock =
673            BasicBlock::Create(EndBlock->getContext(), "select.false.sink",
674                               EndBlock->getParent(), EndBlock);
675        FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
676        FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
677        for (Instruction *FalseInst : FalseSlicesInterleaved)
678          FalseInst->moveBefore(FalseBranch);
679      }
680      // If there was nothing to sink, then arbitrarily choose the 'false' side
681      // for a new input value to the PHI.
682      if (TrueBlock == FalseBlock) {
683        assert(TrueBlock == nullptr &&
684               "Unexpected basic block transform while optimizing select");
685  
686        FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false",
687                                        EndBlock->getParent(), EndBlock);
688        auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
689        FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
690      }
691  
692      // Insert the real conditional branch based on the original condition.
693      // If we did not create a new block for one of the 'true' or 'false' paths
694      // of the condition, it means that side of the branch goes to the end block
695      // directly and the path originates from the start block from the point of
696      // view of the new PHI.
697      BasicBlock *TT, *FT;
698      if (TrueBlock == nullptr) {
699        TT = EndBlock;
700        FT = FalseBlock;
701        TrueBlock = StartBlock;
702      } else if (FalseBlock == nullptr) {
703        TT = TrueBlock;
704        FT = EndBlock;
705        FalseBlock = StartBlock;
706      } else {
707        TT = TrueBlock;
708        FT = FalseBlock;
709      }
710      IRBuilder<> IB(SI.getI());
711      auto *CondFr = IB.CreateFreeze(SI.getCondition(),
712                                     SI.getCondition()->getName() + ".frozen");
713  
714      SmallPtrSet<const Instruction *, 2> INS;
715      for (auto SI : ASI)
716        INS.insert(SI.getI());
717  
718      // Use reverse iterator because later select may use the value of the
719      // earlier select, and we need to propagate value through earlier select
720      // to get the PHI operand.
721      for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
722        SelectLike SI = *It;
723        // The select itself is replaced with a PHI Node.
724        PHINode *PN = PHINode::Create(SI.getType(), 2, "");
725        PN->insertBefore(EndBlock->begin());
726        PN->takeName(SI.getI());
727        PN->addIncoming(getTrueOrFalseValue(SI, true, INS, IB), TrueBlock);
728        PN->addIncoming(getTrueOrFalseValue(SI, false, INS, IB), FalseBlock);
729        PN->setDebugLoc(SI.getI()->getDebugLoc());
730        SI.getI()->replaceAllUsesWith(PN);
731        INS.erase(SI.getI());
732        ++NumSelectsConverted;
733      }
734      IB.CreateCondBr(CondFr, TT, FT, SI.getI());
735  
736      // Remove the old select instructions, now that they are not longer used.
737      for (auto SI : ASI)
738        SI.getI()->eraseFromParent();
739    }
740  }
741  
742  void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
743                                               SelectGroups &SIGroups) {
744    BasicBlock::iterator BBIt = BB.begin();
745    while (BBIt != BB.end()) {
746      Instruction *I = &*BBIt++;
747      if (SelectLike SI = SelectLike::match(I)) {
748        if (!TTI->shouldTreatInstructionLikeSelect(I))
749          continue;
750  
751        SelectGroup SIGroup;
752        SIGroup.push_back(SI);
753        while (BBIt != BB.end()) {
754          Instruction *NI = &*BBIt;
755          // Debug/pseudo instructions should be skipped and not prevent the
756          // formation of a select group.
757          if (NI->isDebugOrPseudoInst()) {
758            ++BBIt;
759            continue;
760          }
761          // We only allow selects in the same group, not other select-like
762          // instructions.
763          if (!isa<SelectInst>(NI))
764            break;
765  
766          SelectLike NSI = SelectLike::match(NI);
767          if (NSI && SI.getCondition() == NSI.getCondition()) {
768            SIGroup.push_back(NSI);
769          } else
770            break;
771          ++BBIt;
772        }
773  
774        // If the select type is not supported, no point optimizing it.
775        // Instruction selection will take care of it.
776        if (!isSelectKindSupported(SI))
777          continue;
778  
779        SIGroups.push_back(SIGroup);
780      }
781    }
782  }
783  
784  void SelectOptimizeImpl::findProfitableSIGroupsBase(
785      SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
786    for (SelectGroup &ASI : SIGroups) {
787      ++NumSelectOptAnalyzed;
788      if (isConvertToBranchProfitableBase(ASI))
789        ProfSIGroups.push_back(ASI);
790    }
791  }
792  
793  static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
794                                 DiagnosticInfoOptimizationBase &Rem) {
795    LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
796    ORE->emit(Rem);
797  }
798  
799  void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
800      const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
801    NumSelectOptAnalyzed += SIGroups.size();
802    // For each select group in an inner-most loop,
803    // a branch is more preferable than a select/conditional-move if:
804    // i) conversion to branches for all the select groups of the loop satisfies
805    //    loop-level heuristics including reducing the loop's critical path by
806    //    some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
807    // ii) the total cost of the select group is cheaper with a branch compared
808    //     to its predicated version. The cost is in terms of latency and the cost
809    //     of a select group is the cost of its most expensive select instruction
810    //     (assuming infinite resources and thus fully leveraging available ILP).
811  
812    DenseMap<const Instruction *, CostInfo> InstCostMap;
813    CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
814                            {Scaled64::getZero(), Scaled64::getZero()}};
815    if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
816        !checkLoopHeuristics(L, LoopCost)) {
817      return;
818    }
819  
820    for (SelectGroup &ASI : SIGroups) {
821      // Assuming infinite resources, the cost of a group of instructions is the
822      // cost of the most expensive instruction of the group.
823      Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
824      for (SelectLike SI : ASI) {
825        SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
826        BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
827      }
828      if (BranchCost < SelectCost) {
829        OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
830        OR << "Profitable to convert to branch (loop analysis). BranchCost="
831           << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
832           << ". ";
833        EmitAndPrintRemark(ORE, OR);
834        ++NumSelectConvertedLoop;
835        ProfSIGroups.push_back(ASI);
836      } else {
837        OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
838                                        ASI.front().getI());
839        ORmiss << "Select is more profitable (loop analysis). BranchCost="
840               << BranchCost.toString()
841               << ", SelectCost=" << SelectCost.toString() << ". ";
842        EmitAndPrintRemark(ORE, ORmiss);
843      }
844    }
845  }
846  
847  bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
848      const SelectGroup &ASI) {
849    SelectLike SI = ASI.front();
850    LLVM_DEBUG(dbgs() << "Analyzing select group containing " << SI.getI()
851                      << "\n");
852    OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
853    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
854  
855    // Skip cold basic blocks. Better to optimize for size for cold blocks.
856    if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) {
857      ++NumSelectColdBB;
858      ORmiss << "Not converted to branch because of cold basic block. ";
859      EmitAndPrintRemark(ORE, ORmiss);
860      return false;
861    }
862  
863    // If unpredictable, branch form is less profitable.
864    if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) {
865      ++NumSelectUnPred;
866      ORmiss << "Not converted to branch because of unpredictable branch. ";
867      EmitAndPrintRemark(ORE, ORmiss);
868      return false;
869    }
870  
871    // If highly predictable, branch form is more profitable, unless a
872    // predictable select is inexpensive in the target architecture.
873    if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
874      ++NumSelectConvertedHighPred;
875      OR << "Converted to branch because of highly predictable branch. ";
876      EmitAndPrintRemark(ORE, OR);
877      return true;
878    }
879  
880    // Look for expensive instructions in the cold operand's (if any) dependence
881    // slice of any of the selects in the group.
882    if (hasExpensiveColdOperand(ASI)) {
883      ++NumSelectConvertedExpColdOperand;
884      OR << "Converted to branch because of expensive cold operand.";
885      EmitAndPrintRemark(ORE, OR);
886      return true;
887    }
888  
889    ORmiss << "Not profitable to convert to branch (base heuristic).";
890    EmitAndPrintRemark(ORE, ORmiss);
891    return false;
892  }
893  
894  static InstructionCost divideNearest(InstructionCost Numerator,
895                                       uint64_t Denominator) {
896    return (Numerator + (Denominator / 2)) / Denominator;
897  }
898  
899  static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
900                                   uint64_t &TrueVal, uint64_t &FalseVal) {
901    if (isa<SelectInst>(SI.getI()))
902      return extractBranchWeights(*SI.getI(), TrueVal, FalseVal);
903    return false;
904  }
905  
906  bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
907    bool ColdOperand = false;
908    uint64_t TrueWeight, FalseWeight, TotalWeight;
909    if (extractBranchWeights(ASI.front(), TrueWeight, FalseWeight)) {
910      uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
911      TotalWeight = TrueWeight + FalseWeight;
912      // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
913      ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
914    } else if (PSI->hasProfileSummary()) {
915      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
916                                      ASI.front().getI());
917      ORmiss << "Profile data available but missing branch-weights metadata for "
918                "select instruction. ";
919      EmitAndPrintRemark(ORE, ORmiss);
920    }
921    if (!ColdOperand)
922      return false;
923    // Check if the cold path's dependence slice is expensive for any of the
924    // selects of the group.
925    for (SelectLike SI : ASI) {
926      Instruction *ColdI = nullptr;
927      uint64_t HotWeight;
928      if (TrueWeight < FalseWeight) {
929        ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue());
930        HotWeight = FalseWeight;
931      } else {
932        ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue());
933        HotWeight = TrueWeight;
934      }
935      if (ColdI) {
936        std::stack<Instruction *> ColdSlice;
937        getExclBackwardsSlice(ColdI, ColdSlice, SI.getI());
938        InstructionCost SliceCost = 0;
939        while (!ColdSlice.empty()) {
940          SliceCost += TTI->getInstructionCost(ColdSlice.top(),
941                                               TargetTransformInfo::TCK_Latency);
942          ColdSlice.pop();
943        }
944        // The colder the cold value operand of the select is the more expensive
945        // the cmov becomes for computing the cold value operand every time. Thus,
946        // the colder the cold operand is the more its cost counts.
947        // Get nearest integer cost adjusted for coldness.
948        InstructionCost AdjSliceCost =
949            divideNearest(SliceCost * HotWeight, TotalWeight);
950        if (AdjSliceCost >=
951            ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
952          return true;
953      }
954    }
955    return false;
956  }
957  
958  // Check if it is safe to move LoadI next to the SI.
959  // Conservatively assume it is safe only if there is no instruction
960  // modifying memory in-between the load and the select instruction.
961  static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
962    // Assume loads from different basic blocks are unsafe to move.
963    if (LoadI->getParent() != SI->getParent())
964      return false;
965    auto It = LoadI->getIterator();
966    while (&*It != SI) {
967      if (It->mayWriteToMemory())
968        return false;
969      It++;
970    }
971    return true;
972  }
973  
974  // For a given source instruction, collect its backwards dependence slice
975  // consisting of instructions exclusively computed for the purpose of producing
976  // the operands of the source instruction. As an approximation
977  // (sufficiently-accurate in practice), we populate this set with the
978  // instructions of the backwards dependence slice that only have one-use and
979  // form an one-use chain that leads to the source instruction.
980  void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
981                                                 std::stack<Instruction *> &Slice,
982                                                 Instruction *SI,
983                                                 bool ForSinking) {
984    SmallPtrSet<Instruction *, 2> Visited;
985    std::queue<Instruction *> Worklist;
986    Worklist.push(I);
987    while (!Worklist.empty()) {
988      Instruction *II = Worklist.front();
989      Worklist.pop();
990  
991      // Avoid cycles.
992      if (!Visited.insert(II).second)
993        continue;
994  
995      if (!II->hasOneUse())
996        continue;
997  
998      // Cannot soundly sink instructions with side-effects.
999      // Terminator or phi instructions cannot be sunk.
1000      // Avoid sinking other select instructions (should be handled separetely).
1001      if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1002                         isa<SelectInst>(II) || isa<PHINode>(II)))
1003        continue;
1004  
1005      // Avoid sinking loads in order not to skip state-modifying instructions,
1006      // that may alias with the loaded address.
1007      // Only allow sinking of loads within the same basic block that are
1008      // conservatively proven to be safe.
1009      if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
1010        continue;
1011  
1012      // Avoid considering instructions with less frequency than the source
1013      // instruction (i.e., avoid colder code regions of the dependence slice).
1014      if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
1015        continue;
1016  
1017      // Eligible one-use instruction added to the dependence slice.
1018      Slice.push(II);
1019  
1020      // Explore all the operands of the current instruction to expand the slice.
1021      for (unsigned k = 0; k < II->getNumOperands(); ++k)
1022        if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
1023          Worklist.push(OpI);
1024    }
1025  }
1026  
1027  bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1028    uint64_t TrueWeight, FalseWeight;
1029    if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1030      uint64_t Max = std::max(TrueWeight, FalseWeight);
1031      uint64_t Sum = TrueWeight + FalseWeight;
1032      if (Sum != 0) {
1033        auto Probability = BranchProbability::getBranchProbability(Max, Sum);
1034        if (Probability > TTI->getPredictableBranchThreshold())
1035          return true;
1036      }
1037    }
1038    return false;
1039  }
1040  
1041  bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1042                                               const CostInfo LoopCost[2]) {
1043    // Loop-level checks to determine if a non-predicated version (with branches)
1044    // of the loop is more profitable than its predicated version.
1045  
1046    if (DisableLoopLevelHeuristics)
1047      return true;
1048  
1049    OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1050                                     L->getHeader()->getFirstNonPHI());
1051  
1052    if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1053        LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1054      ORmissL << "No select conversion in the loop due to no reduction of loop's "
1055                 "critical path. ";
1056      EmitAndPrintRemark(ORE, ORmissL);
1057      return false;
1058    }
1059  
1060    Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1061                        LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1062  
1063    // Profitably converting to branches need to reduce the loop's critical path
1064    // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1065    // relative gain of 12.5%).
1066    if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
1067        Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
1068      Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
1069      ORmissL << "No select conversion in the loop due to small reduction of "
1070                 "loop's critical path. Gain="
1071              << Gain[1].toString()
1072              << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1073      EmitAndPrintRemark(ORE, ORmissL);
1074      return false;
1075    }
1076  
1077    // If the loop's critical path involves loop-carried dependences, the gradient
1078    // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1079    // This check ensures that the latency reduction for the loop's critical path
1080    // keeps decreasing with sufficient rate beyond the two analyzed loop
1081    // iterations.
1082    if (Gain[1] > Gain[0]) {
1083      Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
1084                              (LoopCost[1].PredCost - LoopCost[0].PredCost);
1085      if (GradientGain < Scaled64::get(GainGradientThreshold)) {
1086        ORmissL << "No select conversion in the loop due to small gradient gain. "
1087                   "GradientGain="
1088                << GradientGain.toString() << "%. ";
1089        EmitAndPrintRemark(ORE, ORmissL);
1090        return false;
1091      }
1092    }
1093    // If the gain decreases it is not profitable to convert.
1094    else if (Gain[1] < Gain[0]) {
1095      ORmissL
1096          << "No select conversion in the loop due to negative gradient gain. ";
1097      EmitAndPrintRemark(ORE, ORmissL);
1098      return false;
1099    }
1100  
1101    // Non-predicated version of the loop is more profitable than its
1102    // predicated version.
1103    return true;
1104  }
1105  
1106  // Computes instruction and loop-critical-path costs for both the predicated
1107  // and non-predicated version of the given loop.
1108  // Returns false if unable to compute these costs due to invalid cost of loop
1109  // instruction(s).
1110  bool SelectOptimizeImpl::computeLoopCosts(
1111      const Loop *L, const SelectGroups &SIGroups,
1112      DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1113    LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1114                      << L->getHeader()->getName() << "\n");
1115    const auto &SImap = getSImap(SIGroups);
1116    // Compute instruction and loop-critical-path costs across two iterations for
1117    // both predicated and non-predicated version.
1118    const unsigned Iterations = 2;
1119    for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1120      // Cost of the loop's critical path.
1121      CostInfo &MaxCost = LoopCost[Iter];
1122      for (BasicBlock *BB : L->getBlocks()) {
1123        for (const Instruction &I : *BB) {
1124          if (I.isDebugOrPseudoInst())
1125            continue;
1126          // Compute the predicated and non-predicated cost of the instruction.
1127          Scaled64 IPredCost = Scaled64::getZero(),
1128                   INonPredCost = Scaled64::getZero();
1129  
1130          // Assume infinite resources that allow to fully exploit the available
1131          // instruction-level parallelism.
1132          // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1133          for (const Use &U : I.operands()) {
1134            auto UI = dyn_cast<Instruction>(U.get());
1135            if (!UI)
1136              continue;
1137            if (InstCostMap.count(UI)) {
1138              IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
1139              INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
1140            }
1141          }
1142          auto ILatency = computeInstCost(&I);
1143          if (!ILatency) {
1144            OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1145            ORmissL << "Invalid instruction cost preventing analysis and "
1146                       "optimization of the inner-most loop containing this "
1147                       "instruction. ";
1148            EmitAndPrintRemark(ORE, ORmissL);
1149            return false;
1150          }
1151          IPredCost += Scaled64::get(*ILatency);
1152          INonPredCost += Scaled64::get(*ILatency);
1153  
1154          // For a select that can be converted to branch,
1155          // compute its cost as a branch (non-predicated cost).
1156          //
1157          // BranchCost = PredictedPathCost + MispredictCost
1158          // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1159          // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1160          if (SImap.contains(&I)) {
1161            auto SI = SImap.at(&I);
1162            Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1163            Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1164            Scaled64 PredictedPathCost =
1165                getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
1166  
1167            Scaled64 CondCost = Scaled64::getZero();
1168            if (auto *CI = dyn_cast<Instruction>(SI.getCondition()))
1169              if (InstCostMap.count(CI))
1170                CondCost = InstCostMap[CI].NonPredCost;
1171            Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1172  
1173            INonPredCost = PredictedPathCost + MispredictCost;
1174          }
1175          LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1176                            << INonPredCost << " for " << I << "\n");
1177  
1178          InstCostMap[&I] = {IPredCost, INonPredCost};
1179          MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
1180          MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
1181        }
1182      }
1183      LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1184                        << " MaxCost = " << MaxCost.PredCost << " "
1185                        << MaxCost.NonPredCost << "\n");
1186    }
1187    return true;
1188  }
1189  
1190  SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2>
1191  SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1192    SmallDenseMap<const Instruction *, SelectLike, 2> SImap;
1193    for (const SelectGroup &ASI : SIGroups)
1194      for (SelectLike SI : ASI)
1195        SImap.try_emplace(SI.getI(), SI);
1196    return SImap;
1197  }
1198  
1199  std::optional<uint64_t>
1200  SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1201    InstructionCost ICost =
1202        TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
1203    if (auto OC = ICost.getValue())
1204      return std::optional<uint64_t>(*OC);
1205    return std::nullopt;
1206  }
1207  
1208  ScaledNumber<uint64_t>
1209  SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1210                                           const Scaled64 CondCost) {
1211    uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1212  
1213    // Account for the default misprediction rate when using a branch
1214    // (conservatively set to 25% by default).
1215    uint64_t MispredictRate = MispredictDefaultRate;
1216    // If the select condition is obviously predictable, then the misprediction
1217    // rate is zero.
1218    if (isSelectHighlyPredictable(SI))
1219      MispredictRate = 0;
1220  
1221    // CondCost is included to account for cases where the computation of the
1222    // condition is part of a long dependence chain (potentially loop-carried)
1223    // that would delay detection of a misprediction and increase its cost.
1224    Scaled64 MispredictCost =
1225        std::max(Scaled64::get(MispredictPenalty), CondCost) *
1226        Scaled64::get(MispredictRate);
1227    MispredictCost /= Scaled64::get(100);
1228  
1229    return MispredictCost;
1230  }
1231  
1232  // Returns the cost of a branch when the prediction is correct.
1233  // TrueCost * TrueProbability + FalseCost * FalseProbability.
1234  ScaledNumber<uint64_t>
1235  SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1236                                           const SelectLike SI) {
1237    Scaled64 PredPathCost;
1238    uint64_t TrueWeight, FalseWeight;
1239    if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1240      uint64_t SumWeight = TrueWeight + FalseWeight;
1241      if (SumWeight != 0) {
1242        PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1243                       FalseCost * Scaled64::get(FalseWeight);
1244        PredPathCost /= Scaled64::get(SumWeight);
1245        return PredPathCost;
1246      }
1247    }
1248    // Without branch weight metadata, we assume 75% for the one path and 25% for
1249    // the other, and pick the result with the biggest cost.
1250    PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1251                            FalseCost * Scaled64::get(3) + TrueCost);
1252    PredPathCost /= Scaled64::get(4);
1253    return PredPathCost;
1254  }
1255  
1256  bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1257    bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(1);
1258    if (VectorCond)
1259      return false;
1260    TargetLowering::SelectSupportKind SelectKind;
1261    if (SI.getType()->isVectorTy())
1262      SelectKind = TargetLowering::ScalarCondVectorVal;
1263    else
1264      SelectKind = TargetLowering::ScalarValSelect;
1265    return TLI->isSelectSupported(SelectKind);
1266  }
1267