xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp (revision 953efa5b200f060564a090ab71b3d7f614a35e3f)
1  //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  // This pass converts selects to conditional jumps when profitable.
10  //
11  //===----------------------------------------------------------------------===//
12  
13  #include "llvm/ADT/Optional.h"
14  #include "llvm/ADT/SmallVector.h"
15  #include "llvm/ADT/Statistic.h"
16  #include "llvm/Analysis/BlockFrequencyInfo.h"
17  #include "llvm/Analysis/BranchProbabilityInfo.h"
18  #include "llvm/Analysis/LoopInfo.h"
19  #include "llvm/Analysis/OptimizationRemarkEmitter.h"
20  #include "llvm/Analysis/ProfileSummaryInfo.h"
21  #include "llvm/Analysis/TargetTransformInfo.h"
22  #include "llvm/CodeGen/Passes.h"
23  #include "llvm/CodeGen/TargetLowering.h"
24  #include "llvm/CodeGen/TargetPassConfig.h"
25  #include "llvm/CodeGen/TargetSchedule.h"
26  #include "llvm/CodeGen/TargetSubtargetInfo.h"
27  #include "llvm/IR/BasicBlock.h"
28  #include "llvm/IR/Dominators.h"
29  #include "llvm/IR/Function.h"
30  #include "llvm/IR/IRBuilder.h"
31  #include "llvm/IR/Instruction.h"
32  #include "llvm/InitializePasses.h"
33  #include "llvm/Pass.h"
34  #include "llvm/Support/ScaledNumber.h"
35  #include "llvm/Target/TargetMachine.h"
36  #include "llvm/Transforms/Utils/SizeOpts.h"
37  #include <algorithm>
38  #include <memory>
39  #include <queue>
40  #include <stack>
41  #include <string>
42  
43  using namespace llvm;
44  
45  #define DEBUG_TYPE "select-optimize"
46  
47  STATISTIC(NumSelectOptAnalyzed,
48            "Number of select groups considered for conversion to branch");
49  STATISTIC(NumSelectConvertedExpColdOperand,
50            "Number of select groups converted due to expensive cold operand");
51  STATISTIC(NumSelectConvertedHighPred,
52            "Number of select groups converted due to high-predictability");
53  STATISTIC(NumSelectUnPred,
54            "Number of select groups not converted due to unpredictability");
55  STATISTIC(NumSelectColdBB,
56            "Number of select groups not converted due to cold basic block");
57  STATISTIC(NumSelectConvertedLoop,
58            "Number of select groups converted due to loop-level analysis");
59  STATISTIC(NumSelectsConverted, "Number of selects converted");
60  
61  static cl::opt<unsigned> ColdOperandThreshold(
62      "cold-operand-threshold",
63      cl::desc("Maximum frequency of path for an operand to be considered cold."),
64      cl::init(20), cl::Hidden);
65  
66  static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
67      "cold-operand-max-cost-multiplier",
68      cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
69               "slice of a cold operand to be considered inexpensive."),
70      cl::init(1), cl::Hidden);
71  
72  static cl::opt<unsigned>
73      GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
74                            cl::desc("Gradient gain threshold (%)."),
75                            cl::init(25), cl::Hidden);
76  
77  static cl::opt<unsigned>
78      GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
79                         cl::desc("Minimum gain per loop (in cycles) threshold."),
80                         cl::init(4), cl::Hidden);
81  
82  static cl::opt<unsigned> GainRelativeThreshold(
83      "select-opti-loop-relative-gain-threshold",
84      cl::desc(
85          "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
86      cl::init(8), cl::Hidden);
87  
88  static cl::opt<unsigned> MispredictDefaultRate(
89      "mispredict-default-rate", cl::Hidden, cl::init(25),
90      cl::desc("Default mispredict rate (initialized to 25%)."));
91  
92  static cl::opt<bool>
93      DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
94                                 cl::init(false),
95                                 cl::desc("Disable loop-level heuristics."));
96  
97  namespace {
98  
99  class SelectOptimize : public FunctionPass {
100    const TargetMachine *TM = nullptr;
101    const TargetSubtargetInfo *TSI;
102    const TargetLowering *TLI = nullptr;
103    const TargetTransformInfo *TTI = nullptr;
104    const LoopInfo *LI;
105    DominatorTree *DT;
106    std::unique_ptr<BlockFrequencyInfo> BFI;
107    std::unique_ptr<BranchProbabilityInfo> BPI;
108    ProfileSummaryInfo *PSI;
109    OptimizationRemarkEmitter *ORE;
110    TargetSchedModel TSchedModel;
111  
112  public:
113    static char ID;
114  
115    SelectOptimize() : FunctionPass(ID) {
116      initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
117    }
118  
119    bool runOnFunction(Function &F) override;
120  
121    void getAnalysisUsage(AnalysisUsage &AU) const override {
122      AU.addRequired<ProfileSummaryInfoWrapperPass>();
123      AU.addRequired<TargetPassConfig>();
124      AU.addRequired<TargetTransformInfoWrapperPass>();
125      AU.addRequired<DominatorTreeWrapperPass>();
126      AU.addRequired<LoopInfoWrapperPass>();
127      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
128    }
129  
130  private:
131    // Select groups consist of consecutive select instructions with the same
132    // condition.
133    using SelectGroup = SmallVector<SelectInst *, 2>;
134    using SelectGroups = SmallVector<SelectGroup, 2>;
135  
136    using Scaled64 = ScaledNumber<uint64_t>;
137  
138    struct CostInfo {
139      /// Predicated cost (with selects as conditional moves).
140      Scaled64 PredCost;
141      /// Non-predicated cost (with selects converted to branches).
142      Scaled64 NonPredCost;
143    };
144  
145    // Converts select instructions of a function to conditional jumps when deemed
146    // profitable. Returns true if at least one select was converted.
147    bool optimizeSelects(Function &F);
148  
149    // Heuristics for determining which select instructions can be profitably
150    // conveted to branches. Separate heuristics for selects in inner-most loops
151    // and the rest of code regions (base heuristics for non-inner-most loop
152    // regions).
153    void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
154    void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
155  
156    // Converts to branches the select groups that were deemed
157    // profitable-to-convert.
158    void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
159  
160    // Splits selects of a given basic block into select groups.
161    void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
162  
163    // Determines for which select groups it is profitable converting to branches
164    // (base and inner-most-loop heuristics).
165    void findProfitableSIGroupsBase(SelectGroups &SIGroups,
166                                    SelectGroups &ProfSIGroups);
167    void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
168                                          SelectGroups &ProfSIGroups);
169  
170    // Determines if a select group should be converted to a branch (base
171    // heuristics).
172    bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
173  
174    // Returns true if there are expensive instructions in the cold value
175    // operand's (if any) dependence slice of any of the selects of the given
176    // group.
177    bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
178  
179    // For a given source instruction, collect its backwards dependence slice
180    // consisting of instructions exclusively computed for producing the operands
181    // of the source instruction.
182    void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
183                               bool ForSinking = false);
184  
185    // Returns true if the condition of the select is highly predictable.
186    bool isSelectHighlyPredictable(const SelectInst *SI);
187  
188    // Loop-level checks to determine if a non-predicated version (with branches)
189    // of the given loop is more profitable than its predicated version.
190    bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
191  
192    // Computes instruction and loop-critical-path costs for both the predicated
193    // and non-predicated version of the given loop.
194    bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
195                          DenseMap<const Instruction *, CostInfo> &InstCostMap,
196                          CostInfo *LoopCost);
197  
198    // Returns a set of all the select instructions in the given select groups.
199    SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
200  
201    // Returns the latency cost of a given instruction.
202    Optional<uint64_t> computeInstCost(const Instruction *I);
203  
204    // Returns the misprediction cost of a given select when converted to branch.
205    Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
206  
207    // Returns the cost of a branch when the prediction is correct.
208    Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
209                                  const SelectInst *SI);
210  
211    // Returns true if the target architecture supports lowering a given select.
212    bool isSelectKindSupported(SelectInst *SI);
213  };
214  } // namespace
215  
216  char SelectOptimize::ID = 0;
217  
218  INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
219                        false)
220  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
221  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
222  INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
223  INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
224  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
225  INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
226  INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
227                      false)
228  
229  FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
230  
231  bool SelectOptimize::runOnFunction(Function &F) {
232    TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
233    TSI = TM->getSubtargetImpl(F);
234    TLI = TSI->getTargetLowering();
235  
236    // If none of the select types is supported then skip this pass.
237    // This is an optimization pass. Legality issues will be handled by
238    // instruction selection.
239    if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
240        !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
241        !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
242      return false;
243  
244    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
245    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
246    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
247    BPI.reset(new BranchProbabilityInfo(F, *LI));
248    BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
249    PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
250    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
251    TSchedModel.init(TSI);
252  
253    // When optimizing for size, selects are preferable over branches.
254    if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
255      return false;
256  
257    return optimizeSelects(F);
258  }
259  
260  bool SelectOptimize::optimizeSelects(Function &F) {
261    // Determine for which select groups it is profitable converting to branches.
262    SelectGroups ProfSIGroups;
263    // Base heuristics apply only to non-loops and outer loops.
264    optimizeSelectsBase(F, ProfSIGroups);
265    // Separate heuristics for inner-most loops.
266    optimizeSelectsInnerLoops(F, ProfSIGroups);
267  
268    // Convert to branches the select groups that were deemed
269    // profitable-to-convert.
270    convertProfitableSIGroups(ProfSIGroups);
271  
272    // Code modified if at least one select group was converted.
273    return !ProfSIGroups.empty();
274  }
275  
276  void SelectOptimize::optimizeSelectsBase(Function &F,
277                                           SelectGroups &ProfSIGroups) {
278    // Collect all the select groups.
279    SelectGroups SIGroups;
280    for (BasicBlock &BB : F) {
281      // Base heuristics apply only to non-loops and outer loops.
282      Loop *L = LI->getLoopFor(&BB);
283      if (L && L->isInnermost())
284        continue;
285      collectSelectGroups(BB, SIGroups);
286    }
287  
288    // Determine for which select groups it is profitable converting to branches.
289    findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
290  }
291  
292  void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
293                                                 SelectGroups &ProfSIGroups) {
294    SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
295    // Need to check size on each iteration as we accumulate child loops.
296    for (unsigned long i = 0; i < Loops.size(); ++i)
297      for (Loop *ChildL : Loops[i]->getSubLoops())
298        Loops.push_back(ChildL);
299  
300    for (Loop *L : Loops) {
301      if (!L->isInnermost())
302        continue;
303  
304      SelectGroups SIGroups;
305      for (BasicBlock *BB : L->getBlocks())
306        collectSelectGroups(*BB, SIGroups);
307  
308      findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
309    }
310  }
311  
312  /// If \p isTrue is true, return the true value of \p SI, otherwise return
313  /// false value of \p SI. If the true/false value of \p SI is defined by any
314  /// select instructions in \p Selects, look through the defining select
315  /// instruction until the true/false value is not defined in \p Selects.
316  static Value *
317  getTrueOrFalseValue(SelectInst *SI, bool isTrue,
318                      const SmallPtrSet<const Instruction *, 2> &Selects) {
319    Value *V = nullptr;
320    for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
321         DefSI = dyn_cast<SelectInst>(V)) {
322      assert(DefSI->getCondition() == SI->getCondition() &&
323             "The condition of DefSI does not match with SI");
324      V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
325    }
326    assert(V && "Failed to get select true/false value");
327    return V;
328  }
329  
330  void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
331    for (SelectGroup &ASI : ProfSIGroups) {
332      // The code transformation here is a modified version of the sinking
333      // transformation in CodeGenPrepare::optimizeSelectInst with a more
334      // aggressive strategy of which instructions to sink.
335      //
336      // TODO: eliminate the redundancy of logic transforming selects to branches
337      // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
338      // selects for all cases (with and without profile information).
339  
340      // Transform a sequence like this:
341      //    start:
342      //       %cmp = cmp uge i32 %a, %b
343      //       %sel = select i1 %cmp, i32 %c, i32 %d
344      //
345      // Into:
346      //    start:
347      //       %cmp = cmp uge i32 %a, %b
348      //       %cmp.frozen = freeze %cmp
349      //       br i1 %cmp.frozen, label %select.true, label %select.false
350      //    select.true:
351      //       br label %select.end
352      //    select.false:
353      //       br label %select.end
354      //    select.end:
355      //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
356      //
357      // %cmp should be frozen, otherwise it may introduce undefined behavior.
358      // In addition, we may sink instructions that produce %c or %d into the
359      // destination(s) of the new branch.
360      // If the true or false blocks do not contain a sunken instruction, that
361      // block and its branch may be optimized away. In that case, one side of the
362      // first branch will point directly to select.end, and the corresponding PHI
363      // predecessor block will be the start block.
364  
365      // Find all the instructions that can be soundly sunk to the true/false
366      // blocks. These are instructions that are computed solely for producing the
367      // operands of the select instructions in the group and can be sunk without
368      // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
369      // with side effects).
370      SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
371      typedef std::stack<Instruction *>::size_type StackSizeType;
372      StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
373      for (SelectInst *SI : ASI) {
374        // For each select, compute the sinkable dependence chains of the true and
375        // false operands.
376        if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
377          std::stack<Instruction *> TrueSlice;
378          getExclBackwardsSlice(TI, TrueSlice, true);
379          maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
380          TrueSlices.push_back(TrueSlice);
381        }
382        if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
383          std::stack<Instruction *> FalseSlice;
384          getExclBackwardsSlice(FI, FalseSlice, true);
385          maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
386          FalseSlices.push_back(FalseSlice);
387        }
388      }
389      // In the case of multiple select instructions in the same group, the order
390      // of non-dependent instructions (instructions of different dependence
391      // slices) in the true/false blocks appears to affect performance.
392      // Interleaving the slices seems to experimentally be the optimal approach.
393      // This interleaving scheduling allows for more ILP (with a natural downside
394      // of increasing a bit register pressure) compared to a simple ordering of
395      // one whole chain after another. One would expect that this ordering would
396      // not matter since the scheduling in the backend of the compiler  would
397      // take care of it, but apparently the scheduler fails to deliver optimal
398      // ILP with a naive ordering here.
399      SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
400      for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
401        for (auto &S : TrueSlices) {
402          if (!S.empty()) {
403            TrueSlicesInterleaved.push_back(S.top());
404            S.pop();
405          }
406        }
407      }
408      for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
409        for (auto &S : FalseSlices) {
410          if (!S.empty()) {
411            FalseSlicesInterleaved.push_back(S.top());
412            S.pop();
413          }
414        }
415      }
416  
417      // We split the block containing the select(s) into two blocks.
418      SelectInst *SI = ASI.front();
419      SelectInst *LastSI = ASI.back();
420      BasicBlock *StartBlock = SI->getParent();
421      BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
422      BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
423      BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
424      // Delete the unconditional branch that was just created by the split.
425      StartBlock->getTerminator()->eraseFromParent();
426  
427      // Move any debug/pseudo instructions that were in-between the select
428      // group to the newly-created end block.
429      SmallVector<Instruction *, 2> DebugPseudoINS;
430      auto DIt = SI->getIterator();
431      while (&*DIt != LastSI) {
432        if (DIt->isDebugOrPseudoInst())
433          DebugPseudoINS.push_back(&*DIt);
434        DIt++;
435      }
436      for (auto *DI : DebugPseudoINS) {
437        DI->moveBefore(&*EndBlock->getFirstInsertionPt());
438      }
439  
440      // These are the new basic blocks for the conditional branch.
441      // At least one will become an actual new basic block.
442      BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
443      BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
444      if (!TrueSlicesInterleaved.empty()) {
445        TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
446                                       EndBlock->getParent(), EndBlock);
447        TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
448        TrueBranch->setDebugLoc(LastSI->getDebugLoc());
449        for (Instruction *TrueInst : TrueSlicesInterleaved)
450          TrueInst->moveBefore(TrueBranch);
451      }
452      if (!FalseSlicesInterleaved.empty()) {
453        FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
454                                        EndBlock->getParent(), EndBlock);
455        FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
456        FalseBranch->setDebugLoc(LastSI->getDebugLoc());
457        for (Instruction *FalseInst : FalseSlicesInterleaved)
458          FalseInst->moveBefore(FalseBranch);
459      }
460      // If there was nothing to sink, then arbitrarily choose the 'false' side
461      // for a new input value to the PHI.
462      if (TrueBlock == FalseBlock) {
463        assert(TrueBlock == nullptr &&
464               "Unexpected basic block transform while optimizing select");
465  
466        FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
467                                        EndBlock->getParent(), EndBlock);
468        auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
469        FalseBranch->setDebugLoc(SI->getDebugLoc());
470      }
471  
472      // Insert the real conditional branch based on the original condition.
473      // If we did not create a new block for one of the 'true' or 'false' paths
474      // of the condition, it means that side of the branch goes to the end block
475      // directly and the path originates from the start block from the point of
476      // view of the new PHI.
477      BasicBlock *TT, *FT;
478      if (TrueBlock == nullptr) {
479        TT = EndBlock;
480        FT = FalseBlock;
481        TrueBlock = StartBlock;
482      } else if (FalseBlock == nullptr) {
483        TT = TrueBlock;
484        FT = EndBlock;
485        FalseBlock = StartBlock;
486      } else {
487        TT = TrueBlock;
488        FT = FalseBlock;
489      }
490      IRBuilder<> IB(SI);
491      auto *CondFr =
492          IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
493      IB.CreateCondBr(CondFr, TT, FT, SI);
494  
495      SmallPtrSet<const Instruction *, 2> INS;
496      INS.insert(ASI.begin(), ASI.end());
497      // Use reverse iterator because later select may use the value of the
498      // earlier select, and we need to propagate value through earlier select
499      // to get the PHI operand.
500      for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
501        SelectInst *SI = *It;
502        // The select itself is replaced with a PHI Node.
503        PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
504        PN->takeName(SI);
505        PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
506        PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
507        PN->setDebugLoc(SI->getDebugLoc());
508  
509        SI->replaceAllUsesWith(PN);
510        SI->eraseFromParent();
511        INS.erase(SI);
512        ++NumSelectsConverted;
513      }
514    }
515  }
516  
517  void SelectOptimize::collectSelectGroups(BasicBlock &BB,
518                                           SelectGroups &SIGroups) {
519    BasicBlock::iterator BBIt = BB.begin();
520    while (BBIt != BB.end()) {
521      Instruction *I = &*BBIt++;
522      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
523        SelectGroup SIGroup;
524        SIGroup.push_back(SI);
525        while (BBIt != BB.end()) {
526          Instruction *NI = &*BBIt;
527          SelectInst *NSI = dyn_cast<SelectInst>(NI);
528          if (NSI && SI->getCondition() == NSI->getCondition()) {
529            SIGroup.push_back(NSI);
530          } else if (!NI->isDebugOrPseudoInst()) {
531            // Debug/pseudo instructions should be skipped and not prevent the
532            // formation of a select group.
533            break;
534          }
535          ++BBIt;
536        }
537  
538        // If the select type is not supported, no point optimizing it.
539        // Instruction selection will take care of it.
540        if (!isSelectKindSupported(SI))
541          continue;
542  
543        SIGroups.push_back(SIGroup);
544      }
545    }
546  }
547  
548  void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
549                                                  SelectGroups &ProfSIGroups) {
550    for (SelectGroup &ASI : SIGroups) {
551      ++NumSelectOptAnalyzed;
552      if (isConvertToBranchProfitableBase(ASI))
553        ProfSIGroups.push_back(ASI);
554    }
555  }
556  
557  void SelectOptimize::findProfitableSIGroupsInnerLoops(
558      const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
559    NumSelectOptAnalyzed += SIGroups.size();
560    // For each select group in an inner-most loop,
561    // a branch is more preferable than a select/conditional-move if:
562    // i) conversion to branches for all the select groups of the loop satisfies
563    //    loop-level heuristics including reducing the loop's critical path by
564    //    some threshold (see SelectOptimize::checkLoopHeuristics); and
565    // ii) the total cost of the select group is cheaper with a branch compared
566    //     to its predicated version. The cost is in terms of latency and the cost
567    //     of a select group is the cost of its most expensive select instruction
568    //     (assuming infinite resources and thus fully leveraging available ILP).
569  
570    DenseMap<const Instruction *, CostInfo> InstCostMap;
571    CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
572                            {Scaled64::getZero(), Scaled64::getZero()}};
573    if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
574        !checkLoopHeuristics(L, LoopCost)) {
575      return;
576    }
577  
578    for (SelectGroup &ASI : SIGroups) {
579      // Assuming infinite resources, the cost of a group of instructions is the
580      // cost of the most expensive instruction of the group.
581      Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
582      for (SelectInst *SI : ASI) {
583        SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
584        BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
585      }
586      if (BranchCost < SelectCost) {
587        OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
588        OR << "Profitable to convert to branch (loop analysis). BranchCost="
589           << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
590           << ". ";
591        ORE->emit(OR);
592        ++NumSelectConvertedLoop;
593        ProfSIGroups.push_back(ASI);
594      } else {
595        OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
596        ORmiss << "Select is more profitable (loop analysis). BranchCost="
597               << BranchCost.toString()
598               << ", SelectCost=" << SelectCost.toString() << ". ";
599        ORE->emit(ORmiss);
600      }
601    }
602  }
603  
604  bool SelectOptimize::isConvertToBranchProfitableBase(
605      const SmallVector<SelectInst *, 2> &ASI) {
606    SelectInst *SI = ASI.front();
607    OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
608    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
609  
610    // Skip cold basic blocks. Better to optimize for size for cold blocks.
611    if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
612      ++NumSelectColdBB;
613      ORmiss << "Not converted to branch because of cold basic block. ";
614      ORE->emit(ORmiss);
615      return false;
616    }
617  
618    // If unpredictable, branch form is less profitable.
619    if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
620      ++NumSelectUnPred;
621      ORmiss << "Not converted to branch because of unpredictable branch. ";
622      ORE->emit(ORmiss);
623      return false;
624    }
625  
626    // If highly predictable, branch form is more profitable, unless a
627    // predictable select is inexpensive in the target architecture.
628    if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
629      ++NumSelectConvertedHighPred;
630      OR << "Converted to branch because of highly predictable branch. ";
631      ORE->emit(OR);
632      return true;
633    }
634  
635    // Look for expensive instructions in the cold operand's (if any) dependence
636    // slice of any of the selects in the group.
637    if (hasExpensiveColdOperand(ASI)) {
638      ++NumSelectConvertedExpColdOperand;
639      OR << "Converted to branch because of expensive cold operand.";
640      ORE->emit(OR);
641      return true;
642    }
643  
644    ORmiss << "Not profitable to convert to branch (base heuristic).";
645    ORE->emit(ORmiss);
646    return false;
647  }
648  
649  static InstructionCost divideNearest(InstructionCost Numerator,
650                                       uint64_t Denominator) {
651    return (Numerator + (Denominator / 2)) / Denominator;
652  }
653  
654  bool SelectOptimize::hasExpensiveColdOperand(
655      const SmallVector<SelectInst *, 2> &ASI) {
656    bool ColdOperand = false;
657    uint64_t TrueWeight, FalseWeight, TotalWeight;
658    if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) {
659      uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
660      TotalWeight = TrueWeight + FalseWeight;
661      // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
662      ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
663    } else if (PSI->hasProfileSummary()) {
664      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
665      ORmiss << "Profile data available but missing branch-weights metadata for "
666                "select instruction. ";
667      ORE->emit(ORmiss);
668    }
669    if (!ColdOperand)
670      return false;
671    // Check if the cold path's dependence slice is expensive for any of the
672    // selects of the group.
673    for (SelectInst *SI : ASI) {
674      Instruction *ColdI = nullptr;
675      uint64_t HotWeight;
676      if (TrueWeight < FalseWeight) {
677        ColdI = dyn_cast<Instruction>(SI->getTrueValue());
678        HotWeight = FalseWeight;
679      } else {
680        ColdI = dyn_cast<Instruction>(SI->getFalseValue());
681        HotWeight = TrueWeight;
682      }
683      if (ColdI) {
684        std::stack<Instruction *> ColdSlice;
685        getExclBackwardsSlice(ColdI, ColdSlice);
686        InstructionCost SliceCost = 0;
687        while (!ColdSlice.empty()) {
688          SliceCost += TTI->getInstructionCost(ColdSlice.top(),
689                                               TargetTransformInfo::TCK_Latency);
690          ColdSlice.pop();
691        }
692        // The colder the cold value operand of the select is the more expensive
693        // the cmov becomes for computing the cold value operand every time. Thus,
694        // the colder the cold operand is the more its cost counts.
695        // Get nearest integer cost adjusted for coldness.
696        InstructionCost AdjSliceCost =
697            divideNearest(SliceCost * HotWeight, TotalWeight);
698        if (AdjSliceCost >=
699            ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
700          return true;
701      }
702    }
703    return false;
704  }
705  
706  // For a given source instruction, collect its backwards dependence slice
707  // consisting of instructions exclusively computed for the purpose of producing
708  // the operands of the source instruction. As an approximation
709  // (sufficiently-accurate in practice), we populate this set with the
710  // instructions of the backwards dependence slice that only have one-use and
711  // form an one-use chain that leads to the source instruction.
712  void SelectOptimize::getExclBackwardsSlice(Instruction *I,
713                                             std::stack<Instruction *> &Slice,
714                                             bool ForSinking) {
715    SmallPtrSet<Instruction *, 2> Visited;
716    std::queue<Instruction *> Worklist;
717    Worklist.push(I);
718    while (!Worklist.empty()) {
719      Instruction *II = Worklist.front();
720      Worklist.pop();
721  
722      // Avoid cycles.
723      if (!Visited.insert(II).second)
724        continue;
725  
726      if (!II->hasOneUse())
727        continue;
728  
729      // Cannot soundly sink instructions with side-effects.
730      // Terminator or phi instructions cannot be sunk.
731      // Avoid sinking other select instructions (should be handled separetely).
732      if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
733                         isa<SelectInst>(II) || isa<PHINode>(II)))
734        continue;
735  
736      // Avoid considering instructions with less frequency than the source
737      // instruction (i.e., avoid colder code regions of the dependence slice).
738      if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
739        continue;
740  
741      // Eligible one-use instruction added to the dependence slice.
742      Slice.push(II);
743  
744      // Explore all the operands of the current instruction to expand the slice.
745      for (unsigned k = 0; k < II->getNumOperands(); ++k)
746        if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
747          Worklist.push(OpI);
748    }
749  }
750  
751  bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
752    uint64_t TrueWeight, FalseWeight;
753    if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
754      uint64_t Max = std::max(TrueWeight, FalseWeight);
755      uint64_t Sum = TrueWeight + FalseWeight;
756      if (Sum != 0) {
757        auto Probability = BranchProbability::getBranchProbability(Max, Sum);
758        if (Probability > TTI->getPredictableBranchThreshold())
759          return true;
760      }
761    }
762    return false;
763  }
764  
765  bool SelectOptimize::checkLoopHeuristics(const Loop *L,
766                                           const CostInfo LoopCost[2]) {
767    // Loop-level checks to determine if a non-predicated version (with branches)
768    // of the loop is more profitable than its predicated version.
769  
770    if (DisableLoopLevelHeuristics)
771      return true;
772  
773    OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
774                                     L->getHeader()->getFirstNonPHI());
775  
776    if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
777        LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
778      ORmissL << "No select conversion in the loop due to no reduction of loop's "
779                 "critical path. ";
780      ORE->emit(ORmissL);
781      return false;
782    }
783  
784    Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
785                        LoopCost[1].PredCost - LoopCost[1].NonPredCost};
786  
787    // Profitably converting to branches need to reduce the loop's critical path
788    // by at least some threshold (absolute gain of GainCycleThreshold cycles and
789    // relative gain of 12.5%).
790    if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
791        Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
792      Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
793      ORmissL << "No select conversion in the loop due to small reduction of "
794                 "loop's critical path. Gain="
795              << Gain[1].toString()
796              << ", RelativeGain=" << RelativeGain.toString() << "%. ";
797      ORE->emit(ORmissL);
798      return false;
799    }
800  
801    // If the loop's critical path involves loop-carried dependences, the gradient
802    // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
803    // This check ensures that the latency reduction for the loop's critical path
804    // keeps decreasing with sufficient rate beyond the two analyzed loop
805    // iterations.
806    if (Gain[1] > Gain[0]) {
807      Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
808                              (LoopCost[1].PredCost - LoopCost[0].PredCost);
809      if (GradientGain < Scaled64::get(GainGradientThreshold)) {
810        ORmissL << "No select conversion in the loop due to small gradient gain. "
811                   "GradientGain="
812                << GradientGain.toString() << "%. ";
813        ORE->emit(ORmissL);
814        return false;
815      }
816    }
817    // If the gain decreases it is not profitable to convert.
818    else if (Gain[1] < Gain[0]) {
819      ORmissL
820          << "No select conversion in the loop due to negative gradient gain. ";
821      ORE->emit(ORmissL);
822      return false;
823    }
824  
825    // Non-predicated version of the loop is more profitable than its
826    // predicated version.
827    return true;
828  }
829  
830  // Computes instruction and loop-critical-path costs for both the predicated
831  // and non-predicated version of the given loop.
832  // Returns false if unable to compute these costs due to invalid cost of loop
833  // instruction(s).
834  bool SelectOptimize::computeLoopCosts(
835      const Loop *L, const SelectGroups &SIGroups,
836      DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
837    const auto &SIset = getSIset(SIGroups);
838    // Compute instruction and loop-critical-path costs across two iterations for
839    // both predicated and non-predicated version.
840    const unsigned Iterations = 2;
841    for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
842      // Cost of the loop's critical path.
843      CostInfo &MaxCost = LoopCost[Iter];
844      for (BasicBlock *BB : L->getBlocks()) {
845        for (const Instruction &I : *BB) {
846          if (I.isDebugOrPseudoInst())
847            continue;
848          // Compute the predicated and non-predicated cost of the instruction.
849          Scaled64 IPredCost = Scaled64::getZero(),
850                   INonPredCost = Scaled64::getZero();
851  
852          // Assume infinite resources that allow to fully exploit the available
853          // instruction-level parallelism.
854          // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
855          for (const Use &U : I.operands()) {
856            auto UI = dyn_cast<Instruction>(U.get());
857            if (!UI)
858              continue;
859            if (InstCostMap.count(UI)) {
860              IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
861              INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
862            }
863          }
864          auto ILatency = computeInstCost(&I);
865          if (!ILatency) {
866            OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
867            ORmissL << "Invalid instruction cost preventing analysis and "
868                       "optimization of the inner-most loop containing this "
869                       "instruction. ";
870            ORE->emit(ORmissL);
871            return false;
872          }
873          IPredCost += Scaled64::get(ILatency.value());
874          INonPredCost += Scaled64::get(ILatency.value());
875  
876          // For a select that can be converted to branch,
877          // compute its cost as a branch (non-predicated cost).
878          //
879          // BranchCost = PredictedPathCost + MispredictCost
880          // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
881          // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
882          if (SIset.contains(&I)) {
883            auto SI = dyn_cast<SelectInst>(&I);
884  
885            Scaled64 TrueOpCost = Scaled64::getZero(),
886                     FalseOpCost = Scaled64::getZero();
887            if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
888              if (InstCostMap.count(TI))
889                TrueOpCost = InstCostMap[TI].NonPredCost;
890            if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
891              if (InstCostMap.count(FI))
892                FalseOpCost = InstCostMap[FI].NonPredCost;
893            Scaled64 PredictedPathCost =
894                getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
895  
896            Scaled64 CondCost = Scaled64::getZero();
897            if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
898              if (InstCostMap.count(CI))
899                CondCost = InstCostMap[CI].NonPredCost;
900            Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
901  
902            INonPredCost = PredictedPathCost + MispredictCost;
903          }
904  
905          InstCostMap[&I] = {IPredCost, INonPredCost};
906          MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
907          MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
908        }
909      }
910    }
911    return true;
912  }
913  
914  SmallPtrSet<const Instruction *, 2>
915  SelectOptimize::getSIset(const SelectGroups &SIGroups) {
916    SmallPtrSet<const Instruction *, 2> SIset;
917    for (const SelectGroup &ASI : SIGroups)
918      for (const SelectInst *SI : ASI)
919        SIset.insert(SI);
920    return SIset;
921  }
922  
923  Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
924    InstructionCost ICost =
925        TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
926    if (auto OC = ICost.getValue())
927      return Optional<uint64_t>(*OC);
928    return Optional<uint64_t>(None);
929  }
930  
931  ScaledNumber<uint64_t>
932  SelectOptimize::getMispredictionCost(const SelectInst *SI,
933                                       const Scaled64 CondCost) {
934    uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
935  
936    // Account for the default misprediction rate when using a branch
937    // (conservatively set to 25% by default).
938    uint64_t MispredictRate = MispredictDefaultRate;
939    // If the select condition is obviously predictable, then the misprediction
940    // rate is zero.
941    if (isSelectHighlyPredictable(SI))
942      MispredictRate = 0;
943  
944    // CondCost is included to account for cases where the computation of the
945    // condition is part of a long dependence chain (potentially loop-carried)
946    // that would delay detection of a misprediction and increase its cost.
947    Scaled64 MispredictCost =
948        std::max(Scaled64::get(MispredictPenalty), CondCost) *
949        Scaled64::get(MispredictRate);
950    MispredictCost /= Scaled64::get(100);
951  
952    return MispredictCost;
953  }
954  
955  // Returns the cost of a branch when the prediction is correct.
956  // TrueCost * TrueProbability + FalseCost * FalseProbability.
957  ScaledNumber<uint64_t>
958  SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
959                                       const SelectInst *SI) {
960    Scaled64 PredPathCost;
961    uint64_t TrueWeight, FalseWeight;
962    if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
963      uint64_t SumWeight = TrueWeight + FalseWeight;
964      if (SumWeight != 0) {
965        PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
966                       FalseCost * Scaled64::get(FalseWeight);
967        PredPathCost /= Scaled64::get(SumWeight);
968        return PredPathCost;
969      }
970    }
971    // Without branch weight metadata, we assume 75% for the one path and 25% for
972    // the other, and pick the result with the biggest cost.
973    PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
974                            FalseCost * Scaled64::get(3) + TrueCost);
975    PredPathCost /= Scaled64::get(4);
976    return PredPathCost;
977  }
978  
979  bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
980    bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
981    if (VectorCond)
982      return false;
983    TargetLowering::SelectSupportKind SelectKind;
984    if (SI->getType()->isVectorTy())
985      SelectKind = TargetLowering::ScalarCondVectorVal;
986    else
987      SelectKind = TargetLowering::ScalarValSelect;
988    return TLI->isSelectSupported(SelectKind);
989  }
990