1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The machine combiner pass uses machine trace metrics to ensure the combined
10 // instructions do not lengthen the critical path or the resource depth.
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/Analysis/ProfileSummaryInfo.h"
16 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17 #include "llvm/CodeGen/MachineCombinerPattern.h"
18 #include "llvm/CodeGen/MachineDominators.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineLoopInfo.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/MachineSizeOpts.h"
24 #include "llvm/CodeGen/MachineTraceMetrics.h"
25 #include "llvm/CodeGen/RegisterClassInfo.h"
26 #include "llvm/CodeGen/TargetInstrInfo.h"
27 #include "llvm/CodeGen/TargetRegisterInfo.h"
28 #include "llvm/CodeGen/TargetSchedule.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34
35 using namespace llvm;
36
37 #define DEBUG_TYPE "machine-combiner"
38
39 STATISTIC(NumInstCombined, "Number of machineinst combined");
40
41 static cl::opt<unsigned>
42 inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46 static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50 #ifdef EXPENSIVE_CHECKS
51 static cl::opt<bool> VerifyPatternOrder(
52 "machine-combiner-verify-pattern-order", cl::Hidden,
53 cl::desc(
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56 #else
57 static cl::opt<bool> VerifyPatternOrder(
58 "machine-combiner-verify-pattern-order", cl::Hidden,
59 cl::desc(
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62 #endif
63
64 namespace {
65 class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80 /// True if optimizing for code size.
81 bool OptSize = false;
82
83 public:
84 static char ID;
MachineCombiner()85 MachineCombiner() : MachineFunctionPass(ID) {
86 initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
87 }
88 void getAnalysisUsage(AnalysisUsage &AU) const override;
89 bool runOnMachineFunction(MachineFunction &MF) override;
getPassName() const90 StringRef getPassName() const override { return "Machine InstCombiner"; }
91
92 private:
93 bool combineInstructions(MachineBasicBlock *);
94 MachineInstr *getOperandDef(const MachineOperand &MO);
95 bool isTransientMI(const MachineInstr *MI);
96 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
97 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
98 MachineTraceMetrics::Trace BlockTrace,
99 const MachineBasicBlock &MBB);
100 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
101 MachineTraceMetrics::Trace BlockTrace);
102 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
103 MachineTraceMetrics::Trace BlockTrace,
104 SmallVectorImpl<MachineInstr *> &InsInstrs,
105 SmallVectorImpl<MachineInstr *> &DelInstrs,
106 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
107 unsigned Pattern, bool SlackIsAccurate);
108 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
109 SmallVectorImpl<MachineInstr *> &InsInstrs,
110 SmallVectorImpl<MachineInstr *> &DelInstrs,
111 unsigned Pattern);
112 bool preservesResourceLen(MachineBasicBlock *MBB,
113 MachineTraceMetrics::Trace BlockTrace,
114 SmallVectorImpl<MachineInstr *> &InsInstrs,
115 SmallVectorImpl<MachineInstr *> &DelInstrs);
116 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
117 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
118 std::pair<unsigned, unsigned>
119 getLatenciesForInstrSequences(MachineInstr &MI,
120 SmallVectorImpl<MachineInstr *> &InsInstrs,
121 SmallVectorImpl<MachineInstr *> &DelInstrs,
122 MachineTraceMetrics::Trace BlockTrace);
123
124 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
125 SmallVector<unsigned, 16> &Patterns);
126 CombinerObjective getCombinerObjective(unsigned Pattern);
127 };
128 }
129
130 char MachineCombiner::ID = 0;
131 char &llvm::MachineCombinerID = MachineCombiner::ID;
132
133 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
134 "Machine InstCombiner", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)135 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
136 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
137 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
138 false, false)
139
140 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
141 AU.setPreservesCFG();
142 AU.addPreserved<MachineDominatorTreeWrapperPass>();
143 AU.addRequired<MachineLoopInfoWrapperPass>();
144 AU.addPreserved<MachineLoopInfoWrapperPass>();
145 AU.addRequired<MachineTraceMetrics>();
146 AU.addPreserved<MachineTraceMetrics>();
147 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
148 AU.addRequired<ProfileSummaryInfoWrapperPass>();
149 MachineFunctionPass::getAnalysisUsage(AU);
150 }
151
152 MachineInstr *
getOperandDef(const MachineOperand & MO)153 MachineCombiner::getOperandDef(const MachineOperand &MO) {
154 MachineInstr *DefInstr = nullptr;
155 // We need a virtual register definition.
156 if (MO.isReg() && MO.getReg().isVirtual())
157 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
158 return DefInstr;
159 }
160
161 /// Return true if MI is unlikely to generate an actual target instruction.
isTransientMI(const MachineInstr * MI)162 bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
163 if (!MI->isCopy())
164 return MI->isTransient();
165
166 // If MI is a COPY, check if its src and dst registers can be coalesced.
167 Register Dst = MI->getOperand(0).getReg();
168 Register Src = MI->getOperand(1).getReg();
169
170 if (!MI->isFullCopy()) {
171 // If src RC contains super registers of dst RC, it can also be coalesced.
172 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
173 return false;
174
175 auto SrcSub = MI->getOperand(1).getSubReg();
176 auto SrcRC = MRI->getRegClass(Src);
177 auto DstRC = MRI->getRegClass(Dst);
178 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
179 }
180
181 if (Src.isPhysical() && Dst.isPhysical())
182 return Src == Dst;
183
184 if (Src.isVirtual() && Dst.isVirtual()) {
185 auto SrcRC = MRI->getRegClass(Src);
186 auto DstRC = MRI->getRegClass(Dst);
187 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
188 }
189
190 if (Src.isVirtual())
191 std::swap(Src, Dst);
192
193 // Now Src is physical register, Dst is virtual register.
194 auto DstRC = MRI->getRegClass(Dst);
195 return DstRC->contains(Src);
196 }
197
198 /// Computes depth of instructions in vector \InsInstr.
199 ///
200 /// \param InsInstrs is a vector of machine instructions
201 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
202 /// of defining machine instruction in \p InsInstrs
203 /// \param BlockTrace is a trace of machine instructions
204 ///
205 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
206 unsigned
getDepth(SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,MachineTraceMetrics::Trace BlockTrace,const MachineBasicBlock & MBB)207 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
208 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
209 MachineTraceMetrics::Trace BlockTrace,
210 const MachineBasicBlock &MBB) {
211 SmallVector<unsigned, 16> InstrDepth;
212 // For each instruction in the new sequence compute the depth based on the
213 // operands. Use the trace information when possible. For new operands which
214 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
215 for (auto *InstrPtr : InsInstrs) { // for each Use
216 unsigned IDepth = 0;
217 for (const MachineOperand &MO : InstrPtr->all_uses()) {
218 // Check for virtual register operand.
219 if (!MO.getReg().isVirtual())
220 continue;
221 unsigned DepthOp = 0;
222 unsigned LatencyOp = 0;
223 DenseMap<unsigned, unsigned>::iterator II =
224 InstrIdxForVirtReg.find(MO.getReg());
225 if (II != InstrIdxForVirtReg.end()) {
226 // Operand is new virtual register not in trace
227 assert(II->second < InstrDepth.size() && "Bad Index");
228 MachineInstr *DefInstr = InsInstrs[II->second];
229 assert(DefInstr &&
230 "There must be a definition for a new virtual register");
231 DepthOp = InstrDepth[II->second];
232 int DefIdx =
233 DefInstr->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr);
234 int UseIdx =
235 InstrPtr->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr);
236 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
237 InstrPtr, UseIdx);
238 } else {
239 MachineInstr *DefInstr = getOperandDef(MO);
240 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
241 MachineTraceStrategy::TS_Local ||
242 DefInstr->getParent() == &MBB)) {
243 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
244 if (!isTransientMI(DefInstr))
245 LatencyOp = TSchedModel.computeOperandLatency(
246 DefInstr,
247 DefInstr->findRegisterDefOperandIdx(MO.getReg(),
248 /*TRI=*/nullptr),
249 InstrPtr,
250 InstrPtr->findRegisterUseOperandIdx(MO.getReg(),
251 /*TRI=*/nullptr));
252 }
253 }
254 IDepth = std::max(IDepth, DepthOp + LatencyOp);
255 }
256 InstrDepth.push_back(IDepth);
257 }
258 unsigned NewRootIdx = InsInstrs.size() - 1;
259 return InstrDepth[NewRootIdx];
260 }
261
262 /// Computes instruction latency as max of latency of defined operands.
263 ///
264 /// \param Root is a machine instruction that could be replaced by NewRoot.
265 /// It is used to compute a more accurate latency information for NewRoot in
266 /// case there is a dependent instruction in the same trace (\p BlockTrace)
267 /// \param NewRoot is the instruction for which the latency is computed
268 /// \param BlockTrace is a trace of machine instructions
269 ///
270 /// \returns Latency of \p NewRoot
getLatency(MachineInstr * Root,MachineInstr * NewRoot,MachineTraceMetrics::Trace BlockTrace)271 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
272 MachineTraceMetrics::Trace BlockTrace) {
273 // Check each definition in NewRoot and compute the latency
274 unsigned NewRootLatency = 0;
275
276 for (const MachineOperand &MO : NewRoot->all_defs()) {
277 // Check for virtual register operand.
278 if (!MO.getReg().isVirtual())
279 continue;
280 // Get the first instruction that uses MO
281 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
282 RI++;
283 if (RI == MRI->reg_end())
284 continue;
285 MachineInstr *UseMO = RI->getParent();
286 unsigned LatencyOp = 0;
287 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
288 LatencyOp = TSchedModel.computeOperandLatency(
289 NewRoot,
290 NewRoot->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr),
291 UseMO,
292 UseMO->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr));
293 } else {
294 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
295 }
296 NewRootLatency = std::max(NewRootLatency, LatencyOp);
297 }
298 return NewRootLatency;
299 }
300
getCombinerObjective(unsigned Pattern)301 CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
302 // TODO: If C++ ever gets a real enum class, make this part of the
303 // MachineCombinerPattern class.
304 switch (Pattern) {
305 case MachineCombinerPattern::REASSOC_AX_BY:
306 case MachineCombinerPattern::REASSOC_AX_YB:
307 case MachineCombinerPattern::REASSOC_XA_BY:
308 case MachineCombinerPattern::REASSOC_XA_YB:
309 return CombinerObjective::MustReduceDepth;
310 default:
311 return TII->getCombinerObjective(Pattern);
312 }
313 }
314
315 /// Estimate the latency of the new and original instruction sequence by summing
316 /// up the latencies of the inserted and deleted instructions. This assumes
317 /// that the inserted and deleted instructions are dependent instruction chains,
318 /// which might not hold in all cases.
getLatenciesForInstrSequences(MachineInstr & MI,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,MachineTraceMetrics::Trace BlockTrace)319 std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
320 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
321 SmallVectorImpl<MachineInstr *> &DelInstrs,
322 MachineTraceMetrics::Trace BlockTrace) {
323 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
324 unsigned NewRootLatency = 0;
325 // NewRoot is the last instruction in the \p InsInstrs vector.
326 MachineInstr *NewRoot = InsInstrs.back();
327 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
328 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
329 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
330
331 unsigned RootLatency = 0;
332 for (auto *I : DelInstrs)
333 RootLatency += TSchedModel.computeInstrLatency(I);
334
335 return {NewRootLatency, RootLatency};
336 }
337
reduceRegisterPressure(MachineInstr & Root,MachineBasicBlock * MBB,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned Pattern)338 bool MachineCombiner::reduceRegisterPressure(
339 MachineInstr &Root, MachineBasicBlock *MBB,
340 SmallVectorImpl<MachineInstr *> &InsInstrs,
341 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
342 // FIXME: for now, we don't do any check for the register pressure patterns.
343 // We treat them as always profitable. But we can do better if we make
344 // RegPressureTracker class be aware of TIE attribute. Then we can get an
345 // accurate compare of register pressure with DelInstrs or InsInstrs.
346 return true;
347 }
348
349 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
350 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
351 /// sequence to replace the old sequence is that it cannot lengthen the critical
352 /// path. The definition of "improve" may be restricted by specifying that the
353 /// new path improves the data dependency chain (MustReduceDepth).
improvesCriticalPathLen(MachineBasicBlock * MBB,MachineInstr * Root,MachineTraceMetrics::Trace BlockTrace,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned Pattern,bool SlackIsAccurate)354 bool MachineCombiner::improvesCriticalPathLen(
355 MachineBasicBlock *MBB, MachineInstr *Root,
356 MachineTraceMetrics::Trace BlockTrace,
357 SmallVectorImpl<MachineInstr *> &InsInstrs,
358 SmallVectorImpl<MachineInstr *> &DelInstrs,
359 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
360 bool SlackIsAccurate) {
361 // Get depth and latency of NewRoot and Root.
362 unsigned NewRootDepth =
363 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
364 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
365
366 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
367 << NewRootDepth << "\tRootDepth: " << RootDepth);
368
369 // For a transform such as reassociation, the cost equation is
370 // conservatively calculated so that we must improve the depth (data
371 // dependency cycles) in the critical path to proceed with the transform.
372 // Being conservative also protects against inaccuracies in the underlying
373 // machine trace metrics and CPU models.
374 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
375 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
376 LLVM_DEBUG(NewRootDepth < RootDepth
377 ? dbgs() << "\t and it does it\n"
378 : dbgs() << "\t but it does NOT do it\n");
379 return NewRootDepth < RootDepth;
380 }
381
382 // A more flexible cost calculation for the critical path includes the slack
383 // of the original code sequence. This may allow the transform to proceed
384 // even if the instruction depths (data dependency cycles) become worse.
385
386 // Account for the latency of the inserted and deleted instructions by
387 unsigned NewRootLatency, RootLatency;
388 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
389 std::tie(NewRootLatency, RootLatency) =
390 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
391 } else {
392 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
393 RootLatency = TSchedModel.computeInstrLatency(Root);
394 }
395
396 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
397 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
398 unsigned OldCycleCount =
399 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
400 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
401 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
402 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
403 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
404 << "\n\tRootDepth + RootLatency + RootSlack = "
405 << OldCycleCount;);
406 LLVM_DEBUG(NewCycleCount <= OldCycleCount
407 ? dbgs() << "\n\t It IMPROVES PathLen because"
408 : dbgs() << "\n\t It DOES NOT improve PathLen because");
409 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
410 << ", OldCycleCount = " << OldCycleCount << "\n");
411
412 return NewCycleCount <= OldCycleCount;
413 }
414
415 /// helper routine to convert instructions into SC
instr2instrSC(SmallVectorImpl<MachineInstr * > & Instrs,SmallVectorImpl<const MCSchedClassDesc * > & InstrsSC)416 void MachineCombiner::instr2instrSC(
417 SmallVectorImpl<MachineInstr *> &Instrs,
418 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
419 for (auto *InstrPtr : Instrs) {
420 unsigned Opc = InstrPtr->getOpcode();
421 unsigned Idx = TII->get(Opc).getSchedClass();
422 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
423 InstrsSC.push_back(SC);
424 }
425 }
426
427 /// True when the new instructions do not increase resource length
preservesResourceLen(MachineBasicBlock * MBB,MachineTraceMetrics::Trace BlockTrace,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs)428 bool MachineCombiner::preservesResourceLen(
429 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
430 SmallVectorImpl<MachineInstr *> &InsInstrs,
431 SmallVectorImpl<MachineInstr *> &DelInstrs) {
432 if (!TSchedModel.hasInstrSchedModel())
433 return true;
434
435 // Compute current resource length
436
437 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
438 SmallVector <const MachineBasicBlock *, 1> MBBarr;
439 MBBarr.push_back(MBB);
440 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
441
442 // Deal with SC rather than Instructions.
443 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
444 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
445
446 instr2instrSC(InsInstrs, InsInstrsSC);
447 instr2instrSC(DelInstrs, DelInstrsSC);
448
449 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
450 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
451
452 // Compute new resource length.
453 unsigned ResLenAfterCombine =
454 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
455
456 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
457 << ResLenBeforeCombine
458 << " and after: " << ResLenAfterCombine << "\n";);
459 LLVM_DEBUG(
460 ResLenAfterCombine <=
461 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
462 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
463 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
464 "Length\n");
465
466 return ResLenAfterCombine <=
467 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
468 }
469
470 /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
471 /// depths if requested.
472 ///
473 /// \param MBB basic block to insert instructions in
474 /// \param MI current machine instruction
475 /// \param InsInstrs new instructions to insert in \p MBB
476 /// \param DelInstrs instruction to delete from \p MBB
477 /// \param TraceEnsemble is a pointer to the machine trace information
478 /// \param RegUnits set of live registers, needed to compute instruction depths
479 /// \param TII is target instruction info, used to call target hook
480 /// \param Pattern is used to call target hook finalizeInsInstrs
481 /// \param IncrementalUpdate if true, compute instruction depths incrementally,
482 /// otherwise invalidate the trace
483 static void
insertDeleteInstructions(MachineBasicBlock * MBB,MachineInstr & MI,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,MachineTraceMetrics::Ensemble * TraceEnsemble,SparseSet<LiveRegUnit> & RegUnits,const TargetInstrInfo * TII,unsigned Pattern,bool IncrementalUpdate)484 insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
485 SmallVectorImpl<MachineInstr *> &InsInstrs,
486 SmallVectorImpl<MachineInstr *> &DelInstrs,
487 MachineTraceMetrics::Ensemble *TraceEnsemble,
488 SparseSet<LiveRegUnit> &RegUnits,
489 const TargetInstrInfo *TII, unsigned Pattern,
490 bool IncrementalUpdate) {
491 // If we want to fix up some placeholder for some target, do it now.
492 // We need this because in genAlternativeCodeSequence, we have not decided the
493 // better pattern InsInstrs or DelInstrs, so we don't want generate some
494 // sideeffect to the function. For example we need to delay the constant pool
495 // entry creation here after InsInstrs is selected as better pattern.
496 // Otherwise the constant pool entry created for InsInstrs will not be deleted
497 // even if InsInstrs is not the better pattern.
498 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
499
500 for (auto *InstrPtr : InsInstrs)
501 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
502
503 for (auto *InstrPtr : DelInstrs) {
504 InstrPtr->eraseFromParent();
505 // Erase all LiveRegs defined by the removed instruction
506 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
507 if (I->MI == InstrPtr)
508 I = RegUnits.erase(I);
509 else
510 I++;
511 }
512 }
513
514 if (IncrementalUpdate)
515 for (auto *InstrPtr : InsInstrs)
516 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
517 else
518 TraceEnsemble->invalidate(MBB);
519
520 NumInstCombined++;
521 }
522
523 // Check that the difference between original and new latency is decreasing for
524 // later patterns. This helps to discover sub-optimal pattern orderings.
verifyPatternOrder(MachineBasicBlock * MBB,MachineInstr & Root,SmallVector<unsigned,16> & Patterns)525 void MachineCombiner::verifyPatternOrder(MachineBasicBlock *MBB,
526 MachineInstr &Root,
527 SmallVector<unsigned, 16> &Patterns) {
528 long PrevLatencyDiff = std::numeric_limits<long>::max();
529 (void)PrevLatencyDiff; // Variable is used in assert only.
530 for (auto P : Patterns) {
531 SmallVector<MachineInstr *, 16> InsInstrs;
532 SmallVector<MachineInstr *, 16> DelInstrs;
533 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
534 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
535 InstrIdxForVirtReg);
536 // Found pattern, but did not generate alternative sequence.
537 // This can happen e.g. when an immediate could not be materialized
538 // in a single instruction.
539 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
540 continue;
541
542 unsigned NewRootLatency, RootLatency;
543 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
544 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
545 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
546 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
547 "Current pattern is better than previous pattern.");
548 PrevLatencyDiff = CurrentLatencyDiff;
549 }
550 }
551
552 /// Substitute a slow code sequence with a faster one by
553 /// evaluating instruction combining pattern.
554 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
555 /// combining based on machine trace metrics. Only combine a sequence of
556 /// instructions when this neither lengthens the critical path nor increases
557 /// resource pressure. When optimizing for codesize always combine when the new
558 /// sequence is shorter.
combineInstructions(MachineBasicBlock * MBB)559 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
560 bool Changed = false;
561 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
562
563 bool IncrementalUpdate = false;
564 auto BlockIter = MBB->begin();
565 decltype(BlockIter) LastUpdate;
566 // Check if the block is in a loop.
567 const MachineLoop *ML = MLI->getLoopFor(MBB);
568 if (!TraceEnsemble)
569 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
570
571 SparseSet<LiveRegUnit> RegUnits;
572 RegUnits.setUniverse(TRI->getNumRegUnits());
573
574 bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
575
576 bool DoRegPressureReduce =
577 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
578
579 while (BlockIter != MBB->end()) {
580 auto &MI = *BlockIter++;
581 SmallVector<unsigned, 16> Patterns;
582 // The motivating example is:
583 //
584 // MUL Other MUL_op1 MUL_op2 Other
585 // \ / \ | /
586 // ADD/SUB => MADD/MSUB
587 // (=Root) (=NewRoot)
588
589 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
590 // usually beneficial for code size it unfortunately can hurt performance
591 // when the ADD is on the critical path, but the MUL is not. With the
592 // substitution the MUL becomes part of the critical path (in form of the
593 // MADD) and can lengthen it on architectures where the MADD latency is
594 // longer than the ADD latency.
595 //
596 // For each instruction we check if it can be the root of a combiner
597 // pattern. Then for each pattern the new code sequence in form of MI is
598 // generated and evaluated. When the efficiency criteria (don't lengthen
599 // critical path, don't use more resources) is met the new sequence gets
600 // hooked up into the basic block before the old sequence is removed.
601 //
602 // The algorithm does not try to evaluate all patterns and pick the best.
603 // This is only an artificial restriction though. In practice there is
604 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
605 // based on an internal cost heuristic. If
606 // machine-combiner-verify-pattern-order is enabled, all patterns are
607 // checked to ensure later patterns do not provide better latency savings.
608
609 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
610 continue;
611
612 if (VerifyPatternOrder)
613 verifyPatternOrder(MBB, MI, Patterns);
614
615 for (const auto P : Patterns) {
616 SmallVector<MachineInstr *, 16> InsInstrs;
617 SmallVector<MachineInstr *, 16> DelInstrs;
618 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
619 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
620 InstrIdxForVirtReg);
621 // Found pattern, but did not generate alternative sequence.
622 // This can happen e.g. when an immediate could not be materialized
623 // in a single instruction.
624 if (InsInstrs.empty())
625 continue;
626
627 LLVM_DEBUG(if (dump_intrs) {
628 dbgs() << "\tFor the Pattern (" << (int)P
629 << ") these instructions could be removed\n";
630 for (auto const *InstrPtr : DelInstrs)
631 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
632 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
633 dbgs() << "\tThese instructions could replace the removed ones\n";
634 for (auto const *InstrPtr : InsInstrs)
635 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
636 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
637 });
638
639 if (IncrementalUpdate && LastUpdate != BlockIter) {
640 // Update depths since the last incremental update.
641 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
642 LastUpdate = BlockIter;
643 }
644
645 if (DoRegPressureReduce &&
646 getCombinerObjective(P) ==
647 CombinerObjective::MustReduceRegisterPressure) {
648 if (MBB->size() > inc_threshold) {
649 // Use incremental depth updates for basic blocks above threshold
650 IncrementalUpdate = true;
651 LastUpdate = BlockIter;
652 }
653 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
654 // Replace DelInstrs with InsInstrs.
655 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
656 RegUnits, TII, P, IncrementalUpdate);
657 Changed |= true;
658
659 // Go back to previous instruction as it may have ILP reassociation
660 // opportunity.
661 BlockIter--;
662 break;
663 }
664 }
665
666 if (ML && TII->isThroughputPattern(P)) {
667 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
668 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
669 RegUnits, TII, P, IncrementalUpdate);
670 // Eagerly stop after the first pattern fires.
671 Changed = true;
672 break;
673 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
674 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
675 << InsInstrs.size() << " < "
676 << DelInstrs.size() << ")\n");
677 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
678 RegUnits, TII, P, IncrementalUpdate);
679 // Eagerly stop after the first pattern fires.
680 Changed = true;
681 break;
682 } else {
683 // For big basic blocks, we only compute the full trace the first time
684 // we hit this. We do not invalidate the trace, but instead update the
685 // instruction depths incrementally.
686 // NOTE: Only the instruction depths up to MI are accurate. All other
687 // trace information is not updated.
688 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
689 Traces->verifyAnalysis();
690 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
691 InstrIdxForVirtReg, P,
692 !IncrementalUpdate) &&
693 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
694 if (MBB->size() > inc_threshold) {
695 // Use incremental depth updates for basic blocks above treshold
696 IncrementalUpdate = true;
697 LastUpdate = BlockIter;
698 }
699
700 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
701 RegUnits, TII, P, IncrementalUpdate);
702
703 // Eagerly stop after the first pattern fires.
704 Changed = true;
705 break;
706 }
707 // Cleanup instructions of the alternative code sequence. There is no
708 // use for them.
709 MachineFunction *MF = MBB->getParent();
710 for (auto *InstrPtr : InsInstrs)
711 MF->deleteMachineInstr(InstrPtr);
712 }
713 InstrIdxForVirtReg.clear();
714 }
715 }
716
717 if (Changed && IncrementalUpdate)
718 Traces->invalidate(MBB);
719 return Changed;
720 }
721
runOnMachineFunction(MachineFunction & MF)722 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
723 STI = &MF.getSubtarget();
724 TII = STI->getInstrInfo();
725 TRI = STI->getRegisterInfo();
726 SchedModel = STI->getSchedModel();
727 TSchedModel.init(STI);
728 MRI = &MF.getRegInfo();
729 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
730 Traces = &getAnalysis<MachineTraceMetrics>();
731 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
732 MBFI = (PSI && PSI->hasProfileSummary()) ?
733 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
734 nullptr;
735 TraceEnsemble = nullptr;
736 OptSize = MF.getFunction().hasOptSize();
737 RegClassInfo.runOnMachineFunction(MF);
738
739 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
740 if (!TII->useMachineCombiner()) {
741 LLVM_DEBUG(
742 dbgs()
743 << " Skipping pass: Target does not support machine combiner\n");
744 return false;
745 }
746
747 bool Changed = false;
748
749 // Try to combine instructions.
750 for (auto &MBB : MF)
751 Changed |= combineInstructions(&MBB);
752
753 return Changed;
754 }
755