xref: /freebsd/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp (revision e32fecd0c2c3ee37c47ee100f169e7eb0282a873)
1 //===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Hexagon specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "HexagonSubtarget.h"
14 #include "Hexagon.h"
15 #include "HexagonInstrInfo.h"
16 #include "HexagonRegisterInfo.h"
17 #include "MCTargetDesc/HexagonMCTargetDesc.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallSet.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineOperand.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/ErrorHandling.h"
29 #include "llvm/Target/TargetMachine.h"
30 #include <algorithm>
31 #include <cassert>
32 #include <map>
33 
34 using namespace llvm;
35 
36 #define DEBUG_TYPE "hexagon-subtarget"
37 
38 #define GET_SUBTARGETINFO_CTOR
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #include "HexagonGenSubtargetInfo.inc"
41 
42 static cl::opt<bool> EnableBSBSched("enable-bsb-sched", cl::Hidden,
43                                     cl::init(true));
44 
45 static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched", cl::Hidden,
46                                           cl::init(false));
47 
48 static cl::opt<bool>
49     EnableDotCurSched("enable-cur-sched", cl::Hidden, cl::init(true),
50                       cl::desc("Enable the scheduler to generate .cur"));
51 
52 static cl::opt<bool>
53     DisableHexagonMISched("disable-hexagon-misched", cl::Hidden,
54                           cl::desc("Disable Hexagon MI Scheduling"));
55 
56 static cl::opt<bool> EnableSubregLiveness(
57     "hexagon-subreg-liveness", cl::Hidden, cl::init(true),
58     cl::desc("Enable subregister liveness tracking for Hexagon"));
59 
60 static cl::opt<bool> OverrideLongCalls(
61     "hexagon-long-calls", cl::Hidden,
62     cl::desc("If present, forces/disables the use of long calls"));
63 
64 static cl::opt<bool>
65     EnablePredicatedCalls("hexagon-pred-calls", cl::Hidden,
66                           cl::desc("Consider calls to be predicable"));
67 
68 static cl::opt<bool> SchedPredsCloser("sched-preds-closer", cl::Hidden,
69                                       cl::init(true));
70 
71 static cl::opt<bool> SchedRetvalOptimization("sched-retval-optimization",
72                                              cl::Hidden, cl::init(true));
73 
74 static cl::opt<bool> EnableCheckBankConflict(
75     "hexagon-check-bank-conflict", cl::Hidden, cl::init(true),
76     cl::desc("Enable checking for cache bank conflicts"));
77 
78 static cl::opt<bool> EnableV68FloatCodeGen(
79     "force-hvx-float", cl::Hidden,
80     cl::desc(
81         "Enable the code-generation for vector float instructions on v68."));
82 
83 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
84                                    StringRef FS, const TargetMachine &TM)
85     : HexagonGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
86       OptLevel(TM.getOptLevel()),
87       CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
88       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
89       RegInfo(getHwMode()), TLInfo(TM, *this),
90       InstrItins(getInstrItineraryForCPU(CPUString)) {
91   Hexagon_MC::addArchSubtarget(this, FS);
92   // Beware of the default constructor of InstrItineraryData: it will
93   // reset all members to 0.
94   assert(InstrItins.Itineraries != nullptr && "InstrItins not initialized");
95 }
96 
97 HexagonSubtarget &
98 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
99   Optional<Hexagon::ArchEnum> ArchVer = Hexagon::getCpu(CPUString);
100   if (ArchVer)
101     HexagonArchVersion = *ArchVer;
102   else
103     llvm_unreachable("Unrecognized Hexagon processor version");
104 
105   UseHVX128BOps = false;
106   UseHVX64BOps = false;
107   UseAudioOps = false;
108   UseLongCalls = false;
109 
110   SubtargetFeatures Features(FS);
111 
112   // Turn on QFloat if the HVX version is v68+.
113   // The function ParseSubtargetFeatures will set feature bits and initialize
114   // subtarget's variables all in one, so there isn't a good way to preprocess
115   // the feature string, other than by tinkering with it directly.
116   auto IsQFloatFS = [](StringRef F) {
117     return F == "+hvx-qfloat" || F == "-hvx-qfloat";
118   };
119   if (!llvm::count_if(Features.getFeatures(), IsQFloatFS)) {
120     auto getHvxVersion = [&Features](StringRef FS) -> StringRef {
121       for (StringRef F : llvm::reverse(Features.getFeatures())) {
122         if (F.startswith("+hvxv"))
123           return F;
124       }
125       for (StringRef F : llvm::reverse(Features.getFeatures())) {
126         if (F == "-hvx")
127           return StringRef();
128         if (F.startswith("+hvx") || F == "-hvx")
129           return F.take_front(4);  // Return "+hvx" or "-hvx".
130       }
131       return StringRef();
132     };
133 
134     bool AddQFloat = false;
135     StringRef HvxVer = getHvxVersion(FS);
136     if (HvxVer.startswith("+hvxv")) {
137       int Ver = 0;
138       if (!HvxVer.drop_front(5).consumeInteger(10, Ver) && Ver >= 68)
139         AddQFloat = true;
140     } else if (HvxVer == "+hvx") {
141       if (hasV68Ops())
142         AddQFloat = true;
143     }
144 
145     if (AddQFloat)
146       Features.AddFeature("+hvx-qfloat");
147   }
148 
149   std::string FeatureString = Features.getString();
150   ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FeatureString);
151 
152   // Enable float code generation only if the flag(s) are set and
153   // the feature is enabled. v68 is guarded by additional flags.
154   bool GreaterThanV68 = false;
155   if (useHVXV69Ops())
156     GreaterThanV68 = true;
157 
158   // Support for deprecated qfloat/ieee codegen flags
159   if (!GreaterThanV68) {
160     if (EnableV68FloatCodeGen)
161       UseHVXFloatingPoint = true;
162   } else {
163     UseHVXFloatingPoint = true;
164   }
165 
166   if (UseHVXQFloatOps && UseHVXIEEEFPOps && UseHVXFloatingPoint)
167     LLVM_DEBUG(
168         dbgs() << "Behavior is undefined for simultaneous qfloat and ieee hvx codegen...");
169 
170   if (OverrideLongCalls.getPosition())
171     UseLongCalls = OverrideLongCalls;
172 
173   UseBSBScheduling = hasV60Ops() && EnableBSBSched;
174 
175   if (isTinyCore()) {
176     // Tiny core has a single thread, so back-to-back scheduling is enabled by
177     // default.
178     if (!EnableBSBSched.getPosition())
179       UseBSBScheduling = false;
180   }
181 
182   FeatureBitset FeatureBits = getFeatureBits();
183   if (HexagonDisableDuplex)
184     setFeatureBits(FeatureBits.reset(Hexagon::FeatureDuplex));
185   setFeatureBits(Hexagon_MC::completeHVXFeatures(FeatureBits));
186 
187   return *this;
188 }
189 
190 bool HexagonSubtarget::isHVXElementType(MVT Ty, bool IncludeBool) const {
191   if (!useHVXOps())
192     return false;
193   if (Ty.isVector())
194     Ty = Ty.getVectorElementType();
195   if (IncludeBool && Ty == MVT::i1)
196     return true;
197   ArrayRef<MVT> ElemTypes = getHVXElementTypes();
198   return llvm::is_contained(ElemTypes, Ty);
199 }
200 
201 bool HexagonSubtarget::isHVXVectorType(MVT VecTy, bool IncludeBool) const {
202   if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
203     return false;
204   MVT ElemTy = VecTy.getVectorElementType();
205   if (!IncludeBool && ElemTy == MVT::i1)
206     return false;
207 
208   unsigned HwLen = getVectorLength();
209   unsigned NumElems = VecTy.getVectorNumElements();
210   ArrayRef<MVT> ElemTypes = getHVXElementTypes();
211 
212   if (IncludeBool && ElemTy == MVT::i1) {
213     // Boolean HVX vector types are formed from regular HVX vector types
214     // by replacing the element type with i1.
215     for (MVT T : ElemTypes)
216       if (NumElems * T.getSizeInBits() == 8 * HwLen)
217         return true;
218     return false;
219   }
220 
221   unsigned VecWidth = VecTy.getSizeInBits();
222   if (VecWidth != 8 * HwLen && VecWidth != 16 * HwLen)
223     return false;
224   return llvm::is_contained(ElemTypes, ElemTy);
225 }
226 
227 bool HexagonSubtarget::isTypeForHVX(Type *VecTy, bool IncludeBool) const {
228   if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy))
229     return false;
230   // Avoid types like <2 x i32*>.
231   Type *ScalTy = VecTy->getScalarType();
232   if (!ScalTy->isIntegerTy() &&
233       !(ScalTy->isFloatingPointTy() && useHVXFloatingPoint()))
234     return false;
235   // The given type may be something like <17 x i32>, which is not MVT,
236   // but can be represented as (non-simple) EVT.
237   EVT Ty = EVT::getEVT(VecTy, /*HandleUnknown*/false);
238   if (Ty.getSizeInBits() <= 64 || !Ty.getVectorElementType().isSimple())
239     return false;
240 
241   auto isHvxTy = [this, IncludeBool](MVT SimpleTy) {
242     if (isHVXVectorType(SimpleTy, IncludeBool))
243       return true;
244     auto Action = getTargetLowering()->getPreferredVectorAction(SimpleTy);
245     return Action == TargetLoweringBase::TypeWidenVector;
246   };
247 
248   // Round up EVT to have power-of-2 elements, and keep checking if it
249   // qualifies for HVX, dividing it in half after each step.
250   MVT ElemTy = Ty.getVectorElementType().getSimpleVT();
251   unsigned VecLen = PowerOf2Ceil(Ty.getVectorNumElements());
252   while (ElemTy.getSizeInBits() * VecLen > 64) {
253     MVT SimpleTy = MVT::getVectorVT(ElemTy, VecLen);
254     if (SimpleTy.isValid() && isHvxTy(SimpleTy))
255       return true;
256     VecLen /= 2;
257   }
258 
259   return false;
260 }
261 
262 void HexagonSubtarget::UsrOverflowMutation::apply(ScheduleDAGInstrs *DAG) {
263   for (SUnit &SU : DAG->SUnits) {
264     if (!SU.isInstr())
265       continue;
266     SmallVector<SDep, 4> Erase;
267     for (auto &D : SU.Preds)
268       if (D.getKind() == SDep::Output && D.getReg() == Hexagon::USR_OVF)
269         Erase.push_back(D);
270     for (auto &E : Erase)
271       SU.removePred(E);
272   }
273 }
274 
275 void HexagonSubtarget::HVXMemLatencyMutation::apply(ScheduleDAGInstrs *DAG) {
276   for (SUnit &SU : DAG->SUnits) {
277     // Update the latency of chain edges between v60 vector load or store
278     // instructions to be 1. These instruction cannot be scheduled in the
279     // same packet.
280     MachineInstr &MI1 = *SU.getInstr();
281     auto *QII = static_cast<const HexagonInstrInfo*>(DAG->TII);
282     bool IsStoreMI1 = MI1.mayStore();
283     bool IsLoadMI1 = MI1.mayLoad();
284     if (!QII->isHVXVec(MI1) || !(IsStoreMI1 || IsLoadMI1))
285       continue;
286     for (SDep &SI : SU.Succs) {
287       if (SI.getKind() != SDep::Order || SI.getLatency() != 0)
288         continue;
289       MachineInstr &MI2 = *SI.getSUnit()->getInstr();
290       if (!QII->isHVXVec(MI2))
291         continue;
292       if ((IsStoreMI1 && MI2.mayStore()) || (IsLoadMI1 && MI2.mayLoad())) {
293         SI.setLatency(1);
294         SU.setHeightDirty();
295         // Change the dependence in the opposite direction too.
296         for (SDep &PI : SI.getSUnit()->Preds) {
297           if (PI.getSUnit() != &SU || PI.getKind() != SDep::Order)
298             continue;
299           PI.setLatency(1);
300           SI.getSUnit()->setDepthDirty();
301         }
302       }
303     }
304   }
305 }
306 
307 // Check if a call and subsequent A2_tfrpi instructions should maintain
308 // scheduling affinity. We are looking for the TFRI to be consumed in
309 // the next instruction. This should help reduce the instances of
310 // double register pairs being allocated and scheduled before a call
311 // when not used until after the call. This situation is exacerbated
312 // by the fact that we allocate the pair from the callee saves list,
313 // leading to excess spills and restores.
314 bool HexagonSubtarget::CallMutation::shouldTFRICallBind(
315       const HexagonInstrInfo &HII, const SUnit &Inst1,
316       const SUnit &Inst2) const {
317   if (Inst1.getInstr()->getOpcode() != Hexagon::A2_tfrpi)
318     return false;
319 
320   // TypeXTYPE are 64 bit operations.
321   unsigned Type = HII.getType(*Inst2.getInstr());
322   return Type == HexagonII::TypeS_2op || Type == HexagonII::TypeS_3op ||
323          Type == HexagonII::TypeALU64 || Type == HexagonII::TypeM;
324 }
325 
326 void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
327   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
328   SUnit* LastSequentialCall = nullptr;
329   // Map from virtual register to physical register from the copy.
330   DenseMap<unsigned, unsigned> VRegHoldingReg;
331   // Map from the physical register to the instruction that uses virtual
332   // register. This is used to create the barrier edge.
333   DenseMap<unsigned, SUnit *> LastVRegUse;
334   auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo();
335   auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
336 
337   // Currently we only catch the situation when compare gets scheduled
338   // before preceding call.
339   for (unsigned su = 0, e = DAG->SUnits.size(); su != e; ++su) {
340     // Remember the call.
341     if (DAG->SUnits[su].getInstr()->isCall())
342       LastSequentialCall = &DAG->SUnits[su];
343     // Look for a compare that defines a predicate.
344     else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall)
345       DAG->addEdge(&DAG->SUnits[su], SDep(LastSequentialCall, SDep::Barrier));
346     // Look for call and tfri* instructions.
347     else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 &&
348              shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1]))
349       DAG->addEdge(&DAG->SUnits[su], SDep(&DAG->SUnits[su-1], SDep::Barrier));
350     // Prevent redundant register copies due to reads and writes of physical
351     // registers. The original motivation for this was the code generated
352     // between two calls, which are caused both the return value and the
353     // argument for the next call being in %r0.
354     // Example:
355     //   1: <call1>
356     //   2: %vreg = COPY %r0
357     //   3: <use of %vreg>
358     //   4: %r0 = ...
359     //   5: <call2>
360     // The scheduler would often swap 3 and 4, so an additional register is
361     // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
362     // this.
363     // The code below checks for all the physical registers, not just R0/D0/V0.
364     else if (SchedRetvalOptimization) {
365       const MachineInstr *MI = DAG->SUnits[su].getInstr();
366       if (MI->isCopy() &&
367           Register::isPhysicalRegister(MI->getOperand(1).getReg())) {
368         // %vregX = COPY %r0
369         VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg();
370         LastVRegUse.erase(MI->getOperand(1).getReg());
371       } else {
372         for (const MachineOperand &MO : MI->operands()) {
373           if (!MO.isReg())
374             continue;
375           if (MO.isUse() && !MI->isCopy() &&
376               VRegHoldingReg.count(MO.getReg())) {
377             // <use of %vregX>
378             LastVRegUse[VRegHoldingReg[MO.getReg()]] = &DAG->SUnits[su];
379           } else if (MO.isDef() && Register::isPhysicalRegister(MO.getReg())) {
380             for (MCRegAliasIterator AI(MO.getReg(), &TRI, true); AI.isValid();
381                  ++AI) {
382               if (LastVRegUse.count(*AI) &&
383                   LastVRegUse[*AI] != &DAG->SUnits[su])
384                 // %r0 = ...
385                 DAG->addEdge(&DAG->SUnits[su], SDep(LastVRegUse[*AI], SDep::Barrier));
386               LastVRegUse.erase(*AI);
387             }
388           }
389         }
390       }
391     }
392   }
393 }
394 
395 void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) {
396   if (!EnableCheckBankConflict)
397     return;
398 
399   const auto &HII = static_cast<const HexagonInstrInfo&>(*DAG->TII);
400 
401   // Create artificial edges between loads that could likely cause a bank
402   // conflict. Since such loads would normally not have any dependency
403   // between them, we cannot rely on existing edges.
404   for (unsigned i = 0, e = DAG->SUnits.size(); i != e; ++i) {
405     SUnit &S0 = DAG->SUnits[i];
406     MachineInstr &L0 = *S0.getInstr();
407     if (!L0.mayLoad() || L0.mayStore() ||
408         HII.getAddrMode(L0) != HexagonII::BaseImmOffset)
409       continue;
410     int64_t Offset0;
411     unsigned Size0;
412     MachineOperand *BaseOp0 = HII.getBaseAndOffset(L0, Offset0, Size0);
413     // Is the access size is longer than the L1 cache line, skip the check.
414     if (BaseOp0 == nullptr || !BaseOp0->isReg() || Size0 >= 32)
415       continue;
416     // Scan only up to 32 instructions ahead (to avoid n^2 complexity).
417     for (unsigned j = i+1, m = std::min(i+32, e); j != m; ++j) {
418       SUnit &S1 = DAG->SUnits[j];
419       MachineInstr &L1 = *S1.getInstr();
420       if (!L1.mayLoad() || L1.mayStore() ||
421           HII.getAddrMode(L1) != HexagonII::BaseImmOffset)
422         continue;
423       int64_t Offset1;
424       unsigned Size1;
425       MachineOperand *BaseOp1 = HII.getBaseAndOffset(L1, Offset1, Size1);
426       if (BaseOp1 == nullptr || !BaseOp1->isReg() || Size1 >= 32 ||
427           BaseOp0->getReg() != BaseOp1->getReg())
428         continue;
429       // Check bits 3 and 4 of the offset: if they differ, a bank conflict
430       // is unlikely.
431       if (((Offset0 ^ Offset1) & 0x18) != 0)
432         continue;
433       // Bits 3 and 4 are the same, add an artificial edge and set extra
434       // latency.
435       SDep A(&S0, SDep::Artificial);
436       A.setLatency(1);
437       S1.addPred(A, true);
438     }
439   }
440 }
441 
442 /// Enable use of alias analysis during code generation (during MI
443 /// scheduling, DAGCombine, etc.).
444 bool HexagonSubtarget::useAA() const {
445   if (OptLevel != CodeGenOpt::None)
446     return true;
447   return false;
448 }
449 
450 /// Perform target specific adjustments to the latency of a schedule
451 /// dependency.
452 void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
453                                              SUnit *Dst, int DstOpIdx,
454                                              SDep &Dep) const {
455   if (!Src->isInstr() || !Dst->isInstr())
456     return;
457 
458   MachineInstr *SrcInst = Src->getInstr();
459   MachineInstr *DstInst = Dst->getInstr();
460   const HexagonInstrInfo *QII = getInstrInfo();
461 
462   // Instructions with .new operands have zero latency.
463   SmallSet<SUnit *, 4> ExclSrc;
464   SmallSet<SUnit *, 4> ExclDst;
465   if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
466       isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) {
467     Dep.setLatency(0);
468     return;
469   }
470 
471   // Set the latency for a copy to zero since we hope that is will get
472   // removed.
473   if (DstInst->isCopy())
474     Dep.setLatency(0);
475 
476   // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
477   // the correct latency.
478   // If there are multiple uses of the def of COPY/REG_SEQUENCE, set the latency
479   // only if the latencies on all the uses are equal, otherwise set it to
480   // default.
481   if ((DstInst->isRegSequence() || DstInst->isCopy())) {
482     Register DReg = DstInst->getOperand(0).getReg();
483     int DLatency = -1;
484     for (const auto &DDep : Dst->Succs) {
485       MachineInstr *DDst = DDep.getSUnit()->getInstr();
486       int UseIdx = -1;
487       for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
488         const MachineOperand &MO = DDst->getOperand(OpNum);
489         if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) {
490           UseIdx = OpNum;
491           break;
492         }
493       }
494 
495       if (UseIdx == -1)
496         continue;
497 
498       int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst, 0,
499                                                  *DDst, UseIdx));
500       // Set DLatency for the first time.
501       DLatency = (DLatency == -1) ? Latency : DLatency;
502 
503       // For multiple uses, if the Latency is different across uses, reset
504       // DLatency.
505       if (DLatency != Latency) {
506         DLatency = -1;
507         break;
508       }
509     }
510 
511     DLatency = std::max(DLatency, 0);
512     Dep.setLatency((unsigned)DLatency);
513   }
514 
515   // Try to schedule uses near definitions to generate .cur.
516   ExclSrc.clear();
517   ExclDst.clear();
518   if (EnableDotCurSched && QII->isToBeScheduledASAP(*SrcInst, *DstInst) &&
519       isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) {
520     Dep.setLatency(0);
521     return;
522   }
523   int Latency = Dep.getLatency();
524   bool IsArtificial = Dep.isArtificial();
525   Latency = updateLatency(*SrcInst, *DstInst, IsArtificial, Latency);
526   Dep.setLatency(Latency);
527 }
528 
529 void HexagonSubtarget::getPostRAMutations(
530     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
531   Mutations.push_back(std::make_unique<UsrOverflowMutation>());
532   Mutations.push_back(std::make_unique<HVXMemLatencyMutation>());
533   Mutations.push_back(std::make_unique<BankConflictMutation>());
534 }
535 
536 void HexagonSubtarget::getSMSMutations(
537     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
538   Mutations.push_back(std::make_unique<UsrOverflowMutation>());
539   Mutations.push_back(std::make_unique<HVXMemLatencyMutation>());
540 }
541 
542 // Pin the vtable to this file.
543 void HexagonSubtarget::anchor() {}
544 
545 bool HexagonSubtarget::enableMachineScheduler() const {
546   if (DisableHexagonMISched.getNumOccurrences())
547     return !DisableHexagonMISched;
548   return true;
549 }
550 
551 bool HexagonSubtarget::usePredicatedCalls() const {
552   return EnablePredicatedCalls;
553 }
554 
555 int HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
556                                     MachineInstr &DstInst, bool IsArtificial,
557                                     int Latency) const {
558   if (IsArtificial)
559     return 1;
560   if (!hasV60Ops())
561     return Latency;
562 
563   auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo());
564   // BSB scheduling.
565   if (QII.isHVXVec(SrcInst) || useBSBScheduling())
566     Latency = (Latency + 1) >> 1;
567   return Latency;
568 }
569 
570 void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
571   MachineInstr *SrcI = Src->getInstr();
572   for (auto &I : Src->Succs) {
573     if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
574       continue;
575     Register DepR = I.getReg();
576     int DefIdx = -1;
577     for (unsigned OpNum = 0; OpNum < SrcI->getNumOperands(); OpNum++) {
578       const MachineOperand &MO = SrcI->getOperand(OpNum);
579       bool IsSameOrSubReg = false;
580       if (MO.isReg()) {
581         Register MOReg = MO.getReg();
582         if (DepR.isVirtual()) {
583           IsSameOrSubReg = (MOReg == DepR);
584         } else {
585           IsSameOrSubReg = getRegisterInfo()->isSubRegisterEq(DepR, MOReg);
586         }
587         if (MO.isDef() && IsSameOrSubReg)
588           DefIdx = OpNum;
589       }
590     }
591     assert(DefIdx >= 0 && "Def Reg not found in Src MI");
592     MachineInstr *DstI = Dst->getInstr();
593     SDep T = I;
594     for (unsigned OpNum = 0; OpNum < DstI->getNumOperands(); OpNum++) {
595       const MachineOperand &MO = DstI->getOperand(OpNum);
596       if (MO.isReg() && MO.isUse() && MO.getReg() == DepR) {
597         int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcI,
598                                                    DefIdx, *DstI, OpNum));
599 
600         // For some instructions (ex: COPY), we might end up with < 0 latency
601         // as they don't have any Itinerary class associated with them.
602         Latency = std::max(Latency, 0);
603         bool IsArtificial = I.isArtificial();
604         Latency = updateLatency(*SrcI, *DstI, IsArtificial, Latency);
605         I.setLatency(Latency);
606       }
607     }
608 
609     // Update the latency of opposite edge too.
610     T.setSUnit(Src);
611     auto F = find(Dst->Preds, T);
612     assert(F != Dst->Preds.end());
613     F->setLatency(I.getLatency());
614   }
615 }
616 
617 /// Change the latency between the two SUnits.
618 void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
619       const {
620   for (auto &I : Src->Succs) {
621     if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
622       continue;
623     SDep T = I;
624     I.setLatency(Lat);
625 
626     // Update the latency of opposite edge too.
627     T.setSUnit(Src);
628     auto F = find(Dst->Preds, T);
629     assert(F != Dst->Preds.end());
630     F->setLatency(Lat);
631   }
632 }
633 
634 /// If the SUnit has a zero latency edge, return the other SUnit.
635 static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
636   for (auto &I : Deps)
637     if (I.isAssignedRegDep() && I.getLatency() == 0 &&
638         !I.getSUnit()->getInstr()->isPseudo())
639       return I.getSUnit();
640   return nullptr;
641 }
642 
643 // Return true if these are the best two instructions to schedule
644 // together with a zero latency. Only one dependence should have a zero
645 // latency. If there are multiple choices, choose the best, and change
646 // the others, if needed.
647 bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
648       const HexagonInstrInfo *TII, SmallSet<SUnit*, 4> &ExclSrc,
649       SmallSet<SUnit*, 4> &ExclDst) const {
650   MachineInstr &SrcInst = *Src->getInstr();
651   MachineInstr &DstInst = *Dst->getInstr();
652 
653   // Ignore Boundary SU nodes as these have null instructions.
654   if (Dst->isBoundaryNode())
655     return false;
656 
657   if (SrcInst.isPHI() || DstInst.isPHI())
658     return false;
659 
660   if (!TII->isToBeScheduledASAP(SrcInst, DstInst) &&
661       !TII->canExecuteInBundle(SrcInst, DstInst))
662     return false;
663 
664   // The architecture doesn't allow three dependent instructions in the same
665   // packet. So, if the destination has a zero latency successor, then it's
666   // not a candidate for a zero latency predecessor.
667   if (getZeroLatency(Dst, Dst->Succs) != nullptr)
668     return false;
669 
670   // Check if the Dst instruction is the best candidate first.
671   SUnit *Best = nullptr;
672   SUnit *DstBest = nullptr;
673   SUnit *SrcBest = getZeroLatency(Dst, Dst->Preds);
674   if (SrcBest == nullptr || Src->NodeNum >= SrcBest->NodeNum) {
675     // Check that Src doesn't have a better candidate.
676     DstBest = getZeroLatency(Src, Src->Succs);
677     if (DstBest == nullptr || Dst->NodeNum <= DstBest->NodeNum)
678       Best = Dst;
679   }
680   if (Best != Dst)
681     return false;
682 
683   // The caller frequently adds the same dependence twice. If so, then
684   // return true for this case too.
685   if ((Src == SrcBest && Dst == DstBest ) ||
686       (SrcBest == nullptr && Dst == DstBest) ||
687       (Src == SrcBest && Dst == nullptr))
688     return true;
689 
690   // Reassign the latency for the previous bests, which requires setting
691   // the dependence edge in both directions.
692   if (SrcBest != nullptr) {
693     if (!hasV60Ops())
694       changeLatency(SrcBest, Dst, 1);
695     else
696       restoreLatency(SrcBest, Dst);
697   }
698   if (DstBest != nullptr) {
699     if (!hasV60Ops())
700       changeLatency(Src, DstBest, 1);
701     else
702       restoreLatency(Src, DstBest);
703   }
704 
705   // Attempt to find another opprotunity for zero latency in a different
706   // dependence.
707   if (SrcBest && DstBest)
708     // If there is an edge from SrcBest to DstBst, then try to change that
709     // to 0 now.
710     changeLatency(SrcBest, DstBest, 0);
711   else if (DstBest) {
712     // Check if the previous best destination instruction has a new zero
713     // latency dependence opportunity.
714     ExclSrc.insert(Src);
715     for (auto &I : DstBest->Preds)
716       if (ExclSrc.count(I.getSUnit()) == 0 &&
717           isBestZeroLatency(I.getSUnit(), DstBest, TII, ExclSrc, ExclDst))
718         changeLatency(I.getSUnit(), DstBest, 0);
719   } else if (SrcBest) {
720     // Check if previous best source instruction has a new zero latency
721     // dependence opportunity.
722     ExclDst.insert(Dst);
723     for (auto &I : SrcBest->Succs)
724       if (ExclDst.count(I.getSUnit()) == 0 &&
725           isBestZeroLatency(SrcBest, I.getSUnit(), TII, ExclSrc, ExclDst))
726         changeLatency(SrcBest, I.getSUnit(), 0);
727   }
728 
729   return true;
730 }
731 
732 unsigned HexagonSubtarget::getL1CacheLineSize() const {
733   return 32;
734 }
735 
736 unsigned HexagonSubtarget::getL1PrefetchDistance() const {
737   return 32;
738 }
739 
740 bool HexagonSubtarget::enableSubRegLiveness() const {
741   return EnableSubregLiveness;
742 }
743