1 //===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This implements the ScheduleDAGInstrs class, which implements 10 /// re-scheduling of MachineInstrs. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/CodeGen/ScheduleDAGInstrs.h" 15 16 #include "llvm/ADT/IntEqClasses.h" 17 #include "llvm/ADT/MapVector.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/SparseSet.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/Analysis/AliasAnalysis.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/LiveIntervals.h" 24 #include "llvm/CodeGen/LivePhysRegs.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineFrameInfo.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineInstr.h" 29 #include "llvm/CodeGen/MachineInstrBundle.h" 30 #include "llvm/CodeGen/MachineMemOperand.h" 31 #include "llvm/CodeGen/MachineOperand.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/PseudoSourceValue.h" 34 #include "llvm/CodeGen/RegisterPressure.h" 35 #include "llvm/CodeGen/ScheduleDAG.h" 36 #include "llvm/CodeGen/ScheduleDFS.h" 37 #include "llvm/CodeGen/SlotIndexes.h" 38 #include "llvm/CodeGen/TargetInstrInfo.h" 39 #include "llvm/CodeGen/TargetRegisterInfo.h" 40 #include "llvm/CodeGen/TargetSubtargetInfo.h" 41 #include "llvm/Config/llvm-config.h" 42 #include "llvm/IR/Constants.h" 43 #include "llvm/IR/Function.h" 44 #include "llvm/IR/Type.h" 45 #include "llvm/IR/Value.h" 46 #include "llvm/MC/LaneBitmask.h" 47 #include "llvm/MC/MCRegisterInfo.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/Compiler.h" 51 #include "llvm/Support/Debug.h" 52 #include "llvm/Support/ErrorHandling.h" 53 #include "llvm/Support/Format.h" 54 #include "llvm/Support/raw_ostream.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <iterator> 58 #include <utility> 59 #include <vector> 60 61 using namespace llvm; 62 63 #define DEBUG_TYPE "machine-scheduler" 64 65 static cl::opt<bool> 66 EnableAASchedMI("enable-aa-sched-mi", cl::Hidden, 67 cl::desc("Enable use of AA during MI DAG construction")); 68 69 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden, 70 cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction")); 71 72 static cl::opt<bool> 73 EnableSchedModel("schedmodel", cl::Hidden, cl::init(true), 74 cl::desc("Use TargetSchedModel for latency lookup")); 75 76 static cl::opt<bool> 77 EnableSchedItins("scheditins", cl::Hidden, cl::init(true), 78 cl::desc("Use InstrItineraryData for latency lookup")); 79 80 // Note: the two options below might be used in tuning compile time vs 81 // output quality. Setting HugeRegion so large that it will never be 82 // reached means best-effort, but may be slow. 83 84 // When Stores and Loads maps (or NonAliasStores and NonAliasLoads) 85 // together hold this many SUs, a reduction of maps will be done. 86 static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden, 87 cl::init(1000), cl::desc("The limit to use while constructing the DAG " 88 "prior to scheduling, at which point a trade-off " 89 "is made to avoid excessive compile time.")); 90 91 static cl::opt<unsigned> ReductionSize( 92 "dag-maps-reduction-size", cl::Hidden, 93 cl::desc("A huge scheduling region will have maps reduced by this many " 94 "nodes at a time. Defaults to HugeRegion / 2.")); 95 96 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 97 static cl::opt<bool> SchedPrintCycles( 98 "sched-print-cycles", cl::Hidden, cl::init(false), 99 cl::desc("Report top/bottom cycles when dumping SUnit instances")); 100 #endif 101 102 static unsigned getReductionSize() { 103 // Always reduce a huge region with half of the elements, except 104 // when user sets this number explicitly. 105 if (ReductionSize.getNumOccurrences() == 0) 106 return HugeRegion / 2; 107 return ReductionSize; 108 } 109 110 static void dumpSUList(const ScheduleDAGInstrs::SUList &L) { 111 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 112 dbgs() << "{ "; 113 for (const SUnit *SU : L) { 114 dbgs() << "SU(" << SU->NodeNum << ")"; 115 if (SU != L.back()) 116 dbgs() << ", "; 117 } 118 dbgs() << "}\n"; 119 #endif 120 } 121 122 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, 123 const MachineLoopInfo *mli, 124 bool RemoveKillFlags) 125 : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), 126 RemoveKillFlags(RemoveKillFlags), 127 UnknownValue(UndefValue::get( 128 Type::getVoidTy(mf.getFunction().getContext()))), Topo(SUnits, &ExitSU) { 129 DbgValues.clear(); 130 131 const TargetSubtargetInfo &ST = mf.getSubtarget(); 132 SchedModel.init(&ST, EnableSchedModel, EnableSchedItins); 133 } 134 135 /// If this machine instr has memory reference information and it can be 136 /// tracked to a normal reference to a known object, return the Value 137 /// for that object. This function returns false the memory location is 138 /// unknown or may alias anything. 139 static bool getUnderlyingObjectsForInstr(const MachineInstr *MI, 140 const MachineFrameInfo &MFI, 141 UnderlyingObjectsVector &Objects, 142 const DataLayout &DL) { 143 auto AllMMOsOkay = [&]() { 144 for (const MachineMemOperand *MMO : MI->memoperands()) { 145 // TODO: Figure out whether isAtomic is really necessary (see D57601). 146 if (MMO->isVolatile() || MMO->isAtomic()) 147 return false; 148 149 if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) { 150 // Function that contain tail calls don't have unique PseudoSourceValue 151 // objects. Two PseudoSourceValues might refer to the same or 152 // overlapping locations. The client code calling this function assumes 153 // this is not the case. So return a conservative answer of no known 154 // object. 155 if (MFI.hasTailCall()) 156 return false; 157 158 // For now, ignore PseudoSourceValues which may alias LLVM IR values 159 // because the code that uses this function has no way to cope with 160 // such aliases. 161 if (PSV->isAliased(&MFI)) 162 return false; 163 164 bool MayAlias = PSV->mayAlias(&MFI); 165 Objects.emplace_back(PSV, MayAlias); 166 } else if (const Value *V = MMO->getValue()) { 167 SmallVector<Value *, 4> Objs; 168 if (!getUnderlyingObjectsForCodeGen(V, Objs)) 169 return false; 170 171 for (Value *V : Objs) { 172 assert(isIdentifiedObject(V)); 173 Objects.emplace_back(V, true); 174 } 175 } else 176 return false; 177 } 178 return true; 179 }; 180 181 if (!AllMMOsOkay()) { 182 Objects.clear(); 183 return false; 184 } 185 186 return true; 187 } 188 189 void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) { 190 BB = bb; 191 } 192 193 void ScheduleDAGInstrs::finishBlock() { 194 // Subclasses should no longer refer to the old block. 195 BB = nullptr; 196 } 197 198 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, 199 MachineBasicBlock::iterator begin, 200 MachineBasicBlock::iterator end, 201 unsigned regioninstrs) { 202 assert(bb == BB && "startBlock should set BB"); 203 RegionBegin = begin; 204 RegionEnd = end; 205 NumRegionInstrs = regioninstrs; 206 } 207 208 void ScheduleDAGInstrs::exitRegion() { 209 // Nothing to do. 210 } 211 212 void ScheduleDAGInstrs::addSchedBarrierDeps() { 213 MachineInstr *ExitMI = 214 RegionEnd != BB->end() 215 ? &*skipDebugInstructionsBackward(RegionEnd, RegionBegin) 216 : nullptr; 217 ExitSU.setInstr(ExitMI); 218 // Add dependencies on the defs and uses of the instruction. 219 if (ExitMI) { 220 const MCInstrDesc &MIDesc = ExitMI->getDesc(); 221 for (const MachineOperand &MO : ExitMI->all_uses()) { 222 unsigned OpIdx = MO.getOperandNo(); 223 Register Reg = MO.getReg(); 224 if (Reg.isPhysical()) { 225 // addPhysRegDataDeps uses the provided operand index to retrieve 226 // the operand use cycle from the scheduling model. If the operand 227 // is "fake" (e.g., an operand of a call instruction used to pass 228 // an argument to the called function.), the scheduling model may not 229 // have an entry for it. If this is the case, pass -1 as operand index, 230 // which will cause addPhysRegDataDeps to add an artificial dependency. 231 // FIXME: Using hasImplicitUseOfPhysReg here is inaccurate as it misses 232 // aliases. When fixing, make sure to update addPhysRegDataDeps, too. 233 bool IsRealUse = OpIdx < MIDesc.getNumOperands() || 234 MIDesc.hasImplicitUseOfPhysReg(Reg); 235 for (MCRegUnit Unit : TRI->regunits(Reg)) 236 Uses.insert(PhysRegSUOper(&ExitSU, IsRealUse ? OpIdx : -1, Unit)); 237 } else if (Reg.isVirtual() && MO.readsReg()) { 238 addVRegUseDeps(&ExitSU, OpIdx); 239 } 240 } 241 } 242 if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) { 243 // For others, e.g. fallthrough, conditional branch, assume the exit 244 // uses all the registers that are livein to the successor blocks. 245 for (const MachineBasicBlock *Succ : BB->successors()) { 246 for (const auto &LI : Succ->liveins()) { 247 for (MCRegUnitMaskIterator U(LI.PhysReg, TRI); U.isValid(); ++U) { 248 auto [Unit, Mask] = *U; 249 if ((Mask & LI.LaneMask).any() && !Uses.contains(Unit)) 250 Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit)); 251 } 252 } 253 } 254 } 255 } 256 257 /// MO is an operand of SU's instruction that defines a physical register. Adds 258 /// data dependencies from SU to any uses of the physical register. 259 void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { 260 const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); 261 assert(MO.isDef() && "expect physreg def"); 262 Register Reg = MO.getReg(); 263 264 // Ask the target if address-backscheduling is desirable, and if so how much. 265 const TargetSubtargetInfo &ST = MF.getSubtarget(); 266 267 // Only use any non-zero latency for real defs/uses, in contrast to 268 // "fake" operands added by regalloc. 269 const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc(); 270 bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() && 271 !DefMIDesc.hasImplicitDefOfPhysReg(Reg)); 272 for (MCRegUnit Unit : TRI->regunits(Reg)) { 273 for (RegUnit2SUnitsMap::iterator I = Uses.find(Unit); I != Uses.end(); 274 ++I) { 275 SUnit *UseSU = I->SU; 276 if (UseSU == SU) 277 continue; 278 279 // Adjust the dependence latency using operand def/use information, 280 // then allow the target to perform its own adjustments. 281 MachineInstr *UseInstr = nullptr; 282 int UseOpIdx = I->OpIdx; 283 bool ImplicitPseudoUse = false; 284 SDep Dep; 285 if (UseOpIdx < 0) { 286 Dep = SDep(SU, SDep::Artificial); 287 } else { 288 // Set the hasPhysRegDefs only for physreg defs that have a use within 289 // the scheduling region. 290 SU->hasPhysRegDefs = true; 291 292 UseInstr = UseSU->getInstr(); 293 Register UseReg = UseInstr->getOperand(UseOpIdx).getReg(); 294 const MCInstrDesc &UseMIDesc = UseInstr->getDesc(); 295 ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) && 296 !UseMIDesc.hasImplicitUseOfPhysReg(UseReg); 297 298 Dep = SDep(SU, SDep::Data, UseReg); 299 } 300 if (!ImplicitPseudoDef && !ImplicitPseudoUse) { 301 Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, 302 UseInstr, UseOpIdx)); 303 } else { 304 Dep.setLatency(0); 305 } 306 ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOpIdx, Dep, &SchedModel); 307 UseSU->addPred(Dep); 308 } 309 } 310 } 311 312 /// Adds register dependencies (data, anti, and output) from this SUnit 313 /// to following instructions in the same scheduling region that depend the 314 /// physical register referenced at OperIdx. 315 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { 316 MachineInstr *MI = SU->getInstr(); 317 MachineOperand &MO = MI->getOperand(OperIdx); 318 Register Reg = MO.getReg(); 319 // We do not need to track any dependencies for constant registers. 320 if (MRI.isConstantPhysReg(Reg)) 321 return; 322 323 const TargetSubtargetInfo &ST = MF.getSubtarget(); 324 325 // Optionally add output and anti dependencies. For anti 326 // dependencies we use a latency of 0 because for a multi-issue 327 // target we want to allow the defining instruction to issue 328 // in the same cycle as the using instruction. 329 // TODO: Using a latency of 1 here for output dependencies assumes 330 // there's no cost for reusing registers. 331 SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; 332 for (MCRegUnit Unit : TRI->regunits(Reg)) { 333 for (RegUnit2SUnitsMap::iterator I = Defs.find(Unit); I != Defs.end(); 334 ++I) { 335 SUnit *DefSU = I->SU; 336 if (DefSU == &ExitSU) 337 continue; 338 MachineInstr *DefInstr = DefSU->getInstr(); 339 MachineOperand &DefMO = DefInstr->getOperand(I->OpIdx); 340 if (DefSU != SU && 341 (Kind != SDep::Output || !MO.isDead() || !DefMO.isDead())) { 342 SDep Dep(SU, Kind, DefMO.getReg()); 343 if (Kind != SDep::Anti) { 344 Dep.setLatency( 345 SchedModel.computeOutputLatency(MI, OperIdx, DefInstr)); 346 } 347 ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep, 348 &SchedModel); 349 DefSU->addPred(Dep); 350 } 351 } 352 } 353 354 if (MO.isUse()) { 355 SU->hasPhysRegUses = true; 356 // Either insert a new Reg2SUnits entry with an empty SUnits list, or 357 // retrieve the existing SUnits list for this register's uses. 358 // Push this SUnit on the use list. 359 for (MCRegUnit Unit : TRI->regunits(Reg)) 360 Uses.insert(PhysRegSUOper(SU, OperIdx, Unit)); 361 if (RemoveKillFlags) 362 MO.setIsKill(false); 363 } else { 364 addPhysRegDataDeps(SU, OperIdx); 365 366 // Clear previous uses and defs of this register and its subregisters. 367 for (MCRegUnit Unit : TRI->regunits(Reg)) { 368 Uses.eraseAll(Unit); 369 if (!MO.isDead()) 370 Defs.eraseAll(Unit); 371 } 372 373 if (MO.isDead() && SU->isCall) { 374 // Calls will not be reordered because of chain dependencies (see 375 // below). Since call operands are dead, calls may continue to be added 376 // to the DefList making dependence checking quadratic in the size of 377 // the block. Instead, we leave only one call at the back of the 378 // DefList. 379 for (MCRegUnit Unit : TRI->regunits(Reg)) { 380 RegUnit2SUnitsMap::RangePair P = Defs.equal_range(Unit); 381 RegUnit2SUnitsMap::iterator B = P.first; 382 RegUnit2SUnitsMap::iterator I = P.second; 383 for (bool isBegin = I == B; !isBegin; /* empty */) { 384 isBegin = (--I) == B; 385 if (!I->SU->isCall) 386 break; 387 I = Defs.erase(I); 388 } 389 } 390 } 391 392 // Defs are pushed in the order they are visited and never reordered. 393 for (MCRegUnit Unit : TRI->regunits(Reg)) 394 Defs.insert(PhysRegSUOper(SU, OperIdx, Unit)); 395 } 396 } 397 398 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const 399 { 400 Register Reg = MO.getReg(); 401 // No point in tracking lanemasks if we don't have interesting subregisters. 402 const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 403 if (!RC.HasDisjunctSubRegs) 404 return LaneBitmask::getAll(); 405 406 unsigned SubReg = MO.getSubReg(); 407 if (SubReg == 0) 408 return RC.getLaneMask(); 409 return TRI->getSubRegIndexLaneMask(SubReg); 410 } 411 412 bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) { 413 auto RegUse = CurrentVRegUses.find(MO.getReg()); 414 if (RegUse == CurrentVRegUses.end()) 415 return true; 416 return (RegUse->LaneMask & getLaneMaskForMO(MO)).none(); 417 } 418 419 /// Adds register output and data dependencies from this SUnit to instructions 420 /// that occur later in the same scheduling region if they read from or write to 421 /// the virtual register defined at OperIdx. 422 /// 423 /// TODO: Hoist loop induction variable increments. This has to be 424 /// reevaluated. Generally, IV scheduling should be done before coalescing. 425 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { 426 MachineInstr *MI = SU->getInstr(); 427 MachineOperand &MO = MI->getOperand(OperIdx); 428 Register Reg = MO.getReg(); 429 430 LaneBitmask DefLaneMask; 431 LaneBitmask KillLaneMask; 432 if (TrackLaneMasks) { 433 bool IsKill = MO.getSubReg() == 0 || MO.isUndef(); 434 DefLaneMask = getLaneMaskForMO(MO); 435 // If we have a <read-undef> flag, none of the lane values comes from an 436 // earlier instruction. 437 KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask; 438 439 if (MO.getSubReg() != 0 && MO.isUndef()) { 440 // There may be other subregister defs on the same instruction of the same 441 // register in later operands. The lanes of other defs will now be live 442 // after this instruction, so these should not be treated as killed by the 443 // instruction even though they appear to be killed in this one operand. 444 for (const MachineOperand &OtherMO : 445 llvm::drop_begin(MI->operands(), OperIdx + 1)) 446 if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg) 447 KillLaneMask &= ~getLaneMaskForMO(OtherMO); 448 } 449 450 // Clear undef flag, we'll re-add it later once we know which subregister 451 // Def is first. 452 MO.setIsUndef(false); 453 } else { 454 DefLaneMask = LaneBitmask::getAll(); 455 KillLaneMask = LaneBitmask::getAll(); 456 } 457 458 if (MO.isDead()) { 459 assert(deadDefHasNoUse(MO) && "Dead defs should have no uses"); 460 } else { 461 // Add data dependence to all uses we found so far. 462 const TargetSubtargetInfo &ST = MF.getSubtarget(); 463 for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg), 464 E = CurrentVRegUses.end(); I != E; /*empty*/) { 465 LaneBitmask LaneMask = I->LaneMask; 466 // Ignore uses of other lanes. 467 if ((LaneMask & KillLaneMask).none()) { 468 ++I; 469 continue; 470 } 471 472 if ((LaneMask & DefLaneMask).any()) { 473 SUnit *UseSU = I->SU; 474 MachineInstr *Use = UseSU->getInstr(); 475 SDep Dep(SU, SDep::Data, Reg); 476 Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use, 477 I->OperandIndex)); 478 ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep, 479 &SchedModel); 480 UseSU->addPred(Dep); 481 } 482 483 LaneMask &= ~KillLaneMask; 484 // If we found a Def for all lanes of this use, remove it from the list. 485 if (LaneMask.any()) { 486 I->LaneMask = LaneMask; 487 ++I; 488 } else 489 I = CurrentVRegUses.erase(I); 490 } 491 } 492 493 // Shortcut: Singly defined vregs do not have output/anti dependencies. 494 if (MRI.hasOneDef(Reg)) 495 return; 496 497 // Add output dependence to the next nearest defs of this vreg. 498 // 499 // Unless this definition is dead, the output dependence should be 500 // transitively redundant with antidependencies from this definition's 501 // uses. We're conservative for now until we have a way to guarantee the uses 502 // are not eliminated sometime during scheduling. The output dependence edge 503 // is also useful if output latency exceeds def-use latency. 504 LaneBitmask LaneMask = DefLaneMask; 505 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), 506 CurrentVRegDefs.end())) { 507 // Ignore defs for other lanes. 508 if ((V2SU.LaneMask & LaneMask).none()) 509 continue; 510 // Add an output dependence. 511 SUnit *DefSU = V2SU.SU; 512 // Ignore additional defs of the same lanes in one instruction. This can 513 // happen because lanemasks are shared for targets with too many 514 // subregisters. We also use some representration tricks/hacks where we 515 // add super-register defs/uses, to imply that although we only access parts 516 // of the reg we care about the full one. 517 if (DefSU == SU) 518 continue; 519 SDep Dep(SU, SDep::Output, Reg); 520 Dep.setLatency( 521 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); 522 DefSU->addPred(Dep); 523 524 // Update current definition. This can get tricky if the def was about a 525 // bigger lanemask before. We then have to shrink it and create a new 526 // VReg2SUnit for the non-overlapping part. 527 LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask; 528 LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask; 529 V2SU.SU = SU; 530 V2SU.LaneMask = OverlapMask; 531 if (NonOverlapMask.any()) 532 CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU)); 533 } 534 // If there was no CurrentVRegDefs entry for some lanes yet, create one. 535 if (LaneMask.any()) 536 CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); 537 } 538 539 /// Adds a register data dependency if the instruction that defines the 540 /// virtual register used at OperIdx is mapped to an SUnit. Add a register 541 /// antidependency from this SUnit to instructions that occur later in the same 542 /// scheduling region if they write the virtual register. 543 /// 544 /// TODO: Handle ExitSU "uses" properly. 545 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { 546 const MachineInstr *MI = SU->getInstr(); 547 assert(!MI->isDebugOrPseudoInstr()); 548 549 const MachineOperand &MO = MI->getOperand(OperIdx); 550 Register Reg = MO.getReg(); 551 552 // Remember the use. Data dependencies will be added when we find the def. 553 LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) 554 : LaneBitmask::getAll(); 555 CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU)); 556 557 // Add antidependences to the following defs of the vreg. 558 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), 559 CurrentVRegDefs.end())) { 560 // Ignore defs for unrelated lanes. 561 LaneBitmask PrevDefLaneMask = V2SU.LaneMask; 562 if ((PrevDefLaneMask & LaneMask).none()) 563 continue; 564 if (V2SU.SU == SU) 565 continue; 566 567 V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg)); 568 } 569 } 570 571 572 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, 573 unsigned Latency) { 574 if (SUa->getInstr()->mayAlias(getAAForDep(), *SUb->getInstr(), UseTBAA)) { 575 SDep Dep(SUa, SDep::MayAliasMem); 576 Dep.setLatency(Latency); 577 SUb->addPred(Dep); 578 } 579 } 580 581 /// Creates an SUnit for each real instruction, numbered in top-down 582 /// topological order. The instruction order A < B, implies that no edge exists 583 /// from B to A. 584 /// 585 /// Map each real instruction to its SUnit. 586 /// 587 /// After initSUnits, the SUnits vector cannot be resized and the scheduler may 588 /// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs 589 /// instead of pointers. 590 /// 591 /// MachineScheduler relies on initSUnits numbering the nodes by their order in 592 /// the original instruction list. 593 void ScheduleDAGInstrs::initSUnits() { 594 // We'll be allocating one SUnit for each real instruction in the region, 595 // which is contained within a basic block. 596 SUnits.reserve(NumRegionInstrs); 597 598 for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) { 599 if (MI.isDebugOrPseudoInstr()) 600 continue; 601 602 SUnit *SU = newSUnit(&MI); 603 MISUnitMap[&MI] = SU; 604 605 SU->isCall = MI.isCall(); 606 SU->isCommutable = MI.isCommutable(); 607 608 // Assign the Latency field of SU using target-provided information. 609 SU->Latency = SchedModel.computeInstrLatency(SU->getInstr()); 610 611 // If this SUnit uses a reserved or unbuffered resource, mark it as such. 612 // 613 // Reserved resources block an instruction from issuing and stall the 614 // entire pipeline. These are identified by BufferSize=0. 615 // 616 // Unbuffered resources prevent execution of subsequent instructions that 617 // require the same resources. This is used for in-order execution pipelines 618 // within an out-of-order core. These are identified by BufferSize=1. 619 if (SchedModel.hasInstrSchedModel()) { 620 const MCSchedClassDesc *SC = getSchedClass(SU); 621 for (const MCWriteProcResEntry &PRE : 622 make_range(SchedModel.getWriteProcResBegin(SC), 623 SchedModel.getWriteProcResEnd(SC))) { 624 switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) { 625 case 0: 626 SU->hasReservedResource = true; 627 break; 628 case 1: 629 SU->isUnbuffered = true; 630 break; 631 default: 632 break; 633 } 634 } 635 } 636 } 637 } 638 639 class ScheduleDAGInstrs::Value2SUsMap 640 : public SmallMapVector<ValueType, SUList, 4> { 641 /// Current total number of SUs in map. 642 unsigned NumNodes = 0; 643 644 /// 1 for loads, 0 for stores. (see comment in SUList) 645 unsigned TrueMemOrderLatency; 646 647 public: 648 Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {} 649 650 /// To keep NumNodes up to date, insert() is used instead of 651 /// this operator w/ push_back(). 652 ValueType &operator[](const SUList &Key) { 653 llvm_unreachable("Don't use. Use insert() instead."); }; 654 655 /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling 656 /// reduce(). 657 void inline insert(SUnit *SU, ValueType V) { 658 MapVector::operator[](V).push_back(SU); 659 NumNodes++; 660 } 661 662 /// Clears the list of SUs mapped to V. 663 void inline clearList(ValueType V) { 664 iterator Itr = find(V); 665 if (Itr != end()) { 666 assert(NumNodes >= Itr->second.size()); 667 NumNodes -= Itr->second.size(); 668 669 Itr->second.clear(); 670 } 671 } 672 673 /// Clears map from all contents. 674 void clear() { 675 SmallMapVector<ValueType, SUList, 4>::clear(); 676 NumNodes = 0; 677 } 678 679 unsigned inline size() const { return NumNodes; } 680 681 /// Counts the number of SUs in this map after a reduction. 682 void reComputeSize() { 683 NumNodes = 0; 684 for (auto &I : *this) 685 NumNodes += I.second.size(); 686 } 687 688 unsigned inline getTrueMemOrderLatency() const { 689 return TrueMemOrderLatency; 690 } 691 692 void dump(); 693 }; 694 695 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, 696 Value2SUsMap &Val2SUsMap) { 697 for (auto &I : Val2SUsMap) 698 addChainDependencies(SU, I.second, 699 Val2SUsMap.getTrueMemOrderLatency()); 700 } 701 702 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, 703 Value2SUsMap &Val2SUsMap, 704 ValueType V) { 705 Value2SUsMap::iterator Itr = Val2SUsMap.find(V); 706 if (Itr != Val2SUsMap.end()) 707 addChainDependencies(SU, Itr->second, 708 Val2SUsMap.getTrueMemOrderLatency()); 709 } 710 711 void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { 712 assert(BarrierChain != nullptr); 713 714 for (auto &[V, SUs] : map) { 715 (void)V; 716 for (auto *SU : SUs) 717 SU->addPredBarrier(BarrierChain); 718 } 719 map.clear(); 720 } 721 722 void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { 723 assert(BarrierChain != nullptr); 724 725 // Go through all lists of SUs. 726 for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) { 727 Value2SUsMap::iterator CurrItr = I++; 728 SUList &sus = CurrItr->second; 729 SUList::iterator SUItr = sus.begin(), SUEE = sus.end(); 730 for (; SUItr != SUEE; ++SUItr) { 731 // Stop on BarrierChain or any instruction above it. 732 if ((*SUItr)->NodeNum <= BarrierChain->NodeNum) 733 break; 734 735 (*SUItr)->addPredBarrier(BarrierChain); 736 } 737 738 // Remove also the BarrierChain from list if present. 739 if (SUItr != SUEE && *SUItr == BarrierChain) 740 SUItr++; 741 742 // Remove all SUs that are now successors of BarrierChain. 743 if (SUItr != sus.begin()) 744 sus.erase(sus.begin(), SUItr); 745 } 746 747 // Remove all entries with empty su lists. 748 map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) { 749 return (mapEntry.second.empty()); }); 750 751 // Recompute the size of the map (NumNodes). 752 map.reComputeSize(); 753 } 754 755 void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA, 756 RegPressureTracker *RPTracker, 757 PressureDiffs *PDiffs, 758 LiveIntervals *LIS, 759 bool TrackLaneMasks) { 760 const TargetSubtargetInfo &ST = MF.getSubtarget(); 761 bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI 762 : ST.useAA(); 763 if (UseAA && AA) 764 AAForDep.emplace(*AA); 765 766 BarrierChain = nullptr; 767 768 this->TrackLaneMasks = TrackLaneMasks; 769 MISUnitMap.clear(); 770 ScheduleDAG::clearDAG(); 771 772 // Create an SUnit for each real instruction. 773 initSUnits(); 774 775 if (PDiffs) 776 PDiffs->init(SUnits.size()); 777 778 // We build scheduling units by walking a block's instruction list 779 // from bottom to top. 780 781 // Each MIs' memory operand(s) is analyzed to a list of underlying 782 // objects. The SU is then inserted in the SUList(s) mapped from the 783 // Value(s). Each Value thus gets mapped to lists of SUs depending 784 // on it, stores and loads kept separately. Two SUs are trivially 785 // non-aliasing if they both depend on only identified Values and do 786 // not share any common Value. 787 Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/); 788 789 // Certain memory accesses are known to not alias any SU in Stores 790 // or Loads, and have therefore their own 'NonAlias' 791 // domain. E.g. spill / reload instructions never alias LLVM I/R 792 // Values. It would be nice to assume that this type of memory 793 // accesses always have a proper memory operand modelling, and are 794 // therefore never unanalyzable, but this is conservatively not 795 // done. 796 Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/); 797 798 // Track all instructions that may raise floating-point exceptions. 799 // These do not depend on one other (or normal loads or stores), but 800 // must not be rescheduled across global barriers. Note that we don't 801 // really need a "map" here since we don't track those MIs by value; 802 // using the same Value2SUsMap data type here is simply a matter of 803 // convenience. 804 Value2SUsMap FPExceptions; 805 806 // Remove any stale debug info; sometimes BuildSchedGraph is called again 807 // without emitting the info from the previous call. 808 DbgValues.clear(); 809 FirstDbgValue = nullptr; 810 811 assert(Defs.empty() && Uses.empty() && 812 "Only BuildGraph should update Defs/Uses"); 813 Defs.setUniverse(TRI->getNumRegs()); 814 Uses.setUniverse(TRI->getNumRegs()); 815 816 assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs"); 817 assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses"); 818 unsigned NumVirtRegs = MRI.getNumVirtRegs(); 819 CurrentVRegDefs.setUniverse(NumVirtRegs); 820 CurrentVRegUses.setUniverse(NumVirtRegs); 821 822 // Model data dependencies between instructions being scheduled and the 823 // ExitSU. 824 addSchedBarrierDeps(); 825 826 // Walk the list of instructions, from bottom moving up. 827 MachineInstr *DbgMI = nullptr; 828 for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin; 829 MII != MIE; --MII) { 830 MachineInstr &MI = *std::prev(MII); 831 if (DbgMI) { 832 DbgValues.emplace_back(DbgMI, &MI); 833 DbgMI = nullptr; 834 } 835 836 if (MI.isDebugValue() || MI.isDebugPHI()) { 837 DbgMI = &MI; 838 continue; 839 } 840 841 if (MI.isDebugLabel() || MI.isDebugRef() || MI.isPseudoProbe()) 842 continue; 843 844 SUnit *SU = MISUnitMap[&MI]; 845 assert(SU && "No SUnit mapped to this MI"); 846 847 if (RPTracker) { 848 RegisterOperands RegOpers; 849 RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false); 850 if (TrackLaneMasks) { 851 SlotIndex SlotIdx = LIS->getInstructionIndex(MI); 852 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx); 853 } 854 if (PDiffs != nullptr) 855 PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI); 856 857 if (RPTracker->getPos() == RegionEnd || &*RPTracker->getPos() != &MI) 858 RPTracker->recedeSkipDebugValues(); 859 assert(&*RPTracker->getPos() == &MI && "RPTracker in sync"); 860 RPTracker->recede(RegOpers); 861 } 862 863 assert( 864 (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) && 865 "Cannot schedule terminators or labels!"); 866 867 // Add register-based dependencies (data, anti, and output). 868 // For some instructions (calls, returns, inline-asm, etc.) there can 869 // be explicit uses and implicit defs, in which case the use will appear 870 // on the operand list before the def. Do two passes over the operand 871 // list to make sure that defs are processed before any uses. 872 bool HasVRegDef = false; 873 for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) { 874 const MachineOperand &MO = MI.getOperand(j); 875 if (!MO.isReg() || !MO.isDef()) 876 continue; 877 Register Reg = MO.getReg(); 878 if (Reg.isPhysical()) { 879 addPhysRegDeps(SU, j); 880 } else if (Reg.isVirtual()) { 881 HasVRegDef = true; 882 addVRegDefDeps(SU, j); 883 } 884 } 885 // Now process all uses. 886 for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) { 887 const MachineOperand &MO = MI.getOperand(j); 888 // Only look at use operands. 889 // We do not need to check for MO.readsReg() here because subsequent 890 // subregister defs will get output dependence edges and need no 891 // additional use dependencies. 892 if (!MO.isReg() || !MO.isUse()) 893 continue; 894 Register Reg = MO.getReg(); 895 if (Reg.isPhysical()) { 896 addPhysRegDeps(SU, j); 897 } else if (Reg.isVirtual() && MO.readsReg()) { 898 addVRegUseDeps(SU, j); 899 } 900 } 901 902 // If we haven't seen any uses in this scheduling region, create a 903 // dependence edge to ExitSU to model the live-out latency. This is required 904 // for vreg defs with no in-region use, and prefetches with no vreg def. 905 // 906 // FIXME: NumDataSuccs would be more precise than NumSuccs here. This 907 // check currently relies on being called before adding chain deps. 908 if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) { 909 SDep Dep(SU, SDep::Artificial); 910 Dep.setLatency(SU->Latency - 1); 911 ExitSU.addPred(Dep); 912 } 913 914 // Add memory dependencies (Note: isStoreToStackSlot and 915 // isLoadFromStackSLot are not usable after stack slots are lowered to 916 // actual addresses). 917 918 const TargetInstrInfo *TII = ST.getInstrInfo(); 919 // This is a barrier event that acts as a pivotal node in the DAG. 920 if (TII->isGlobalMemoryObject(&MI)) { 921 922 // Become the barrier chain. 923 if (BarrierChain) 924 BarrierChain->addPredBarrier(SU); 925 BarrierChain = SU; 926 927 LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU(" 928 << BarrierChain->NodeNum << ").\n"); 929 930 // Add dependencies against everything below it and clear maps. 931 addBarrierChain(Stores); 932 addBarrierChain(Loads); 933 addBarrierChain(NonAliasStores); 934 addBarrierChain(NonAliasLoads); 935 addBarrierChain(FPExceptions); 936 937 continue; 938 } 939 940 // Instructions that may raise FP exceptions may not be moved 941 // across any global barriers. 942 if (MI.mayRaiseFPException()) { 943 if (BarrierChain) 944 BarrierChain->addPredBarrier(SU); 945 946 FPExceptions.insert(SU, UnknownValue); 947 948 if (FPExceptions.size() >= HugeRegion) { 949 LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n"); 950 Value2SUsMap empty; 951 reduceHugeMemNodeMaps(FPExceptions, empty, getReductionSize()); 952 } 953 } 954 955 // If it's not a store or a variant load, we're done. 956 if (!MI.mayStore() && 957 !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad())) 958 continue; 959 960 // Always add dependecy edge to BarrierChain if present. 961 if (BarrierChain) 962 BarrierChain->addPredBarrier(SU); 963 964 // Find the underlying objects for MI. The Objs vector is either 965 // empty, or filled with the Values of memory locations which this 966 // SU depends on. 967 UnderlyingObjectsVector Objs; 968 bool ObjsFound = getUnderlyingObjectsForInstr(&MI, MFI, Objs, 969 MF.getDataLayout()); 970 971 if (MI.mayStore()) { 972 if (!ObjsFound) { 973 // An unknown store depends on all stores and loads. 974 addChainDependencies(SU, Stores); 975 addChainDependencies(SU, NonAliasStores); 976 addChainDependencies(SU, Loads); 977 addChainDependencies(SU, NonAliasLoads); 978 979 // Map this store to 'UnknownValue'. 980 Stores.insert(SU, UnknownValue); 981 } else { 982 // Add precise dependencies against all previously seen memory 983 // accesses mapped to the same Value(s). 984 for (const UnderlyingObject &UnderlObj : Objs) { 985 ValueType V = UnderlObj.getValue(); 986 bool ThisMayAlias = UnderlObj.mayAlias(); 987 988 // Add dependencies to previous stores and loads mapped to V. 989 addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); 990 addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V); 991 } 992 // Update the store map after all chains have been added to avoid adding 993 // self-loop edge if multiple underlying objects are present. 994 for (const UnderlyingObject &UnderlObj : Objs) { 995 ValueType V = UnderlObj.getValue(); 996 bool ThisMayAlias = UnderlObj.mayAlias(); 997 998 // Map this store to V. 999 (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V); 1000 } 1001 // The store may have dependencies to unanalyzable loads and 1002 // stores. 1003 addChainDependencies(SU, Loads, UnknownValue); 1004 addChainDependencies(SU, Stores, UnknownValue); 1005 } 1006 } else { // SU is a load. 1007 if (!ObjsFound) { 1008 // An unknown load depends on all stores. 1009 addChainDependencies(SU, Stores); 1010 addChainDependencies(SU, NonAliasStores); 1011 1012 Loads.insert(SU, UnknownValue); 1013 } else { 1014 for (const UnderlyingObject &UnderlObj : Objs) { 1015 ValueType V = UnderlObj.getValue(); 1016 bool ThisMayAlias = UnderlObj.mayAlias(); 1017 1018 // Add precise dependencies against all previously seen stores 1019 // mapping to the same Value(s). 1020 addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); 1021 1022 // Map this load to V. 1023 (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V); 1024 } 1025 // The load may have dependencies to unanalyzable stores. 1026 addChainDependencies(SU, Stores, UnknownValue); 1027 } 1028 } 1029 1030 // Reduce maps if they grow huge. 1031 if (Stores.size() + Loads.size() >= HugeRegion) { 1032 LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n"); 1033 reduceHugeMemNodeMaps(Stores, Loads, getReductionSize()); 1034 } 1035 if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) { 1036 LLVM_DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n"); 1037 reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize()); 1038 } 1039 } 1040 1041 if (DbgMI) 1042 FirstDbgValue = DbgMI; 1043 1044 Defs.clear(); 1045 Uses.clear(); 1046 CurrentVRegDefs.clear(); 1047 CurrentVRegUses.clear(); 1048 1049 Topo.MarkDirty(); 1050 } 1051 1052 raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) { 1053 PSV->printCustom(OS); 1054 return OS; 1055 } 1056 1057 void ScheduleDAGInstrs::Value2SUsMap::dump() { 1058 for (const auto &[ValType, SUs] : *this) { 1059 if (isa<const Value *>(ValType)) { 1060 const Value *V = cast<const Value *>(ValType); 1061 if (isa<UndefValue>(V)) 1062 dbgs() << "Unknown"; 1063 else 1064 V->printAsOperand(dbgs()); 1065 } else if (isa<const PseudoSourceValue *>(ValType)) 1066 dbgs() << cast<const PseudoSourceValue *>(ValType); 1067 else 1068 llvm_unreachable("Unknown Value type."); 1069 1070 dbgs() << " : "; 1071 dumpSUList(SUs); 1072 } 1073 } 1074 1075 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, 1076 Value2SUsMap &loads, unsigned N) { 1077 LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump(); 1078 dbgs() << "Loading SUnits:\n"; loads.dump()); 1079 1080 // Insert all SU's NodeNums into a vector and sort it. 1081 std::vector<unsigned> NodeNums; 1082 NodeNums.reserve(stores.size() + loads.size()); 1083 for (const auto &[V, SUs] : stores) { 1084 (void)V; 1085 for (const auto *SU : SUs) 1086 NodeNums.push_back(SU->NodeNum); 1087 } 1088 for (const auto &[V, SUs] : loads) { 1089 (void)V; 1090 for (const auto *SU : SUs) 1091 NodeNums.push_back(SU->NodeNum); 1092 } 1093 llvm::sort(NodeNums); 1094 1095 // The N last elements in NodeNums will be removed, and the SU with 1096 // the lowest NodeNum of them will become the new BarrierChain to 1097 // let the not yet seen SUs have a dependency to the removed SUs. 1098 assert(N <= NodeNums.size()); 1099 SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)]; 1100 if (BarrierChain) { 1101 // The aliasing and non-aliasing maps reduce independently of each 1102 // other, but share a common BarrierChain. Check if the 1103 // newBarrierChain is above the former one. If it is not, it may 1104 // introduce a loop to use newBarrierChain, so keep the old one. 1105 if (newBarrierChain->NodeNum < BarrierChain->NodeNum) { 1106 BarrierChain->addPredBarrier(newBarrierChain); 1107 BarrierChain = newBarrierChain; 1108 LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU(" 1109 << BarrierChain->NodeNum << ").\n"); 1110 } 1111 else 1112 LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU(" 1113 << BarrierChain->NodeNum << ").\n"); 1114 } 1115 else 1116 BarrierChain = newBarrierChain; 1117 1118 insertBarrierChain(stores); 1119 insertBarrierChain(loads); 1120 1121 LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump(); 1122 dbgs() << "Loading SUnits:\n"; loads.dump()); 1123 } 1124 1125 static void toggleKills(const MachineRegisterInfo &MRI, LiveRegUnits &LiveRegs, 1126 MachineInstr &MI, bool addToLiveRegs) { 1127 for (MachineOperand &MO : MI.operands()) { 1128 if (!MO.isReg() || !MO.readsReg()) 1129 continue; 1130 Register Reg = MO.getReg(); 1131 if (!Reg) 1132 continue; 1133 1134 // Things that are available after the instruction are killed by it. 1135 bool IsKill = LiveRegs.available(Reg); 1136 1137 // Exception: Do not kill reserved registers 1138 MO.setIsKill(IsKill && !MRI.isReserved(Reg)); 1139 if (addToLiveRegs) 1140 LiveRegs.addReg(Reg); 1141 } 1142 } 1143 1144 void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) { 1145 LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n'); 1146 1147 LiveRegs.init(*TRI); 1148 LiveRegs.addLiveOuts(MBB); 1149 1150 // Examine block from end to start... 1151 for (MachineInstr &MI : llvm::reverse(MBB)) { 1152 if (MI.isDebugOrPseudoInstr()) 1153 continue; 1154 1155 // Update liveness. Registers that are defed but not used in this 1156 // instruction are now dead. Mark register and all subregs as they 1157 // are completely defined. 1158 for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { 1159 const MachineOperand &MO = *O; 1160 if (MO.isReg()) { 1161 if (!MO.isDef()) 1162 continue; 1163 Register Reg = MO.getReg(); 1164 if (!Reg) 1165 continue; 1166 LiveRegs.removeReg(Reg); 1167 } else if (MO.isRegMask()) { 1168 LiveRegs.removeRegsNotPreserved(MO.getRegMask()); 1169 } 1170 } 1171 1172 // If there is a bundle header fix it up first. 1173 if (!MI.isBundled()) { 1174 toggleKills(MRI, LiveRegs, MI, true); 1175 } else { 1176 MachineBasicBlock::instr_iterator Bundle = MI.getIterator(); 1177 if (MI.isBundle()) 1178 toggleKills(MRI, LiveRegs, MI, false); 1179 1180 // Some targets make the (questionable) assumtion that the instructions 1181 // inside the bundle are ordered and consequently only the last use of 1182 // a register inside the bundle can kill it. 1183 MachineBasicBlock::instr_iterator I = std::next(Bundle); 1184 while (I->isBundledWithSucc()) 1185 ++I; 1186 do { 1187 if (!I->isDebugOrPseudoInstr()) 1188 toggleKills(MRI, LiveRegs, *I, true); 1189 --I; 1190 } while (I != Bundle); 1191 } 1192 } 1193 } 1194 1195 void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const { 1196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1197 dumpNodeName(SU); 1198 if (SchedPrintCycles) 1199 dbgs() << " [TopReadyCycle = " << SU.TopReadyCycle 1200 << ", BottomReadyCycle = " << SU.BotReadyCycle << "]"; 1201 dbgs() << ": "; 1202 SU.getInstr()->dump(); 1203 #endif 1204 } 1205 1206 void ScheduleDAGInstrs::dump() const { 1207 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1208 if (EntrySU.getInstr() != nullptr) 1209 dumpNodeAll(EntrySU); 1210 for (const SUnit &SU : SUnits) 1211 dumpNodeAll(SU); 1212 if (ExitSU.getInstr() != nullptr) 1213 dumpNodeAll(ExitSU); 1214 #endif 1215 } 1216 1217 std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const { 1218 std::string s; 1219 raw_string_ostream oss(s); 1220 if (SU == &EntrySU) 1221 oss << "<entry>"; 1222 else if (SU == &ExitSU) 1223 oss << "<exit>"; 1224 else 1225 SU->getInstr()->print(oss, /*IsStandalone=*/true); 1226 return s; 1227 } 1228 1229 /// Return the basic block label. It is not necessarily unique because a block 1230 /// contains multiple scheduling regions. But it is fine for visualization. 1231 std::string ScheduleDAGInstrs::getDAGName() const { 1232 return "dag." + BB->getFullName(); 1233 } 1234 1235 bool ScheduleDAGInstrs::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { 1236 return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); 1237 } 1238 1239 bool ScheduleDAGInstrs::addEdge(SUnit *SuccSU, const SDep &PredDep) { 1240 if (SuccSU != &ExitSU) { 1241 // Do not use WillCreateCycle, it assumes SD scheduling. 1242 // If Pred is reachable from Succ, then the edge creates a cycle. 1243 if (Topo.IsReachable(PredDep.getSUnit(), SuccSU)) 1244 return false; 1245 Topo.AddPredQueued(SuccSU, PredDep.getSUnit()); 1246 } 1247 SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial()); 1248 // Return true regardless of whether a new edge needed to be inserted. 1249 return true; 1250 } 1251 1252 //===----------------------------------------------------------------------===// 1253 // SchedDFSResult Implementation 1254 //===----------------------------------------------------------------------===// 1255 1256 namespace llvm { 1257 1258 /// Internal state used to compute SchedDFSResult. 1259 class SchedDFSImpl { 1260 SchedDFSResult &R; 1261 1262 /// Join DAG nodes into equivalence classes by their subtree. 1263 IntEqClasses SubtreeClasses; 1264 /// List PredSU, SuccSU pairs that represent data edges between subtrees. 1265 std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs; 1266 1267 struct RootData { 1268 unsigned NodeID; 1269 unsigned ParentNodeID; ///< Parent node (member of the parent subtree). 1270 unsigned SubInstrCount = 0; ///< Instr count in this tree only, not 1271 /// children. 1272 1273 RootData(unsigned id): NodeID(id), 1274 ParentNodeID(SchedDFSResult::InvalidSubtreeID) {} 1275 1276 unsigned getSparseSetIndex() const { return NodeID; } 1277 }; 1278 1279 SparseSet<RootData> RootSet; 1280 1281 public: 1282 SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) { 1283 RootSet.setUniverse(R.DFSNodeData.size()); 1284 } 1285 1286 /// Returns true if this node been visited by the DFS traversal. 1287 /// 1288 /// During visitPostorderNode the Node's SubtreeID is assigned to the Node 1289 /// ID. Later, SubtreeID is updated but remains valid. 1290 bool isVisited(const SUnit *SU) const { 1291 return R.DFSNodeData[SU->NodeNum].SubtreeID 1292 != SchedDFSResult::InvalidSubtreeID; 1293 } 1294 1295 /// Initializes this node's instruction count. We don't need to flag the node 1296 /// visited until visitPostorder because the DAG cannot have cycles. 1297 void visitPreorder(const SUnit *SU) { 1298 R.DFSNodeData[SU->NodeNum].InstrCount = 1299 SU->getInstr()->isTransient() ? 0 : 1; 1300 } 1301 1302 /// Called once for each node after all predecessors are visited. Revisit this 1303 /// node's predecessors and potentially join them now that we know the ILP of 1304 /// the other predecessors. 1305 void visitPostorderNode(const SUnit *SU) { 1306 // Mark this node as the root of a subtree. It may be joined with its 1307 // successors later. 1308 R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum; 1309 RootData RData(SU->NodeNum); 1310 RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1; 1311 1312 // If any predecessors are still in their own subtree, they either cannot be 1313 // joined or are large enough to remain separate. If this parent node's 1314 // total instruction count is not greater than a child subtree by at least 1315 // the subtree limit, then try to join it now since splitting subtrees is 1316 // only useful if multiple high-pressure paths are possible. 1317 unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount; 1318 for (const SDep &PredDep : SU->Preds) { 1319 if (PredDep.getKind() != SDep::Data) 1320 continue; 1321 unsigned PredNum = PredDep.getSUnit()->NodeNum; 1322 if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit) 1323 joinPredSubtree(PredDep, SU, /*CheckLimit=*/false); 1324 1325 // Either link or merge the TreeData entry from the child to the parent. 1326 if (R.DFSNodeData[PredNum].SubtreeID == PredNum) { 1327 // If the predecessor's parent is invalid, this is a tree edge and the 1328 // current node is the parent. 1329 if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID) 1330 RootSet[PredNum].ParentNodeID = SU->NodeNum; 1331 } 1332 else if (RootSet.count(PredNum)) { 1333 // The predecessor is not a root, but is still in the root set. This 1334 // must be the new parent that it was just joined to. Note that 1335 // RootSet[PredNum].ParentNodeID may either be invalid or may still be 1336 // set to the original parent. 1337 RData.SubInstrCount += RootSet[PredNum].SubInstrCount; 1338 RootSet.erase(PredNum); 1339 } 1340 } 1341 RootSet[SU->NodeNum] = RData; 1342 } 1343 1344 /// Called once for each tree edge after calling visitPostOrderNode on 1345 /// the predecessor. Increment the parent node's instruction count and 1346 /// preemptively join this subtree to its parent's if it is small enough. 1347 void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) { 1348 R.DFSNodeData[Succ->NodeNum].InstrCount 1349 += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount; 1350 joinPredSubtree(PredDep, Succ); 1351 } 1352 1353 /// Adds a connection for cross edges. 1354 void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) { 1355 ConnectionPairs.emplace_back(PredDep.getSUnit(), Succ); 1356 } 1357 1358 /// Sets each node's subtree ID to the representative ID and record 1359 /// connections between trees. 1360 void finalize() { 1361 SubtreeClasses.compress(); 1362 R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); 1363 assert(SubtreeClasses.getNumClasses() == RootSet.size() 1364 && "number of roots should match trees"); 1365 for (const RootData &Root : RootSet) { 1366 unsigned TreeID = SubtreeClasses[Root.NodeID]; 1367 if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID) 1368 R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID]; 1369 R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount; 1370 // Note that SubInstrCount may be greater than InstrCount if we joined 1371 // subtrees across a cross edge. InstrCount will be attributed to the 1372 // original parent, while SubInstrCount will be attributed to the joined 1373 // parent. 1374 } 1375 R.SubtreeConnections.resize(SubtreeClasses.getNumClasses()); 1376 R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses()); 1377 LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n"); 1378 for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) { 1379 R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx]; 1380 LLVM_DEBUG(dbgs() << " SU(" << Idx << ") in tree " 1381 << R.DFSNodeData[Idx].SubtreeID << '\n'); 1382 } 1383 for (const auto &[Pred, Succ] : ConnectionPairs) { 1384 unsigned PredTree = SubtreeClasses[Pred->NodeNum]; 1385 unsigned SuccTree = SubtreeClasses[Succ->NodeNum]; 1386 if (PredTree == SuccTree) 1387 continue; 1388 unsigned Depth = Pred->getDepth(); 1389 addConnection(PredTree, SuccTree, Depth); 1390 addConnection(SuccTree, PredTree, Depth); 1391 } 1392 } 1393 1394 protected: 1395 /// Joins the predecessor subtree with the successor that is its DFS parent. 1396 /// Applies some heuristics before joining. 1397 bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ, 1398 bool CheckLimit = true) { 1399 assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); 1400 1401 // Check if the predecessor is already joined. 1402 const SUnit *PredSU = PredDep.getSUnit(); 1403 unsigned PredNum = PredSU->NodeNum; 1404 if (R.DFSNodeData[PredNum].SubtreeID != PredNum) 1405 return false; 1406 1407 // Four is the magic number of successors before a node is considered a 1408 // pinch point. 1409 unsigned NumDataSucs = 0; 1410 for (const SDep &SuccDep : PredSU->Succs) { 1411 if (SuccDep.getKind() == SDep::Data) { 1412 if (++NumDataSucs >= 4) 1413 return false; 1414 } 1415 } 1416 if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit) 1417 return false; 1418 R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum; 1419 SubtreeClasses.join(Succ->NodeNum, PredNum); 1420 return true; 1421 } 1422 1423 /// Called by finalize() to record a connection between trees. 1424 void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) { 1425 if (!Depth) 1426 return; 1427 1428 do { 1429 SmallVectorImpl<SchedDFSResult::Connection> &Connections = 1430 R.SubtreeConnections[FromTree]; 1431 for (SchedDFSResult::Connection &C : Connections) { 1432 if (C.TreeID == ToTree) { 1433 C.Level = std::max(C.Level, Depth); 1434 return; 1435 } 1436 } 1437 Connections.push_back(SchedDFSResult::Connection(ToTree, Depth)); 1438 FromTree = R.DFSTreeData[FromTree].ParentTreeID; 1439 } while (FromTree != SchedDFSResult::InvalidSubtreeID); 1440 } 1441 }; 1442 1443 } // end namespace llvm 1444 1445 namespace { 1446 1447 /// Manage the stack used by a reverse depth-first search over the DAG. 1448 class SchedDAGReverseDFS { 1449 std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack; 1450 1451 public: 1452 bool isComplete() const { return DFSStack.empty(); } 1453 1454 void follow(const SUnit *SU) { 1455 DFSStack.emplace_back(SU, SU->Preds.begin()); 1456 } 1457 void advance() { ++DFSStack.back().second; } 1458 1459 const SDep *backtrack() { 1460 DFSStack.pop_back(); 1461 return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second); 1462 } 1463 1464 const SUnit *getCurr() const { return DFSStack.back().first; } 1465 1466 SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; } 1467 1468 SUnit::const_pred_iterator getPredEnd() const { 1469 return getCurr()->Preds.end(); 1470 } 1471 }; 1472 1473 } // end anonymous namespace 1474 1475 static bool hasDataSucc(const SUnit *SU) { 1476 for (const SDep &SuccDep : SU->Succs) { 1477 if (SuccDep.getKind() == SDep::Data && 1478 !SuccDep.getSUnit()->isBoundaryNode()) 1479 return true; 1480 } 1481 return false; 1482 } 1483 1484 /// Computes an ILP metric for all nodes in the subDAG reachable via depth-first 1485 /// search from this root. 1486 void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { 1487 if (!IsBottomUp) 1488 llvm_unreachable("Top-down ILP metric is unimplemented"); 1489 1490 SchedDFSImpl Impl(*this); 1491 for (const SUnit &SU : SUnits) { 1492 if (Impl.isVisited(&SU) || hasDataSucc(&SU)) 1493 continue; 1494 1495 SchedDAGReverseDFS DFS; 1496 Impl.visitPreorder(&SU); 1497 DFS.follow(&SU); 1498 while (true) { 1499 // Traverse the leftmost path as far as possible. 1500 while (DFS.getPred() != DFS.getPredEnd()) { 1501 const SDep &PredDep = *DFS.getPred(); 1502 DFS.advance(); 1503 // Ignore non-data edges. 1504 if (PredDep.getKind() != SDep::Data 1505 || PredDep.getSUnit()->isBoundaryNode()) { 1506 continue; 1507 } 1508 // An already visited edge is a cross edge, assuming an acyclic DAG. 1509 if (Impl.isVisited(PredDep.getSUnit())) { 1510 Impl.visitCrossEdge(PredDep, DFS.getCurr()); 1511 continue; 1512 } 1513 Impl.visitPreorder(PredDep.getSUnit()); 1514 DFS.follow(PredDep.getSUnit()); 1515 } 1516 // Visit the top of the stack in postorder and backtrack. 1517 const SUnit *Child = DFS.getCurr(); 1518 const SDep *PredDep = DFS.backtrack(); 1519 Impl.visitPostorderNode(Child); 1520 if (PredDep) 1521 Impl.visitPostorderEdge(*PredDep, DFS.getCurr()); 1522 if (DFS.isComplete()) 1523 break; 1524 } 1525 } 1526 Impl.finalize(); 1527 } 1528 1529 /// The root of the given SubtreeID was just scheduled. For all subtrees 1530 /// connected to this tree, record the depth of the connection so that the 1531 /// nearest connected subtrees can be prioritized. 1532 void SchedDFSResult::scheduleTree(unsigned SubtreeID) { 1533 for (const Connection &C : SubtreeConnections[SubtreeID]) { 1534 SubtreeConnectLevels[C.TreeID] = 1535 std::max(SubtreeConnectLevels[C.TreeID], C.Level); 1536 LLVM_DEBUG(dbgs() << " Tree: " << C.TreeID << " @" 1537 << SubtreeConnectLevels[C.TreeID] << '\n'); 1538 } 1539 } 1540 1541 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1542 LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { 1543 OS << InstrCount << " / " << Length << " = "; 1544 if (!Length) 1545 OS << "BADILP"; 1546 else 1547 OS << format("%g", ((double)InstrCount / Length)); 1548 } 1549 1550 LLVM_DUMP_METHOD void ILPValue::dump() const { 1551 dbgs() << *this << '\n'; 1552 } 1553 1554 namespace llvm { 1555 1556 LLVM_ATTRIBUTE_UNUSED 1557 raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { 1558 Val.print(OS); 1559 return OS; 1560 } 1561 1562 } // end namespace llvm 1563 1564 #endif 1565