1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file This file defines a set of schedule DAG mutations that can be used to 10 // override default scheduler behavior to enforce specific scheduling patterns. 11 // They should be used in cases where runtime performance considerations such as 12 // inter-wavefront interactions, mean that compile-time heuristics cannot 13 // predict the optimal instruction ordering, or in kernels where optimum 14 // instruction scheduling is important enough to warrant manual intervention. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUIGroupLP.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/BitmaskEnum.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/CodeGen/TargetOpcodes.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "machine-scheduler" 30 31 namespace { 32 33 static cl::opt<bool> 34 EnableIGroupLP("amdgpu-igrouplp", 35 cl::desc("Enable construction of Instruction Groups and " 36 "their ordering for scheduling"), 37 cl::init(false)); 38 39 static cl::opt<Optional<unsigned>> 40 VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), 41 cl::Hidden, 42 cl::desc("The maximum number of instructions to include " 43 "in VMEM group.")); 44 45 static cl::opt<Optional<unsigned>> 46 MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), 47 cl::Hidden, 48 cl::desc("The maximum number of instructions to include " 49 "in MFMA group.")); 50 51 static cl::opt<Optional<unsigned>> 52 LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), 53 cl::Hidden, 54 cl::desc("The maximum number of instructions to include " 55 "in lds/gds read group.")); 56 57 static cl::opt<Optional<unsigned>> 58 LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), 59 cl::Hidden, 60 cl::desc("The maximum number of instructions to include " 61 "in lds/gds write group.")); 62 63 typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)> 64 CanAddMIFn; 65 66 // Classify instructions into groups to enable fine tuned control over the 67 // scheduler. These groups may be more specific than current SchedModel 68 // instruction classes. 69 class SchedGroup { 70 private: 71 // Function that returns true if a non-bundle MI may be inserted into this 72 // group. 73 const CanAddMIFn canAddMI; 74 75 // Maximum number of SUnits that can be added to this group. 76 Optional<unsigned> MaxSize; 77 78 // Collection of SUnits that are classified as members of this group. 79 SmallVector<SUnit *, 32> Collection; 80 81 ScheduleDAGInstrs *DAG; 82 83 void tryAddEdge(SUnit *A, SUnit *B) { 84 if (A != B && DAG->canAddEdge(B, A)) { 85 DAG->addEdge(B, SDep(A, SDep::Artificial)); 86 LLVM_DEBUG(dbgs() << "Adding edge...\n" 87 << "from: SU(" << A->NodeNum << ") " << *A->getInstr() 88 << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); 89 } 90 } 91 92 public: 93 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If 94 // MakePred is true, SU will be a predecessor of the SUnits in this 95 // SchedGroup, otherwise SU will be a successor. 96 void link(SUnit &SU, bool MakePred = false) { 97 for (auto A : Collection) { 98 SUnit *B = &SU; 99 if (MakePred) 100 std::swap(A, B); 101 102 tryAddEdge(A, B); 103 } 104 } 105 106 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use 107 // the predicate to determine whether SU should be a predecessor (P = true) 108 // or a successor (P = false) of this SchedGroup. 109 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) { 110 for (auto A : Collection) { 111 SUnit *B = &SU; 112 if (P(A, B)) 113 std::swap(A, B); 114 115 tryAddEdge(A, B); 116 } 117 } 118 119 // Add DAG dependencies such that SUnits in this group shall be ordered 120 // before SUnits in OtherGroup. 121 void link(SchedGroup &OtherGroup) { 122 for (auto B : OtherGroup.Collection) 123 link(*B); 124 } 125 126 // Returns true if no more instructions may be added to this group. 127 bool isFull() { return MaxSize && Collection.size() >= *MaxSize; } 128 129 // Returns true if SU can be added to this SchedGroup. 130 bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { 131 if (isFull()) 132 return false; 133 134 MachineInstr &MI = *SU.getInstr(); 135 if (MI.getOpcode() != TargetOpcode::BUNDLE) 136 return canAddMI(MI, TII); 137 138 // Special case for bundled MIs. 139 const MachineBasicBlock *MBB = MI.getParent(); 140 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; 141 while (E != MBB->end() && E->isBundledWithPred()) 142 ++E; 143 144 // Return true if all of the bundled MIs can be added to this group. 145 return std::all_of( 146 B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); 147 } 148 149 void add(SUnit &SU) { Collection.push_back(&SU); } 150 151 SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize, 152 ScheduleDAGInstrs *DAG) 153 : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} 154 }; 155 156 bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 157 return TII->isMFMA(MI); 158 } 159 160 bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 161 return TII->isVALU(MI) && !TII->isMFMA(MI); 162 } 163 164 bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 165 return TII->isSALU(MI); 166 } 167 168 bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 169 return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); 170 } 171 172 bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 173 return MI.mayLoad() && 174 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); 175 } 176 177 bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 178 return MI.mayStore() && 179 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); 180 } 181 182 bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 183 return MI.mayStore() && TII->isDS(MI); 184 } 185 186 bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { 187 return MI.mayLoad() && TII->isDS(MI); 188 } 189 190 class IGroupLPDAGMutation : public ScheduleDAGMutation { 191 public: 192 const SIInstrInfo *TII; 193 ScheduleDAGMI *DAG; 194 195 IGroupLPDAGMutation() = default; 196 void apply(ScheduleDAGInstrs *DAGInstrs) override; 197 }; 198 199 // DAG mutation that coordinates with the SCHED_BARRIER instruction and 200 // corresponding builtin. The mutation adds edges from specific instruction 201 // classes determined by the SCHED_BARRIER mask so that they cannot be 202 // scheduled around the SCHED_BARRIER. 203 class SchedBarrierDAGMutation : public ScheduleDAGMutation { 204 private: 205 const SIInstrInfo *TII; 206 207 ScheduleDAGMI *DAG; 208 209 // Components of the mask that determines which instructions may not be 210 // scheduled across the SCHED_BARRIER. 211 enum class SchedBarrierMasks { 212 NONE = 0u, 213 ALU = 1u << 0, 214 VALU = 1u << 1, 215 SALU = 1u << 2, 216 MFMA = 1u << 3, 217 VMEM = 1u << 4, 218 VMEM_READ = 1u << 5, 219 VMEM_WRITE = 1u << 6, 220 DS = 1u << 7, 221 DS_READ = 1u << 8, 222 DS_WRITE = 1u << 9, 223 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) 224 }; 225 226 // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a 227 // region. 228 // 229 std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr; 230 std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr; 231 std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr; 232 std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr; 233 std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr; 234 std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr; 235 std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr; 236 237 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should 238 // not be reordered accross the SCHED_BARRIER. 239 void getSchedGroupsFromMask(int32_t Mask, 240 SmallVectorImpl<SchedGroup *> &SchedGroups); 241 242 // Add DAG edges that enforce SCHED_BARRIER ordering. 243 void addSchedBarrierEdges(SUnit &SU); 244 245 // Classify instructions and add them to the SchedGroup. 246 void initSchedGroup(SchedGroup *SG); 247 248 // Remove all existing edges from a SCHED_BARRIER. 249 void resetSchedBarrierEdges(SUnit &SU); 250 251 public: 252 void apply(ScheduleDAGInstrs *DAGInstrs) override; 253 254 SchedBarrierDAGMutation() = default; 255 }; 256 257 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 258 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 259 TII = ST.getInstrInfo(); 260 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 261 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 262 if (!TSchedModel || DAG->SUnits.empty()) 263 return; 264 265 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); 266 267 // The order of InstructionGroups in this vector defines the 268 // order in which edges will be added. In other words, given the 269 // present ordering, we will try to make each VMEMRead instruction 270 // a predecessor of each DSRead instruction, and so on. 271 SmallVector<SchedGroup, 4> PipelineOrderGroups = { 272 SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), 273 SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), 274 SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), 275 SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; 276 277 for (SUnit &SU : DAG->SUnits) { 278 LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); 279 for (auto &SG : PipelineOrderGroups) 280 if (SG.canAddSU(SU, TII)) 281 SG.add(SU); 282 } 283 284 for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { 285 auto &GroupA = PipelineOrderGroups[i]; 286 for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { 287 auto &GroupB = PipelineOrderGroups[j]; 288 GroupA.link(GroupB); 289 } 290 } 291 } 292 293 void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { 294 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 295 if (!TSchedModel || DAGInstrs->SUnits.empty()) 296 return; 297 298 LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); 299 300 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 301 TII = ST.getInstrInfo(); 302 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); 303 for (auto &SU : DAG->SUnits) 304 if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) 305 addSchedBarrierEdges(SU); 306 } 307 308 void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { 309 MachineInstr &MI = *SchedBarrier.getInstr(); 310 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); 311 // Remove all existing edges from the SCHED_BARRIER that were added due to the 312 // instruction having side effects. 313 resetSchedBarrierEdges(SchedBarrier); 314 SmallVector<SchedGroup *, 4> SchedGroups; 315 int32_t Mask = MI.getOperand(0).getImm(); 316 getSchedGroupsFromMask(Mask, SchedGroups); 317 for (auto SG : SchedGroups) 318 SG->link( 319 SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( 320 const SUnit *A, const SUnit *B) { 321 return A->NodeNum > B->NodeNum; 322 }); 323 } 324 325 void SchedBarrierDAGMutation::getSchedGroupsFromMask( 326 int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) { 327 SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; 328 // See IntrinsicsAMDGPU.td for an explanation of these masks and their 329 // mappings. 330 // 331 if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && 332 (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { 333 if (!VALUSchedGroup) { 334 VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG); 335 initSchedGroup(VALUSchedGroup.get()); 336 } 337 338 SchedGroups.push_back(VALUSchedGroup.get()); 339 } 340 341 if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && 342 (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { 343 if (!SALUSchedGroup) { 344 SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG); 345 initSchedGroup(SALUSchedGroup.get()); 346 } 347 348 SchedGroups.push_back(SALUSchedGroup.get()); 349 } 350 351 if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && 352 (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { 353 if (!MFMASchedGroup) { 354 MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG); 355 initSchedGroup(MFMASchedGroup.get()); 356 } 357 358 SchedGroups.push_back(MFMASchedGroup.get()); 359 } 360 361 if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && 362 (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { 363 if (!VMEMReadSchedGroup) { 364 VMEMReadSchedGroup = 365 std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG); 366 initSchedGroup(VMEMReadSchedGroup.get()); 367 } 368 369 SchedGroups.push_back(VMEMReadSchedGroup.get()); 370 } 371 372 if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && 373 (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { 374 if (!VMEMWriteSchedGroup) { 375 VMEMWriteSchedGroup = 376 std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG); 377 initSchedGroup(VMEMWriteSchedGroup.get()); 378 } 379 380 SchedGroups.push_back(VMEMWriteSchedGroup.get()); 381 } 382 383 if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && 384 (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { 385 if (!DSReadSchedGroup) { 386 DSReadSchedGroup = 387 std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG); 388 initSchedGroup(DSReadSchedGroup.get()); 389 } 390 391 SchedGroups.push_back(DSReadSchedGroup.get()); 392 } 393 394 if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && 395 (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { 396 if (!DSWriteSchedGroup) { 397 DSWriteSchedGroup = 398 std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG); 399 initSchedGroup(DSWriteSchedGroup.get()); 400 } 401 402 SchedGroups.push_back(DSWriteSchedGroup.get()); 403 } 404 } 405 406 void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { 407 assert(SG); 408 for (auto &SU : DAG->SUnits) 409 if (SG->canAddSU(SU, TII)) 410 SG->add(SU); 411 } 412 413 void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { 414 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); 415 for (auto &P : SU.Preds) 416 SU.removePred(P); 417 418 for (auto &S : SU.Succs) { 419 for (auto &SP : S.getSUnit()->Preds) { 420 if (SP.getSUnit() == &SU) { 421 S.getSUnit()->removePred(SP); 422 } 423 } 424 } 425 } 426 427 } // namespace 428 429 namespace llvm { 430 431 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { 432 return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr; 433 } 434 435 std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() { 436 return std::make_unique<SchedBarrierDAGMutation>(); 437 } 438 439 } // end namespace llvm 440