//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // \file This file defines a set of schedule DAG mutations that can be used to // override default scheduler behavior to enforce specific scheduling patterns. // They should be used in cases where runtime performance considerations such as // inter-wavefront interactions, mean that compile-time heuristics cannot // predict the optimal instruction ordering, or in kernels where optimum // instruction scheduling is important enough to warrant manual intervention. // //===----------------------------------------------------------------------===// #include "AMDGPUIGroupLP.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" using namespace llvm; #define DEBUG_TYPE "machine-scheduler" namespace { static cl::opt EnableIGroupLP("amdgpu-igrouplp", cl::desc("Enable construction of Instruction Groups and " "their ordering for scheduling"), cl::init(false)); static cl::opt> VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in VMEM group.")); static cl::opt> MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in MFMA group.")); static cl::opt> LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in lds/gds read group.")); static cl::opt> LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in lds/gds write group.")); typedef function_ref CanAddMIFn; // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel // instruction classes. class SchedGroup { private: // Function that returns true if a non-bundle MI may be inserted into this // group. const CanAddMIFn canAddMI; // Maximum number of SUnits that can be added to this group. Optional MaxSize; // Collection of SUnits that are classified as members of this group. SmallVector Collection; ScheduleDAGInstrs *DAG; void tryAddEdge(SUnit *A, SUnit *B) { if (A != B && DAG->canAddEdge(B, A)) { DAG->addEdge(B, SDep(A, SDep::Artificial)); LLVM_DEBUG(dbgs() << "Adding edge...\n" << "from: SU(" << A->NodeNum << ") " << *A->getInstr() << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); } } public: // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If // MakePred is true, SU will be a predecessor of the SUnits in this // SchedGroup, otherwise SU will be a successor. void link(SUnit &SU, bool MakePred = false) { for (auto A : Collection) { SUnit *B = &SU; if (MakePred) std::swap(A, B); tryAddEdge(A, B); } } // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use // the predicate to determine whether SU should be a predecessor (P = true) // or a successor (P = false) of this SchedGroup. void link(SUnit &SU, function_ref P) { for (auto A : Collection) { SUnit *B = &SU; if (P(A, B)) std::swap(A, B); tryAddEdge(A, B); } } // Add DAG dependencies such that SUnits in this group shall be ordered // before SUnits in OtherGroup. void link(SchedGroup &OtherGroup) { for (auto B : OtherGroup.Collection) link(*B); } // Returns true if no more instructions may be added to this group. bool isFull() { return MaxSize && Collection.size() >= *MaxSize; } // Returns true if SU can be added to this SchedGroup. bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { if (isFull()) return false; MachineInstr &MI = *SU.getInstr(); if (MI.getOpcode() != TargetOpcode::BUNDLE) return canAddMI(MI, TII); // Special case for bundled MIs. const MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; while (E != MBB->end() && E->isBundledWithPred()) ++E; // Return true if all of the bundled MIs can be added to this group. return std::all_of( B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); } void add(SUnit &SU) { Collection.push_back(&SU); } SchedGroup(CanAddMIFn canAddMI, Optional MaxSize, ScheduleDAGInstrs *DAG) : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} }; bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return TII->isMFMA(MI); } bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return TII->isVALU(MI) && !TII->isMFMA(MI); } bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return TII->isSALU(MI); } bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); } bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return MI.mayLoad() && (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); } bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return MI.mayStore() && (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); } bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return MI.mayStore() && TII->isDS(MI); } bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { return MI.mayLoad() && TII->isDS(MI); } class IGroupLPDAGMutation : public ScheduleDAGMutation { public: const SIInstrInfo *TII; ScheduleDAGMI *DAG; IGroupLPDAGMutation() = default; void apply(ScheduleDAGInstrs *DAGInstrs) override; }; // DAG mutation that coordinates with the SCHED_BARRIER instruction and // corresponding builtin. The mutation adds edges from specific instruction // classes determined by the SCHED_BARRIER mask so that they cannot be // scheduled around the SCHED_BARRIER. class SchedBarrierDAGMutation : public ScheduleDAGMutation { private: const SIInstrInfo *TII; ScheduleDAGMI *DAG; // Components of the mask that determines which instructions may not be // scheduled across the SCHED_BARRIER. enum class SchedBarrierMasks { NONE = 0u, ALU = 1u << 0, VALU = 1u << 1, SALU = 1u << 2, MFMA = 1u << 3, VMEM = 1u << 4, VMEM_READ = 1u << 5, VMEM_WRITE = 1u << 6, DS = 1u << 7, DS_READ = 1u << 8, DS_WRITE = 1u << 9, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) }; // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a // region. // std::unique_ptr MFMASchedGroup = nullptr; std::unique_ptr VALUSchedGroup = nullptr; std::unique_ptr SALUSchedGroup = nullptr; std::unique_ptr VMEMReadSchedGroup = nullptr; std::unique_ptr VMEMWriteSchedGroup = nullptr; std::unique_ptr DSWriteSchedGroup = nullptr; std::unique_ptr DSReadSchedGroup = nullptr; // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should // not be reordered accross the SCHED_BARRIER. void getSchedGroupsFromMask(int32_t Mask, SmallVectorImpl &SchedGroups); // Add DAG edges that enforce SCHED_BARRIER ordering. void addSchedBarrierEdges(SUnit &SU); // Classify instructions and add them to the SchedGroup. void initSchedGroup(SchedGroup *SG); // Remove all existing edges from a SCHED_BARRIER. void resetSchedBarrierEdges(SUnit &SU); public: void apply(ScheduleDAGInstrs *DAGInstrs) override; SchedBarrierDAGMutation() = default; }; void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); DAG = static_cast(DAGInstrs); const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); if (!TSchedModel || DAG->SUnits.empty()) return; LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); // The order of InstructionGroups in this vector defines the // order in which edges will be added. In other words, given the // present ordering, we will try to make each VMEMRead instruction // a predecessor of each DSRead instruction, and so on. SmallVector PipelineOrderGroups = { SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; for (SUnit &SU : DAG->SUnits) { LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); for (auto &SG : PipelineOrderGroups) if (SG.canAddSU(SU, TII)) SG.add(SU); } for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { auto &GroupA = PipelineOrderGroups[i]; for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { auto &GroupB = PipelineOrderGroups[j]; GroupA.link(GroupB); } } } void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); if (!TSchedModel || DAGInstrs->SUnits.empty()) return; LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); DAG = static_cast(DAGInstrs); for (auto &SU : DAG->SUnits) if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) addSchedBarrierEdges(SU); } void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { MachineInstr &MI = *SchedBarrier.getInstr(); assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); // Remove all existing edges from the SCHED_BARRIER that were added due to the // instruction having side effects. resetSchedBarrierEdges(SchedBarrier); SmallVector SchedGroups; int32_t Mask = MI.getOperand(0).getImm(); getSchedGroupsFromMask(Mask, SchedGroups); for (auto SG : SchedGroups) SG->link( SchedBarrier, (function_ref)[]( const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); } void SchedBarrierDAGMutation::getSchedGroupsFromMask( int32_t Mask, SmallVectorImpl &SchedGroups) { SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; // See IntrinsicsAMDGPU.td for an explanation of these masks and their // mappings. // if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { if (!VALUSchedGroup) { VALUSchedGroup = std::make_unique(isVALUSGMember, None, DAG); initSchedGroup(VALUSchedGroup.get()); } SchedGroups.push_back(VALUSchedGroup.get()); } if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { if (!SALUSchedGroup) { SALUSchedGroup = std::make_unique(isSALUSGMember, None, DAG); initSchedGroup(SALUSchedGroup.get()); } SchedGroups.push_back(SALUSchedGroup.get()); } if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { if (!MFMASchedGroup) { MFMASchedGroup = std::make_unique(isMFMASGMember, None, DAG); initSchedGroup(MFMASchedGroup.get()); } SchedGroups.push_back(MFMASchedGroup.get()); } if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { if (!VMEMReadSchedGroup) { VMEMReadSchedGroup = std::make_unique(isVMEMReadSGMember, None, DAG); initSchedGroup(VMEMReadSchedGroup.get()); } SchedGroups.push_back(VMEMReadSchedGroup.get()); } if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { if (!VMEMWriteSchedGroup) { VMEMWriteSchedGroup = std::make_unique(isVMEMWriteSGMember, None, DAG); initSchedGroup(VMEMWriteSchedGroup.get()); } SchedGroups.push_back(VMEMWriteSchedGroup.get()); } if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { if (!DSReadSchedGroup) { DSReadSchedGroup = std::make_unique(isDSReadSGMember, None, DAG); initSchedGroup(DSReadSchedGroup.get()); } SchedGroups.push_back(DSReadSchedGroup.get()); } if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { if (!DSWriteSchedGroup) { DSWriteSchedGroup = std::make_unique(isDSWriteSGMember, None, DAG); initSchedGroup(DSWriteSchedGroup.get()); } SchedGroups.push_back(DSWriteSchedGroup.get()); } } void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { assert(SG); for (auto &SU : DAG->SUnits) if (SG->canAddSU(SU, TII)) SG->add(SU); } void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); for (auto &P : SU.Preds) SU.removePred(P); for (auto &S : SU.Succs) { for (auto &SP : S.getSUnit()->Preds) { if (SP.getSUnit() == &SU) { S.getSUnit()->removePred(SP); } } } } } // namespace namespace llvm { std::unique_ptr createIGroupLPDAGMutation() { return EnableIGroupLP ? std::make_unique() : nullptr; } std::unique_ptr createSchedBarrierDAGMutation() { return std::make_unique(); } } // end namespace llvm