Lines Matching +full:dim +full:- +full:mode
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
37 #define DEBUG_TYPE "amdgpu-subtarget"
46 "amdgpu-enable-power-sched",
51 "amdgpu-vgpr-index-mode",
52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
55 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59 static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
68 // Determine default and user-specified characteristics in initializeSubtargetDependencies()
74 // Similarly we want enable-prt-strict-null to be on by default and not to in initializeSubtargetDependencies()
77 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); in initializeSubtargetDependencies()
81 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; in initializeSubtargetDependencies()
83 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS in initializeSubtargetDependencies()
88 FullFS += "-wavefrontsize16,"; in initializeSubtargetDependencies()
90 FullFS += "-wavefrontsize32,"; in initializeSubtargetDependencies()
92 FullFS += "-wavefrontsize64,"; in initializeSubtargetDependencies()
100 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to in initializeSubtargetDependencies()
119 // Targets must either support 64-bit offsets for MUBUF instructions, and/or in initializeSubtargetDependencies()
120 // support flat operations, otherwise they cannot access a 64-bit global in initializeSubtargetDependencies()
123 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets in initializeSubtargetDependencies()
127 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { in initializeSubtargetDependencies()
131 // Unless +-flat-for-global is specified, use MUBUF instructions for global in initializeSubtargetDependencies()
133 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { in initializeSubtargetDependencies()
194 : // clang-format off in GCNSubtarget()
203 // clang-format on in GCNSubtarget()
320 // On gfx10, all 16-bit instructions preserve the high bits. in zeroesHigh16BitsOfDest()
330 // In gfx9, the preferred handling of the unused high 16-bits changed. Most in zeroesHigh16BitsOfDest()
341 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
350 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); in getMaxLocalMemSizeWithWaveCount()
363 // size, and the per-workgroup LDS allocation size.
406 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); in getOccupancyWithLocalMemSize()
432 F, "amdgpu-flat-work-group-size", Default); in getFlatWorkGroupSizes()
454 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum in getEffectiveWavesPerEU()
485 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); in getWavesPerEU()
489 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { in getReqdWorkGroupSize() argument
491 if (Node && Node->getNumOperands() == 3) in getReqdWorkGroupSize()
492 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); in getReqdWorkGroupSize()
504 return ReqdSize - 1; in getMaxWorkitemID()
505 return getFlatWorkGroupSizes(Kernel).second - 1; in getMaxWorkitemID()
518 Function *Kernel = I->getParent()->getParent(); in makeLIDRangeMetadata()
525 const Function *F = CI->getCalledFunction(); in makeLIDRangeMetadata()
527 unsigned Dim = UINT_MAX; in makeLIDRangeMetadata() local
528 switch (F->getIntrinsicID()) { in makeLIDRangeMetadata()
534 Dim = 0; in makeLIDRangeMetadata()
541 Dim = 1; in makeLIDRangeMetadata()
548 Dim = 2; in makeLIDRangeMetadata()
554 if (Dim <= 3) { in makeLIDRangeMetadata()
555 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); in makeLIDRangeMetadata()
576 CI->addRangeRetAttr(Range); in makeLIDRangeMetadata()
578 MDBuilder MDB(I->getContext()); in makeLIDRangeMetadata()
580 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); in makeLIDRangeMetadata()
590 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) in getImplicitArgNumBytes()
600 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", in getImplicitArgNumBytes()
682 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; in hasMadF16()
752 // "amdgpu-num-sgpr" attribute. in getBaseMaxNumSGPRs()
753 if (F.hasFnAttribute("amdgpu-num-sgpr")) { in getBaseMaxNumSGPRs()
755 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); in getBaseMaxNumSGPRs()
765 // of reserved special registers in total. Theoretically you could re-use in getBaseMaxNumSGPRs()
787 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); in getBaseMaxNumSGPRs()
834 // "amdgpu-num-vgpr" attribute. in getBaseMaxNumVGPRs()
835 if (F.hasFnAttribute("amdgpu-num-vgpr")) { in getBaseMaxNumVGPRs()
837 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); in getBaseMaxNumVGPRs()
871 !Def->isInstr() || !Use->isInstr()) in adjustSchedDependency()
874 MachineInstr *DefI = Def->getInstr(); in adjustSchedDependency()
875 MachineInstr *UseI = Use->getInstr(); in adjustSchedDependency()
877 if (DefI->isBundle()) { in adjustSchedDependency()
880 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); in adjustSchedDependency()
881 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); in adjustSchedDependency()
883 for (++I; I != E && I->isBundledWithPred(); ++I) { in adjustSchedDependency()
884 if (I->modifiesRegister(Reg, TRI)) in adjustSchedDependency()
887 --Lat; in adjustSchedDependency()
890 } else if (UseI->isBundle()) { in adjustSchedDependency()
893 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); in adjustSchedDependency()
894 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); in adjustSchedDependency()
896 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { in adjustSchedDependency()
897 if (I->readsRegister(Reg, TRI)) in adjustSchedDependency()
899 --Lat; in adjustSchedDependency()
921 const MachineInstr *MI = SU->getInstr(); in isSALU()
922 return MI && TII->isSALU(*MI) && !MI->isTerminator(); in isSALU()
926 const MachineInstr *MI = SU->getInstr(); in isVALU()
927 return MI && TII->isVALU(*MI); in isVALU()
937 while (!Worklist.empty() && MaxChain-- > 0) { in linkSALUChain()
942 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); in linkSALUChain()
943 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); in linkSALUChain()
945 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) in linkSALUChain()
946 if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) in linkSALUChain()
949 for (SDep &SI : From->Succs) { in linkSALUChain()
951 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && in linkSALUChain()
952 DAG->canAddEdge(SUv, SU)) in linkSALUChain()
953 DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); in linkSALUChain()
956 for (SDep &SI : SU->Succs) { in linkSALUChain()
967 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); in apply()
971 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); in apply()
972 if (!TSchedModel || DAG->SUnits.empty()) in apply()
979 auto LastSALU = DAG->SUnits.begin(); in apply()
980 auto E = DAG->SUnits.end(); in apply()
982 for (SUnit &SU : DAG->SUnits) { in apply()
984 if (!TII->isMAI(MAI) || in apply()
989 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; in apply()
991 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); in apply()
1001 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || in apply()
1002 !DAG->canAddEdge(&*LastSALU, &SU)) in apply()
1005 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); in apply()
1031 "amdgpu-nsa-threshold", -1); in getNSAThreshold()
1059 const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); in GCNUserSGPRUsageInfo()
1062 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); in GCNUserSGPRUsageInfo()
1074 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) in GCNUserSGPRUsageInfo()
1078 if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) in GCNUserSGPRUsageInfo()
1081 if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) in GCNUserSGPRUsageInfo()
1127 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; in getNumFreeUserSGPRs()
1132 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3); in getMaxNumWorkGroups()