Lines Matching +full:tri +full:- +full:state
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
16 /// non-strict WQM inactive lanes may control flow decisions.
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
68 //===----------------------------------------------------------------------===//
87 #define DEBUG_TYPE "si-wqm"
101 int State; member
103 explicit PrintState(int State) : State(State) {} in PrintState()
112 char State = PS.State; in operator <<() local
114 if (State & M.first) { in operator <<()
116 State &= ~M.first; in operator <<()
118 if (State) in operator <<()
122 assert(State == 0); in operator <<()
153 const SIRegisterInfo *TRI; member in __anonf56fbe7e0111::SIWholeQuadMode
173 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
284 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) in printInfo()
285 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; in printInfo()
320 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); in markDefs()
328 SubReg ? TRI->getSubRegIndexLaneMask(SubReg) in markDefs()
329 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) in markDefs()
332 // Perform a depth-first iteration of the LiveRange graph marking defs. in markDefs()
357 if (Value->isPHIDef()) { in markDefs()
359 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); in markDefs()
360 assert(MBB && "Phi-def has no defining MBB"); in markDefs()
364 auto PI = MBB->pred_begin() + Idx; in markDefs()
365 auto PE = MBB->pred_end(); in markDefs()
367 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) { in markDefs()
377 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); in markDefs()
383 for (const MachineOperand &Op : MI->all_defs()) { in markDefs()
390 : TRI->getSubRegIndexLaneMask(Op.getSubReg()); in markDefs()
403 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); in markDefs()
450 LiveRange &LR = LIS->getInterval(Reg); in markOperand()
456 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { in markOperand()
457 LiveRange &LR = LIS->getRegUnit(Unit); in markOperand()
458 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); in markOperand()
480 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); in scanInstructions()
486 // We need to visit the basic blocks in reverse post-order so that we visit in scanInstructions()
499 if (TII->isWQM(Opcode)) { in scanInstructions()
504 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) { in scanInstructions()
527 TII->isDualSourceBlendEXP(MI)) { in scanInstructions()
537 // Dual source blend export acts as implicit strict-wqm, its sources in scanInstructions()
568 } else if (TII->isDisableWQM(MI)) { in scanInstructions()
595 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) { in scanInstructions()
629 // Control flow-type instructions and stores to temporary memory that are in propagateInstruction()
632 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { in propagateInstruction()
649 if (!PrevMI->isPHI()) { in propagateInstruction()
728 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); in saveSCC()
731 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) in saveSCC()
734 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) in saveSCC()
737 LIS->InsertMachineInstrInMaps(*Save); in saveSCC()
738 LIS->InsertMachineInstrInMaps(*Restore); in saveSCC()
739 LIS->createAndComputeVirtRegInterval(SaveReg); in saveSCC()
750 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS); in splitBlock()
755 switch (TermMI->getOpcode()) { in splitBlock()
772 TermMI->setDesc(TII->get(NewOpcode)); in splitBlock()
778 for (MachineBasicBlock *Succ : SplitBB->successors()) { in splitBlock()
784 MDT->getBase().applyUpdates(DTUpdates); in splitBlock()
786 PDT->applyUpdates(DTUpdates); in splitBlock()
790 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) in splitBlock()
792 LIS->InsertMachineInstrInMaps(*MI); in splitBlock()
873 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; in lowerKillF32()
875 if (TRI->isVGPR(*MRI, Op0.getReg())) { in lowerKillF32()
877 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); in lowerKillF32()
879 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) in lowerKillF32()
889 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) in lowerKillF32()
893 // State of SCC represents whether any lanes are live in mask, in lowerKillF32()
896 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); in lowerKillF32()
899 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); in lowerKillF32()
902 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) in lowerKillF32()
906 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI); in lowerKillF32()
909 LIS->InsertMachineInstrInMaps(*MaskUpdateMI); in lowerKillF32()
910 LIS->InsertMachineInstrInMaps(*ExecMaskMI); in lowerKillF32()
911 LIS->InsertMachineInstrInMaps(*EarlyTermMI); in lowerKillF32()
912 LIS->InsertMachineInstrInMaps(*NewTerm); in lowerKillF32()
935 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) in lowerKillI1()
942 LIS->RemoveMachineInstrFromMaps(MI); in lowerKillI1()
945 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) in lowerKillI1()
947 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); in lowerKillI1()
956 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); in lowerKillI1()
958 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); in lowerKillI1()
959 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) in lowerKillI1()
964 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) in lowerKillI1()
970 // State of SCC represents whether any lanes are live in mask, in lowerKillI1()
973 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); in lowerKillI1()
981 // Demote - deactivate quads with only helper lanes in lowerKillI1()
982 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); in lowerKillI1()
984 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); in lowerKillI1()
985 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) in lowerKillI1()
989 // Kill - deactivate lanes no longer in live mask in lowerKillI1()
991 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; in lowerKillI1()
992 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); in lowerKillI1()
994 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) in lowerKillI1()
1000 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); in lowerKillI1()
1005 LIS->RemoveMachineInstrFromMaps(MI); in lowerKillI1()
1011 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); in lowerKillI1()
1012 LIS->InsertMachineInstrInMaps(*MaskUpdateMI); in lowerKillI1()
1013 LIS->InsertMachineInstrInMaps(*EarlyTermMI); in lowerKillI1()
1015 LIS->InsertMachineInstrInMaps(*WQMMaskMI); in lowerKillI1()
1016 LIS->InsertMachineInstrInMaps(*NewTerm); in lowerKillI1()
1019 LIS->removeInterval(CndReg); in lowerKillI1()
1020 LIS->createAndComputeVirtRegInterval(CndReg); in lowerKillI1()
1023 LIS->createAndComputeVirtRegInterval(TmpReg); in lowerKillI1()
1025 LIS->createAndComputeVirtRegInterval(LiveMaskWQM); in lowerKillI1()
1032 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1038 const BlockInfo &BI = BII->second; in lowerBlock()
1045 char State = BI.InitialState; in lowerBlock() local
1050 State = StateTransition[&MI]; in lowerBlock()
1056 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); in lowerBlock()
1087 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin()); in prepareInsertion()
1089 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) in prepareInsertion()
1090 : LIS->getMBBEndIdx(&MBB); in prepareInsertion()
1092 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); in prepareInsertion()
1102 SlotIndex Next = S->start.getBaseIndex(); in prepareInsertion()
1107 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex()); in prepareInsertion()
1109 auto NextI = std::next(EndMI->getIterator()); in prepareInsertion()
1112 SlotIndex Next = LIS->getInstructionIndex(*NextI); in prepareInsertion()
1121 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) in prepareInsertion()
1124 assert(Idx == LIS->getMBBEndIdx(&MBB)); in prepareInsertion()
1133 for (const MachineOperand &MO : MBBI->all_defs()) { in prepareInsertion()
1158 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm); in toExact()
1159 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before); in toExact()
1168 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) in toExact()
1172 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec) in toExact()
1177 LIS->InsertMachineInstrInMaps(*MI); in toExact()
1187 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) in toWQM()
1190 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); in toWQM()
1193 LIS->InsertMachineInstrInMaps(*MI); in toWQM()
1206 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), in toStrictMode()
1208 .addImm(-1); in toStrictMode()
1210 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM), in toStrictMode()
1212 .addImm(-1); in toStrictMode()
1214 LIS->InsertMachineInstrInMaps(*MI); in toStrictMode()
1229 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), in fromStrictMode()
1233 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), in fromStrictMode()
1237 LIS->InsertMachineInstrInMaps(*MI); in fromStrictMode()
1246 BlockInfo &BI = BII->second; in processBlock()
1248 // This is a non-entry block that is WQM throughout, so no need to do in processBlock()
1261 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; in processBlock() local
1263 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); in processBlock()
1268 if (II != IE && II->getOpcode() == AMDGPU::COPY && in processBlock()
1269 II->getOperand(1).getReg() == TRI->getExec()) in processBlock()
1283 // Record initial state is block information. in processBlock()
1284 BI.InitialState = State; in processBlock()
1302 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { in processBlock()
1305 if (III->second.Needs & StateStrictWWM) in processBlock()
1307 else if (III->second.Needs & StateStrictWQM) in processBlock()
1309 else if (III->second.Needs & StateWQM) in processBlock()
1312 Needs &= ~III->second.Disabled; in processBlock()
1313 OutNeeds = III->second.OutNeeds; in processBlock()
1337 if (!(Needs & State)) { in processBlock()
1339 if (State == StateStrictWWM || Needs == StateStrictWWM || in processBlock()
1340 State == StateStrictWQM || Needs == StateStrictWQM) { in processBlock()
1350 switch (State) { in processBlock()
1354 // Exact/Strict -> Strict: save SCC in processBlock()
1355 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec in processBlock()
1356 // Exact/Strict -> Exact: no save in processBlock()
1360 // WQM -> Exact/Strict: save SCC in processBlock()
1364 llvm_unreachable("Unknown state"); in processBlock()
1370 if (State & StateStrict) { in processBlock()
1371 assert(State == StateStrictWWM || State == StateStrictWQM); in processBlock()
1373 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State); in processBlock()
1375 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg); in processBlock()
1377 State = NonStrictState; in processBlock()
1381 NonStrictState = State; in processBlock()
1384 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC); in processBlock()
1387 State = Needs; in processBlock()
1390 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { in processBlock()
1393 SavedWQMReg = MRI->createVirtualRegister(BoolRC); in processBlock()
1397 State = StateExact; in processBlock()
1398 } else if (State == StateExact && (Needs & StateWQM) && in processBlock()
1405 LIS->createAndComputeVirtRegInterval(SavedWQMReg); in processBlock()
1408 State = StateWQM; in processBlock()
1411 // non-StrictWWM state that already matches our needs, but we in processBlock()
1413 assert(Needs & State); in processBlock()
1435 const DebugLoc &DL = MI->getDebugLoc(); in lowerLiveMaskQueries()
1436 Register Dest = MI->getOperand(0).getReg(); in lowerLiveMaskQueries()
1439 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) in lowerLiveMaskQueries()
1442 LIS->ReplaceMachineInstrInMaps(*MI, *Copy); in lowerLiveMaskQueries()
1443 MI->eraseFromParent(); in lowerLiveMaskQueries()
1450 assert(MI->getNumExplicitOperands() == 2); in lowerCopyInstrs()
1452 const Register Reg = MI->getOperand(0).getReg(); in lowerCopyInstrs()
1455 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0)); in lowerCopyInstrs()
1456 if (TRI->isVGPRClass(regClass)) { in lowerCopyInstrs()
1457 const unsigned MovOp = TII->getMovOpcode(regClass); in lowerCopyInstrs()
1458 MI->setDesc(TII->get(MovOp)); in lowerCopyInstrs()
1462 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { in lowerCopyInstrs()
1466 // Remove early-clobber and exec dependency from simple SGPR copies. in lowerCopyInstrs()
1469 if (MI->getOperand(0).isEarlyClobber()) { in lowerCopyInstrs()
1470 LIS->removeInterval(Reg); in lowerCopyInstrs()
1471 MI->getOperand(0).setIsEarlyClobber(false); in lowerCopyInstrs()
1472 LIS->createAndComputeVirtRegInterval(Reg); in lowerCopyInstrs()
1474 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr); in lowerCopyInstrs()
1476 MI->removeOperand(Index); in lowerCopyInstrs()
1477 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr); in lowerCopyInstrs()
1479 MI->setDesc(TII->get(AMDGPU::COPY)); in lowerCopyInstrs()
1480 LLVM_DEBUG(dbgs() << " -> " << *MI); in lowerCopyInstrs()
1484 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || in lowerCopyInstrs()
1485 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { in lowerCopyInstrs()
1486 assert(MI->getNumExplicitOperands() == 3); in lowerCopyInstrs()
1490 assert(MI->getOperand(2).isUndef()); in lowerCopyInstrs()
1491 MI->removeOperand(2); in lowerCopyInstrs()
1492 MI->untieRegOperand(1); in lowerCopyInstrs()
1494 assert(MI->getNumExplicitOperands() == 2); in lowerCopyInstrs()
1497 unsigned CopyOp = MI->getOperand(1).isReg() in lowerCopyInstrs()
1499 : TII->getMovOpcode(TRI->getRegClassForOperandReg( in lowerCopyInstrs()
1500 *MRI, MI->getOperand(0))); in lowerCopyInstrs()
1501 MI->setDesc(TII->get(CopyOp)); in lowerCopyInstrs()
1508 MachineBasicBlock *MBB = MI->getParent(); in lowerKillInstrs()
1510 switch (MI->getOpcode()) { in lowerKillInstrs()
1527 bool IsWave32 = ST->isWave32(); in lowerInitExec()
1532 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), in lowerInitExec()
1533 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), in lowerInitExec()
1537 LIS->RemoveMachineInstrFromMaps(MI); in lowerInitExec()
1538 LIS->InsertMachineInstrInMaps(*InitMI); in lowerInitExec()
1550 // S_CMOV_B64 exec, -1 in lowerInitExec()
1552 MachineInstr *FirstMI = &*MBB->begin(); in lowerInitExec()
1554 MachineInstr *DefInstr = MRI->getVRegDef(InputReg); in lowerInitExec()
1555 assert(DefInstr && DefInstr->isCopy()); in lowerInitExec()
1556 if (DefInstr->getParent() == MBB) { in lowerInitExec()
1560 DefInstr->removeFromParent(); in lowerInitExec()
1561 MBB->insert(FirstMI, DefInstr); in lowerInitExec()
1563 LIS->handleMove(*DefInstr); in lowerInitExec()
1566 FirstMI = &*std::next(FirstMI->getIterator()); in lowerInitExec()
1573 const unsigned WavefrontSize = ST->getWavefrontSize(); in lowerInitExec()
1574 const unsigned Mask = (WavefrontSize << 1) - 1; in lowerInitExec()
1575 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); in lowerInitExec()
1576 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) in lowerInitExec()
1581 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) in lowerInitExec()
1584 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) in lowerInitExec()
1589 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), in lowerInitExec()
1591 .addImm(-1); in lowerInitExec()
1598 LIS->RemoveMachineInstrFromMaps(MI); in lowerInitExec()
1601 LIS->InsertMachineInstrInMaps(*BfeMI); in lowerInitExec()
1602 LIS->InsertMachineInstrInMaps(*BfmMI); in lowerInitExec()
1603 LIS->InsertMachineInstrInMaps(*CmpMI); in lowerInitExec()
1604 LIS->InsertMachineInstrInMaps(*CmovMI); in lowerInitExec()
1606 LIS->removeInterval(InputReg); in lowerInitExec()
1607 LIS->createAndComputeVirtRegInterval(InputReg); in lowerInitExec()
1608 LIS->createAndComputeVirtRegInterval(CountReg); in lowerInitExec()
1619 // - multiple INIT_EXEC instructions in lowerInitExecInstrs()
1620 // - INIT_EXEC instructions not in the entry block in lowerInitExecInstrs()
1621 if (MI->getParent() == &Entry) in lowerInitExecInstrs()
1622 InsertPt = std::next(MI->getIterator()); in lowerInitExecInstrs()
1633 << " ------------- \n"); in runOnMachineFunction()
1647 TII = ST->getInstrInfo(); in runOnMachineFunction()
1648 TRI = &TII->getRegisterInfo(); in runOnMachineFunction()
1652 MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; in runOnMachineFunction()
1655 PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; in runOnMachineFunction()
1657 if (ST->isWave32()) { in runOnMachineFunction()
1691 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); in runOnMachineFunction()
1693 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) in runOnMachineFunction()
1695 LIS->InsertMachineInstrInMaps(*MI); in runOnMachineFunction()
1709 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) in runOnMachineFunction()
1711 LIS->InsertMachineInstrInMaps(*MI); in runOnMachineFunction()
1726 LIS->createAndComputeVirtRegInterval(LiveMaskReg); in runOnMachineFunction()
1731 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC); in runOnMachineFunction()
1735 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); in runOnMachineFunction()