10b57cec5SDimitry Andric //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file This pass tries to apply several peephole SDWA patterns. 100b57cec5SDimitry Andric /// 110b57cec5SDimitry Andric /// E.g. original: 120b57cec5SDimitry Andric /// V_LSHRREV_B32_e32 %0, 16, %1 13*e8d8bef9SDimitry Andric /// V_ADD_CO_U32_e32 %2, %0, %3 140b57cec5SDimitry Andric /// V_LSHLREV_B32_e32 %4, 16, %2 150b57cec5SDimitry Andric /// 160b57cec5SDimitry Andric /// Replace: 17*e8d8bef9SDimitry Andric /// V_ADD_CO_U32_sdwa %4, %1, %3 180b57cec5SDimitry Andric /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 190b57cec5SDimitry Andric /// 200b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric #include "AMDGPU.h" 23*e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 240b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 25480093f4SDimitry Andric #include "llvm/ADT/MapVector.h" 260b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h" 270b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric using namespace llvm; 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #define DEBUG_TYPE "si-peephole-sdwa" 320b57cec5SDimitry Andric 330b57cec5SDimitry Andric STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 340b57cec5SDimitry Andric STATISTIC(NumSDWAInstructionsPeepholed, 350b57cec5SDimitry Andric "Number of instruction converted to SDWA."); 360b57cec5SDimitry Andric 370b57cec5SDimitry Andric namespace { 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric class SDWAOperand; 400b57cec5SDimitry Andric class SDWADstOperand; 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric class SIPeepholeSDWA : public MachineFunctionPass { 430b57cec5SDimitry Andric public: 440b57cec5SDimitry Andric using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 450b57cec5SDimitry Andric 460b57cec5SDimitry Andric private: 470b57cec5SDimitry Andric MachineRegisterInfo *MRI; 480b57cec5SDimitry Andric const SIRegisterInfo *TRI; 490b57cec5SDimitry Andric const SIInstrInfo *TII; 500b57cec5SDimitry Andric 51480093f4SDimitry Andric MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 52480093f4SDimitry Andric MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; 530b57cec5SDimitry Andric SmallVector<MachineInstr *, 8> ConvertedInstructions; 540b57cec5SDimitry Andric 550b57cec5SDimitry Andric Optional<int64_t> foldToImm(const MachineOperand &Op) const; 560b57cec5SDimitry Andric 570b57cec5SDimitry Andric public: 580b57cec5SDimitry Andric static char ID; 590b57cec5SDimitry Andric 600b57cec5SDimitry Andric SIPeepholeSDWA() : MachineFunctionPass(ID) { 610b57cec5SDimitry Andric initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 620b57cec5SDimitry Andric } 630b57cec5SDimitry Andric 640b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 650b57cec5SDimitry Andric void matchSDWAOperands(MachineBasicBlock &MBB); 660b57cec5SDimitry Andric std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 670b57cec5SDimitry Andric bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; 680b57cec5SDimitry Andric void pseudoOpConvertToVOP2(MachineInstr &MI, 690b57cec5SDimitry Andric const GCNSubtarget &ST) const; 700b57cec5SDimitry Andric bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 710b57cec5SDimitry Andric void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric StringRef getPassName() const override { return "SI Peephole SDWA"; } 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 760b57cec5SDimitry Andric AU.setPreservesCFG(); 770b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 780b57cec5SDimitry Andric } 790b57cec5SDimitry Andric }; 800b57cec5SDimitry Andric 810b57cec5SDimitry Andric class SDWAOperand { 820b57cec5SDimitry Andric private: 830b57cec5SDimitry Andric MachineOperand *Target; // Operand that would be used in converted instruction 840b57cec5SDimitry Andric MachineOperand *Replaced; // Operand that would be replace by Target 850b57cec5SDimitry Andric 860b57cec5SDimitry Andric public: 870b57cec5SDimitry Andric SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 880b57cec5SDimitry Andric : Target(TargetOp), Replaced(ReplacedOp) { 890b57cec5SDimitry Andric assert(Target->isReg()); 900b57cec5SDimitry Andric assert(Replaced->isReg()); 910b57cec5SDimitry Andric } 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric virtual ~SDWAOperand() = default; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 960b57cec5SDimitry Andric virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 970b57cec5SDimitry Andric 980b57cec5SDimitry Andric MachineOperand *getTargetOperand() const { return Target; } 990b57cec5SDimitry Andric MachineOperand *getReplacedOperand() const { return Replaced; } 1000b57cec5SDimitry Andric MachineInstr *getParentInst() const { return Target->getParent(); } 1010b57cec5SDimitry Andric 1020b57cec5SDimitry Andric MachineRegisterInfo *getMRI() const { 1030b57cec5SDimitry Andric return &getParentInst()->getParent()->getParent()->getRegInfo(); 1040b57cec5SDimitry Andric } 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1070b57cec5SDimitry Andric virtual void print(raw_ostream& OS) const = 0; 1080b57cec5SDimitry Andric void dump() const { print(dbgs()); } 1090b57cec5SDimitry Andric #endif 1100b57cec5SDimitry Andric }; 1110b57cec5SDimitry Andric 1120b57cec5SDimitry Andric using namespace AMDGPU::SDWA; 1130b57cec5SDimitry Andric 1140b57cec5SDimitry Andric class SDWASrcOperand : public SDWAOperand { 1150b57cec5SDimitry Andric private: 1160b57cec5SDimitry Andric SdwaSel SrcSel; 1170b57cec5SDimitry Andric bool Abs; 1180b57cec5SDimitry Andric bool Neg; 1190b57cec5SDimitry Andric bool Sext; 1200b57cec5SDimitry Andric 1210b57cec5SDimitry Andric public: 1220b57cec5SDimitry Andric SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 1230b57cec5SDimitry Andric SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 1240b57cec5SDimitry Andric bool Sext_ = false) 1250b57cec5SDimitry Andric : SDWAOperand(TargetOp, ReplacedOp), 1260b57cec5SDimitry Andric SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 1270b57cec5SDimitry Andric 1280b57cec5SDimitry Andric MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 1290b57cec5SDimitry Andric bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric SdwaSel getSrcSel() const { return SrcSel; } 1320b57cec5SDimitry Andric bool getAbs() const { return Abs; } 1330b57cec5SDimitry Andric bool getNeg() const { return Neg; } 1340b57cec5SDimitry Andric bool getSext() const { return Sext; } 1350b57cec5SDimitry Andric 1360b57cec5SDimitry Andric uint64_t getSrcMods(const SIInstrInfo *TII, 1370b57cec5SDimitry Andric const MachineOperand *SrcOp) const; 1380b57cec5SDimitry Andric 1390b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1400b57cec5SDimitry Andric void print(raw_ostream& OS) const override; 1410b57cec5SDimitry Andric #endif 1420b57cec5SDimitry Andric }; 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric class SDWADstOperand : public SDWAOperand { 1450b57cec5SDimitry Andric private: 1460b57cec5SDimitry Andric SdwaSel DstSel; 1470b57cec5SDimitry Andric DstUnused DstUn; 1480b57cec5SDimitry Andric 1490b57cec5SDimitry Andric public: 1500b57cec5SDimitry Andric 1510b57cec5SDimitry Andric SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 1520b57cec5SDimitry Andric SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 1530b57cec5SDimitry Andric : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 1540b57cec5SDimitry Andric 1550b57cec5SDimitry Andric MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 1560b57cec5SDimitry Andric bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 1570b57cec5SDimitry Andric 1580b57cec5SDimitry Andric SdwaSel getDstSel() const { return DstSel; } 1590b57cec5SDimitry Andric DstUnused getDstUnused() const { return DstUn; } 1600b57cec5SDimitry Andric 1610b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1620b57cec5SDimitry Andric void print(raw_ostream& OS) const override; 1630b57cec5SDimitry Andric #endif 1640b57cec5SDimitry Andric }; 1650b57cec5SDimitry Andric 1660b57cec5SDimitry Andric class SDWADstPreserveOperand : public SDWADstOperand { 1670b57cec5SDimitry Andric private: 1680b57cec5SDimitry Andric MachineOperand *Preserve; 1690b57cec5SDimitry Andric 1700b57cec5SDimitry Andric public: 1710b57cec5SDimitry Andric SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 1720b57cec5SDimitry Andric MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 1730b57cec5SDimitry Andric : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 1740b57cec5SDimitry Andric Preserve(PreserveOp) {} 1750b57cec5SDimitry Andric 1760b57cec5SDimitry Andric bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 1770b57cec5SDimitry Andric 1780b57cec5SDimitry Andric MachineOperand *getPreservedOperand() const { return Preserve; } 1790b57cec5SDimitry Andric 1800b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1810b57cec5SDimitry Andric void print(raw_ostream& OS) const override; 1820b57cec5SDimitry Andric #endif 1830b57cec5SDimitry Andric }; 1840b57cec5SDimitry Andric 1850b57cec5SDimitry Andric } // end anonymous namespace 1860b57cec5SDimitry Andric 1870b57cec5SDimitry Andric INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 1880b57cec5SDimitry Andric 1890b57cec5SDimitry Andric char SIPeepholeSDWA::ID = 0; 1900b57cec5SDimitry Andric 1910b57cec5SDimitry Andric char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 1920b57cec5SDimitry Andric 1930b57cec5SDimitry Andric FunctionPass *llvm::createSIPeepholeSDWAPass() { 1940b57cec5SDimitry Andric return new SIPeepholeSDWA(); 1950b57cec5SDimitry Andric } 1960b57cec5SDimitry Andric 1970b57cec5SDimitry Andric 1980b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1990b57cec5SDimitry Andric static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 2000b57cec5SDimitry Andric switch(Sel) { 2010b57cec5SDimitry Andric case BYTE_0: OS << "BYTE_0"; break; 2020b57cec5SDimitry Andric case BYTE_1: OS << "BYTE_1"; break; 2030b57cec5SDimitry Andric case BYTE_2: OS << "BYTE_2"; break; 2040b57cec5SDimitry Andric case BYTE_3: OS << "BYTE_3"; break; 2050b57cec5SDimitry Andric case WORD_0: OS << "WORD_0"; break; 2060b57cec5SDimitry Andric case WORD_1: OS << "WORD_1"; break; 2070b57cec5SDimitry Andric case DWORD: OS << "DWORD"; break; 2080b57cec5SDimitry Andric } 2090b57cec5SDimitry Andric return OS; 2100b57cec5SDimitry Andric } 2110b57cec5SDimitry Andric 2120b57cec5SDimitry Andric static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 2130b57cec5SDimitry Andric switch(Un) { 2140b57cec5SDimitry Andric case UNUSED_PAD: OS << "UNUSED_PAD"; break; 2150b57cec5SDimitry Andric case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 2160b57cec5SDimitry Andric case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 2170b57cec5SDimitry Andric } 2180b57cec5SDimitry Andric return OS; 2190b57cec5SDimitry Andric } 2200b57cec5SDimitry Andric 2210b57cec5SDimitry Andric LLVM_DUMP_METHOD 2220b57cec5SDimitry Andric void SDWASrcOperand::print(raw_ostream& OS) const { 2230b57cec5SDimitry Andric OS << "SDWA src: " << *getTargetOperand() 2240b57cec5SDimitry Andric << " src_sel:" << getSrcSel() 2250b57cec5SDimitry Andric << " abs:" << getAbs() << " neg:" << getNeg() 2260b57cec5SDimitry Andric << " sext:" << getSext() << '\n'; 2270b57cec5SDimitry Andric } 2280b57cec5SDimitry Andric 2290b57cec5SDimitry Andric LLVM_DUMP_METHOD 2300b57cec5SDimitry Andric void SDWADstOperand::print(raw_ostream& OS) const { 2310b57cec5SDimitry Andric OS << "SDWA dst: " << *getTargetOperand() 2320b57cec5SDimitry Andric << " dst_sel:" << getDstSel() 2330b57cec5SDimitry Andric << " dst_unused:" << getDstUnused() << '\n'; 2340b57cec5SDimitry Andric } 2350b57cec5SDimitry Andric 2360b57cec5SDimitry Andric LLVM_DUMP_METHOD 2370b57cec5SDimitry Andric void SDWADstPreserveOperand::print(raw_ostream& OS) const { 2380b57cec5SDimitry Andric OS << "SDWA preserve dst: " << *getTargetOperand() 2390b57cec5SDimitry Andric << " dst_sel:" << getDstSel() 2400b57cec5SDimitry Andric << " preserve:" << *getPreservedOperand() << '\n'; 2410b57cec5SDimitry Andric } 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric #endif 2440b57cec5SDimitry Andric 2450b57cec5SDimitry Andric static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 2460b57cec5SDimitry Andric assert(To.isReg() && From.isReg()); 2470b57cec5SDimitry Andric To.setReg(From.getReg()); 2480b57cec5SDimitry Andric To.setSubReg(From.getSubReg()); 2490b57cec5SDimitry Andric To.setIsUndef(From.isUndef()); 2500b57cec5SDimitry Andric if (To.isUse()) { 2510b57cec5SDimitry Andric To.setIsKill(From.isKill()); 2520b57cec5SDimitry Andric } else { 2530b57cec5SDimitry Andric To.setIsDead(From.isDead()); 2540b57cec5SDimitry Andric } 2550b57cec5SDimitry Andric } 2560b57cec5SDimitry Andric 2570b57cec5SDimitry Andric static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 2580b57cec5SDimitry Andric return LHS.isReg() && 2590b57cec5SDimitry Andric RHS.isReg() && 2600b57cec5SDimitry Andric LHS.getReg() == RHS.getReg() && 2610b57cec5SDimitry Andric LHS.getSubReg() == RHS.getSubReg(); 2620b57cec5SDimitry Andric } 2630b57cec5SDimitry Andric 2640b57cec5SDimitry Andric static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 2650b57cec5SDimitry Andric const MachineRegisterInfo *MRI) { 2660b57cec5SDimitry Andric if (!Reg->isReg() || !Reg->isDef()) 2670b57cec5SDimitry Andric return nullptr; 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric MachineOperand *ResMO = nullptr; 2700b57cec5SDimitry Andric for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 2710b57cec5SDimitry Andric // If there exist use of subreg of Reg then return nullptr 2720b57cec5SDimitry Andric if (!isSameReg(UseMO, *Reg)) 2730b57cec5SDimitry Andric return nullptr; 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric // Check that there is only one instruction that uses Reg 2760b57cec5SDimitry Andric if (!ResMO) { 2770b57cec5SDimitry Andric ResMO = &UseMO; 2780b57cec5SDimitry Andric } else if (ResMO->getParent() != UseMO.getParent()) { 2790b57cec5SDimitry Andric return nullptr; 2800b57cec5SDimitry Andric } 2810b57cec5SDimitry Andric } 2820b57cec5SDimitry Andric 2830b57cec5SDimitry Andric return ResMO; 2840b57cec5SDimitry Andric } 2850b57cec5SDimitry Andric 2860b57cec5SDimitry Andric static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 2870b57cec5SDimitry Andric const MachineRegisterInfo *MRI) { 2880b57cec5SDimitry Andric if (!Reg->isReg()) 2890b57cec5SDimitry Andric return nullptr; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 2920b57cec5SDimitry Andric if (!DefInstr) 2930b57cec5SDimitry Andric return nullptr; 2940b57cec5SDimitry Andric 2950b57cec5SDimitry Andric for (auto &DefMO : DefInstr->defs()) { 2960b57cec5SDimitry Andric if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 2970b57cec5SDimitry Andric return &DefMO; 2980b57cec5SDimitry Andric } 2990b57cec5SDimitry Andric 3000b57cec5SDimitry Andric // Ignore implicit defs. 3010b57cec5SDimitry Andric return nullptr; 3020b57cec5SDimitry Andric } 3030b57cec5SDimitry Andric 3040b57cec5SDimitry Andric uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 3050b57cec5SDimitry Andric const MachineOperand *SrcOp) const { 3060b57cec5SDimitry Andric uint64_t Mods = 0; 3070b57cec5SDimitry Andric const auto *MI = SrcOp->getParent(); 3080b57cec5SDimitry Andric if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 3090b57cec5SDimitry Andric if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 3100b57cec5SDimitry Andric Mods = Mod->getImm(); 3110b57cec5SDimitry Andric } 3120b57cec5SDimitry Andric } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 3130b57cec5SDimitry Andric if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 3140b57cec5SDimitry Andric Mods = Mod->getImm(); 3150b57cec5SDimitry Andric } 3160b57cec5SDimitry Andric } 3170b57cec5SDimitry Andric if (Abs || Neg) { 3180b57cec5SDimitry Andric assert(!Sext && 3190b57cec5SDimitry Andric "Float and integer src modifiers can't be set simulteniously"); 3200b57cec5SDimitry Andric Mods |= Abs ? SISrcMods::ABS : 0u; 3210b57cec5SDimitry Andric Mods ^= Neg ? SISrcMods::NEG : 0u; 3220b57cec5SDimitry Andric } else if (Sext) { 3230b57cec5SDimitry Andric Mods |= SISrcMods::SEXT; 3240b57cec5SDimitry Andric } 3250b57cec5SDimitry Andric 3260b57cec5SDimitry Andric return Mods; 3270b57cec5SDimitry Andric } 3280b57cec5SDimitry Andric 3290b57cec5SDimitry Andric MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 3300b57cec5SDimitry Andric // For SDWA src operand potential instruction is one that use register 3310b57cec5SDimitry Andric // defined by parent instruction 3320b57cec5SDimitry Andric MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 3330b57cec5SDimitry Andric if (!PotentialMO) 3340b57cec5SDimitry Andric return nullptr; 3350b57cec5SDimitry Andric 3360b57cec5SDimitry Andric return PotentialMO->getParent(); 3370b57cec5SDimitry Andric } 3380b57cec5SDimitry Andric 3390b57cec5SDimitry Andric bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 3400b57cec5SDimitry Andric // Find operand in instruction that matches source operand and replace it with 3410b57cec5SDimitry Andric // target operand. Set corresponding src_sel 3420b57cec5SDimitry Andric bool IsPreserveSrc = false; 3430b57cec5SDimitry Andric MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 3440b57cec5SDimitry Andric MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 3450b57cec5SDimitry Andric MachineOperand *SrcMods = 3460b57cec5SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 3470b57cec5SDimitry Andric assert(Src && (Src->isReg() || Src->isImm())); 3480b57cec5SDimitry Andric if (!isSameReg(*Src, *getReplacedOperand())) { 3490b57cec5SDimitry Andric // If this is not src0 then it could be src1 3500b57cec5SDimitry Andric Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 3510b57cec5SDimitry Andric SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 3520b57cec5SDimitry Andric SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric if (!Src || 3550b57cec5SDimitry Andric !isSameReg(*Src, *getReplacedOperand())) { 3560b57cec5SDimitry Andric // It's possible this Src is a tied operand for 3570b57cec5SDimitry Andric // UNUSED_PRESERVE, in which case we can either 3580b57cec5SDimitry Andric // abandon the peephole attempt, or if legal we can 3590b57cec5SDimitry Andric // copy the target operand into the tied slot 3600b57cec5SDimitry Andric // if the preserve operation will effectively cause the same 3610b57cec5SDimitry Andric // result by overwriting the rest of the dst. 3620b57cec5SDimitry Andric MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 3630b57cec5SDimitry Andric MachineOperand *DstUnused = 3640b57cec5SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3650b57cec5SDimitry Andric 3660b57cec5SDimitry Andric if (Dst && 3670b57cec5SDimitry Andric DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 3680b57cec5SDimitry Andric // This will work if the tied src is acessing WORD_0, and the dst is 3690b57cec5SDimitry Andric // writing WORD_1. Modifiers don't matter because all the bits that 3700b57cec5SDimitry Andric // would be impacted are being overwritten by the dst. 3710b57cec5SDimitry Andric // Any other case will not work. 3720b57cec5SDimitry Andric SdwaSel DstSel = static_cast<SdwaSel>( 3730b57cec5SDimitry Andric TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 3740b57cec5SDimitry Andric if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 3750b57cec5SDimitry Andric getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 3760b57cec5SDimitry Andric IsPreserveSrc = true; 3770b57cec5SDimitry Andric auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3780b57cec5SDimitry Andric AMDGPU::OpName::vdst); 3790b57cec5SDimitry Andric auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 3800b57cec5SDimitry Andric Src = &MI.getOperand(TiedIdx); 3810b57cec5SDimitry Andric SrcSel = nullptr; 3820b57cec5SDimitry Andric SrcMods = nullptr; 3830b57cec5SDimitry Andric } else { 3840b57cec5SDimitry Andric // Not legal to convert this src 3850b57cec5SDimitry Andric return false; 3860b57cec5SDimitry Andric } 3870b57cec5SDimitry Andric } 3880b57cec5SDimitry Andric } 3890b57cec5SDimitry Andric assert(Src && Src->isReg()); 3900b57cec5SDimitry Andric 3910b57cec5SDimitry Andric if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 3920b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 3930b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 3940b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 3950b57cec5SDimitry Andric !isSameReg(*Src, *getReplacedOperand())) { 3960b57cec5SDimitry Andric // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 3970b57cec5SDimitry Andric // src2. This is not allowed. 3980b57cec5SDimitry Andric return false; 3990b57cec5SDimitry Andric } 4000b57cec5SDimitry Andric 4010b57cec5SDimitry Andric assert(isSameReg(*Src, *getReplacedOperand()) && 4020b57cec5SDimitry Andric (IsPreserveSrc || (SrcSel && SrcMods))); 4030b57cec5SDimitry Andric } 4040b57cec5SDimitry Andric copyRegOperand(*Src, *getTargetOperand()); 4050b57cec5SDimitry Andric if (!IsPreserveSrc) { 4060b57cec5SDimitry Andric SrcSel->setImm(getSrcSel()); 4070b57cec5SDimitry Andric SrcMods->setImm(getSrcMods(TII, Src)); 4080b57cec5SDimitry Andric } 4090b57cec5SDimitry Andric getTargetOperand()->setIsKill(false); 4100b57cec5SDimitry Andric return true; 4110b57cec5SDimitry Andric } 4120b57cec5SDimitry Andric 4130b57cec5SDimitry Andric MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 4140b57cec5SDimitry Andric // For SDWA dst operand potential instruction is one that defines register 4150b57cec5SDimitry Andric // that this operand uses 4160b57cec5SDimitry Andric MachineRegisterInfo *MRI = getMRI(); 4170b57cec5SDimitry Andric MachineInstr *ParentMI = getParentInst(); 4180b57cec5SDimitry Andric 4190b57cec5SDimitry Andric MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 4200b57cec5SDimitry Andric if (!PotentialMO) 4210b57cec5SDimitry Andric return nullptr; 4220b57cec5SDimitry Andric 4230b57cec5SDimitry Andric // Check that ParentMI is the only instruction that uses replaced register 4240b57cec5SDimitry Andric for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 4250b57cec5SDimitry Andric if (&UseInst != ParentMI) 4260b57cec5SDimitry Andric return nullptr; 4270b57cec5SDimitry Andric } 4280b57cec5SDimitry Andric 4290b57cec5SDimitry Andric return PotentialMO->getParent(); 4300b57cec5SDimitry Andric } 4310b57cec5SDimitry Andric 4320b57cec5SDimitry Andric bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 4330b57cec5SDimitry Andric // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 4340b57cec5SDimitry Andric 4350b57cec5SDimitry Andric if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || 4360b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || 4370b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 4380b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 4390b57cec5SDimitry Andric getDstSel() != AMDGPU::SDWA::DWORD) { 4400b57cec5SDimitry Andric // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 4410b57cec5SDimitry Andric return false; 4420b57cec5SDimitry Andric } 4430b57cec5SDimitry Andric 4440b57cec5SDimitry Andric MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 4450b57cec5SDimitry Andric assert(Operand && 4460b57cec5SDimitry Andric Operand->isReg() && 4470b57cec5SDimitry Andric isSameReg(*Operand, *getReplacedOperand())); 4480b57cec5SDimitry Andric copyRegOperand(*Operand, *getTargetOperand()); 4490b57cec5SDimitry Andric MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 4500b57cec5SDimitry Andric assert(DstSel); 4510b57cec5SDimitry Andric DstSel->setImm(getDstSel()); 4520b57cec5SDimitry Andric MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 4530b57cec5SDimitry Andric assert(DstUnused); 4540b57cec5SDimitry Andric DstUnused->setImm(getDstUnused()); 4550b57cec5SDimitry Andric 4560b57cec5SDimitry Andric // Remove original instruction because it would conflict with our new 4570b57cec5SDimitry Andric // instruction by register definition 4580b57cec5SDimitry Andric getParentInst()->eraseFromParent(); 4590b57cec5SDimitry Andric return true; 4600b57cec5SDimitry Andric } 4610b57cec5SDimitry Andric 4620b57cec5SDimitry Andric bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 4630b57cec5SDimitry Andric const SIInstrInfo *TII) { 4640b57cec5SDimitry Andric // MI should be moved right before v_or_b32. 4650b57cec5SDimitry Andric // For this we should clear all kill flags on uses of MI src-operands or else 4660b57cec5SDimitry Andric // we can encounter problem with use of killed operand. 4670b57cec5SDimitry Andric for (MachineOperand &MO : MI.uses()) { 4680b57cec5SDimitry Andric if (!MO.isReg()) 4690b57cec5SDimitry Andric continue; 4700b57cec5SDimitry Andric getMRI()->clearKillFlags(MO.getReg()); 4710b57cec5SDimitry Andric } 4720b57cec5SDimitry Andric 4730b57cec5SDimitry Andric // Move MI before v_or_b32 4740b57cec5SDimitry Andric auto MBB = MI.getParent(); 4750b57cec5SDimitry Andric MBB->remove(&MI); 4760b57cec5SDimitry Andric MBB->insert(getParentInst(), &MI); 4770b57cec5SDimitry Andric 4780b57cec5SDimitry Andric // Add Implicit use of preserved register 4790b57cec5SDimitry Andric MachineInstrBuilder MIB(*MBB->getParent(), MI); 4800b57cec5SDimitry Andric MIB.addReg(getPreservedOperand()->getReg(), 4810b57cec5SDimitry Andric RegState::ImplicitKill, 4820b57cec5SDimitry Andric getPreservedOperand()->getSubReg()); 4830b57cec5SDimitry Andric 4840b57cec5SDimitry Andric // Tie dst to implicit use 4850b57cec5SDimitry Andric MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 4860b57cec5SDimitry Andric MI.getNumOperands() - 1); 4870b57cec5SDimitry Andric 4880b57cec5SDimitry Andric // Convert MI as any other SDWADstOperand and remove v_or_b32 4890b57cec5SDimitry Andric return SDWADstOperand::convertToSDWA(MI, TII); 4900b57cec5SDimitry Andric } 4910b57cec5SDimitry Andric 4920b57cec5SDimitry Andric Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 4930b57cec5SDimitry Andric if (Op.isImm()) { 4940b57cec5SDimitry Andric return Op.getImm(); 4950b57cec5SDimitry Andric } 4960b57cec5SDimitry Andric 4970b57cec5SDimitry Andric // If this is not immediate then it can be copy of immediate value, e.g.: 4980b57cec5SDimitry Andric // %1 = S_MOV_B32 255; 4990b57cec5SDimitry Andric if (Op.isReg()) { 5000b57cec5SDimitry Andric for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 5010b57cec5SDimitry Andric if (!isSameReg(Op, Def)) 5020b57cec5SDimitry Andric continue; 5030b57cec5SDimitry Andric 5040b57cec5SDimitry Andric const MachineInstr *DefInst = Def.getParent(); 5050b57cec5SDimitry Andric if (!TII->isFoldableCopy(*DefInst)) 5060b57cec5SDimitry Andric return None; 5070b57cec5SDimitry Andric 5080b57cec5SDimitry Andric const MachineOperand &Copied = DefInst->getOperand(1); 5090b57cec5SDimitry Andric if (!Copied.isImm()) 5100b57cec5SDimitry Andric return None; 5110b57cec5SDimitry Andric 5120b57cec5SDimitry Andric return Copied.getImm(); 5130b57cec5SDimitry Andric } 5140b57cec5SDimitry Andric } 5150b57cec5SDimitry Andric 5160b57cec5SDimitry Andric return None; 5170b57cec5SDimitry Andric } 5180b57cec5SDimitry Andric 5190b57cec5SDimitry Andric std::unique_ptr<SDWAOperand> 5200b57cec5SDimitry Andric SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 5210b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode(); 5220b57cec5SDimitry Andric switch (Opcode) { 5230b57cec5SDimitry Andric case AMDGPU::V_LSHRREV_B32_e32: 5240b57cec5SDimitry Andric case AMDGPU::V_ASHRREV_I32_e32: 5250b57cec5SDimitry Andric case AMDGPU::V_LSHLREV_B32_e32: 5260b57cec5SDimitry Andric case AMDGPU::V_LSHRREV_B32_e64: 5270b57cec5SDimitry Andric case AMDGPU::V_ASHRREV_I32_e64: 5280b57cec5SDimitry Andric case AMDGPU::V_LSHLREV_B32_e64: { 5290b57cec5SDimitry Andric // from: v_lshrrev_b32_e32 v1, 16/24, v0 5300b57cec5SDimitry Andric // to SDWA src:v0 src_sel:WORD_1/BYTE_3 5310b57cec5SDimitry Andric 5320b57cec5SDimitry Andric // from: v_ashrrev_i32_e32 v1, 16/24, v0 5330b57cec5SDimitry Andric // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 5340b57cec5SDimitry Andric 5350b57cec5SDimitry Andric // from: v_lshlrev_b32_e32 v1, 16/24, v0 5360b57cec5SDimitry Andric // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 5370b57cec5SDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 5380b57cec5SDimitry Andric auto Imm = foldToImm(*Src0); 5390b57cec5SDimitry Andric if (!Imm) 5400b57cec5SDimitry Andric break; 5410b57cec5SDimitry Andric 5420b57cec5SDimitry Andric if (*Imm != 16 && *Imm != 24) 5430b57cec5SDimitry Andric break; 5440b57cec5SDimitry Andric 5450b57cec5SDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 5460b57cec5SDimitry Andric MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 547*e8d8bef9SDimitry Andric if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) 5480b57cec5SDimitry Andric break; 5490b57cec5SDimitry Andric 5500b57cec5SDimitry Andric if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 5510b57cec5SDimitry Andric Opcode == AMDGPU::V_LSHLREV_B32_e64) { 5528bcb0991SDimitry Andric return std::make_unique<SDWADstOperand>( 5530b57cec5SDimitry Andric Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 5540b57cec5SDimitry Andric } else { 5558bcb0991SDimitry Andric return std::make_unique<SDWASrcOperand>( 5560b57cec5SDimitry Andric Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 5570b57cec5SDimitry Andric Opcode != AMDGPU::V_LSHRREV_B32_e32 && 5580b57cec5SDimitry Andric Opcode != AMDGPU::V_LSHRREV_B32_e64); 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric break; 5610b57cec5SDimitry Andric } 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric case AMDGPU::V_LSHRREV_B16_e32: 5640b57cec5SDimitry Andric case AMDGPU::V_ASHRREV_I16_e32: 5650b57cec5SDimitry Andric case AMDGPU::V_LSHLREV_B16_e32: 5660b57cec5SDimitry Andric case AMDGPU::V_LSHRREV_B16_e64: 5670b57cec5SDimitry Andric case AMDGPU::V_ASHRREV_I16_e64: 5680b57cec5SDimitry Andric case AMDGPU::V_LSHLREV_B16_e64: { 5690b57cec5SDimitry Andric // from: v_lshrrev_b16_e32 v1, 8, v0 5700b57cec5SDimitry Andric // to SDWA src:v0 src_sel:BYTE_1 5710b57cec5SDimitry Andric 5720b57cec5SDimitry Andric // from: v_ashrrev_i16_e32 v1, 8, v0 5730b57cec5SDimitry Andric // to SDWA src:v0 src_sel:BYTE_1 sext:1 5740b57cec5SDimitry Andric 5750b57cec5SDimitry Andric // from: v_lshlrev_b16_e32 v1, 8, v0 5760b57cec5SDimitry Andric // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 5770b57cec5SDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 5780b57cec5SDimitry Andric auto Imm = foldToImm(*Src0); 5790b57cec5SDimitry Andric if (!Imm || *Imm != 8) 5800b57cec5SDimitry Andric break; 5810b57cec5SDimitry Andric 5820b57cec5SDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 5830b57cec5SDimitry Andric MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 5840b57cec5SDimitry Andric 585*e8d8bef9SDimitry Andric if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) 5860b57cec5SDimitry Andric break; 5870b57cec5SDimitry Andric 5880b57cec5SDimitry Andric if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 5890b57cec5SDimitry Andric Opcode == AMDGPU::V_LSHLREV_B16_e64) { 5908bcb0991SDimitry Andric return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 5910b57cec5SDimitry Andric } else { 5928bcb0991SDimitry Andric return std::make_unique<SDWASrcOperand>( 5930b57cec5SDimitry Andric Src1, Dst, BYTE_1, false, false, 5940b57cec5SDimitry Andric Opcode != AMDGPU::V_LSHRREV_B16_e32 && 5950b57cec5SDimitry Andric Opcode != AMDGPU::V_LSHRREV_B16_e64); 5960b57cec5SDimitry Andric } 5970b57cec5SDimitry Andric break; 5980b57cec5SDimitry Andric } 5990b57cec5SDimitry Andric 600*e8d8bef9SDimitry Andric case AMDGPU::V_BFE_I32_e64: 601*e8d8bef9SDimitry Andric case AMDGPU::V_BFE_U32_e64: { 6020b57cec5SDimitry Andric // e.g.: 6030b57cec5SDimitry Andric // from: v_bfe_u32 v1, v0, 8, 8 6040b57cec5SDimitry Andric // to SDWA src:v0 src_sel:BYTE_1 6050b57cec5SDimitry Andric 6060b57cec5SDimitry Andric // offset | width | src_sel 6070b57cec5SDimitry Andric // ------------------------ 6080b57cec5SDimitry Andric // 0 | 8 | BYTE_0 6090b57cec5SDimitry Andric // 0 | 16 | WORD_0 6100b57cec5SDimitry Andric // 0 | 32 | DWORD ? 6110b57cec5SDimitry Andric // 8 | 8 | BYTE_1 6120b57cec5SDimitry Andric // 16 | 8 | BYTE_2 6130b57cec5SDimitry Andric // 16 | 16 | WORD_1 6140b57cec5SDimitry Andric // 24 | 8 | BYTE_3 6150b57cec5SDimitry Andric 6160b57cec5SDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 6170b57cec5SDimitry Andric auto Offset = foldToImm(*Src1); 6180b57cec5SDimitry Andric if (!Offset) 6190b57cec5SDimitry Andric break; 6200b57cec5SDimitry Andric 6210b57cec5SDimitry Andric MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 6220b57cec5SDimitry Andric auto Width = foldToImm(*Src2); 6230b57cec5SDimitry Andric if (!Width) 6240b57cec5SDimitry Andric break; 6250b57cec5SDimitry Andric 6260b57cec5SDimitry Andric SdwaSel SrcSel = DWORD; 6270b57cec5SDimitry Andric 6280b57cec5SDimitry Andric if (*Offset == 0 && *Width == 8) 6290b57cec5SDimitry Andric SrcSel = BYTE_0; 6300b57cec5SDimitry Andric else if (*Offset == 0 && *Width == 16) 6310b57cec5SDimitry Andric SrcSel = WORD_0; 6320b57cec5SDimitry Andric else if (*Offset == 0 && *Width == 32) 6330b57cec5SDimitry Andric SrcSel = DWORD; 6340b57cec5SDimitry Andric else if (*Offset == 8 && *Width == 8) 6350b57cec5SDimitry Andric SrcSel = BYTE_1; 6360b57cec5SDimitry Andric else if (*Offset == 16 && *Width == 8) 6370b57cec5SDimitry Andric SrcSel = BYTE_2; 6380b57cec5SDimitry Andric else if (*Offset == 16 && *Width == 16) 6390b57cec5SDimitry Andric SrcSel = WORD_1; 6400b57cec5SDimitry Andric else if (*Offset == 24 && *Width == 8) 6410b57cec5SDimitry Andric SrcSel = BYTE_3; 6420b57cec5SDimitry Andric else 6430b57cec5SDimitry Andric break; 6440b57cec5SDimitry Andric 6450b57cec5SDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 6460b57cec5SDimitry Andric MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 6470b57cec5SDimitry Andric 648*e8d8bef9SDimitry Andric if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical()) 6490b57cec5SDimitry Andric break; 6500b57cec5SDimitry Andric 6518bcb0991SDimitry Andric return std::make_unique<SDWASrcOperand>( 652*e8d8bef9SDimitry Andric Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); 6530b57cec5SDimitry Andric } 6540b57cec5SDimitry Andric 6550b57cec5SDimitry Andric case AMDGPU::V_AND_B32_e32: 6560b57cec5SDimitry Andric case AMDGPU::V_AND_B32_e64: { 6570b57cec5SDimitry Andric // e.g.: 6580b57cec5SDimitry Andric // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 6590b57cec5SDimitry Andric // to SDWA src:v0 src_sel:WORD_0/BYTE_0 6600b57cec5SDimitry Andric 6610b57cec5SDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 6620b57cec5SDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 6630b57cec5SDimitry Andric auto ValSrc = Src1; 6640b57cec5SDimitry Andric auto Imm = foldToImm(*Src0); 6650b57cec5SDimitry Andric 6660b57cec5SDimitry Andric if (!Imm) { 6670b57cec5SDimitry Andric Imm = foldToImm(*Src1); 6680b57cec5SDimitry Andric ValSrc = Src0; 6690b57cec5SDimitry Andric } 6700b57cec5SDimitry Andric 6710b57cec5SDimitry Andric if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 6720b57cec5SDimitry Andric break; 6730b57cec5SDimitry Andric 6740b57cec5SDimitry Andric MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 6750b57cec5SDimitry Andric 676*e8d8bef9SDimitry Andric if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical()) 6770b57cec5SDimitry Andric break; 6780b57cec5SDimitry Andric 6798bcb0991SDimitry Andric return std::make_unique<SDWASrcOperand>( 6800b57cec5SDimitry Andric ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 6810b57cec5SDimitry Andric } 6820b57cec5SDimitry Andric 6830b57cec5SDimitry Andric case AMDGPU::V_OR_B32_e32: 6840b57cec5SDimitry Andric case AMDGPU::V_OR_B32_e64: { 6850b57cec5SDimitry Andric // Patterns for dst_unused:UNUSED_PRESERVE. 6860b57cec5SDimitry Andric // e.g., from: 6870b57cec5SDimitry Andric // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 6880b57cec5SDimitry Andric // src1_sel:WORD_1 src2_sel:WORD1 6890b57cec5SDimitry Andric // v_add_f16_e32 v3, v1, v2 6900b57cec5SDimitry Andric // v_or_b32_e32 v4, v0, v3 6910b57cec5SDimitry Andric // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 6920b57cec5SDimitry Andric 6930b57cec5SDimitry Andric // Check if one of operands of v_or_b32 is SDWA instruction 6940b57cec5SDimitry Andric using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 6950b57cec5SDimitry Andric auto CheckOROperandsForSDWA = 6960b57cec5SDimitry Andric [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 6970b57cec5SDimitry Andric if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 6980b57cec5SDimitry Andric return CheckRetType(None); 6990b57cec5SDimitry Andric 7000b57cec5SDimitry Andric MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 7010b57cec5SDimitry Andric if (!Op1Def) 7020b57cec5SDimitry Andric return CheckRetType(None); 7030b57cec5SDimitry Andric 7040b57cec5SDimitry Andric MachineInstr *Op1Inst = Op1Def->getParent(); 7050b57cec5SDimitry Andric if (!TII->isSDWA(*Op1Inst)) 7060b57cec5SDimitry Andric return CheckRetType(None); 7070b57cec5SDimitry Andric 7080b57cec5SDimitry Andric MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 7090b57cec5SDimitry Andric if (!Op2Def) 7100b57cec5SDimitry Andric return CheckRetType(None); 7110b57cec5SDimitry Andric 7120b57cec5SDimitry Andric return CheckRetType(std::make_pair(Op1Def, Op2Def)); 7130b57cec5SDimitry Andric }; 7140b57cec5SDimitry Andric 7150b57cec5SDimitry Andric MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 7160b57cec5SDimitry Andric MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 7170b57cec5SDimitry Andric assert(OrSDWA && OrOther); 7180b57cec5SDimitry Andric auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 7190b57cec5SDimitry Andric if (!Res) { 7200b57cec5SDimitry Andric OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 7210b57cec5SDimitry Andric OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 7220b57cec5SDimitry Andric assert(OrSDWA && OrOther); 7230b57cec5SDimitry Andric Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 7240b57cec5SDimitry Andric if (!Res) 7250b57cec5SDimitry Andric break; 7260b57cec5SDimitry Andric } 7270b57cec5SDimitry Andric 7280b57cec5SDimitry Andric MachineOperand *OrSDWADef = Res->first; 7290b57cec5SDimitry Andric MachineOperand *OrOtherDef = Res->second; 7300b57cec5SDimitry Andric assert(OrSDWADef && OrOtherDef); 7310b57cec5SDimitry Andric 7320b57cec5SDimitry Andric MachineInstr *SDWAInst = OrSDWADef->getParent(); 7330b57cec5SDimitry Andric MachineInstr *OtherInst = OrOtherDef->getParent(); 7340b57cec5SDimitry Andric 7350b57cec5SDimitry Andric // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 7360b57cec5SDimitry Andric // destination patterns don't overlap. Compatible instruction can be either 7370b57cec5SDimitry Andric // regular instruction with compatible bitness or SDWA instruction with 7380b57cec5SDimitry Andric // correct dst_sel 7390b57cec5SDimitry Andric // SDWAInst | OtherInst bitness / OtherInst dst_sel 7400b57cec5SDimitry Andric // ----------------------------------------------------- 7410b57cec5SDimitry Andric // DWORD | no / no 7420b57cec5SDimitry Andric // WORD_0 | no / BYTE_2/3, WORD_1 7430b57cec5SDimitry Andric // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 7440b57cec5SDimitry Andric // BYTE_0 | no / BYTE_1/2/3, WORD_1 7450b57cec5SDimitry Andric // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 7460b57cec5SDimitry Andric // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 7470b57cec5SDimitry Andric // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 7480b57cec5SDimitry Andric // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 7490b57cec5SDimitry Andric // but v_add_f32 is not. 7500b57cec5SDimitry Andric 7510b57cec5SDimitry Andric // TODO: add support for non-SDWA instructions as OtherInst. 7520b57cec5SDimitry Andric // For now this only works with SDWA instructions. For regular instructions 7530b57cec5SDimitry Andric // there is no way to determine if the instruction writes only 8/16/24-bit 7540b57cec5SDimitry Andric // out of full register size and all registers are at min 32-bit wide. 7550b57cec5SDimitry Andric if (!TII->isSDWA(*OtherInst)) 7560b57cec5SDimitry Andric break; 7570b57cec5SDimitry Andric 7580b57cec5SDimitry Andric SdwaSel DstSel = static_cast<SdwaSel>( 7590b57cec5SDimitry Andric TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 7600b57cec5SDimitry Andric SdwaSel OtherDstSel = static_cast<SdwaSel>( 7610b57cec5SDimitry Andric TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 7620b57cec5SDimitry Andric 7630b57cec5SDimitry Andric bool DstSelAgree = false; 7640b57cec5SDimitry Andric switch (DstSel) { 7650b57cec5SDimitry Andric case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 7660b57cec5SDimitry Andric (OtherDstSel == BYTE_3) || 7670b57cec5SDimitry Andric (OtherDstSel == WORD_1)); 7680b57cec5SDimitry Andric break; 7690b57cec5SDimitry Andric case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 7700b57cec5SDimitry Andric (OtherDstSel == BYTE_1) || 7710b57cec5SDimitry Andric (OtherDstSel == WORD_0)); 7720b57cec5SDimitry Andric break; 7730b57cec5SDimitry Andric case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 7740b57cec5SDimitry Andric (OtherDstSel == BYTE_2) || 7750b57cec5SDimitry Andric (OtherDstSel == BYTE_3) || 7760b57cec5SDimitry Andric (OtherDstSel == WORD_1)); 7770b57cec5SDimitry Andric break; 7780b57cec5SDimitry Andric case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 7790b57cec5SDimitry Andric (OtherDstSel == BYTE_2) || 7800b57cec5SDimitry Andric (OtherDstSel == BYTE_3) || 7810b57cec5SDimitry Andric (OtherDstSel == WORD_1)); 7820b57cec5SDimitry Andric break; 7830b57cec5SDimitry Andric case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 7840b57cec5SDimitry Andric (OtherDstSel == BYTE_1) || 7850b57cec5SDimitry Andric (OtherDstSel == BYTE_3) || 7860b57cec5SDimitry Andric (OtherDstSel == WORD_0)); 7870b57cec5SDimitry Andric break; 7880b57cec5SDimitry Andric case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 7890b57cec5SDimitry Andric (OtherDstSel == BYTE_1) || 7900b57cec5SDimitry Andric (OtherDstSel == BYTE_2) || 7910b57cec5SDimitry Andric (OtherDstSel == WORD_0)); 7920b57cec5SDimitry Andric break; 7930b57cec5SDimitry Andric default: DstSelAgree = false; 7940b57cec5SDimitry Andric } 7950b57cec5SDimitry Andric 7960b57cec5SDimitry Andric if (!DstSelAgree) 7970b57cec5SDimitry Andric break; 7980b57cec5SDimitry Andric 7990b57cec5SDimitry Andric // Also OtherInst dst_unused should be UNUSED_PAD 8000b57cec5SDimitry Andric DstUnused OtherDstUnused = static_cast<DstUnused>( 8010b57cec5SDimitry Andric TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 8020b57cec5SDimitry Andric if (OtherDstUnused != DstUnused::UNUSED_PAD) 8030b57cec5SDimitry Andric break; 8040b57cec5SDimitry Andric 8050b57cec5SDimitry Andric // Create DstPreserveOperand 8060b57cec5SDimitry Andric MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 8070b57cec5SDimitry Andric assert(OrDst && OrDst->isReg()); 8080b57cec5SDimitry Andric 8098bcb0991SDimitry Andric return std::make_unique<SDWADstPreserveOperand>( 8100b57cec5SDimitry Andric OrDst, OrSDWADef, OrOtherDef, DstSel); 8110b57cec5SDimitry Andric 8120b57cec5SDimitry Andric } 8130b57cec5SDimitry Andric } 8140b57cec5SDimitry Andric 8150b57cec5SDimitry Andric return std::unique_ptr<SDWAOperand>(nullptr); 8160b57cec5SDimitry Andric } 8170b57cec5SDimitry Andric 81847395794SDimitry Andric #if !defined(NDEBUG) 81947395794SDimitry Andric static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 82047395794SDimitry Andric Operand.print(OS); 82147395794SDimitry Andric return OS; 82247395794SDimitry Andric } 82347395794SDimitry Andric #endif 82447395794SDimitry Andric 8250b57cec5SDimitry Andric void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 8260b57cec5SDimitry Andric for (MachineInstr &MI : MBB) { 8270b57cec5SDimitry Andric if (auto Operand = matchSDWAOperand(MI)) { 8280b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 8290b57cec5SDimitry Andric SDWAOperands[&MI] = std::move(Operand); 8300b57cec5SDimitry Andric ++NumSDWAPatternsFound; 8310b57cec5SDimitry Andric } 8320b57cec5SDimitry Andric } 8330b57cec5SDimitry Andric } 8340b57cec5SDimitry Andric 8350b57cec5SDimitry Andric // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and 836*e8d8bef9SDimitry Andric // V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA 837*e8d8bef9SDimitry Andric // to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa. 8380b57cec5SDimitry Andric // 8390b57cec5SDimitry Andric // We are transforming from a VOP3 into a VOP2 form of the instruction. 8400b57cec5SDimitry Andric // %19:vgpr_32 = V_AND_B32_e32 255, 8410b57cec5SDimitry Andric // killed %16:vgpr_32, implicit $exec 842*e8d8bef9SDimitry Andric // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 8430b57cec5SDimitry Andric // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec 8440b57cec5SDimitry Andric // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 8450b57cec5SDimitry Andric // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec 8460b57cec5SDimitry Andric // 8470b57cec5SDimitry Andric // becomes 848*e8d8bef9SDimitry Andric // %47:vgpr_32 = V_ADD_CO_U32_sdwa 8490b57cec5SDimitry Andric // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, 8500b57cec5SDimitry Andric // implicit-def $vcc, implicit $exec 8510b57cec5SDimitry Andric // %48:vgpr_32 = V_ADDC_U32_e32 8520b57cec5SDimitry Andric // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec 8530b57cec5SDimitry Andric void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, 8540b57cec5SDimitry Andric const GCNSubtarget &ST) const { 8550b57cec5SDimitry Andric int Opc = MI.getOpcode(); 856*e8d8bef9SDimitry Andric assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && 857*e8d8bef9SDimitry Andric "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64"); 8580b57cec5SDimitry Andric 8590b57cec5SDimitry Andric // Can the candidate MI be shrunk? 8600b57cec5SDimitry Andric if (!TII->canShrink(MI, *MRI)) 8610b57cec5SDimitry Andric return; 8620b57cec5SDimitry Andric Opc = AMDGPU::getVOPe32(Opc); 8630b57cec5SDimitry Andric // Find the related ADD instruction. 8640b57cec5SDimitry Andric const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 8650b57cec5SDimitry Andric if (!Sdst) 8660b57cec5SDimitry Andric return; 8670b57cec5SDimitry Andric MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); 8680b57cec5SDimitry Andric if (!NextOp) 8690b57cec5SDimitry Andric return; 8700b57cec5SDimitry Andric MachineInstr &MISucc = *NextOp->getParent(); 8710b57cec5SDimitry Andric // Can the successor be shrunk? 8720b57cec5SDimitry Andric if (!TII->canShrink(MISucc, *MRI)) 8730b57cec5SDimitry Andric return; 8740b57cec5SDimitry Andric int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); 8750b57cec5SDimitry Andric // Make sure the carry in/out are subsequently unused. 8760b57cec5SDimitry Andric MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); 8770b57cec5SDimitry Andric if (!CarryIn) 8780b57cec5SDimitry Andric return; 8790b57cec5SDimitry Andric MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); 8800b57cec5SDimitry Andric if (!CarryOut) 8810b57cec5SDimitry Andric return; 8820b57cec5SDimitry Andric if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) 8830b57cec5SDimitry Andric return; 8840b57cec5SDimitry Andric // Make sure VCC or its subregs are dead before MI. 8850b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 8860b57cec5SDimitry Andric auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); 8870b57cec5SDimitry Andric if (Liveness != MachineBasicBlock::LQR_Dead) 8880b57cec5SDimitry Andric return; 8890b57cec5SDimitry Andric // Check if VCC is referenced in range of (MI,MISucc]. 8900b57cec5SDimitry Andric for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); 8910b57cec5SDimitry Andric I != E; ++I) { 8920b57cec5SDimitry Andric if (I->modifiesRegister(AMDGPU::VCC, TRI)) 8930b57cec5SDimitry Andric return; 8940b57cec5SDimitry Andric } 8955ffd83dbSDimitry Andric 8960b57cec5SDimitry Andric // Make the two new e32 instruction variants. 8970b57cec5SDimitry Andric // Replace MI with V_{SUB|ADD}_I32_e32 8985ffd83dbSDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) 8995ffd83dbSDimitry Andric .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) 9005ffd83dbSDimitry Andric .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) 9015ffd83dbSDimitry Andric .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) 9025ffd83dbSDimitry Andric .setMIFlags(MI.getFlags()); 9035ffd83dbSDimitry Andric 9040b57cec5SDimitry Andric MI.eraseFromParent(); 9055ffd83dbSDimitry Andric 9060b57cec5SDimitry Andric // Replace MISucc with V_{SUBB|ADDC}_U32_e32 9075ffd83dbSDimitry Andric BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)) 9085ffd83dbSDimitry Andric .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)) 9095ffd83dbSDimitry Andric .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)) 9105ffd83dbSDimitry Andric .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)) 9115ffd83dbSDimitry Andric .setMIFlags(MISucc.getFlags()); 9125ffd83dbSDimitry Andric 9130b57cec5SDimitry Andric MISucc.eraseFromParent(); 9140b57cec5SDimitry Andric } 9150b57cec5SDimitry Andric 9160b57cec5SDimitry Andric bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, 9170b57cec5SDimitry Andric const GCNSubtarget &ST) const { 9180b57cec5SDimitry Andric // Check if this is already an SDWA instruction 9190b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 9200b57cec5SDimitry Andric if (TII->isSDWA(Opc)) 9210b57cec5SDimitry Andric return true; 9220b57cec5SDimitry Andric 9230b57cec5SDimitry Andric // Check if this instruction has opcode that supports SDWA 9240b57cec5SDimitry Andric if (AMDGPU::getSDWAOp(Opc) == -1) 9250b57cec5SDimitry Andric Opc = AMDGPU::getVOPe32(Opc); 9260b57cec5SDimitry Andric 9270b57cec5SDimitry Andric if (AMDGPU::getSDWAOp(Opc) == -1) 9280b57cec5SDimitry Andric return false; 9290b57cec5SDimitry Andric 9300b57cec5SDimitry Andric if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 9310b57cec5SDimitry Andric return false; 9320b57cec5SDimitry Andric 9330b57cec5SDimitry Andric if (TII->isVOPC(Opc)) { 9340b57cec5SDimitry Andric if (!ST.hasSDWASdst()) { 9350b57cec5SDimitry Andric const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 9360b57cec5SDimitry Andric if (SDst && (SDst->getReg() != AMDGPU::VCC && 9370b57cec5SDimitry Andric SDst->getReg() != AMDGPU::VCC_LO)) 9380b57cec5SDimitry Andric return false; 9390b57cec5SDimitry Andric } 9400b57cec5SDimitry Andric 9410b57cec5SDimitry Andric if (!ST.hasSDWAOutModsVOPC() && 9420b57cec5SDimitry Andric (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 9430b57cec5SDimitry Andric TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 9440b57cec5SDimitry Andric return false; 9450b57cec5SDimitry Andric 9460b57cec5SDimitry Andric } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 9470b57cec5SDimitry Andric !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 9480b57cec5SDimitry Andric return false; 9490b57cec5SDimitry Andric } 9500b57cec5SDimitry Andric 9510b57cec5SDimitry Andric if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || 9520b57cec5SDimitry Andric Opc == AMDGPU::V_FMAC_F32_e32 || 9530b57cec5SDimitry Andric Opc == AMDGPU::V_MAC_F16_e32 || 9540b57cec5SDimitry Andric Opc == AMDGPU::V_MAC_F32_e32)) 9550b57cec5SDimitry Andric return false; 9560b57cec5SDimitry Andric 9570b57cec5SDimitry Andric // Check if target supports this SDWA opcode 9580b57cec5SDimitry Andric if (TII->pseudoToMCOpcode(Opc) == -1) 9590b57cec5SDimitry Andric return false; 9600b57cec5SDimitry Andric 9610b57cec5SDimitry Andric // FIXME: has SDWA but require handling of implicit VCC use 9620b57cec5SDimitry Andric if (Opc == AMDGPU::V_CNDMASK_B32_e32) 9630b57cec5SDimitry Andric return false; 9640b57cec5SDimitry Andric 965*e8d8bef9SDimitry Andric if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { 966*e8d8bef9SDimitry Andric if (!Src0->isReg() && !Src0->isImm()) 967*e8d8bef9SDimitry Andric return false; 968*e8d8bef9SDimitry Andric } 969*e8d8bef9SDimitry Andric 970*e8d8bef9SDimitry Andric if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { 971*e8d8bef9SDimitry Andric if (!Src1->isReg() && !Src1->isImm()) 972*e8d8bef9SDimitry Andric return false; 973*e8d8bef9SDimitry Andric } 974*e8d8bef9SDimitry Andric 9750b57cec5SDimitry Andric return true; 9760b57cec5SDimitry Andric } 9770b57cec5SDimitry Andric 9780b57cec5SDimitry Andric bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 9790b57cec5SDimitry Andric const SDWAOperandsVector &SDWAOperands) { 9800b57cec5SDimitry Andric 9810b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 9820b57cec5SDimitry Andric 9830b57cec5SDimitry Andric // Convert to sdwa 9840b57cec5SDimitry Andric int SDWAOpcode; 9850b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode(); 9860b57cec5SDimitry Andric if (TII->isSDWA(Opcode)) { 9870b57cec5SDimitry Andric SDWAOpcode = Opcode; 9880b57cec5SDimitry Andric } else { 9890b57cec5SDimitry Andric SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 9900b57cec5SDimitry Andric if (SDWAOpcode == -1) 9910b57cec5SDimitry Andric SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 9920b57cec5SDimitry Andric } 9930b57cec5SDimitry Andric assert(SDWAOpcode != -1); 9940b57cec5SDimitry Andric 9950b57cec5SDimitry Andric const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 9960b57cec5SDimitry Andric 9970b57cec5SDimitry Andric // Create SDWA version of instruction MI and initialize its operands 9980b57cec5SDimitry Andric MachineInstrBuilder SDWAInst = 9995ffd83dbSDimitry Andric BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) 10005ffd83dbSDimitry Andric .setMIFlags(MI.getFlags()); 10010b57cec5SDimitry Andric 10020b57cec5SDimitry Andric // Copy dst, if it is present in original then should also be present in SDWA 10030b57cec5SDimitry Andric MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 10040b57cec5SDimitry Andric if (Dst) { 10050b57cec5SDimitry Andric assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 10060b57cec5SDimitry Andric SDWAInst.add(*Dst); 10070b57cec5SDimitry Andric } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 10080b57cec5SDimitry Andric assert(Dst && 10090b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 10100b57cec5SDimitry Andric SDWAInst.add(*Dst); 10110b57cec5SDimitry Andric } else { 10120b57cec5SDimitry Andric assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 10130b57cec5SDimitry Andric SDWAInst.addReg(TRI->getVCC(), RegState::Define); 10140b57cec5SDimitry Andric } 10150b57cec5SDimitry Andric 10160b57cec5SDimitry Andric // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 10170b57cec5SDimitry Andric // src0_modifiers (except for v_nop_sdwa, but it can't get here) 10180b57cec5SDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 10190b57cec5SDimitry Andric assert( 10200b57cec5SDimitry Andric Src0 && 10210b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 10220b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 10230b57cec5SDimitry Andric if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 10240b57cec5SDimitry Andric SDWAInst.addImm(Mod->getImm()); 10250b57cec5SDimitry Andric else 10260b57cec5SDimitry Andric SDWAInst.addImm(0); 10270b57cec5SDimitry Andric SDWAInst.add(*Src0); 10280b57cec5SDimitry Andric 10290b57cec5SDimitry Andric // Copy src1 if present, initialize src1_modifiers. 10300b57cec5SDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 10310b57cec5SDimitry Andric if (Src1) { 10320b57cec5SDimitry Andric assert( 10330b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 10340b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 10350b57cec5SDimitry Andric if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 10360b57cec5SDimitry Andric SDWAInst.addImm(Mod->getImm()); 10370b57cec5SDimitry Andric else 10380b57cec5SDimitry Andric SDWAInst.addImm(0); 10390b57cec5SDimitry Andric SDWAInst.add(*Src1); 10400b57cec5SDimitry Andric } 10410b57cec5SDimitry Andric 10420b57cec5SDimitry Andric if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || 10430b57cec5SDimitry Andric SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || 10440b57cec5SDimitry Andric SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 10450b57cec5SDimitry Andric SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 10460b57cec5SDimitry Andric // v_mac_f16/32 has additional src2 operand tied to vdst 10470b57cec5SDimitry Andric MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 10480b57cec5SDimitry Andric assert(Src2); 10490b57cec5SDimitry Andric SDWAInst.add(*Src2); 10500b57cec5SDimitry Andric } 10510b57cec5SDimitry Andric 10520b57cec5SDimitry Andric // Copy clamp if present, initialize otherwise 10530b57cec5SDimitry Andric assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 10540b57cec5SDimitry Andric MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 10550b57cec5SDimitry Andric if (Clamp) { 10560b57cec5SDimitry Andric SDWAInst.add(*Clamp); 10570b57cec5SDimitry Andric } else { 10580b57cec5SDimitry Andric SDWAInst.addImm(0); 10590b57cec5SDimitry Andric } 10600b57cec5SDimitry Andric 10610b57cec5SDimitry Andric // Copy omod if present, initialize otherwise if needed 10620b57cec5SDimitry Andric if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 10630b57cec5SDimitry Andric MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 10640b57cec5SDimitry Andric if (OMod) { 10650b57cec5SDimitry Andric SDWAInst.add(*OMod); 10660b57cec5SDimitry Andric } else { 10670b57cec5SDimitry Andric SDWAInst.addImm(0); 10680b57cec5SDimitry Andric } 10690b57cec5SDimitry Andric } 10700b57cec5SDimitry Andric 10710b57cec5SDimitry Andric // Copy dst_sel if present, initialize otherwise if needed 10720b57cec5SDimitry Andric if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 10730b57cec5SDimitry Andric MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 10740b57cec5SDimitry Andric if (DstSel) { 10750b57cec5SDimitry Andric SDWAInst.add(*DstSel); 10760b57cec5SDimitry Andric } else { 10770b57cec5SDimitry Andric SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 10780b57cec5SDimitry Andric } 10790b57cec5SDimitry Andric } 10800b57cec5SDimitry Andric 10810b57cec5SDimitry Andric // Copy dst_unused if present, initialize otherwise if needed 10820b57cec5SDimitry Andric if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 10830b57cec5SDimitry Andric MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 10840b57cec5SDimitry Andric if (DstUnused) { 10850b57cec5SDimitry Andric SDWAInst.add(*DstUnused); 10860b57cec5SDimitry Andric } else { 10870b57cec5SDimitry Andric SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 10880b57cec5SDimitry Andric } 10890b57cec5SDimitry Andric } 10900b57cec5SDimitry Andric 10910b57cec5SDimitry Andric // Copy src0_sel if present, initialize otherwise 10920b57cec5SDimitry Andric assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 10930b57cec5SDimitry Andric MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 10940b57cec5SDimitry Andric if (Src0Sel) { 10950b57cec5SDimitry Andric SDWAInst.add(*Src0Sel); 10960b57cec5SDimitry Andric } else { 10970b57cec5SDimitry Andric SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 10980b57cec5SDimitry Andric } 10990b57cec5SDimitry Andric 11000b57cec5SDimitry Andric // Copy src1_sel if present, initialize otherwise if needed 11010b57cec5SDimitry Andric if (Src1) { 11020b57cec5SDimitry Andric assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 11030b57cec5SDimitry Andric MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 11040b57cec5SDimitry Andric if (Src1Sel) { 11050b57cec5SDimitry Andric SDWAInst.add(*Src1Sel); 11060b57cec5SDimitry Andric } else { 11070b57cec5SDimitry Andric SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 11080b57cec5SDimitry Andric } 11090b57cec5SDimitry Andric } 11100b57cec5SDimitry Andric 11110b57cec5SDimitry Andric // Check for a preserved register that needs to be copied. 11120b57cec5SDimitry Andric auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 11130b57cec5SDimitry Andric if (DstUnused && 11140b57cec5SDimitry Andric DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 11150b57cec5SDimitry Andric // We expect, if we are here, that the instruction was already in it's SDWA form, 11160b57cec5SDimitry Andric // with a tied operand. 11170b57cec5SDimitry Andric assert(Dst && Dst->isTied()); 11180b57cec5SDimitry Andric assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 11190b57cec5SDimitry Andric // We also expect a vdst, since sdst can't preserve. 11200b57cec5SDimitry Andric auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 11210b57cec5SDimitry Andric assert(PreserveDstIdx != -1); 11220b57cec5SDimitry Andric 11230b57cec5SDimitry Andric auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 11240b57cec5SDimitry Andric auto Tied = MI.getOperand(TiedIdx); 11250b57cec5SDimitry Andric 11260b57cec5SDimitry Andric SDWAInst.add(Tied); 11270b57cec5SDimitry Andric SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 11280b57cec5SDimitry Andric } 11290b57cec5SDimitry Andric 11300b57cec5SDimitry Andric // Apply all sdwa operand patterns. 11310b57cec5SDimitry Andric bool Converted = false; 11320b57cec5SDimitry Andric for (auto &Operand : SDWAOperands) { 11330b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 11340b57cec5SDimitry Andric // There should be no intesection between SDWA operands and potential MIs 11350b57cec5SDimitry Andric // e.g.: 11360b57cec5SDimitry Andric // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 11370b57cec5SDimitry Andric // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 11380b57cec5SDimitry Andric // v_add_u32 v3, v4, v2 11390b57cec5SDimitry Andric // 11400b57cec5SDimitry Andric // In that example it is possible that we would fold 2nd instruction into 3rd 11410b57cec5SDimitry Andric // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 11420b57cec5SDimitry Andric // already destroyed). So if SDWAOperand is also a potential MI then do not 11430b57cec5SDimitry Andric // apply it. 11440b57cec5SDimitry Andric if (PotentialMatches.count(Operand->getParentInst()) == 0) 11450b57cec5SDimitry Andric Converted |= Operand->convertToSDWA(*SDWAInst, TII); 11460b57cec5SDimitry Andric } 11470b57cec5SDimitry Andric if (Converted) { 11480b57cec5SDimitry Andric ConvertedInstructions.push_back(SDWAInst); 11490b57cec5SDimitry Andric } else { 11500b57cec5SDimitry Andric SDWAInst->eraseFromParent(); 11510b57cec5SDimitry Andric return false; 11520b57cec5SDimitry Andric } 11530b57cec5SDimitry Andric 11540b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 11550b57cec5SDimitry Andric ++NumSDWAInstructionsPeepholed; 11560b57cec5SDimitry Andric 11570b57cec5SDimitry Andric MI.eraseFromParent(); 11580b57cec5SDimitry Andric return true; 11590b57cec5SDimitry Andric } 11600b57cec5SDimitry Andric 11610b57cec5SDimitry Andric // If an instruction was converted to SDWA it should not have immediates or SGPR 11620b57cec5SDimitry Andric // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 11630b57cec5SDimitry Andric void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 11640b57cec5SDimitry Andric const GCNSubtarget &ST) const { 11650b57cec5SDimitry Andric const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 11660b57cec5SDimitry Andric unsigned ConstantBusCount = 0; 11670b57cec5SDimitry Andric for (MachineOperand &Op : MI.explicit_uses()) { 11680b57cec5SDimitry Andric if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 11690b57cec5SDimitry Andric continue; 11700b57cec5SDimitry Andric 11710b57cec5SDimitry Andric unsigned I = MI.getOperandNo(&Op); 11720b57cec5SDimitry Andric if (Desc.OpInfo[I].RegClass == -1 || 11730b57cec5SDimitry Andric !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 11740b57cec5SDimitry Andric continue; 11750b57cec5SDimitry Andric 11760b57cec5SDimitry Andric if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 11770b57cec5SDimitry Andric TRI->isSGPRReg(*MRI, Op.getReg())) { 11780b57cec5SDimitry Andric ++ConstantBusCount; 11790b57cec5SDimitry Andric continue; 11800b57cec5SDimitry Andric } 11810b57cec5SDimitry Andric 11828bcb0991SDimitry Andric Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 11830b57cec5SDimitry Andric auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 11840b57cec5SDimitry Andric TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 11850b57cec5SDimitry Andric if (Op.isImm()) 11860b57cec5SDimitry Andric Copy.addImm(Op.getImm()); 11870b57cec5SDimitry Andric else if (Op.isReg()) 11880b57cec5SDimitry Andric Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 11890b57cec5SDimitry Andric Op.getSubReg()); 11900b57cec5SDimitry Andric Op.ChangeToRegister(VGPR, false); 11910b57cec5SDimitry Andric } 11920b57cec5SDimitry Andric } 11930b57cec5SDimitry Andric 11940b57cec5SDimitry Andric bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 11950b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 11960b57cec5SDimitry Andric 11970b57cec5SDimitry Andric if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 11980b57cec5SDimitry Andric return false; 11990b57cec5SDimitry Andric 12000b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 12010b57cec5SDimitry Andric TRI = ST.getRegisterInfo(); 12020b57cec5SDimitry Andric TII = ST.getInstrInfo(); 12030b57cec5SDimitry Andric 12040b57cec5SDimitry Andric // Find all SDWA operands in MF. 12050b57cec5SDimitry Andric bool Ret = false; 12060b57cec5SDimitry Andric for (MachineBasicBlock &MBB : MF) { 12070b57cec5SDimitry Andric bool Changed = false; 12080b57cec5SDimitry Andric do { 12090b57cec5SDimitry Andric // Preprocess the ADD/SUB pairs so they could be SDWA'ed. 12100b57cec5SDimitry Andric // Look for a possible ADD or SUB that resulted from a previously lowered 12110b57cec5SDimitry Andric // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 12120b57cec5SDimitry Andric // lowers the pair of instructions into e32 form. 12130b57cec5SDimitry Andric matchSDWAOperands(MBB); 12140b57cec5SDimitry Andric for (const auto &OperandPair : SDWAOperands) { 12150b57cec5SDimitry Andric const auto &Operand = OperandPair.second; 12160b57cec5SDimitry Andric MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 12170b57cec5SDimitry Andric if (PotentialMI && 1218*e8d8bef9SDimitry Andric (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || 1219*e8d8bef9SDimitry Andric PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) 12200b57cec5SDimitry Andric pseudoOpConvertToVOP2(*PotentialMI, ST); 12210b57cec5SDimitry Andric } 12220b57cec5SDimitry Andric SDWAOperands.clear(); 12230b57cec5SDimitry Andric 12240b57cec5SDimitry Andric // Generate potential match list. 12250b57cec5SDimitry Andric matchSDWAOperands(MBB); 12260b57cec5SDimitry Andric 12270b57cec5SDimitry Andric for (const auto &OperandPair : SDWAOperands) { 12280b57cec5SDimitry Andric const auto &Operand = OperandPair.second; 12290b57cec5SDimitry Andric MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 12300b57cec5SDimitry Andric if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 12310b57cec5SDimitry Andric PotentialMatches[PotentialMI].push_back(Operand.get()); 12320b57cec5SDimitry Andric } 12330b57cec5SDimitry Andric } 12340b57cec5SDimitry Andric 12350b57cec5SDimitry Andric for (auto &PotentialPair : PotentialMatches) { 12360b57cec5SDimitry Andric MachineInstr &PotentialMI = *PotentialPair.first; 12370b57cec5SDimitry Andric convertToSDWA(PotentialMI, PotentialPair.second); 12380b57cec5SDimitry Andric } 12390b57cec5SDimitry Andric 12400b57cec5SDimitry Andric PotentialMatches.clear(); 12410b57cec5SDimitry Andric SDWAOperands.clear(); 12420b57cec5SDimitry Andric 12430b57cec5SDimitry Andric Changed = !ConvertedInstructions.empty(); 12440b57cec5SDimitry Andric 12450b57cec5SDimitry Andric if (Changed) 12460b57cec5SDimitry Andric Ret = true; 12470b57cec5SDimitry Andric while (!ConvertedInstructions.empty()) 12480b57cec5SDimitry Andric legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 12490b57cec5SDimitry Andric } while (Changed); 12500b57cec5SDimitry Andric } 12510b57cec5SDimitry Andric 12520b57cec5SDimitry Andric return Ret; 12530b57cec5SDimitry Andric } 1254