10b57cec5SDimitry Andric //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// SI Implementation of TargetInstrInfo. 110b57cec5SDimitry Andric // 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 140b57cec5SDimitry Andric #include "SIInstrInfo.h" 150b57cec5SDimitry Andric #include "AMDGPU.h" 16e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h" 170b57cec5SDimitry Andric #include "GCNHazardRecognizer.h" 18e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 19e8d8bef9SDimitry Andric #include "SIMachineFunctionInfo.h" 205f757f3fSDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 210b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 225f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 23349cc55cSDimitry Andric #include "llvm/CodeGen/LiveIntervals.h" 24e8d8bef9SDimitry Andric #include "llvm/CodeGen/LiveVariables.h" 250b57cec5SDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 2681ad6265SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h" 27349cc55cSDimitry Andric #include "llvm/CodeGen/MachineScheduler.h" 280b57cec5SDimitry Andric #include "llvm/CodeGen/RegisterScavenging.h" 290b57cec5SDimitry Andric #include "llvm/CodeGen/ScheduleDAG.h" 300b57cec5SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 31e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 32fe6060f1SDimitry Andric #include "llvm/MC/MCContext.h" 330b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h" 340b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h" 350b57cec5SDimitry Andric 360b57cec5SDimitry Andric using namespace llvm; 370b57cec5SDimitry Andric 385ffd83dbSDimitry Andric #define DEBUG_TYPE "si-instr-info" 395ffd83dbSDimitry Andric 400b57cec5SDimitry Andric #define GET_INSTRINFO_CTOR_DTOR 410b57cec5SDimitry Andric #include "AMDGPUGenInstrInfo.inc" 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric namespace llvm { 440b57cec5SDimitry Andric namespace AMDGPU { 450b57cec5SDimitry Andric #define GET_D16ImageDimIntrinsics_IMPL 460b57cec5SDimitry Andric #define GET_ImageDimIntrinsicTable_IMPL 470b57cec5SDimitry Andric #define GET_RsrcIntrinsics_IMPL 480b57cec5SDimitry Andric #include "AMDGPUGenSearchableTables.inc" 490b57cec5SDimitry Andric } 500b57cec5SDimitry Andric } 510b57cec5SDimitry Andric 520b57cec5SDimitry Andric 530b57cec5SDimitry Andric // Must be at least 4 to be able to branch over minimum unconditional branch 540b57cec5SDimitry Andric // code. This is only for making it possible to write reasonably small tests for 550b57cec5SDimitry Andric // long branches. 560b57cec5SDimitry Andric static cl::opt<unsigned> 570b57cec5SDimitry Andric BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 580b57cec5SDimitry Andric cl::desc("Restrict range of branch instructions (DEBUG)")); 590b57cec5SDimitry Andric 605ffd83dbSDimitry Andric static cl::opt<bool> Fix16BitCopies( 615ffd83dbSDimitry Andric "amdgpu-fix-16-bit-physreg-copies", 625ffd83dbSDimitry Andric cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 635ffd83dbSDimitry Andric cl::init(true), 645ffd83dbSDimitry Andric cl::ReallyHidden); 655ffd83dbSDimitry Andric 660b57cec5SDimitry Andric SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 670b57cec5SDimitry Andric : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 68480093f4SDimitry Andric RI(ST), ST(ST) { 69480093f4SDimitry Andric SchedModel.init(&ST); 70480093f4SDimitry Andric } 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 730b57cec5SDimitry Andric // TargetInstrInfo callbacks 740b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric static unsigned getNumOperandsNoGlue(SDNode *Node) { 770b57cec5SDimitry Andric unsigned N = Node->getNumOperands(); 780b57cec5SDimitry Andric while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 790b57cec5SDimitry Andric --N; 800b57cec5SDimitry Andric return N; 810b57cec5SDimitry Andric } 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric /// Returns true if both nodes have the same value for the given 840b57cec5SDimitry Andric /// operand \p Op, or if both nodes do not have this operand. 850b57cec5SDimitry Andric static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 860b57cec5SDimitry Andric unsigned Opc0 = N0->getMachineOpcode(); 870b57cec5SDimitry Andric unsigned Opc1 = N1->getMachineOpcode(); 880b57cec5SDimitry Andric 890b57cec5SDimitry Andric int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 900b57cec5SDimitry Andric int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 910b57cec5SDimitry Andric 920b57cec5SDimitry Andric if (Op0Idx == -1 && Op1Idx == -1) 930b57cec5SDimitry Andric return true; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric 960b57cec5SDimitry Andric if ((Op0Idx == -1 && Op1Idx != -1) || 970b57cec5SDimitry Andric (Op1Idx == -1 && Op0Idx != -1)) 980b57cec5SDimitry Andric return false; 990b57cec5SDimitry Andric 1000b57cec5SDimitry Andric // getNamedOperandIdx returns the index for the MachineInstr's operands, 1010b57cec5SDimitry Andric // which includes the result as the first operand. We are indexing into the 1020b57cec5SDimitry Andric // MachineSDNode's operands, so we need to skip the result operand to get 1030b57cec5SDimitry Andric // the real index. 1040b57cec5SDimitry Andric --Op0Idx; 1050b57cec5SDimitry Andric --Op1Idx; 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 1080b57cec5SDimitry Andric } 1090b57cec5SDimitry Andric 1105f757f3fSDimitry Andric static bool canRemat(const MachineInstr &MI) { 1115f757f3fSDimitry Andric 1125f757f3fSDimitry Andric if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) || 1135f757f3fSDimitry Andric SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) || 1145f757f3fSDimitry Andric SIInstrInfo::isSALU(MI)) 1155f757f3fSDimitry Andric return true; 1165f757f3fSDimitry Andric 1175f757f3fSDimitry Andric if (SIInstrInfo::isSMRD(MI)) { 1185f757f3fSDimitry Andric return !MI.memoperands_empty() && 1195f757f3fSDimitry Andric llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) { 1205f757f3fSDimitry Andric return MMO->isLoad() && MMO->isInvariant(); 1215f757f3fSDimitry Andric }); 1225f757f3fSDimitry Andric } 1235f757f3fSDimitry Andric 1245f757f3fSDimitry Andric return false; 1255f757f3fSDimitry Andric } 1265f757f3fSDimitry Andric 127fcaf7f86SDimitry Andric bool SIInstrInfo::isReallyTriviallyReMaterializable( 128fcaf7f86SDimitry Andric const MachineInstr &MI) const { 1295f757f3fSDimitry Andric 1305f757f3fSDimitry Andric if (canRemat(MI)) { 131fe6060f1SDimitry Andric // Normally VALU use of exec would block the rematerialization, but that 132fe6060f1SDimitry Andric // is OK in this case to have an implicit exec read as all VALU do. 133fe6060f1SDimitry Andric // We really want all of the generic logic for this except for this. 134fe6060f1SDimitry Andric 135fe6060f1SDimitry Andric // Another potential implicit use is mode register. The core logic of 136fe6060f1SDimitry Andric // the RA will not attempt rematerialization if mode is set anywhere 137fe6060f1SDimitry Andric // in the function, otherwise it is safe since mode is not changed. 138349cc55cSDimitry Andric 139349cc55cSDimitry Andric // There is difference to generic method which does not allow 140349cc55cSDimitry Andric // rematerialization if there are virtual register uses. We allow this, 141349cc55cSDimitry Andric // therefore this method includes SOP instructions as well. 1425f757f3fSDimitry Andric if (!MI.hasImplicitDef() && 143bdd1243dSDimitry Andric MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && 1445f757f3fSDimitry Andric !MI.mayRaiseFPException()) 1455f757f3fSDimitry Andric return true; 146fe6060f1SDimitry Andric } 147fe6060f1SDimitry Andric 1485f757f3fSDimitry Andric return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); 1490b57cec5SDimitry Andric } 150fe6060f1SDimitry Andric 15181ad6265SDimitry Andric // Returns true if the scalar result of a VALU instruction depends on exec. 15281ad6265SDimitry Andric static bool resultDependsOnExec(const MachineInstr &MI) { 15381ad6265SDimitry Andric // Ignore comparisons which are only used masked with exec. 15481ad6265SDimitry Andric // This allows some hoisting/sinking of VALU comparisons. 15581ad6265SDimitry Andric if (MI.isCompare()) { 15681ad6265SDimitry Andric const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 15781ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 15881ad6265SDimitry Andric if (!DstReg.isVirtual()) 15904eeddc0SDimitry Andric return true; 16081ad6265SDimitry Andric for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { 16181ad6265SDimitry Andric switch (Use.getOpcode()) { 16281ad6265SDimitry Andric case AMDGPU::S_AND_SAVEEXEC_B32: 16381ad6265SDimitry Andric case AMDGPU::S_AND_SAVEEXEC_B64: 16481ad6265SDimitry Andric break; 16581ad6265SDimitry Andric case AMDGPU::S_AND_B32: 16681ad6265SDimitry Andric case AMDGPU::S_AND_B64: 16781ad6265SDimitry Andric if (!Use.readsRegister(AMDGPU::EXEC)) 16881ad6265SDimitry Andric return true; 16981ad6265SDimitry Andric break; 17081ad6265SDimitry Andric default: 17181ad6265SDimitry Andric return true; 17281ad6265SDimitry Andric } 17381ad6265SDimitry Andric } 17481ad6265SDimitry Andric return false; 17581ad6265SDimitry Andric } 17604eeddc0SDimitry Andric 17704eeddc0SDimitry Andric switch (MI.getOpcode()) { 17804eeddc0SDimitry Andric default: 17904eeddc0SDimitry Andric break; 18004eeddc0SDimitry Andric case AMDGPU::V_READFIRSTLANE_B32: 18104eeddc0SDimitry Andric return true; 18204eeddc0SDimitry Andric } 18304eeddc0SDimitry Andric 18404eeddc0SDimitry Andric return false; 18504eeddc0SDimitry Andric } 18604eeddc0SDimitry Andric 187fe6060f1SDimitry Andric bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { 188fe6060f1SDimitry Andric // Any implicit use of exec by VALU is not a real register read. 189fe6060f1SDimitry Andric return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && 19081ad6265SDimitry Andric isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); 1910b57cec5SDimitry Andric } 1920b57cec5SDimitry Andric 1935f757f3fSDimitry Andric bool SIInstrInfo::isSafeToSink(MachineInstr &MI, 1945f757f3fSDimitry Andric MachineBasicBlock *SuccToSinkTo, 1955f757f3fSDimitry Andric MachineCycleInfo *CI) const { 1965f757f3fSDimitry Andric // Allow sinking if MI edits lane mask (divergent i1 in sgpr). 1975f757f3fSDimitry Andric if (MI.getOpcode() == AMDGPU::SI_IF_BREAK) 1985f757f3fSDimitry Andric return true; 1995f757f3fSDimitry Andric 2005f757f3fSDimitry Andric MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); 2015f757f3fSDimitry Andric // Check if sinking of MI would create temporal divergent use. 2025f757f3fSDimitry Andric for (auto Op : MI.uses()) { 2035f757f3fSDimitry Andric if (Op.isReg() && Op.getReg().isVirtual() && 2045f757f3fSDimitry Andric RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) { 2055f757f3fSDimitry Andric MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg()); 2065f757f3fSDimitry Andric 2075f757f3fSDimitry Andric // SgprDef defined inside cycle 2085f757f3fSDimitry Andric MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent()); 2095f757f3fSDimitry Andric if (FromCycle == nullptr) 2105f757f3fSDimitry Andric continue; 2115f757f3fSDimitry Andric 2125f757f3fSDimitry Andric MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo); 2135f757f3fSDimitry Andric // Check if there is a FromCycle that contains SgprDef's basic block but 2145f757f3fSDimitry Andric // does not contain SuccToSinkTo and also has divergent exit condition. 2155f757f3fSDimitry Andric while (FromCycle && !FromCycle->contains(ToCycle)) { 2165f757f3fSDimitry Andric // After structurize-cfg, there should be exactly one cycle exit. 2175f757f3fSDimitry Andric SmallVector<MachineBasicBlock *, 1> ExitBlocks; 2185f757f3fSDimitry Andric FromCycle->getExitBlocks(ExitBlocks); 2195f757f3fSDimitry Andric assert(ExitBlocks.size() == 1); 2205f757f3fSDimitry Andric assert(ExitBlocks[0]->getSinglePredecessor()); 2215f757f3fSDimitry Andric 2225f757f3fSDimitry Andric // FromCycle has divergent exit condition. 2235f757f3fSDimitry Andric if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) { 2245f757f3fSDimitry Andric return false; 2255f757f3fSDimitry Andric } 2265f757f3fSDimitry Andric 2275f757f3fSDimitry Andric FromCycle = FromCycle->getParentCycle(); 2285f757f3fSDimitry Andric } 2295f757f3fSDimitry Andric } 2305f757f3fSDimitry Andric } 2315f757f3fSDimitry Andric 2325f757f3fSDimitry Andric return true; 2335f757f3fSDimitry Andric } 2345f757f3fSDimitry Andric 2350b57cec5SDimitry Andric bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 2360b57cec5SDimitry Andric int64_t &Offset0, 2370b57cec5SDimitry Andric int64_t &Offset1) const { 2380b57cec5SDimitry Andric if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 2390b57cec5SDimitry Andric return false; 2400b57cec5SDimitry Andric 2410b57cec5SDimitry Andric unsigned Opc0 = Load0->getMachineOpcode(); 2420b57cec5SDimitry Andric unsigned Opc1 = Load1->getMachineOpcode(); 2430b57cec5SDimitry Andric 2440b57cec5SDimitry Andric // Make sure both are actually loads. 2450b57cec5SDimitry Andric if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 2460b57cec5SDimitry Andric return false; 2470b57cec5SDimitry Andric 248cb14a3feSDimitry Andric // A mayLoad instruction without a def is not a load. Likely a prefetch. 249cb14a3feSDimitry Andric if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs()) 250cb14a3feSDimitry Andric return false; 251cb14a3feSDimitry Andric 2520b57cec5SDimitry Andric if (isDS(Opc0) && isDS(Opc1)) { 2530b57cec5SDimitry Andric 2540b57cec5SDimitry Andric // FIXME: Handle this case: 2550b57cec5SDimitry Andric if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 2560b57cec5SDimitry Andric return false; 2570b57cec5SDimitry Andric 2580b57cec5SDimitry Andric // Check base reg. 2590b57cec5SDimitry Andric if (Load0->getOperand(0) != Load1->getOperand(0)) 2600b57cec5SDimitry Andric return false; 2610b57cec5SDimitry Andric 2620b57cec5SDimitry Andric // Skip read2 / write2 variants for simplicity. 2630b57cec5SDimitry Andric // TODO: We should report true if the used offsets are adjacent (excluded 2640b57cec5SDimitry Andric // st64 versions). 2650b57cec5SDimitry Andric int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 2660b57cec5SDimitry Andric int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 2670b57cec5SDimitry Andric if (Offset0Idx == -1 || Offset1Idx == -1) 2680b57cec5SDimitry Andric return false; 2690b57cec5SDimitry Andric 27081ad6265SDimitry Andric // XXX - be careful of dataless loads 2710b57cec5SDimitry Andric // getNamedOperandIdx returns the index for MachineInstrs. Since they 2720b57cec5SDimitry Andric // include the output in the operand list, but SDNodes don't, we need to 2730b57cec5SDimitry Andric // subtract the index by one. 2740b57cec5SDimitry Andric Offset0Idx -= get(Opc0).NumDefs; 2750b57cec5SDimitry Andric Offset1Idx -= get(Opc1).NumDefs; 276647cbc5dSDimitry Andric Offset0 = Load0->getConstantOperandVal(Offset0Idx); 277647cbc5dSDimitry Andric Offset1 = Load1->getConstantOperandVal(Offset1Idx); 2780b57cec5SDimitry Andric return true; 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric 2810b57cec5SDimitry Andric if (isSMRD(Opc0) && isSMRD(Opc1)) { 2820b57cec5SDimitry Andric // Skip time and cache invalidation instructions. 283bdd1243dSDimitry Andric if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) || 284bdd1243dSDimitry Andric !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase)) 2850b57cec5SDimitry Andric return false; 2860b57cec5SDimitry Andric 287fcaf7f86SDimitry Andric unsigned NumOps = getNumOperandsNoGlue(Load0); 288fcaf7f86SDimitry Andric if (NumOps != getNumOperandsNoGlue(Load1)) 289fcaf7f86SDimitry Andric return false; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric // Check base reg. 2920b57cec5SDimitry Andric if (Load0->getOperand(0) != Load1->getOperand(0)) 2930b57cec5SDimitry Andric return false; 2940b57cec5SDimitry Andric 295fcaf7f86SDimitry Andric // Match register offsets, if both register and immediate offsets present. 296fcaf7f86SDimitry Andric assert(NumOps == 4 || NumOps == 5); 297fcaf7f86SDimitry Andric if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1)) 298fcaf7f86SDimitry Andric return false; 299fcaf7f86SDimitry Andric 3000b57cec5SDimitry Andric const ConstantSDNode *Load0Offset = 301fcaf7f86SDimitry Andric dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3)); 3020b57cec5SDimitry Andric const ConstantSDNode *Load1Offset = 303fcaf7f86SDimitry Andric dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3)); 3040b57cec5SDimitry Andric 3050b57cec5SDimitry Andric if (!Load0Offset || !Load1Offset) 3060b57cec5SDimitry Andric return false; 3070b57cec5SDimitry Andric 3080b57cec5SDimitry Andric Offset0 = Load0Offset->getZExtValue(); 3090b57cec5SDimitry Andric Offset1 = Load1Offset->getZExtValue(); 3100b57cec5SDimitry Andric return true; 3110b57cec5SDimitry Andric } 3120b57cec5SDimitry Andric 3130b57cec5SDimitry Andric // MUBUF and MTBUF can access the same addresses. 3140b57cec5SDimitry Andric if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 3150b57cec5SDimitry Andric 3160b57cec5SDimitry Andric // MUBUF and MTBUF have vaddr at different indices. 3170b57cec5SDimitry Andric if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 3180b57cec5SDimitry Andric !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 3190b57cec5SDimitry Andric !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 3200b57cec5SDimitry Andric return false; 3210b57cec5SDimitry Andric 3220b57cec5SDimitry Andric int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 3230b57cec5SDimitry Andric int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 3240b57cec5SDimitry Andric 3250b57cec5SDimitry Andric if (OffIdx0 == -1 || OffIdx1 == -1) 3260b57cec5SDimitry Andric return false; 3270b57cec5SDimitry Andric 3280b57cec5SDimitry Andric // getNamedOperandIdx returns the index for MachineInstrs. Since they 3290b57cec5SDimitry Andric // include the output in the operand list, but SDNodes don't, we need to 3300b57cec5SDimitry Andric // subtract the index by one. 3310b57cec5SDimitry Andric OffIdx0 -= get(Opc0).NumDefs; 3320b57cec5SDimitry Andric OffIdx1 -= get(Opc1).NumDefs; 3330b57cec5SDimitry Andric 3340b57cec5SDimitry Andric SDValue Off0 = Load0->getOperand(OffIdx0); 3350b57cec5SDimitry Andric SDValue Off1 = Load1->getOperand(OffIdx1); 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric // The offset might be a FrameIndexSDNode. 3380b57cec5SDimitry Andric if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 3390b57cec5SDimitry Andric return false; 3400b57cec5SDimitry Andric 341*1db9f3b2SDimitry Andric Offset0 = Off0->getAsZExtVal(); 342*1db9f3b2SDimitry Andric Offset1 = Off1->getAsZExtVal(); 3430b57cec5SDimitry Andric return true; 3440b57cec5SDimitry Andric } 3450b57cec5SDimitry Andric 3460b57cec5SDimitry Andric return false; 3470b57cec5SDimitry Andric } 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric static bool isStride64(unsigned Opc) { 3500b57cec5SDimitry Andric switch (Opc) { 3510b57cec5SDimitry Andric case AMDGPU::DS_READ2ST64_B32: 3520b57cec5SDimitry Andric case AMDGPU::DS_READ2ST64_B64: 3530b57cec5SDimitry Andric case AMDGPU::DS_WRITE2ST64_B32: 3540b57cec5SDimitry Andric case AMDGPU::DS_WRITE2ST64_B64: 3550b57cec5SDimitry Andric return true; 3560b57cec5SDimitry Andric default: 3570b57cec5SDimitry Andric return false; 3580b57cec5SDimitry Andric } 3590b57cec5SDimitry Andric } 3600b57cec5SDimitry Andric 3615ffd83dbSDimitry Andric bool SIInstrInfo::getMemOperandsWithOffsetWidth( 3625ffd83dbSDimitry Andric const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 3635ffd83dbSDimitry Andric int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 3640b57cec5SDimitry Andric const TargetRegisterInfo *TRI) const { 365480093f4SDimitry Andric if (!LdSt.mayLoadOrStore()) 366480093f4SDimitry Andric return false; 367480093f4SDimitry Andric 3680b57cec5SDimitry Andric unsigned Opc = LdSt.getOpcode(); 3695ffd83dbSDimitry Andric OffsetIsScalable = false; 3705ffd83dbSDimitry Andric const MachineOperand *BaseOp, *OffsetOp; 3715ffd83dbSDimitry Andric int DataOpIdx; 3720b57cec5SDimitry Andric 3730b57cec5SDimitry Andric if (isDS(LdSt)) { 3740b57cec5SDimitry Andric BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 3755ffd83dbSDimitry Andric OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 3765ffd83dbSDimitry Andric if (OffsetOp) { 3775ffd83dbSDimitry Andric // Normal, single offset LDS instruction. 3785ffd83dbSDimitry Andric if (!BaseOp) { 3795ffd83dbSDimitry Andric // DS_CONSUME/DS_APPEND use M0 for the base address. 3805ffd83dbSDimitry Andric // TODO: find the implicit use operand for M0 and use that as BaseOp? 3810b57cec5SDimitry Andric return false; 3820b57cec5SDimitry Andric } 3835ffd83dbSDimitry Andric BaseOps.push_back(BaseOp); 3845ffd83dbSDimitry Andric Offset = OffsetOp->getImm(); 3855ffd83dbSDimitry Andric // Get appropriate operand, and compute width accordingly. 3865ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 3875ffd83dbSDimitry Andric if (DataOpIdx == -1) 3885ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 3895ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 3905ffd83dbSDimitry Andric } else { 3910b57cec5SDimitry Andric // The 2 offset instructions use offset0 and offset1 instead. We can treat 3925ffd83dbSDimitry Andric // these as a load with a single offset if the 2 offsets are consecutive. 3935ffd83dbSDimitry Andric // We will use this for some partially aligned loads. 3945ffd83dbSDimitry Andric const MachineOperand *Offset0Op = 3950b57cec5SDimitry Andric getNamedOperand(LdSt, AMDGPU::OpName::offset0); 3965ffd83dbSDimitry Andric const MachineOperand *Offset1Op = 3970b57cec5SDimitry Andric getNamedOperand(LdSt, AMDGPU::OpName::offset1); 3980b57cec5SDimitry Andric 39906c3fb27SDimitry Andric unsigned Offset0 = Offset0Op->getImm() & 0xff; 40006c3fb27SDimitry Andric unsigned Offset1 = Offset1Op->getImm() & 0xff; 4015ffd83dbSDimitry Andric if (Offset0 + 1 != Offset1) 4025ffd83dbSDimitry Andric return false; 4030b57cec5SDimitry Andric 4040b57cec5SDimitry Andric // Each of these offsets is in element sized units, so we need to convert 4050b57cec5SDimitry Andric // to bytes of the individual reads. 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric unsigned EltSize; 4080b57cec5SDimitry Andric if (LdSt.mayLoad()) 4090b57cec5SDimitry Andric EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 4100b57cec5SDimitry Andric else { 4110b57cec5SDimitry Andric assert(LdSt.mayStore()); 4120b57cec5SDimitry Andric int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 4130b57cec5SDimitry Andric EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 4140b57cec5SDimitry Andric } 4150b57cec5SDimitry Andric 4160b57cec5SDimitry Andric if (isStride64(Opc)) 4170b57cec5SDimitry Andric EltSize *= 64; 4180b57cec5SDimitry Andric 4195ffd83dbSDimitry Andric BaseOps.push_back(BaseOp); 4200b57cec5SDimitry Andric Offset = EltSize * Offset0; 4215ffd83dbSDimitry Andric // Get appropriate operand(s), and compute width accordingly. 4225ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 4235ffd83dbSDimitry Andric if (DataOpIdx == -1) { 4245ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 4255ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 4265ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); 4275ffd83dbSDimitry Andric Width += getOpSize(LdSt, DataOpIdx); 4285ffd83dbSDimitry Andric } else { 4295ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 4300b57cec5SDimitry Andric } 4315ffd83dbSDimitry Andric } 4325ffd83dbSDimitry Andric return true; 4330b57cec5SDimitry Andric } 4340b57cec5SDimitry Andric 4350b57cec5SDimitry Andric if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 4368bcb0991SDimitry Andric const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 437fe6060f1SDimitry Andric if (!RSrc) // e.g. BUFFER_WBINVL1_VOL 4388bcb0991SDimitry Andric return false; 4395ffd83dbSDimitry Andric BaseOps.push_back(RSrc); 4405ffd83dbSDimitry Andric BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 441fe6060f1SDimitry Andric if (BaseOp && !BaseOp->isFI()) 4425ffd83dbSDimitry Andric BaseOps.push_back(BaseOp); 4430b57cec5SDimitry Andric const MachineOperand *OffsetImm = 4440b57cec5SDimitry Andric getNamedOperand(LdSt, AMDGPU::OpName::offset); 4450b57cec5SDimitry Andric Offset = OffsetImm->getImm(); 446fe6060f1SDimitry Andric const MachineOperand *SOffset = 447fe6060f1SDimitry Andric getNamedOperand(LdSt, AMDGPU::OpName::soffset); 448fe6060f1SDimitry Andric if (SOffset) { 449fe6060f1SDimitry Andric if (SOffset->isReg()) 450fe6060f1SDimitry Andric BaseOps.push_back(SOffset); 451fe6060f1SDimitry Andric else 4520b57cec5SDimitry Andric Offset += SOffset->getImm(); 4535ffd83dbSDimitry Andric } 4545ffd83dbSDimitry Andric // Get appropriate operand, and compute width accordingly. 4555ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 4565ffd83dbSDimitry Andric if (DataOpIdx == -1) 4575ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 45881ad6265SDimitry Andric if (DataOpIdx == -1) // LDS DMA 45981ad6265SDimitry Andric return false; 4605ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 4615ffd83dbSDimitry Andric return true; 4625ffd83dbSDimitry Andric } 4630b57cec5SDimitry Andric 4645ffd83dbSDimitry Andric if (isMIMG(LdSt)) { 4655ffd83dbSDimitry Andric int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 4665ffd83dbSDimitry Andric BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); 4675ffd83dbSDimitry Andric int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 4685ffd83dbSDimitry Andric if (VAddr0Idx >= 0) { 4695ffd83dbSDimitry Andric // GFX10 possible NSA encoding. 4705ffd83dbSDimitry Andric for (int I = VAddr0Idx; I < SRsrcIdx; ++I) 4715ffd83dbSDimitry Andric BaseOps.push_back(&LdSt.getOperand(I)); 4725ffd83dbSDimitry Andric } else { 4735ffd83dbSDimitry Andric BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); 4745ffd83dbSDimitry Andric } 4755ffd83dbSDimitry Andric Offset = 0; 4765ffd83dbSDimitry Andric // Get appropriate operand, and compute width accordingly. 4775ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 4785ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 4790b57cec5SDimitry Andric return true; 4800b57cec5SDimitry Andric } 4810b57cec5SDimitry Andric 4820b57cec5SDimitry Andric if (isSMRD(LdSt)) { 4835ffd83dbSDimitry Andric BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 4845ffd83dbSDimitry Andric if (!BaseOp) // e.g. S_MEMTIME 4850b57cec5SDimitry Andric return false; 4865ffd83dbSDimitry Andric BaseOps.push_back(BaseOp); 4875ffd83dbSDimitry Andric OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 4885ffd83dbSDimitry Andric Offset = OffsetOp ? OffsetOp->getImm() : 0; 4895ffd83dbSDimitry Andric // Get appropriate operand, and compute width accordingly. 4905ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); 4915f757f3fSDimitry Andric if (DataOpIdx == -1) 4925f757f3fSDimitry Andric return false; 4935ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 4940b57cec5SDimitry Andric return true; 4950b57cec5SDimitry Andric } 4960b57cec5SDimitry Andric 4970b57cec5SDimitry Andric if (isFLAT(LdSt)) { 498e8d8bef9SDimitry Andric // Instructions have either vaddr or saddr or both or none. 4995ffd83dbSDimitry Andric BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 5005ffd83dbSDimitry Andric if (BaseOp) 5015ffd83dbSDimitry Andric BaseOps.push_back(BaseOp); 5020b57cec5SDimitry Andric BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 5035ffd83dbSDimitry Andric if (BaseOp) 5045ffd83dbSDimitry Andric BaseOps.push_back(BaseOp); 5050b57cec5SDimitry Andric Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 5065ffd83dbSDimitry Andric // Get appropriate operand, and compute width accordingly. 5075ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 5085ffd83dbSDimitry Andric if (DataOpIdx == -1) 5095ffd83dbSDimitry Andric DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 51081ad6265SDimitry Andric if (DataOpIdx == -1) // LDS DMA 51181ad6265SDimitry Andric return false; 5125ffd83dbSDimitry Andric Width = getOpSize(LdSt, DataOpIdx); 5130b57cec5SDimitry Andric return true; 5140b57cec5SDimitry Andric } 5150b57cec5SDimitry Andric 5160b57cec5SDimitry Andric return false; 5170b57cec5SDimitry Andric } 5180b57cec5SDimitry Andric 5190b57cec5SDimitry Andric static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 5205ffd83dbSDimitry Andric ArrayRef<const MachineOperand *> BaseOps1, 5210b57cec5SDimitry Andric const MachineInstr &MI2, 5225ffd83dbSDimitry Andric ArrayRef<const MachineOperand *> BaseOps2) { 5235ffd83dbSDimitry Andric // Only examine the first "base" operand of each instruction, on the 5245ffd83dbSDimitry Andric // assumption that it represents the real base address of the memory access. 5255ffd83dbSDimitry Andric // Other operands are typically offsets or indices from this base address. 5265ffd83dbSDimitry Andric if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) 5270b57cec5SDimitry Andric return true; 5280b57cec5SDimitry Andric 5290b57cec5SDimitry Andric if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 5300b57cec5SDimitry Andric return false; 5310b57cec5SDimitry Andric 5320b57cec5SDimitry Andric auto MO1 = *MI1.memoperands_begin(); 5330b57cec5SDimitry Andric auto MO2 = *MI2.memoperands_begin(); 5340b57cec5SDimitry Andric if (MO1->getAddrSpace() != MO2->getAddrSpace()) 5350b57cec5SDimitry Andric return false; 5360b57cec5SDimitry Andric 5370b57cec5SDimitry Andric auto Base1 = MO1->getValue(); 5380b57cec5SDimitry Andric auto Base2 = MO2->getValue(); 5390b57cec5SDimitry Andric if (!Base1 || !Base2) 5400b57cec5SDimitry Andric return false; 541e8d8bef9SDimitry Andric Base1 = getUnderlyingObject(Base1); 542e8d8bef9SDimitry Andric Base2 = getUnderlyingObject(Base2); 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 5450b57cec5SDimitry Andric return false; 5460b57cec5SDimitry Andric 5470b57cec5SDimitry Andric return Base1 == Base2; 5480b57cec5SDimitry Andric } 5490b57cec5SDimitry Andric 5505ffd83dbSDimitry Andric bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 5515f757f3fSDimitry Andric int64_t Offset1, bool OffsetIsScalable1, 5525ffd83dbSDimitry Andric ArrayRef<const MachineOperand *> BaseOps2, 5535f757f3fSDimitry Andric int64_t Offset2, bool OffsetIsScalable2, 5545f757f3fSDimitry Andric unsigned ClusterSize, 5555ffd83dbSDimitry Andric unsigned NumBytes) const { 556e8d8bef9SDimitry Andric // If the mem ops (to be clustered) do not have the same base ptr, then they 557e8d8bef9SDimitry Andric // should not be clustered 558e8d8bef9SDimitry Andric if (!BaseOps1.empty() && !BaseOps2.empty()) { 5595ffd83dbSDimitry Andric const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 5605ffd83dbSDimitry Andric const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 5615ffd83dbSDimitry Andric if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 5620b57cec5SDimitry Andric return false; 563e8d8bef9SDimitry Andric } else if (!BaseOps1.empty() || !BaseOps2.empty()) { 564e8d8bef9SDimitry Andric // If only one base op is empty, they do not have the same base ptr 565e8d8bef9SDimitry Andric return false; 5660b57cec5SDimitry Andric } 567e8d8bef9SDimitry Andric 56881ad6265SDimitry Andric // In order to avoid register pressure, on an average, the number of DWORDS 569e8d8bef9SDimitry Andric // loaded together by all clustered mem ops should not exceed 8. This is an 570e8d8bef9SDimitry Andric // empirical value based on certain observations and performance related 571e8d8bef9SDimitry Andric // experiments. 572e8d8bef9SDimitry Andric // The good thing about this heuristic is - it avoids clustering of too many 573e8d8bef9SDimitry Andric // sub-word loads, and also avoids clustering of wide loads. Below is the 574e8d8bef9SDimitry Andric // brief summary of how the heuristic behaves for various `LoadSize`. 575e8d8bef9SDimitry Andric // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops 576e8d8bef9SDimitry Andric // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops 577e8d8bef9SDimitry Andric // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops 578e8d8bef9SDimitry Andric // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops 579e8d8bef9SDimitry Andric // (5) LoadSize >= 17: do not cluster 5805f757f3fSDimitry Andric const unsigned LoadSize = NumBytes / ClusterSize; 5815f757f3fSDimitry Andric const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize; 582e8d8bef9SDimitry Andric return NumDWORDs <= 8; 5830b57cec5SDimitry Andric } 5840b57cec5SDimitry Andric 5850b57cec5SDimitry Andric // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 5860b57cec5SDimitry Andric // the first 16 loads will be interleaved with the stores, and the next 16 will 5870b57cec5SDimitry Andric // be clustered as expected. It should really split into 2 16 store batches. 5880b57cec5SDimitry Andric // 5890b57cec5SDimitry Andric // Loads are clustered until this returns false, rather than trying to schedule 5900b57cec5SDimitry Andric // groups of stores. This also means we have to deal with saying different 5910b57cec5SDimitry Andric // address space loads should be clustered, and ones which might cause bank 5920b57cec5SDimitry Andric // conflicts. 5930b57cec5SDimitry Andric // 5940b57cec5SDimitry Andric // This might be deprecated so it might not be worth that much effort to fix. 5950b57cec5SDimitry Andric bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 5960b57cec5SDimitry Andric int64_t Offset0, int64_t Offset1, 5970b57cec5SDimitry Andric unsigned NumLoads) const { 5980b57cec5SDimitry Andric assert(Offset1 > Offset0 && 5990b57cec5SDimitry Andric "Second offset should be larger than first offset!"); 6000b57cec5SDimitry Andric // If we have less than 16 loads in a row, and the offsets are within 64 6010b57cec5SDimitry Andric // bytes, then schedule together. 6020b57cec5SDimitry Andric 6030b57cec5SDimitry Andric // A cacheline is 64 bytes (for global memory). 6040b57cec5SDimitry Andric return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 6050b57cec5SDimitry Andric } 6060b57cec5SDimitry Andric 6070b57cec5SDimitry Andric static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 6080b57cec5SDimitry Andric MachineBasicBlock::iterator MI, 609480093f4SDimitry Andric const DebugLoc &DL, MCRegister DestReg, 6105ffd83dbSDimitry Andric MCRegister SrcReg, bool KillSrc, 61106c3fb27SDimitry Andric const char *Msg = "illegal VGPR to SGPR copy") { 6120b57cec5SDimitry Andric MachineFunction *MF = MBB.getParent(); 6135ffd83dbSDimitry Andric DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 6140b57cec5SDimitry Andric LLVMContext &C = MF->getFunction().getContext(); 6150b57cec5SDimitry Andric C.diagnose(IllegalCopy); 6160b57cec5SDimitry Andric 6170b57cec5SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 6180b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 6190b57cec5SDimitry Andric } 6200b57cec5SDimitry Andric 62181ad6265SDimitry Andric /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not 62281ad6265SDimitry Andric /// possible to have a direct copy in these cases on GFX908, so an intermediate 62381ad6265SDimitry Andric /// VGPR copy is required. 624e8d8bef9SDimitry Andric static void indirectCopyToAGPR(const SIInstrInfo &TII, 625e8d8bef9SDimitry Andric MachineBasicBlock &MBB, 626e8d8bef9SDimitry Andric MachineBasicBlock::iterator MI, 627e8d8bef9SDimitry Andric const DebugLoc &DL, MCRegister DestReg, 628e8d8bef9SDimitry Andric MCRegister SrcReg, bool KillSrc, 629bdd1243dSDimitry Andric RegScavenger &RS, bool RegsOverlap, 630e8d8bef9SDimitry Andric Register ImpDefSuperReg = Register(), 631e8d8bef9SDimitry Andric Register ImpUseSuperReg = Register()) { 63281ad6265SDimitry Andric assert((TII.getSubtarget().hasMAIInsts() && 63381ad6265SDimitry Andric !TII.getSubtarget().hasGFX90AInsts()) && 63481ad6265SDimitry Andric "Expected GFX908 subtarget."); 635e8d8bef9SDimitry Andric 63681ad6265SDimitry Andric assert((AMDGPU::SReg_32RegClass.contains(SrcReg) || 63781ad6265SDimitry Andric AMDGPU::AGPR_32RegClass.contains(SrcReg)) && 63881ad6265SDimitry Andric "Source register of the copy should be either an SGPR or an AGPR."); 63981ad6265SDimitry Andric 64081ad6265SDimitry Andric assert(AMDGPU::AGPR_32RegClass.contains(DestReg) && 64181ad6265SDimitry Andric "Destination register of the copy should be an AGPR."); 64281ad6265SDimitry Andric 64381ad6265SDimitry Andric const SIRegisterInfo &RI = TII.getRegisterInfo(); 644e8d8bef9SDimitry Andric 645e8d8bef9SDimitry Andric // First try to find defining accvgpr_write to avoid temporary registers. 646bdd1243dSDimitry Andric // In the case of copies of overlapping AGPRs, we conservatively do not 647bdd1243dSDimitry Andric // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up 648bdd1243dSDimitry Andric // an accvgpr_write used for this same copy due to implicit-defs 649bdd1243dSDimitry Andric if (!RegsOverlap) { 650e8d8bef9SDimitry Andric for (auto Def = MI, E = MBB.begin(); Def != E; ) { 651e8d8bef9SDimitry Andric --Def; 65206c3fb27SDimitry Andric 65306c3fb27SDimitry Andric if (!Def->modifiesRegister(SrcReg, &RI)) 654e8d8bef9SDimitry Andric continue; 65506c3fb27SDimitry Andric 65606c3fb27SDimitry Andric if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 65706c3fb27SDimitry Andric Def->getOperand(0).getReg() != SrcReg) 658e8d8bef9SDimitry Andric break; 659e8d8bef9SDimitry Andric 660e8d8bef9SDimitry Andric MachineOperand &DefOp = Def->getOperand(1); 661e8d8bef9SDimitry Andric assert(DefOp.isReg() || DefOp.isImm()); 662e8d8bef9SDimitry Andric 663e8d8bef9SDimitry Andric if (DefOp.isReg()) { 664e8d8bef9SDimitry Andric bool SafeToPropagate = true; 665bdd1243dSDimitry Andric // Check that register source operand is not clobbered before MI. 666bdd1243dSDimitry Andric // Immediate operands are always safe to propagate. 667e8d8bef9SDimitry Andric for (auto I = Def; I != MI && SafeToPropagate; ++I) 668e8d8bef9SDimitry Andric if (I->modifiesRegister(DefOp.getReg(), &RI)) 669e8d8bef9SDimitry Andric SafeToPropagate = false; 670e8d8bef9SDimitry Andric 671e8d8bef9SDimitry Andric if (!SafeToPropagate) 672e8d8bef9SDimitry Andric break; 673e8d8bef9SDimitry Andric 674e8d8bef9SDimitry Andric DefOp.setIsKill(false); 675e8d8bef9SDimitry Andric } 676e8d8bef9SDimitry Andric 677e8d8bef9SDimitry Andric MachineInstrBuilder Builder = 678e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 679e8d8bef9SDimitry Andric .add(DefOp); 680e8d8bef9SDimitry Andric if (ImpDefSuperReg) 681e8d8bef9SDimitry Andric Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 682e8d8bef9SDimitry Andric 683e8d8bef9SDimitry Andric if (ImpUseSuperReg) { 684e8d8bef9SDimitry Andric Builder.addReg(ImpUseSuperReg, 685e8d8bef9SDimitry Andric getKillRegState(KillSrc) | RegState::Implicit); 686e8d8bef9SDimitry Andric } 687e8d8bef9SDimitry Andric 688e8d8bef9SDimitry Andric return; 689e8d8bef9SDimitry Andric } 690bdd1243dSDimitry Andric } 691e8d8bef9SDimitry Andric 69206c3fb27SDimitry Andric RS.enterBasicBlockEnd(MBB); 6935f757f3fSDimitry Andric RS.backward(std::next(MI)); 694e8d8bef9SDimitry Andric 695e8d8bef9SDimitry Andric // Ideally we want to have three registers for a long reg_sequence copy 696e8d8bef9SDimitry Andric // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 697e8d8bef9SDimitry Andric unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 698e8d8bef9SDimitry Andric *MBB.getParent()); 699e8d8bef9SDimitry Andric 700e8d8bef9SDimitry Andric // Registers in the sequence are allocated contiguously so we can just 701e8d8bef9SDimitry Andric // use register number to pick one of three round-robin temps. 70281ad6265SDimitry Andric unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3; 70381ad6265SDimitry Andric Register Tmp = 70481ad6265SDimitry Andric MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy(); 70581ad6265SDimitry Andric assert(MBB.getParent()->getRegInfo().isReserved(Tmp) && 70681ad6265SDimitry Andric "VGPR used for an intermediate copy should have been reserved."); 707fe6060f1SDimitry Andric 70806c3fb27SDimitry Andric // Only loop through if there are any free registers left. We don't want to 70906c3fb27SDimitry Andric // spill. 71006c3fb27SDimitry Andric while (RegNo--) { 71106c3fb27SDimitry Andric Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, 71206c3fb27SDimitry Andric /* RestoreAfter */ false, 0, 71306c3fb27SDimitry Andric /* AllowSpill */ false); 714e8d8bef9SDimitry Andric if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 715e8d8bef9SDimitry Andric break; 716e8d8bef9SDimitry Andric Tmp = Tmp2; 717e8d8bef9SDimitry Andric RS.setRegUsed(Tmp); 718e8d8bef9SDimitry Andric } 719e8d8bef9SDimitry Andric 720e8d8bef9SDimitry Andric // Insert copy to temporary VGPR. 721e8d8bef9SDimitry Andric unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; 722e8d8bef9SDimitry Andric if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { 723e8d8bef9SDimitry Andric TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; 724e8d8bef9SDimitry Andric } else { 725e8d8bef9SDimitry Andric assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 726e8d8bef9SDimitry Andric } 727e8d8bef9SDimitry Andric 728e8d8bef9SDimitry Andric MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) 729e8d8bef9SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 730e8d8bef9SDimitry Andric if (ImpUseSuperReg) { 731e8d8bef9SDimitry Andric UseBuilder.addReg(ImpUseSuperReg, 732e8d8bef9SDimitry Andric getKillRegState(KillSrc) | RegState::Implicit); 733e8d8bef9SDimitry Andric } 734e8d8bef9SDimitry Andric 735e8d8bef9SDimitry Andric MachineInstrBuilder DefBuilder 736e8d8bef9SDimitry Andric = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 737e8d8bef9SDimitry Andric .addReg(Tmp, RegState::Kill); 738e8d8bef9SDimitry Andric 739e8d8bef9SDimitry Andric if (ImpDefSuperReg) 740e8d8bef9SDimitry Andric DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 741e8d8bef9SDimitry Andric } 742e8d8bef9SDimitry Andric 743e8d8bef9SDimitry Andric static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, 744e8d8bef9SDimitry Andric MachineBasicBlock::iterator MI, const DebugLoc &DL, 745e8d8bef9SDimitry Andric MCRegister DestReg, MCRegister SrcReg, bool KillSrc, 746e8d8bef9SDimitry Andric const TargetRegisterClass *RC, bool Forward) { 747e8d8bef9SDimitry Andric const SIRegisterInfo &RI = TII.getRegisterInfo(); 748e8d8bef9SDimitry Andric ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); 749e8d8bef9SDimitry Andric MachineBasicBlock::iterator I = MI; 750e8d8bef9SDimitry Andric MachineInstr *FirstMI = nullptr, *LastMI = nullptr; 751e8d8bef9SDimitry Andric 752e8d8bef9SDimitry Andric for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { 753e8d8bef9SDimitry Andric int16_t SubIdx = BaseIndices[Idx]; 7545f757f3fSDimitry Andric Register DestSubReg = RI.getSubReg(DestReg, SubIdx); 7555f757f3fSDimitry Andric Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); 7565f757f3fSDimitry Andric assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); 757e8d8bef9SDimitry Andric unsigned Opcode = AMDGPU::S_MOV_B32; 758e8d8bef9SDimitry Andric 759e8d8bef9SDimitry Andric // Is SGPR aligned? If so try to combine with next. 7605f757f3fSDimitry Andric bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0; 7615f757f3fSDimitry Andric bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0; 762e8d8bef9SDimitry Andric if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { 763e8d8bef9SDimitry Andric // Can use SGPR64 copy 764e8d8bef9SDimitry Andric unsigned Channel = RI.getChannelFromSubReg(SubIdx); 765e8d8bef9SDimitry Andric SubIdx = RI.getSubRegFromChannel(Channel, 2); 7665f757f3fSDimitry Andric DestSubReg = RI.getSubReg(DestReg, SubIdx); 7675f757f3fSDimitry Andric SrcSubReg = RI.getSubReg(SrcReg, SubIdx); 7685f757f3fSDimitry Andric assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); 769e8d8bef9SDimitry Andric Opcode = AMDGPU::S_MOV_B64; 770e8d8bef9SDimitry Andric Idx++; 771e8d8bef9SDimitry Andric } 772e8d8bef9SDimitry Andric 7735f757f3fSDimitry Andric LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) 7745f757f3fSDimitry Andric .addReg(SrcSubReg) 775e8d8bef9SDimitry Andric .addReg(SrcReg, RegState::Implicit); 776e8d8bef9SDimitry Andric 777e8d8bef9SDimitry Andric if (!FirstMI) 778e8d8bef9SDimitry Andric FirstMI = LastMI; 779e8d8bef9SDimitry Andric 780e8d8bef9SDimitry Andric if (!Forward) 781e8d8bef9SDimitry Andric I--; 782e8d8bef9SDimitry Andric } 783e8d8bef9SDimitry Andric 784e8d8bef9SDimitry Andric assert(FirstMI && LastMI); 785e8d8bef9SDimitry Andric if (!Forward) 786e8d8bef9SDimitry Andric std::swap(FirstMI, LastMI); 787e8d8bef9SDimitry Andric 788e8d8bef9SDimitry Andric FirstMI->addOperand( 789e8d8bef9SDimitry Andric MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); 790e8d8bef9SDimitry Andric 791e8d8bef9SDimitry Andric if (KillSrc) 792e8d8bef9SDimitry Andric LastMI->addRegisterKilled(SrcReg, &RI); 793e8d8bef9SDimitry Andric } 794e8d8bef9SDimitry Andric 7950b57cec5SDimitry Andric void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 7960b57cec5SDimitry Andric MachineBasicBlock::iterator MI, 797480093f4SDimitry Andric const DebugLoc &DL, MCRegister DestReg, 798480093f4SDimitry Andric MCRegister SrcReg, bool KillSrc) const { 799bdd1243dSDimitry Andric const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg); 8005f757f3fSDimitry Andric unsigned Size = RI.getRegSizeInBits(*RC); 8015f757f3fSDimitry Andric const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); 8025f757f3fSDimitry Andric unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); 8030b57cec5SDimitry Andric 8045f757f3fSDimitry Andric // The rest of copyPhysReg assumes Src and Dst size are the same size. 8055f757f3fSDimitry Andric // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can 8065f757f3fSDimitry Andric // we remove Fix16BitCopies and this code block? 8075f757f3fSDimitry Andric if (Fix16BitCopies) { 8085f757f3fSDimitry Andric if (((Size == 16) != (SrcSize == 16))) { 8095f757f3fSDimitry Andric // Non-VGPR Src and Dst will later be expanded back to 32 bits. 8105f757f3fSDimitry Andric assert(ST.hasTrue16BitInsts()); 8115f757f3fSDimitry Andric MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg; 8125f757f3fSDimitry Andric MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16); 8135f757f3fSDimitry Andric RegToFix = SubReg; 8145ffd83dbSDimitry Andric 8155ffd83dbSDimitry Andric if (DestReg == SrcReg) { 8165f757f3fSDimitry Andric // Identity copy. Insert empty bundle since ExpandPostRA expects an 8175f757f3fSDimitry Andric // instruction here. 8185ffd83dbSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 8195ffd83dbSDimitry Andric return; 8205ffd83dbSDimitry Andric } 821bdd1243dSDimitry Andric RC = RI.getPhysRegBaseClass(DestReg); 8225f757f3fSDimitry Andric Size = RI.getRegSizeInBits(*RC); 8235f757f3fSDimitry Andric SrcRC = RI.getPhysRegBaseClass(SrcReg); 8245f757f3fSDimitry Andric SrcSize = RI.getRegSizeInBits(*SrcRC); 8255f757f3fSDimitry Andric } 8265ffd83dbSDimitry Andric } 8275ffd83dbSDimitry Andric 8280b57cec5SDimitry Andric if (RC == &AMDGPU::VGPR_32RegClass) { 8290b57cec5SDimitry Andric assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 8300b57cec5SDimitry Andric AMDGPU::SReg_32RegClass.contains(SrcReg) || 8310b57cec5SDimitry Andric AMDGPU::AGPR_32RegClass.contains(SrcReg)); 8320b57cec5SDimitry Andric unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 833e8d8bef9SDimitry Andric AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; 8340b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(Opc), DestReg) 8350b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 8360b57cec5SDimitry Andric return; 8370b57cec5SDimitry Andric } 8380b57cec5SDimitry Andric 8390b57cec5SDimitry Andric if (RC == &AMDGPU::SReg_32_XM0RegClass || 8400b57cec5SDimitry Andric RC == &AMDGPU::SReg_32RegClass) { 8410b57cec5SDimitry Andric if (SrcReg == AMDGPU::SCC) { 8420b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 843480093f4SDimitry Andric .addImm(1) 8440b57cec5SDimitry Andric .addImm(0); 8450b57cec5SDimitry Andric return; 8460b57cec5SDimitry Andric } 8470b57cec5SDimitry Andric 8480b57cec5SDimitry Andric if (DestReg == AMDGPU::VCC_LO) { 8490b57cec5SDimitry Andric if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 8500b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 8510b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 8520b57cec5SDimitry Andric } else { 8530b57cec5SDimitry Andric // FIXME: Hack until VReg_1 removed. 8540b57cec5SDimitry Andric assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 8550b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 8560b57cec5SDimitry Andric .addImm(0) 8570b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 8580b57cec5SDimitry Andric } 8590b57cec5SDimitry Andric 8600b57cec5SDimitry Andric return; 8610b57cec5SDimitry Andric } 8620b57cec5SDimitry Andric 8630b57cec5SDimitry Andric if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 8640b57cec5SDimitry Andric reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 8650b57cec5SDimitry Andric return; 8660b57cec5SDimitry Andric } 8670b57cec5SDimitry Andric 8680b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 8690b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 8700b57cec5SDimitry Andric return; 8710b57cec5SDimitry Andric } 8720b57cec5SDimitry Andric 8730b57cec5SDimitry Andric if (RC == &AMDGPU::SReg_64RegClass) { 8745ffd83dbSDimitry Andric if (SrcReg == AMDGPU::SCC) { 8755ffd83dbSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) 8765ffd83dbSDimitry Andric .addImm(1) 8775ffd83dbSDimitry Andric .addImm(0); 8785ffd83dbSDimitry Andric return; 8795ffd83dbSDimitry Andric } 8805ffd83dbSDimitry Andric 8810b57cec5SDimitry Andric if (DestReg == AMDGPU::VCC) { 8820b57cec5SDimitry Andric if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 8830b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 8840b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 8850b57cec5SDimitry Andric } else { 8860b57cec5SDimitry Andric // FIXME: Hack until VReg_1 removed. 8870b57cec5SDimitry Andric assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 8880b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 8890b57cec5SDimitry Andric .addImm(0) 8900b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 8910b57cec5SDimitry Andric } 8920b57cec5SDimitry Andric 8930b57cec5SDimitry Andric return; 8940b57cec5SDimitry Andric } 8950b57cec5SDimitry Andric 8960b57cec5SDimitry Andric if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 8970b57cec5SDimitry Andric reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 8980b57cec5SDimitry Andric return; 8990b57cec5SDimitry Andric } 9000b57cec5SDimitry Andric 9010b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 9020b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 9030b57cec5SDimitry Andric return; 9040b57cec5SDimitry Andric } 9050b57cec5SDimitry Andric 9060b57cec5SDimitry Andric if (DestReg == AMDGPU::SCC) { 9075ffd83dbSDimitry Andric // Copying 64-bit or 32-bit sources to SCC barely makes sense, 9085ffd83dbSDimitry Andric // but SelectionDAG emits such copies for i1 sources. 9095ffd83dbSDimitry Andric if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 910e8d8bef9SDimitry Andric // This copy can only be produced by patterns 911e8d8bef9SDimitry Andric // with explicit SCC, which are known to be enabled 912e8d8bef9SDimitry Andric // only for subtargets with S_CMP_LG_U64 present. 913e8d8bef9SDimitry Andric assert(ST.hasScalarCompareEq64()); 914e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) 915e8d8bef9SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)) 916e8d8bef9SDimitry Andric .addImm(0); 917e8d8bef9SDimitry Andric } else { 9180b57cec5SDimitry Andric assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 9190b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 9200b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)) 9210b57cec5SDimitry Andric .addImm(0); 922e8d8bef9SDimitry Andric } 9235ffd83dbSDimitry Andric 9240b57cec5SDimitry Andric return; 9250b57cec5SDimitry Andric } 9260b57cec5SDimitry Andric 9270b57cec5SDimitry Andric if (RC == &AMDGPU::AGPR_32RegClass) { 92881ad6265SDimitry Andric if (AMDGPU::VGPR_32RegClass.contains(SrcReg) || 92981ad6265SDimitry Andric (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) { 930e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 9310b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 9320b57cec5SDimitry Andric return; 9330b57cec5SDimitry Andric } 9340b57cec5SDimitry Andric 935fe6060f1SDimitry Andric if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { 936fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) 937fe6060f1SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 938fe6060f1SDimitry Andric return; 939fe6060f1SDimitry Andric } 940fe6060f1SDimitry Andric 941e8d8bef9SDimitry Andric // FIXME: Pass should maintain scavenger to avoid scan through the block on 942e8d8bef9SDimitry Andric // every AGPR spill. 943e8d8bef9SDimitry Andric RegScavenger RS; 944bdd1243dSDimitry Andric const bool Overlap = RI.regsOverlap(SrcReg, DestReg); 945bdd1243dSDimitry Andric indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap); 946e8d8bef9SDimitry Andric return; 947e8d8bef9SDimitry Andric } 948e8d8bef9SDimitry Andric 949fe6060f1SDimitry Andric if (Size == 16) { 9505f757f3fSDimitry Andric assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) || 9515ffd83dbSDimitry Andric AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 9525ffd83dbSDimitry Andric AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 9535ffd83dbSDimitry Andric 9545ffd83dbSDimitry Andric bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 9555ffd83dbSDimitry Andric bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 9565ffd83dbSDimitry Andric bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 9575ffd83dbSDimitry Andric bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 958647cbc5dSDimitry Andric bool DstLow = !AMDGPU::isHi(DestReg, RI); 959647cbc5dSDimitry Andric bool SrcLow = !AMDGPU::isHi(SrcReg, RI); 9605ffd83dbSDimitry Andric MCRegister NewDestReg = RI.get32BitRegister(DestReg); 9615ffd83dbSDimitry Andric MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 9625ffd83dbSDimitry Andric 9635ffd83dbSDimitry Andric if (IsSGPRDst) { 9645ffd83dbSDimitry Andric if (!IsSGPRSrc) { 9655ffd83dbSDimitry Andric reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 9665ffd83dbSDimitry Andric return; 9675ffd83dbSDimitry Andric } 9685ffd83dbSDimitry Andric 9695ffd83dbSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 9705ffd83dbSDimitry Andric .addReg(NewSrcReg, getKillRegState(KillSrc)); 9715ffd83dbSDimitry Andric return; 9725ffd83dbSDimitry Andric } 9735ffd83dbSDimitry Andric 9745ffd83dbSDimitry Andric if (IsAGPRDst || IsAGPRSrc) { 9755ffd83dbSDimitry Andric if (!DstLow || !SrcLow) { 9765ffd83dbSDimitry Andric reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 9775ffd83dbSDimitry Andric "Cannot use hi16 subreg with an AGPR!"); 9785ffd83dbSDimitry Andric } 9795ffd83dbSDimitry Andric 9805ffd83dbSDimitry Andric copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 9815ffd83dbSDimitry Andric return; 9825ffd83dbSDimitry Andric } 9835ffd83dbSDimitry Andric 9845f757f3fSDimitry Andric if (ST.hasTrue16BitInsts()) { 9855f757f3fSDimitry Andric if (IsSGPRSrc) { 9865f757f3fSDimitry Andric assert(SrcLow); 9875f757f3fSDimitry Andric SrcReg = NewSrcReg; 9885f757f3fSDimitry Andric } 9895f757f3fSDimitry Andric // Use the smaller instruction encoding if possible. 9905f757f3fSDimitry Andric if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) && 9915f757f3fSDimitry Andric (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) { 9925f757f3fSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg) 9935f757f3fSDimitry Andric .addReg(SrcReg); 9945f757f3fSDimitry Andric } else { 9955f757f3fSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg) 9965f757f3fSDimitry Andric .addImm(0) // src0_modifiers 9975f757f3fSDimitry Andric .addReg(SrcReg) 9985f757f3fSDimitry Andric .addImm(0); // op_sel 9995f757f3fSDimitry Andric } 10005f757f3fSDimitry Andric return; 10015f757f3fSDimitry Andric } 10025f757f3fSDimitry Andric 10035ffd83dbSDimitry Andric if (IsSGPRSrc && !ST.hasSDWAScalar()) { 10045ffd83dbSDimitry Andric if (!DstLow || !SrcLow) { 10055ffd83dbSDimitry Andric reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 10065ffd83dbSDimitry Andric "Cannot use hi16 subreg on VI!"); 10075ffd83dbSDimitry Andric } 10085ffd83dbSDimitry Andric 10095ffd83dbSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 10105ffd83dbSDimitry Andric .addReg(NewSrcReg, getKillRegState(KillSrc)); 10115ffd83dbSDimitry Andric return; 10125ffd83dbSDimitry Andric } 10135ffd83dbSDimitry Andric 10145ffd83dbSDimitry Andric auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 10155ffd83dbSDimitry Andric .addImm(0) // src0_modifiers 10165ffd83dbSDimitry Andric .addReg(NewSrcReg) 10175ffd83dbSDimitry Andric .addImm(0) // clamp 10185ffd83dbSDimitry Andric .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 10195ffd83dbSDimitry Andric : AMDGPU::SDWA::SdwaSel::WORD_1) 10205ffd83dbSDimitry Andric .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 10215ffd83dbSDimitry Andric .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 10225ffd83dbSDimitry Andric : AMDGPU::SDWA::SdwaSel::WORD_1) 10235ffd83dbSDimitry Andric .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 10245ffd83dbSDimitry Andric // First implicit operand is $exec. 10255ffd83dbSDimitry Andric MIB->tieOperands(0, MIB->getNumOperands() - 1); 10265ffd83dbSDimitry Andric return; 10275ffd83dbSDimitry Andric } 10285ffd83dbSDimitry Andric 1029fe6060f1SDimitry Andric if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { 103081ad6265SDimitry Andric if (ST.hasMovB64()) { 103181ad6265SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) 103281ad6265SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc)); 103381ad6265SDimitry Andric return; 103481ad6265SDimitry Andric } 10355f757f3fSDimitry Andric if (ST.hasPkMovB32()) { 1036fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) 1037fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_1) 1038fe6060f1SDimitry Andric .addReg(SrcReg) 1039fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 1040fe6060f1SDimitry Andric .addReg(SrcReg) 1041fe6060f1SDimitry Andric .addImm(0) // op_sel_lo 1042fe6060f1SDimitry Andric .addImm(0) // op_sel_hi 1043fe6060f1SDimitry Andric .addImm(0) // neg_lo 1044fe6060f1SDimitry Andric .addImm(0) // neg_hi 1045fe6060f1SDimitry Andric .addImm(0) // clamp 1046fe6060f1SDimitry Andric .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 1047fe6060f1SDimitry Andric return; 1048fe6060f1SDimitry Andric } 1049fe6060f1SDimitry Andric } 1050fe6060f1SDimitry Andric 1051e8d8bef9SDimitry Andric const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 10520b57cec5SDimitry Andric if (RI.isSGPRClass(RC)) { 1053fe6060f1SDimitry Andric if (!RI.isSGPRClass(SrcRC)) { 10540b57cec5SDimitry Andric reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 10550b57cec5SDimitry Andric return; 10560b57cec5SDimitry Andric } 105781ad6265SDimitry Andric const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); 105881ad6265SDimitry Andric expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, 105981ad6265SDimitry Andric Forward); 1060e8d8bef9SDimitry Andric return; 10610b57cec5SDimitry Andric } 10620b57cec5SDimitry Andric 1063fe6060f1SDimitry Andric unsigned EltSize = 4; 1064e8d8bef9SDimitry Andric unsigned Opcode = AMDGPU::V_MOV_B32_e32; 10654824e7fdSDimitry Andric if (RI.isAGPRClass(RC)) { 10660eae32dcSDimitry Andric if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) 10670eae32dcSDimitry Andric Opcode = AMDGPU::V_ACCVGPR_MOV_B32; 106881ad6265SDimitry Andric else if (RI.hasVGPRs(SrcRC) || 106981ad6265SDimitry Andric (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC))) 10700eae32dcSDimitry Andric Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 10710eae32dcSDimitry Andric else 10720eae32dcSDimitry Andric Opcode = AMDGPU::INSTRUCTION_LIST_END; 10734824e7fdSDimitry Andric } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { 1074e8d8bef9SDimitry Andric Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; 1075fe6060f1SDimitry Andric } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && 1076fe6060f1SDimitry Andric (RI.isProperlyAlignedRC(*RC) && 1077fe6060f1SDimitry Andric (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { 1078fe6060f1SDimitry Andric // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. 107981ad6265SDimitry Andric if (ST.hasMovB64()) { 108081ad6265SDimitry Andric Opcode = AMDGPU::V_MOV_B64_e32; 108181ad6265SDimitry Andric EltSize = 8; 10825f757f3fSDimitry Andric } else if (ST.hasPkMovB32()) { 1083fe6060f1SDimitry Andric Opcode = AMDGPU::V_PK_MOV_B32; 1084fe6060f1SDimitry Andric EltSize = 8; 1085fe6060f1SDimitry Andric } 1086e8d8bef9SDimitry Andric } 1087e8d8bef9SDimitry Andric 1088e8d8bef9SDimitry Andric // For the cases where we need an intermediate instruction/temporary register 1089e8d8bef9SDimitry Andric // (destination is an AGPR), we need a scavenger. 1090e8d8bef9SDimitry Andric // 1091e8d8bef9SDimitry Andric // FIXME: The pass should maintain this for us so we don't have to re-scan the 1092e8d8bef9SDimitry Andric // whole block for every handled copy. 1093e8d8bef9SDimitry Andric std::unique_ptr<RegScavenger> RS; 1094e8d8bef9SDimitry Andric if (Opcode == AMDGPU::INSTRUCTION_LIST_END) 1095e8d8bef9SDimitry Andric RS.reset(new RegScavenger()); 1096e8d8bef9SDimitry Andric 1097fe6060f1SDimitry Andric ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 1098e8d8bef9SDimitry Andric 1099e8d8bef9SDimitry Andric // If there is an overlap, we can't kill the super-register on the last 1100e8d8bef9SDimitry Andric // instruction, since it will also kill the components made live by this def. 1101bdd1243dSDimitry Andric const bool Overlap = RI.regsOverlap(SrcReg, DestReg); 1102bdd1243dSDimitry Andric const bool CanKillSuperReg = KillSrc && !Overlap; 11030b57cec5SDimitry Andric 11040b57cec5SDimitry Andric for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 11050b57cec5SDimitry Andric unsigned SubIdx; 11060b57cec5SDimitry Andric if (Forward) 11070b57cec5SDimitry Andric SubIdx = SubIndices[Idx]; 11080b57cec5SDimitry Andric else 11090b57cec5SDimitry Andric SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 11105f757f3fSDimitry Andric Register DestSubReg = RI.getSubReg(DestReg, SubIdx); 11115f757f3fSDimitry Andric Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); 11125f757f3fSDimitry Andric assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); 11130b57cec5SDimitry Andric 1114bdd1243dSDimitry Andric bool IsFirstSubreg = Idx == 0; 1115e8d8bef9SDimitry Andric bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; 11160b57cec5SDimitry Andric 1117e8d8bef9SDimitry Andric if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { 1118bdd1243dSDimitry Andric Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); 1119e8d8bef9SDimitry Andric Register ImpUseSuper = SrcReg; 11205f757f3fSDimitry Andric indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill, 11215f757f3fSDimitry Andric *RS, Overlap, ImpDefSuper, ImpUseSuper); 1122fe6060f1SDimitry Andric } else if (Opcode == AMDGPU::V_PK_MOV_B32) { 1123fe6060f1SDimitry Andric MachineInstrBuilder MIB = 11245f757f3fSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg) 1125fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_1) 1126fe6060f1SDimitry Andric .addReg(SrcSubReg) 1127fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 1128fe6060f1SDimitry Andric .addReg(SrcSubReg) 1129fe6060f1SDimitry Andric .addImm(0) // op_sel_lo 1130fe6060f1SDimitry Andric .addImm(0) // op_sel_hi 1131fe6060f1SDimitry Andric .addImm(0) // neg_lo 1132fe6060f1SDimitry Andric .addImm(0) // neg_hi 1133fe6060f1SDimitry Andric .addImm(0) // clamp 1134fe6060f1SDimitry Andric .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 1135bdd1243dSDimitry Andric if (IsFirstSubreg) 1136fe6060f1SDimitry Andric MIB.addReg(DestReg, RegState::Define | RegState::Implicit); 1137e8d8bef9SDimitry Andric } else { 1138e8d8bef9SDimitry Andric MachineInstrBuilder Builder = 11395f757f3fSDimitry Andric BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg); 1140bdd1243dSDimitry Andric if (IsFirstSubreg) 11410b57cec5SDimitry Andric Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 11420b57cec5SDimitry Andric 11430b57cec5SDimitry Andric Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 11440b57cec5SDimitry Andric } 11450b57cec5SDimitry Andric } 1146e8d8bef9SDimitry Andric } 11470b57cec5SDimitry Andric 11480b57cec5SDimitry Andric int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 11490b57cec5SDimitry Andric int NewOpc; 11500b57cec5SDimitry Andric 11510b57cec5SDimitry Andric // Try to map original to commuted opcode 11520b57cec5SDimitry Andric NewOpc = AMDGPU::getCommuteRev(Opcode); 11530b57cec5SDimitry Andric if (NewOpc != -1) 11540b57cec5SDimitry Andric // Check if the commuted (REV) opcode exists on the target. 11550b57cec5SDimitry Andric return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 11560b57cec5SDimitry Andric 11570b57cec5SDimitry Andric // Try to map commuted to original opcode 11580b57cec5SDimitry Andric NewOpc = AMDGPU::getCommuteOrig(Opcode); 11590b57cec5SDimitry Andric if (NewOpc != -1) 11600b57cec5SDimitry Andric // Check if the original (non-REV) opcode exists on the target. 11610b57cec5SDimitry Andric return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 11620b57cec5SDimitry Andric 11630b57cec5SDimitry Andric return Opcode; 11640b57cec5SDimitry Andric } 11650b57cec5SDimitry Andric 11660b57cec5SDimitry Andric void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 11670b57cec5SDimitry Andric MachineBasicBlock::iterator MI, 1168bdd1243dSDimitry Andric const DebugLoc &DL, Register DestReg, 11690b57cec5SDimitry Andric int64_t Value) const { 11700b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 11710b57cec5SDimitry Andric const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 11720b57cec5SDimitry Andric if (RegClass == &AMDGPU::SReg_32RegClass || 11730b57cec5SDimitry Andric RegClass == &AMDGPU::SGPR_32RegClass || 11740b57cec5SDimitry Andric RegClass == &AMDGPU::SReg_32_XM0RegClass || 11750b57cec5SDimitry Andric RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 11760b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 11770b57cec5SDimitry Andric .addImm(Value); 11780b57cec5SDimitry Andric return; 11790b57cec5SDimitry Andric } 11800b57cec5SDimitry Andric 11810b57cec5SDimitry Andric if (RegClass == &AMDGPU::SReg_64RegClass || 11820b57cec5SDimitry Andric RegClass == &AMDGPU::SGPR_64RegClass || 11830b57cec5SDimitry Andric RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 11840b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 11850b57cec5SDimitry Andric .addImm(Value); 11860b57cec5SDimitry Andric return; 11870b57cec5SDimitry Andric } 11880b57cec5SDimitry Andric 11890b57cec5SDimitry Andric if (RegClass == &AMDGPU::VGPR_32RegClass) { 11900b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 11910b57cec5SDimitry Andric .addImm(Value); 11920b57cec5SDimitry Andric return; 11930b57cec5SDimitry Andric } 1194fe6060f1SDimitry Andric if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { 11950b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 11960b57cec5SDimitry Andric .addImm(Value); 11970b57cec5SDimitry Andric return; 11980b57cec5SDimitry Andric } 11990b57cec5SDimitry Andric 12000b57cec5SDimitry Andric unsigned EltSize = 4; 12010b57cec5SDimitry Andric unsigned Opcode = AMDGPU::V_MOV_B32_e32; 12020b57cec5SDimitry Andric if (RI.isSGPRClass(RegClass)) { 12030b57cec5SDimitry Andric if (RI.getRegSizeInBits(*RegClass) > 32) { 12040b57cec5SDimitry Andric Opcode = AMDGPU::S_MOV_B64; 12050b57cec5SDimitry Andric EltSize = 8; 12060b57cec5SDimitry Andric } else { 12070b57cec5SDimitry Andric Opcode = AMDGPU::S_MOV_B32; 12080b57cec5SDimitry Andric EltSize = 4; 12090b57cec5SDimitry Andric } 12100b57cec5SDimitry Andric } 12110b57cec5SDimitry Andric 12120b57cec5SDimitry Andric ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 12130b57cec5SDimitry Andric for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 12140b57cec5SDimitry Andric int64_t IdxValue = Idx == 0 ? Value : 0; 12150b57cec5SDimitry Andric 12160b57cec5SDimitry Andric MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 12175ffd83dbSDimitry Andric get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 12180b57cec5SDimitry Andric Builder.addImm(IdxValue); 12190b57cec5SDimitry Andric } 12200b57cec5SDimitry Andric } 12210b57cec5SDimitry Andric 12220b57cec5SDimitry Andric const TargetRegisterClass * 12230b57cec5SDimitry Andric SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 12240b57cec5SDimitry Andric return &AMDGPU::VGPR_32RegClass; 12250b57cec5SDimitry Andric } 12260b57cec5SDimitry Andric 12270b57cec5SDimitry Andric void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 12280b57cec5SDimitry Andric MachineBasicBlock::iterator I, 12295ffd83dbSDimitry Andric const DebugLoc &DL, Register DstReg, 12300b57cec5SDimitry Andric ArrayRef<MachineOperand> Cond, 12315ffd83dbSDimitry Andric Register TrueReg, 12325ffd83dbSDimitry Andric Register FalseReg) const { 12330b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 12340b57cec5SDimitry Andric const TargetRegisterClass *BoolXExecRC = 12350b57cec5SDimitry Andric RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 12360b57cec5SDimitry Andric assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 12370b57cec5SDimitry Andric "Not a VGPR32 reg"); 12380b57cec5SDimitry Andric 12390b57cec5SDimitry Andric if (Cond.size() == 1) { 12408bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 12410b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 12420b57cec5SDimitry Andric .add(Cond[0]); 12430b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 12440b57cec5SDimitry Andric .addImm(0) 12450b57cec5SDimitry Andric .addReg(FalseReg) 12460b57cec5SDimitry Andric .addImm(0) 12470b57cec5SDimitry Andric .addReg(TrueReg) 12480b57cec5SDimitry Andric .addReg(SReg); 12490b57cec5SDimitry Andric } else if (Cond.size() == 2) { 12500b57cec5SDimitry Andric assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 12510b57cec5SDimitry Andric switch (Cond[0].getImm()) { 12520b57cec5SDimitry Andric case SIInstrInfo::SCC_TRUE: { 12538bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 12540b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 12550b57cec5SDimitry Andric : AMDGPU::S_CSELECT_B64), SReg) 1256480093f4SDimitry Andric .addImm(1) 12570b57cec5SDimitry Andric .addImm(0); 12580b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 12590b57cec5SDimitry Andric .addImm(0) 12600b57cec5SDimitry Andric .addReg(FalseReg) 12610b57cec5SDimitry Andric .addImm(0) 12620b57cec5SDimitry Andric .addReg(TrueReg) 12630b57cec5SDimitry Andric .addReg(SReg); 12640b57cec5SDimitry Andric break; 12650b57cec5SDimitry Andric } 12660b57cec5SDimitry Andric case SIInstrInfo::SCC_FALSE: { 12678bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 12680b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 12690b57cec5SDimitry Andric : AMDGPU::S_CSELECT_B64), SReg) 12700b57cec5SDimitry Andric .addImm(0) 1271480093f4SDimitry Andric .addImm(1); 12720b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 12730b57cec5SDimitry Andric .addImm(0) 12740b57cec5SDimitry Andric .addReg(FalseReg) 12750b57cec5SDimitry Andric .addImm(0) 12760b57cec5SDimitry Andric .addReg(TrueReg) 12770b57cec5SDimitry Andric .addReg(SReg); 12780b57cec5SDimitry Andric break; 12790b57cec5SDimitry Andric } 12800b57cec5SDimitry Andric case SIInstrInfo::VCCNZ: { 12810b57cec5SDimitry Andric MachineOperand RegOp = Cond[1]; 12820b57cec5SDimitry Andric RegOp.setImplicit(false); 12838bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 12840b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 12850b57cec5SDimitry Andric .add(RegOp); 12860b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 12870b57cec5SDimitry Andric .addImm(0) 12880b57cec5SDimitry Andric .addReg(FalseReg) 12890b57cec5SDimitry Andric .addImm(0) 12900b57cec5SDimitry Andric .addReg(TrueReg) 12910b57cec5SDimitry Andric .addReg(SReg); 12920b57cec5SDimitry Andric break; 12930b57cec5SDimitry Andric } 12940b57cec5SDimitry Andric case SIInstrInfo::VCCZ: { 12950b57cec5SDimitry Andric MachineOperand RegOp = Cond[1]; 12960b57cec5SDimitry Andric RegOp.setImplicit(false); 12978bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 12980b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 12990b57cec5SDimitry Andric .add(RegOp); 13000b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 13010b57cec5SDimitry Andric .addImm(0) 13020b57cec5SDimitry Andric .addReg(TrueReg) 13030b57cec5SDimitry Andric .addImm(0) 13040b57cec5SDimitry Andric .addReg(FalseReg) 13050b57cec5SDimitry Andric .addReg(SReg); 13060b57cec5SDimitry Andric break; 13070b57cec5SDimitry Andric } 13080b57cec5SDimitry Andric case SIInstrInfo::EXECNZ: { 13098bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 13108bcb0991SDimitry Andric Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 13110b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 13120b57cec5SDimitry Andric : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 13130b57cec5SDimitry Andric .addImm(0); 13140b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 13150b57cec5SDimitry Andric : AMDGPU::S_CSELECT_B64), SReg) 1316480093f4SDimitry Andric .addImm(1) 13170b57cec5SDimitry Andric .addImm(0); 13180b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 13190b57cec5SDimitry Andric .addImm(0) 13200b57cec5SDimitry Andric .addReg(FalseReg) 13210b57cec5SDimitry Andric .addImm(0) 13220b57cec5SDimitry Andric .addReg(TrueReg) 13230b57cec5SDimitry Andric .addReg(SReg); 13240b57cec5SDimitry Andric break; 13250b57cec5SDimitry Andric } 13260b57cec5SDimitry Andric case SIInstrInfo::EXECZ: { 13278bcb0991SDimitry Andric Register SReg = MRI.createVirtualRegister(BoolXExecRC); 13288bcb0991SDimitry Andric Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 13290b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 13300b57cec5SDimitry Andric : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 13310b57cec5SDimitry Andric .addImm(0); 13320b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 13330b57cec5SDimitry Andric : AMDGPU::S_CSELECT_B64), SReg) 13340b57cec5SDimitry Andric .addImm(0) 1335480093f4SDimitry Andric .addImm(1); 13360b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 13370b57cec5SDimitry Andric .addImm(0) 13380b57cec5SDimitry Andric .addReg(FalseReg) 13390b57cec5SDimitry Andric .addImm(0) 13400b57cec5SDimitry Andric .addReg(TrueReg) 13410b57cec5SDimitry Andric .addReg(SReg); 13420b57cec5SDimitry Andric llvm_unreachable("Unhandled branch predicate EXECZ"); 13430b57cec5SDimitry Andric break; 13440b57cec5SDimitry Andric } 13450b57cec5SDimitry Andric default: 13460b57cec5SDimitry Andric llvm_unreachable("invalid branch predicate"); 13470b57cec5SDimitry Andric } 13480b57cec5SDimitry Andric } else { 13490b57cec5SDimitry Andric llvm_unreachable("Can only handle Cond size 1 or 2"); 13500b57cec5SDimitry Andric } 13510b57cec5SDimitry Andric } 13520b57cec5SDimitry Andric 13535ffd83dbSDimitry Andric Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 13540b57cec5SDimitry Andric MachineBasicBlock::iterator I, 13550b57cec5SDimitry Andric const DebugLoc &DL, 13565ffd83dbSDimitry Andric Register SrcReg, int Value) const { 13570b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 13588bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 13590b57cec5SDimitry Andric BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 13600b57cec5SDimitry Andric .addImm(Value) 13610b57cec5SDimitry Andric .addReg(SrcReg); 13620b57cec5SDimitry Andric 13630b57cec5SDimitry Andric return Reg; 13640b57cec5SDimitry Andric } 13650b57cec5SDimitry Andric 13665ffd83dbSDimitry Andric Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 13670b57cec5SDimitry Andric MachineBasicBlock::iterator I, 13680b57cec5SDimitry Andric const DebugLoc &DL, 13695ffd83dbSDimitry Andric Register SrcReg, int Value) const { 13700b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 13718bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 13720b57cec5SDimitry Andric BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 13730b57cec5SDimitry Andric .addImm(Value) 13740b57cec5SDimitry Andric .addReg(SrcReg); 13750b57cec5SDimitry Andric 13760b57cec5SDimitry Andric return Reg; 13770b57cec5SDimitry Andric } 13780b57cec5SDimitry Andric 13790b57cec5SDimitry Andric unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 13800b57cec5SDimitry Andric 13814824e7fdSDimitry Andric if (RI.isAGPRClass(DstRC)) 13820b57cec5SDimitry Andric return AMDGPU::COPY; 13835f757f3fSDimitry Andric if (RI.getRegSizeInBits(*DstRC) == 16) { 13845f757f3fSDimitry Andric // Assume hi bits are unneeded. Only _e64 true16 instructions are legal 13855f757f3fSDimitry Andric // before RA. 13865f757f3fSDimitry Andric return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64; 13875f757f3fSDimitry Andric } else if (RI.getRegSizeInBits(*DstRC) == 32) { 13880b57cec5SDimitry Andric return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 13890b57cec5SDimitry Andric } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 13900b57cec5SDimitry Andric return AMDGPU::S_MOV_B64; 13910b57cec5SDimitry Andric } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 13920b57cec5SDimitry Andric return AMDGPU::V_MOV_B64_PSEUDO; 13930b57cec5SDimitry Andric } 13940b57cec5SDimitry Andric return AMDGPU::COPY; 13950b57cec5SDimitry Andric } 13960b57cec5SDimitry Andric 1397e8d8bef9SDimitry Andric const MCInstrDesc & 1398e8d8bef9SDimitry Andric SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, 1399e8d8bef9SDimitry Andric bool IsIndirectSrc) const { 1400e8d8bef9SDimitry Andric if (IsIndirectSrc) { 14015ffd83dbSDimitry Andric if (VecSize <= 32) // 4 bytes 1402e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); 14035ffd83dbSDimitry Andric if (VecSize <= 64) // 8 bytes 1404e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); 14055ffd83dbSDimitry Andric if (VecSize <= 96) // 12 bytes 1406e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); 14075ffd83dbSDimitry Andric if (VecSize <= 128) // 16 bytes 1408e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); 14095ffd83dbSDimitry Andric if (VecSize <= 160) // 20 bytes 1410e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); 14115ffd83dbSDimitry Andric if (VecSize <= 256) // 32 bytes 1412e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); 1413bdd1243dSDimitry Andric if (VecSize <= 288) // 36 bytes 1414bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9); 1415bdd1243dSDimitry Andric if (VecSize <= 320) // 40 bytes 1416bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10); 1417bdd1243dSDimitry Andric if (VecSize <= 352) // 44 bytes 1418bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11); 1419bdd1243dSDimitry Andric if (VecSize <= 384) // 48 bytes 1420bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12); 14215ffd83dbSDimitry Andric if (VecSize <= 512) // 64 bytes 1422e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); 14235ffd83dbSDimitry Andric if (VecSize <= 1024) // 128 bytes 1424e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); 14255ffd83dbSDimitry Andric 1426e8d8bef9SDimitry Andric llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos"); 14275ffd83dbSDimitry Andric } 14285ffd83dbSDimitry Andric 14295ffd83dbSDimitry Andric if (VecSize <= 32) // 4 bytes 1430e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); 14315ffd83dbSDimitry Andric if (VecSize <= 64) // 8 bytes 1432e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); 14335ffd83dbSDimitry Andric if (VecSize <= 96) // 12 bytes 1434e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); 14355ffd83dbSDimitry Andric if (VecSize <= 128) // 16 bytes 1436e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); 14375ffd83dbSDimitry Andric if (VecSize <= 160) // 20 bytes 1438e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); 14395ffd83dbSDimitry Andric if (VecSize <= 256) // 32 bytes 1440e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); 1441bdd1243dSDimitry Andric if (VecSize <= 288) // 36 bytes 1442bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9); 1443bdd1243dSDimitry Andric if (VecSize <= 320) // 40 bytes 1444bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10); 1445bdd1243dSDimitry Andric if (VecSize <= 352) // 44 bytes 1446bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11); 1447bdd1243dSDimitry Andric if (VecSize <= 384) // 48 bytes 1448bdd1243dSDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12); 14495ffd83dbSDimitry Andric if (VecSize <= 512) // 64 bytes 1450e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); 14515ffd83dbSDimitry Andric if (VecSize <= 1024) // 128 bytes 1452e8d8bef9SDimitry Andric return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); 14535ffd83dbSDimitry Andric 1454e8d8bef9SDimitry Andric llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos"); 14555ffd83dbSDimitry Andric } 14565ffd83dbSDimitry Andric 1457e8d8bef9SDimitry Andric static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { 1458e8d8bef9SDimitry Andric if (VecSize <= 32) // 4 bytes 1459e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; 14605ffd83dbSDimitry Andric if (VecSize <= 64) // 8 bytes 1461e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1462e8d8bef9SDimitry Andric if (VecSize <= 96) // 12 bytes 1463e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; 14645ffd83dbSDimitry Andric if (VecSize <= 128) // 16 bytes 1465e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1466e8d8bef9SDimitry Andric if (VecSize <= 160) // 20 bytes 1467e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; 14685ffd83dbSDimitry Andric if (VecSize <= 256) // 32 bytes 1469e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1470bdd1243dSDimitry Andric if (VecSize <= 288) // 36 bytes 1471bdd1243dSDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9; 1472bdd1243dSDimitry Andric if (VecSize <= 320) // 40 bytes 1473bdd1243dSDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10; 1474bdd1243dSDimitry Andric if (VecSize <= 352) // 44 bytes 1475bdd1243dSDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11; 1476bdd1243dSDimitry Andric if (VecSize <= 384) // 48 bytes 1477bdd1243dSDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12; 14785ffd83dbSDimitry Andric if (VecSize <= 512) // 64 bytes 1479e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; 14805ffd83dbSDimitry Andric if (VecSize <= 1024) // 128 bytes 1481e8d8bef9SDimitry Andric return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; 14825ffd83dbSDimitry Andric 14835ffd83dbSDimitry Andric llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 14845ffd83dbSDimitry Andric } 14855ffd83dbSDimitry Andric 1486e8d8bef9SDimitry Andric static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { 1487e8d8bef9SDimitry Andric if (VecSize <= 32) // 4 bytes 1488e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1489e8d8bef9SDimitry Andric if (VecSize <= 64) // 8 bytes 1490e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1491e8d8bef9SDimitry Andric if (VecSize <= 96) // 12 bytes 1492e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1493e8d8bef9SDimitry Andric if (VecSize <= 128) // 16 bytes 1494e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1495e8d8bef9SDimitry Andric if (VecSize <= 160) // 20 bytes 1496e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1497e8d8bef9SDimitry Andric if (VecSize <= 256) // 32 bytes 1498e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; 149906c3fb27SDimitry Andric if (VecSize <= 288) // 36 bytes 150006c3fb27SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9; 150106c3fb27SDimitry Andric if (VecSize <= 320) // 40 bytes 150206c3fb27SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10; 150306c3fb27SDimitry Andric if (VecSize <= 352) // 44 bytes 150406c3fb27SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11; 150506c3fb27SDimitry Andric if (VecSize <= 384) // 48 bytes 150606c3fb27SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12; 1507e8d8bef9SDimitry Andric if (VecSize <= 512) // 64 bytes 1508e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1509e8d8bef9SDimitry Andric if (VecSize <= 1024) // 128 bytes 1510e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1511e8d8bef9SDimitry Andric 1512e8d8bef9SDimitry Andric llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1513e8d8bef9SDimitry Andric } 1514e8d8bef9SDimitry Andric 1515e8d8bef9SDimitry Andric static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { 1516e8d8bef9SDimitry Andric if (VecSize <= 64) // 8 bytes 1517e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; 1518e8d8bef9SDimitry Andric if (VecSize <= 128) // 16 bytes 1519e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; 1520e8d8bef9SDimitry Andric if (VecSize <= 256) // 32 bytes 1521e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; 1522e8d8bef9SDimitry Andric if (VecSize <= 512) // 64 bytes 1523e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; 1524e8d8bef9SDimitry Andric if (VecSize <= 1024) // 128 bytes 1525e8d8bef9SDimitry Andric return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; 1526e8d8bef9SDimitry Andric 1527e8d8bef9SDimitry Andric llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1528e8d8bef9SDimitry Andric } 1529e8d8bef9SDimitry Andric 1530e8d8bef9SDimitry Andric const MCInstrDesc & 1531e8d8bef9SDimitry Andric SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, 1532e8d8bef9SDimitry Andric bool IsSGPR) const { 15335ffd83dbSDimitry Andric if (IsSGPR) { 15345ffd83dbSDimitry Andric switch (EltSize) { 15355ffd83dbSDimitry Andric case 32: 1536e8d8bef9SDimitry Andric return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); 15375ffd83dbSDimitry Andric case 64: 1538e8d8bef9SDimitry Andric return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); 15395ffd83dbSDimitry Andric default: 15405ffd83dbSDimitry Andric llvm_unreachable("invalid reg indexing elt size"); 15415ffd83dbSDimitry Andric } 15425ffd83dbSDimitry Andric } 15435ffd83dbSDimitry Andric 15445ffd83dbSDimitry Andric assert(EltSize == 32 && "invalid reg indexing elt size"); 1545e8d8bef9SDimitry Andric return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); 15465ffd83dbSDimitry Andric } 15475ffd83dbSDimitry Andric 15480b57cec5SDimitry Andric static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 15490b57cec5SDimitry Andric switch (Size) { 15500b57cec5SDimitry Andric case 4: 15510b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S32_SAVE; 15520b57cec5SDimitry Andric case 8: 15530b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S64_SAVE; 15540b57cec5SDimitry Andric case 12: 15550b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S96_SAVE; 15560b57cec5SDimitry Andric case 16: 15570b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S128_SAVE; 15580b57cec5SDimitry Andric case 20: 15590b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S160_SAVE; 15605ffd83dbSDimitry Andric case 24: 15615ffd83dbSDimitry Andric return AMDGPU::SI_SPILL_S192_SAVE; 1562fe6060f1SDimitry Andric case 28: 1563fe6060f1SDimitry Andric return AMDGPU::SI_SPILL_S224_SAVE; 15640b57cec5SDimitry Andric case 32: 15650b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S256_SAVE; 1566bdd1243dSDimitry Andric case 36: 1567bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S288_SAVE; 1568bdd1243dSDimitry Andric case 40: 1569bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S320_SAVE; 1570bdd1243dSDimitry Andric case 44: 1571bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S352_SAVE; 1572bdd1243dSDimitry Andric case 48: 1573bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S384_SAVE; 15740b57cec5SDimitry Andric case 64: 15750b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S512_SAVE; 15760b57cec5SDimitry Andric case 128: 15770b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S1024_SAVE; 15780b57cec5SDimitry Andric default: 15790b57cec5SDimitry Andric llvm_unreachable("unknown register size"); 15800b57cec5SDimitry Andric } 15810b57cec5SDimitry Andric } 15820b57cec5SDimitry Andric 15830b57cec5SDimitry Andric static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 15840b57cec5SDimitry Andric switch (Size) { 15850b57cec5SDimitry Andric case 4: 15860b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V32_SAVE; 15870b57cec5SDimitry Andric case 8: 15880b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V64_SAVE; 15890b57cec5SDimitry Andric case 12: 15900b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V96_SAVE; 15910b57cec5SDimitry Andric case 16: 15920b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V128_SAVE; 15930b57cec5SDimitry Andric case 20: 15940b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V160_SAVE; 15955ffd83dbSDimitry Andric case 24: 15965ffd83dbSDimitry Andric return AMDGPU::SI_SPILL_V192_SAVE; 1597fe6060f1SDimitry Andric case 28: 1598fe6060f1SDimitry Andric return AMDGPU::SI_SPILL_V224_SAVE; 15990b57cec5SDimitry Andric case 32: 16000b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V256_SAVE; 1601bdd1243dSDimitry Andric case 36: 1602bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V288_SAVE; 1603bdd1243dSDimitry Andric case 40: 1604bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V320_SAVE; 1605bdd1243dSDimitry Andric case 44: 1606bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V352_SAVE; 1607bdd1243dSDimitry Andric case 48: 1608bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V384_SAVE; 16090b57cec5SDimitry Andric case 64: 16100b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V512_SAVE; 16110b57cec5SDimitry Andric case 128: 16120b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V1024_SAVE; 16130b57cec5SDimitry Andric default: 16140b57cec5SDimitry Andric llvm_unreachable("unknown register size"); 16150b57cec5SDimitry Andric } 16160b57cec5SDimitry Andric } 16170b57cec5SDimitry Andric 16180b57cec5SDimitry Andric static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 16190b57cec5SDimitry Andric switch (Size) { 16200b57cec5SDimitry Andric case 4: 16210b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A32_SAVE; 16220b57cec5SDimitry Andric case 8: 16230b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A64_SAVE; 1624e8d8bef9SDimitry Andric case 12: 1625e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A96_SAVE; 16260b57cec5SDimitry Andric case 16: 16270b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A128_SAVE; 1628e8d8bef9SDimitry Andric case 20: 1629e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A160_SAVE; 1630e8d8bef9SDimitry Andric case 24: 1631e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A192_SAVE; 1632fe6060f1SDimitry Andric case 28: 1633fe6060f1SDimitry Andric return AMDGPU::SI_SPILL_A224_SAVE; 1634e8d8bef9SDimitry Andric case 32: 1635e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A256_SAVE; 1636bdd1243dSDimitry Andric case 36: 1637bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A288_SAVE; 1638bdd1243dSDimitry Andric case 40: 1639bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A320_SAVE; 1640bdd1243dSDimitry Andric case 44: 1641bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A352_SAVE; 1642bdd1243dSDimitry Andric case 48: 1643bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A384_SAVE; 16440b57cec5SDimitry Andric case 64: 16450b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A512_SAVE; 16460b57cec5SDimitry Andric case 128: 16470b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A1024_SAVE; 16480b57cec5SDimitry Andric default: 16490b57cec5SDimitry Andric llvm_unreachable("unknown register size"); 16500b57cec5SDimitry Andric } 16510b57cec5SDimitry Andric } 16520b57cec5SDimitry Andric 16530eae32dcSDimitry Andric static unsigned getAVSpillSaveOpcode(unsigned Size) { 16540eae32dcSDimitry Andric switch (Size) { 16550eae32dcSDimitry Andric case 4: 16560eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV32_SAVE; 16570eae32dcSDimitry Andric case 8: 16580eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV64_SAVE; 16590eae32dcSDimitry Andric case 12: 16600eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV96_SAVE; 16610eae32dcSDimitry Andric case 16: 16620eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV128_SAVE; 16630eae32dcSDimitry Andric case 20: 16640eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV160_SAVE; 16650eae32dcSDimitry Andric case 24: 16660eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV192_SAVE; 16670eae32dcSDimitry Andric case 28: 16680eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV224_SAVE; 16690eae32dcSDimitry Andric case 32: 16700eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV256_SAVE; 1671bdd1243dSDimitry Andric case 36: 1672bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV288_SAVE; 1673bdd1243dSDimitry Andric case 40: 1674bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV320_SAVE; 1675bdd1243dSDimitry Andric case 44: 1676bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV352_SAVE; 1677bdd1243dSDimitry Andric case 48: 1678bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV384_SAVE; 16790eae32dcSDimitry Andric case 64: 16800eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV512_SAVE; 16810eae32dcSDimitry Andric case 128: 16820eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV1024_SAVE; 16830eae32dcSDimitry Andric default: 16840eae32dcSDimitry Andric llvm_unreachable("unknown register size"); 16850eae32dcSDimitry Andric } 16860eae32dcSDimitry Andric } 16870eae32dcSDimitry Andric 16885f757f3fSDimitry Andric static unsigned getWWMRegSpillSaveOpcode(unsigned Size, 16895f757f3fSDimitry Andric bool IsVectorSuperClass) { 169006c3fb27SDimitry Andric // Currently, there is only 32-bit WWM register spills needed. 169106c3fb27SDimitry Andric if (Size != 4) 169206c3fb27SDimitry Andric llvm_unreachable("unknown wwm register spill size"); 169306c3fb27SDimitry Andric 16945f757f3fSDimitry Andric if (IsVectorSuperClass) 16955f757f3fSDimitry Andric return AMDGPU::SI_SPILL_WWM_AV32_SAVE; 16965f757f3fSDimitry Andric 169706c3fb27SDimitry Andric return AMDGPU::SI_SPILL_WWM_V32_SAVE; 169806c3fb27SDimitry Andric } 169906c3fb27SDimitry Andric 170006c3fb27SDimitry Andric static unsigned getVectorRegSpillSaveOpcode(Register Reg, 170106c3fb27SDimitry Andric const TargetRegisterClass *RC, 170206c3fb27SDimitry Andric unsigned Size, 170306c3fb27SDimitry Andric const SIRegisterInfo &TRI, 170406c3fb27SDimitry Andric const SIMachineFunctionInfo &MFI) { 17055f757f3fSDimitry Andric bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); 17065f757f3fSDimitry Andric 170706c3fb27SDimitry Andric // Choose the right opcode if spilling a WWM register. 170806c3fb27SDimitry Andric if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) 17095f757f3fSDimitry Andric return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); 171006c3fb27SDimitry Andric 17115f757f3fSDimitry Andric if (IsVectorSuperClass) 171206c3fb27SDimitry Andric return getAVSpillSaveOpcode(Size); 171306c3fb27SDimitry Andric 171406c3fb27SDimitry Andric return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) 171506c3fb27SDimitry Andric : getVGPRSpillSaveOpcode(Size); 171606c3fb27SDimitry Andric } 171706c3fb27SDimitry Andric 1718bdd1243dSDimitry Andric void SIInstrInfo::storeRegToStackSlot( 1719bdd1243dSDimitry Andric MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, 1720bdd1243dSDimitry Andric bool isKill, int FrameIndex, const TargetRegisterClass *RC, 1721bdd1243dSDimitry Andric const TargetRegisterInfo *TRI, Register VReg) const { 17220b57cec5SDimitry Andric MachineFunction *MF = MBB.getParent(); 17230b57cec5SDimitry Andric SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 17240b57cec5SDimitry Andric MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 17250b57cec5SDimitry Andric const DebugLoc &DL = MBB.findDebugLoc(MI); 17260b57cec5SDimitry Andric 17270b57cec5SDimitry Andric MachinePointerInfo PtrInfo 17280b57cec5SDimitry Andric = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 17295ffd83dbSDimitry Andric MachineMemOperand *MMO = MF->getMachineMemOperand( 17305ffd83dbSDimitry Andric PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 17315ffd83dbSDimitry Andric FrameInfo.getObjectAlign(FrameIndex)); 17320b57cec5SDimitry Andric unsigned SpillSize = TRI->getSpillSize(*RC); 17330b57cec5SDimitry Andric 17344824e7fdSDimitry Andric MachineRegisterInfo &MRI = MF->getRegInfo(); 17350b57cec5SDimitry Andric if (RI.isSGPRClass(RC)) { 17360b57cec5SDimitry Andric MFI->setHasSpilledSGPRs(); 1737480093f4SDimitry Andric assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 17385ffd83dbSDimitry Andric assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && 17395ffd83dbSDimitry Andric SrcReg != AMDGPU::EXEC && "exec should not be spilled"); 17400b57cec5SDimitry Andric 17410b57cec5SDimitry Andric // We are only allowed to create one new instruction when spilling 17420b57cec5SDimitry Andric // registers, so we need to use pseudo instruction for spilling SGPRs. 17430b57cec5SDimitry Andric const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 17440b57cec5SDimitry Andric 17450b57cec5SDimitry Andric // The SGPR spill/restore instructions only work on number sgprs, so we need 17460b57cec5SDimitry Andric // to make sure we are using the correct register class. 1747e8d8bef9SDimitry Andric if (SrcReg.isVirtual() && SpillSize == 4) { 17485ffd83dbSDimitry Andric MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 17490b57cec5SDimitry Andric } 17500b57cec5SDimitry Andric 17518bcb0991SDimitry Andric BuildMI(MBB, MI, DL, OpDesc) 17520b57cec5SDimitry Andric .addReg(SrcReg, getKillRegState(isKill)) // data 17530b57cec5SDimitry Andric .addFrameIndex(FrameIndex) // addr 17540b57cec5SDimitry Andric .addMemOperand(MMO) 17550b57cec5SDimitry Andric .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1756e8d8bef9SDimitry Andric 17570b57cec5SDimitry Andric if (RI.spillSGPRToVGPR()) 17580b57cec5SDimitry Andric FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 17590b57cec5SDimitry Andric return; 17600b57cec5SDimitry Andric } 17610b57cec5SDimitry Andric 176206c3fb27SDimitry Andric unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, 176306c3fb27SDimitry Andric SpillSize, RI, *MFI); 17640b57cec5SDimitry Andric MFI->setHasSpilledVGPRs(); 17650b57cec5SDimitry Andric 1766e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, get(Opcode)) 1767e8d8bef9SDimitry Andric .addReg(SrcReg, getKillRegState(isKill)) // data 17680b57cec5SDimitry Andric .addFrameIndex(FrameIndex) // addr 17690b57cec5SDimitry Andric .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 17700b57cec5SDimitry Andric .addImm(0) // offset 17710b57cec5SDimitry Andric .addMemOperand(MMO); 17720b57cec5SDimitry Andric } 17730b57cec5SDimitry Andric 17740b57cec5SDimitry Andric static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 17750b57cec5SDimitry Andric switch (Size) { 17760b57cec5SDimitry Andric case 4: 17770b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S32_RESTORE; 17780b57cec5SDimitry Andric case 8: 17790b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S64_RESTORE; 17800b57cec5SDimitry Andric case 12: 17810b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S96_RESTORE; 17820b57cec5SDimitry Andric case 16: 17830b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S128_RESTORE; 17840b57cec5SDimitry Andric case 20: 17850b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S160_RESTORE; 17865ffd83dbSDimitry Andric case 24: 17875ffd83dbSDimitry Andric return AMDGPU::SI_SPILL_S192_RESTORE; 1788fe6060f1SDimitry Andric case 28: 1789fe6060f1SDimitry Andric return AMDGPU::SI_SPILL_S224_RESTORE; 17900b57cec5SDimitry Andric case 32: 17910b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S256_RESTORE; 1792bdd1243dSDimitry Andric case 36: 1793bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S288_RESTORE; 1794bdd1243dSDimitry Andric case 40: 1795bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S320_RESTORE; 1796bdd1243dSDimitry Andric case 44: 1797bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S352_RESTORE; 1798bdd1243dSDimitry Andric case 48: 1799bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_S384_RESTORE; 18000b57cec5SDimitry Andric case 64: 18010b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S512_RESTORE; 18020b57cec5SDimitry Andric case 128: 18030b57cec5SDimitry Andric return AMDGPU::SI_SPILL_S1024_RESTORE; 18040b57cec5SDimitry Andric default: 18050b57cec5SDimitry Andric llvm_unreachable("unknown register size"); 18060b57cec5SDimitry Andric } 18070b57cec5SDimitry Andric } 18080b57cec5SDimitry Andric 18090b57cec5SDimitry Andric static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 18100b57cec5SDimitry Andric switch (Size) { 18110b57cec5SDimitry Andric case 4: 18120b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V32_RESTORE; 18130b57cec5SDimitry Andric case 8: 18140b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V64_RESTORE; 18150b57cec5SDimitry Andric case 12: 18160b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V96_RESTORE; 18170b57cec5SDimitry Andric case 16: 18180b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V128_RESTORE; 18190b57cec5SDimitry Andric case 20: 18200b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V160_RESTORE; 18215ffd83dbSDimitry Andric case 24: 18225ffd83dbSDimitry Andric return AMDGPU::SI_SPILL_V192_RESTORE; 1823fe6060f1SDimitry Andric case 28: 1824fe6060f1SDimitry Andric return AMDGPU::SI_SPILL_V224_RESTORE; 18250b57cec5SDimitry Andric case 32: 18260b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V256_RESTORE; 1827bdd1243dSDimitry Andric case 36: 1828bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V288_RESTORE; 1829bdd1243dSDimitry Andric case 40: 1830bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V320_RESTORE; 1831bdd1243dSDimitry Andric case 44: 1832bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V352_RESTORE; 1833bdd1243dSDimitry Andric case 48: 1834bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_V384_RESTORE; 18350b57cec5SDimitry Andric case 64: 18360b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V512_RESTORE; 18370b57cec5SDimitry Andric case 128: 18380b57cec5SDimitry Andric return AMDGPU::SI_SPILL_V1024_RESTORE; 18390b57cec5SDimitry Andric default: 18400b57cec5SDimitry Andric llvm_unreachable("unknown register size"); 18410b57cec5SDimitry Andric } 18420b57cec5SDimitry Andric } 18430b57cec5SDimitry Andric 18440b57cec5SDimitry Andric static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 18450b57cec5SDimitry Andric switch (Size) { 18460b57cec5SDimitry Andric case 4: 18470b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A32_RESTORE; 18480b57cec5SDimitry Andric case 8: 18490b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A64_RESTORE; 1850e8d8bef9SDimitry Andric case 12: 1851e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A96_RESTORE; 18520b57cec5SDimitry Andric case 16: 18530b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A128_RESTORE; 1854e8d8bef9SDimitry Andric case 20: 1855e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A160_RESTORE; 1856e8d8bef9SDimitry Andric case 24: 1857e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A192_RESTORE; 1858fe6060f1SDimitry Andric case 28: 1859fe6060f1SDimitry Andric return AMDGPU::SI_SPILL_A224_RESTORE; 1860e8d8bef9SDimitry Andric case 32: 1861e8d8bef9SDimitry Andric return AMDGPU::SI_SPILL_A256_RESTORE; 1862bdd1243dSDimitry Andric case 36: 1863bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A288_RESTORE; 1864bdd1243dSDimitry Andric case 40: 1865bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A320_RESTORE; 1866bdd1243dSDimitry Andric case 44: 1867bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A352_RESTORE; 1868bdd1243dSDimitry Andric case 48: 1869bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_A384_RESTORE; 18700b57cec5SDimitry Andric case 64: 18710b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A512_RESTORE; 18720b57cec5SDimitry Andric case 128: 18730b57cec5SDimitry Andric return AMDGPU::SI_SPILL_A1024_RESTORE; 18740b57cec5SDimitry Andric default: 18750b57cec5SDimitry Andric llvm_unreachable("unknown register size"); 18760b57cec5SDimitry Andric } 18770b57cec5SDimitry Andric } 18780b57cec5SDimitry Andric 18790eae32dcSDimitry Andric static unsigned getAVSpillRestoreOpcode(unsigned Size) { 18800eae32dcSDimitry Andric switch (Size) { 18810eae32dcSDimitry Andric case 4: 18820eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV32_RESTORE; 18830eae32dcSDimitry Andric case 8: 18840eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV64_RESTORE; 18850eae32dcSDimitry Andric case 12: 18860eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV96_RESTORE; 18870eae32dcSDimitry Andric case 16: 18880eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV128_RESTORE; 18890eae32dcSDimitry Andric case 20: 18900eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV160_RESTORE; 18910eae32dcSDimitry Andric case 24: 18920eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV192_RESTORE; 18930eae32dcSDimitry Andric case 28: 18940eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV224_RESTORE; 18950eae32dcSDimitry Andric case 32: 18960eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV256_RESTORE; 1897bdd1243dSDimitry Andric case 36: 1898bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV288_RESTORE; 1899bdd1243dSDimitry Andric case 40: 1900bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV320_RESTORE; 1901bdd1243dSDimitry Andric case 44: 1902bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV352_RESTORE; 1903bdd1243dSDimitry Andric case 48: 1904bdd1243dSDimitry Andric return AMDGPU::SI_SPILL_AV384_RESTORE; 19050eae32dcSDimitry Andric case 64: 19060eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV512_RESTORE; 19070eae32dcSDimitry Andric case 128: 19080eae32dcSDimitry Andric return AMDGPU::SI_SPILL_AV1024_RESTORE; 19090eae32dcSDimitry Andric default: 19100eae32dcSDimitry Andric llvm_unreachable("unknown register size"); 19110eae32dcSDimitry Andric } 19120eae32dcSDimitry Andric } 19130eae32dcSDimitry Andric 19145f757f3fSDimitry Andric static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, 19155f757f3fSDimitry Andric bool IsVectorSuperClass) { 191606c3fb27SDimitry Andric // Currently, there is only 32-bit WWM register spills needed. 191706c3fb27SDimitry Andric if (Size != 4) 191806c3fb27SDimitry Andric llvm_unreachable("unknown wwm register spill size"); 191906c3fb27SDimitry Andric 19205f757f3fSDimitry Andric if (IsVectorSuperClass) 19215f757f3fSDimitry Andric return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; 19225f757f3fSDimitry Andric 192306c3fb27SDimitry Andric return AMDGPU::SI_SPILL_WWM_V32_RESTORE; 192406c3fb27SDimitry Andric } 192506c3fb27SDimitry Andric 192606c3fb27SDimitry Andric static unsigned 192706c3fb27SDimitry Andric getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, 192806c3fb27SDimitry Andric unsigned Size, const SIRegisterInfo &TRI, 192906c3fb27SDimitry Andric const SIMachineFunctionInfo &MFI) { 19305f757f3fSDimitry Andric bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); 19315f757f3fSDimitry Andric 193206c3fb27SDimitry Andric // Choose the right opcode if restoring a WWM register. 193306c3fb27SDimitry Andric if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) 19345f757f3fSDimitry Andric return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); 193506c3fb27SDimitry Andric 19365f757f3fSDimitry Andric if (IsVectorSuperClass) 193706c3fb27SDimitry Andric return getAVSpillRestoreOpcode(Size); 193806c3fb27SDimitry Andric 193906c3fb27SDimitry Andric return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) 194006c3fb27SDimitry Andric : getVGPRSpillRestoreOpcode(Size); 194106c3fb27SDimitry Andric } 194206c3fb27SDimitry Andric 19430b57cec5SDimitry Andric void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 19440b57cec5SDimitry Andric MachineBasicBlock::iterator MI, 19455ffd83dbSDimitry Andric Register DestReg, int FrameIndex, 19460b57cec5SDimitry Andric const TargetRegisterClass *RC, 1947bdd1243dSDimitry Andric const TargetRegisterInfo *TRI, 1948bdd1243dSDimitry Andric Register VReg) const { 19490b57cec5SDimitry Andric MachineFunction *MF = MBB.getParent(); 19500b57cec5SDimitry Andric SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 19510b57cec5SDimitry Andric MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 19520b57cec5SDimitry Andric const DebugLoc &DL = MBB.findDebugLoc(MI); 19530b57cec5SDimitry Andric unsigned SpillSize = TRI->getSpillSize(*RC); 19540b57cec5SDimitry Andric 19550b57cec5SDimitry Andric MachinePointerInfo PtrInfo 19560b57cec5SDimitry Andric = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 19570b57cec5SDimitry Andric 19580b57cec5SDimitry Andric MachineMemOperand *MMO = MF->getMachineMemOperand( 19595ffd83dbSDimitry Andric PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 19605ffd83dbSDimitry Andric FrameInfo.getObjectAlign(FrameIndex)); 19610b57cec5SDimitry Andric 19620b57cec5SDimitry Andric if (RI.isSGPRClass(RC)) { 19630b57cec5SDimitry Andric MFI->setHasSpilledSGPRs(); 1964480093f4SDimitry Andric assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 19655ffd83dbSDimitry Andric assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && 19665ffd83dbSDimitry Andric DestReg != AMDGPU::EXEC && "exec should not be spilled"); 19670b57cec5SDimitry Andric 19680b57cec5SDimitry Andric // FIXME: Maybe this should not include a memoperand because it will be 19690b57cec5SDimitry Andric // lowered to non-memory instructions. 19700b57cec5SDimitry Andric const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 19715ffd83dbSDimitry Andric if (DestReg.isVirtual() && SpillSize == 4) { 19720b57cec5SDimitry Andric MachineRegisterInfo &MRI = MF->getRegInfo(); 19735ffd83dbSDimitry Andric MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 19740b57cec5SDimitry Andric } 19750b57cec5SDimitry Andric 19760b57cec5SDimitry Andric if (RI.spillSGPRToVGPR()) 19770b57cec5SDimitry Andric FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 19788bcb0991SDimitry Andric BuildMI(MBB, MI, DL, OpDesc, DestReg) 19790b57cec5SDimitry Andric .addFrameIndex(FrameIndex) // addr 19800b57cec5SDimitry Andric .addMemOperand(MMO) 19810b57cec5SDimitry Andric .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1982e8d8bef9SDimitry Andric 19830b57cec5SDimitry Andric return; 19840b57cec5SDimitry Andric } 19850b57cec5SDimitry Andric 198606c3fb27SDimitry Andric unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, 198706c3fb27SDimitry Andric SpillSize, RI, *MFI); 1988e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, get(Opcode), DestReg) 1989e8d8bef9SDimitry Andric .addFrameIndex(FrameIndex) // vaddr 19900b57cec5SDimitry Andric .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 19910b57cec5SDimitry Andric .addImm(0) // offset 19920b57cec5SDimitry Andric .addMemOperand(MMO); 19930b57cec5SDimitry Andric } 19940b57cec5SDimitry Andric 19950b57cec5SDimitry Andric void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 19960b57cec5SDimitry Andric MachineBasicBlock::iterator MI) const { 1997e8d8bef9SDimitry Andric insertNoops(MBB, MI, 1); 1998e8d8bef9SDimitry Andric } 1999e8d8bef9SDimitry Andric 2000e8d8bef9SDimitry Andric void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, 2001e8d8bef9SDimitry Andric MachineBasicBlock::iterator MI, 2002e8d8bef9SDimitry Andric unsigned Quantity) const { 2003e8d8bef9SDimitry Andric DebugLoc DL = MBB.findDebugLoc(MI); 2004e8d8bef9SDimitry Andric while (Quantity > 0) { 2005e8d8bef9SDimitry Andric unsigned Arg = std::min(Quantity, 8u); 2006e8d8bef9SDimitry Andric Quantity -= Arg; 2007e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); 2008e8d8bef9SDimitry Andric } 20090b57cec5SDimitry Andric } 20100b57cec5SDimitry Andric 20110b57cec5SDimitry Andric void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 20120b57cec5SDimitry Andric auto MF = MBB.getParent(); 20130b57cec5SDimitry Andric SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 20140b57cec5SDimitry Andric 20150b57cec5SDimitry Andric assert(Info->isEntryFunction()); 20160b57cec5SDimitry Andric 20170b57cec5SDimitry Andric if (MBB.succ_empty()) { 20180b57cec5SDimitry Andric bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 20190b57cec5SDimitry Andric if (HasNoTerminator) { 20200b57cec5SDimitry Andric if (Info->returnsVoid()) { 20210b57cec5SDimitry Andric BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 20220b57cec5SDimitry Andric } else { 20230b57cec5SDimitry Andric BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 20240b57cec5SDimitry Andric } 20250b57cec5SDimitry Andric } 20260b57cec5SDimitry Andric } 20270b57cec5SDimitry Andric } 20280b57cec5SDimitry Andric 20290b57cec5SDimitry Andric unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 20300b57cec5SDimitry Andric switch (MI.getOpcode()) { 2031349cc55cSDimitry Andric default: 2032349cc55cSDimitry Andric if (MI.isMetaInstruction()) 2033349cc55cSDimitry Andric return 0; 2034349cc55cSDimitry Andric return 1; // FIXME: Do wait states equal cycles? 20350b57cec5SDimitry Andric 20360b57cec5SDimitry Andric case AMDGPU::S_NOP: 20370b57cec5SDimitry Andric return MI.getOperand(0).getImm() + 1; 2038349cc55cSDimitry Andric // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The 2039349cc55cSDimitry Andric // hazard, even if one exist, won't really be visible. Should we handle it? 20400b57cec5SDimitry Andric } 20410b57cec5SDimitry Andric } 20420b57cec5SDimitry Andric 20430b57cec5SDimitry Andric bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 2044fe6060f1SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 20450b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 20460b57cec5SDimitry Andric DebugLoc DL = MBB.findDebugLoc(MI); 20470b57cec5SDimitry Andric switch (MI.getOpcode()) { 20480b57cec5SDimitry Andric default: return TargetInstrInfo::expandPostRAPseudo(MI); 20490b57cec5SDimitry Andric case AMDGPU::S_MOV_B64_term: 20500b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20510b57cec5SDimitry Andric // register allocation. 20520b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_MOV_B64)); 20530b57cec5SDimitry Andric break; 20540b57cec5SDimitry Andric 20550b57cec5SDimitry Andric case AMDGPU::S_MOV_B32_term: 20560b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20570b57cec5SDimitry Andric // register allocation. 20580b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_MOV_B32)); 20590b57cec5SDimitry Andric break; 20600b57cec5SDimitry Andric 20610b57cec5SDimitry Andric case AMDGPU::S_XOR_B64_term: 20620b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20630b57cec5SDimitry Andric // register allocation. 20640b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_XOR_B64)); 20650b57cec5SDimitry Andric break; 20660b57cec5SDimitry Andric 20670b57cec5SDimitry Andric case AMDGPU::S_XOR_B32_term: 20680b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20690b57cec5SDimitry Andric // register allocation. 20700b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_XOR_B32)); 20710b57cec5SDimitry Andric break; 2072e8d8bef9SDimitry Andric case AMDGPU::S_OR_B64_term: 2073e8d8bef9SDimitry Andric // This is only a terminator to get the correct spill code placement during 2074e8d8bef9SDimitry Andric // register allocation. 2075e8d8bef9SDimitry Andric MI.setDesc(get(AMDGPU::S_OR_B64)); 2076e8d8bef9SDimitry Andric break; 20770b57cec5SDimitry Andric case AMDGPU::S_OR_B32_term: 20780b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20790b57cec5SDimitry Andric // register allocation. 20800b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_OR_B32)); 20810b57cec5SDimitry Andric break; 20820b57cec5SDimitry Andric 20830b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B64_term: 20840b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20850b57cec5SDimitry Andric // register allocation. 20860b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 20870b57cec5SDimitry Andric break; 20880b57cec5SDimitry Andric 20890b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B32_term: 20900b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 20910b57cec5SDimitry Andric // register allocation. 20920b57cec5SDimitry Andric MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 20930b57cec5SDimitry Andric break; 20940b57cec5SDimitry Andric 2095fe6060f1SDimitry Andric case AMDGPU::S_AND_B64_term: 2096fe6060f1SDimitry Andric // This is only a terminator to get the correct spill code placement during 2097fe6060f1SDimitry Andric // register allocation. 2098fe6060f1SDimitry Andric MI.setDesc(get(AMDGPU::S_AND_B64)); 2099fe6060f1SDimitry Andric break; 2100fe6060f1SDimitry Andric 2101fe6060f1SDimitry Andric case AMDGPU::S_AND_B32_term: 2102fe6060f1SDimitry Andric // This is only a terminator to get the correct spill code placement during 2103fe6060f1SDimitry Andric // register allocation. 2104fe6060f1SDimitry Andric MI.setDesc(get(AMDGPU::S_AND_B32)); 2105fe6060f1SDimitry Andric break; 2106fe6060f1SDimitry Andric 210706c3fb27SDimitry Andric case AMDGPU::S_AND_SAVEEXEC_B64_term: 210806c3fb27SDimitry Andric // This is only a terminator to get the correct spill code placement during 210906c3fb27SDimitry Andric // register allocation. 211006c3fb27SDimitry Andric MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64)); 211106c3fb27SDimitry Andric break; 211206c3fb27SDimitry Andric 211306c3fb27SDimitry Andric case AMDGPU::S_AND_SAVEEXEC_B32_term: 211406c3fb27SDimitry Andric // This is only a terminator to get the correct spill code placement during 211506c3fb27SDimitry Andric // register allocation. 211606c3fb27SDimitry Andric MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32)); 211706c3fb27SDimitry Andric break; 211806c3fb27SDimitry Andric 21195f757f3fSDimitry Andric case AMDGPU::SI_SPILL_S32_TO_VGPR: 21205f757f3fSDimitry Andric MI.setDesc(get(AMDGPU::V_WRITELANE_B32)); 21215f757f3fSDimitry Andric break; 21225f757f3fSDimitry Andric 21235f757f3fSDimitry Andric case AMDGPU::SI_RESTORE_S32_FROM_VGPR: 21245f757f3fSDimitry Andric MI.setDesc(get(AMDGPU::V_READLANE_B32)); 21255f757f3fSDimitry Andric break; 21265f757f3fSDimitry Andric 21270b57cec5SDimitry Andric case AMDGPU::V_MOV_B64_PSEUDO: { 21288bcb0991SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 21298bcb0991SDimitry Andric Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 21308bcb0991SDimitry Andric Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 21310b57cec5SDimitry Andric 21320b57cec5SDimitry Andric const MachineOperand &SrcOp = MI.getOperand(1); 21330b57cec5SDimitry Andric // FIXME: Will this work for 64-bit floating point immediates? 21340b57cec5SDimitry Andric assert(!SrcOp.isFPImm()); 213581ad6265SDimitry Andric if (ST.hasMovB64()) { 213681ad6265SDimitry Andric MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); 2137bdd1243dSDimitry Andric if (SrcOp.isReg() || isInlineConstant(MI, 1) || 2138bdd1243dSDimitry Andric isUInt<32>(SrcOp.getImm())) 213981ad6265SDimitry Andric break; 214081ad6265SDimitry Andric } 21410b57cec5SDimitry Andric if (SrcOp.isImm()) { 21420b57cec5SDimitry Andric APInt Imm(64, SrcOp.getImm()); 2143fe6060f1SDimitry Andric APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 2144fe6060f1SDimitry Andric APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 21455f757f3fSDimitry Andric if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) { 2146fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 2147fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_1) 2148fe6060f1SDimitry Andric .addImm(Lo.getSExtValue()) 2149fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_1) 2150fe6060f1SDimitry Andric .addImm(Lo.getSExtValue()) 2151fe6060f1SDimitry Andric .addImm(0) // op_sel_lo 2152fe6060f1SDimitry Andric .addImm(0) // op_sel_hi 2153fe6060f1SDimitry Andric .addImm(0) // neg_lo 2154fe6060f1SDimitry Andric .addImm(0) // neg_hi 2155fe6060f1SDimitry Andric .addImm(0); // clamp 2156fe6060f1SDimitry Andric } else { 21570b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 2158fe6060f1SDimitry Andric .addImm(Lo.getSExtValue()) 21590b57cec5SDimitry Andric .addReg(Dst, RegState::Implicit | RegState::Define); 21600b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 2161fe6060f1SDimitry Andric .addImm(Hi.getSExtValue()) 21620b57cec5SDimitry Andric .addReg(Dst, RegState::Implicit | RegState::Define); 2163fe6060f1SDimitry Andric } 21640b57cec5SDimitry Andric } else { 21650b57cec5SDimitry Andric assert(SrcOp.isReg()); 21665f757f3fSDimitry Andric if (ST.hasPkMovB32() && 2167fe6060f1SDimitry Andric !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { 2168fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 2169fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_1) // src0_mod 2170fe6060f1SDimitry Andric .addReg(SrcOp.getReg()) 2171fe6060f1SDimitry Andric .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod 2172fe6060f1SDimitry Andric .addReg(SrcOp.getReg()) 2173fe6060f1SDimitry Andric .addImm(0) // op_sel_lo 2174fe6060f1SDimitry Andric .addImm(0) // op_sel_hi 2175fe6060f1SDimitry Andric .addImm(0) // neg_lo 2176fe6060f1SDimitry Andric .addImm(0) // neg_hi 2177fe6060f1SDimitry Andric .addImm(0); // clamp 2178fe6060f1SDimitry Andric } else { 21790b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 21800b57cec5SDimitry Andric .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 21810b57cec5SDimitry Andric .addReg(Dst, RegState::Implicit | RegState::Define); 21820b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 21830b57cec5SDimitry Andric .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 21840b57cec5SDimitry Andric .addReg(Dst, RegState::Implicit | RegState::Define); 21850b57cec5SDimitry Andric } 2186fe6060f1SDimitry Andric } 21870b57cec5SDimitry Andric MI.eraseFromParent(); 21880b57cec5SDimitry Andric break; 21890b57cec5SDimitry Andric } 21908bcb0991SDimitry Andric case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 21918bcb0991SDimitry Andric expandMovDPP64(MI); 21928bcb0991SDimitry Andric break; 21938bcb0991SDimitry Andric } 2194fe6060f1SDimitry Andric case AMDGPU::S_MOV_B64_IMM_PSEUDO: { 2195fe6060f1SDimitry Andric const MachineOperand &SrcOp = MI.getOperand(1); 2196fe6060f1SDimitry Andric assert(!SrcOp.isFPImm()); 2197fe6060f1SDimitry Andric APInt Imm(64, SrcOp.getImm()); 2198fe6060f1SDimitry Andric if (Imm.isIntN(32) || isInlineConstant(Imm)) { 2199fe6060f1SDimitry Andric MI.setDesc(get(AMDGPU::S_MOV_B64)); 2200fe6060f1SDimitry Andric break; 2201fe6060f1SDimitry Andric } 2202fe6060f1SDimitry Andric 2203fe6060f1SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2204fe6060f1SDimitry Andric Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 2205fe6060f1SDimitry Andric Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 2206fe6060f1SDimitry Andric 2207fe6060f1SDimitry Andric APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 2208fe6060f1SDimitry Andric APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 2209fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) 2210fe6060f1SDimitry Andric .addImm(Lo.getSExtValue()) 2211fe6060f1SDimitry Andric .addReg(Dst, RegState::Implicit | RegState::Define); 2212fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) 2213fe6060f1SDimitry Andric .addImm(Hi.getSExtValue()) 2214fe6060f1SDimitry Andric .addReg(Dst, RegState::Implicit | RegState::Define); 2215fe6060f1SDimitry Andric MI.eraseFromParent(); 2216fe6060f1SDimitry Andric break; 2217fe6060f1SDimitry Andric } 22180b57cec5SDimitry Andric case AMDGPU::V_SET_INACTIVE_B32: { 22190b57cec5SDimitry Andric unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 22200b57cec5SDimitry Andric unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 222181ad6265SDimitry Andric // FIXME: We may possibly optimize the COPY once we find ways to make LLVM 222281ad6265SDimitry Andric // optimizations (mainly Register Coalescer) aware of WWM register liveness. 222381ad6265SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 222481ad6265SDimitry Andric .add(MI.getOperand(1)); 2225fe6060f1SDimitry Andric auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 2226fe6060f1SDimitry Andric FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 22270b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 22280b57cec5SDimitry Andric .add(MI.getOperand(2)); 22290b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(NotOpc), Exec) 22300b57cec5SDimitry Andric .addReg(Exec); 22310b57cec5SDimitry Andric MI.eraseFromParent(); 22320b57cec5SDimitry Andric break; 22330b57cec5SDimitry Andric } 22340b57cec5SDimitry Andric case AMDGPU::V_SET_INACTIVE_B64: { 22350b57cec5SDimitry Andric unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 22360b57cec5SDimitry Andric unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 223781ad6265SDimitry Andric MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 223881ad6265SDimitry Andric MI.getOperand(0).getReg()) 223981ad6265SDimitry Andric .add(MI.getOperand(1)); 224081ad6265SDimitry Andric expandPostRAPseudo(*Copy); 2241fe6060f1SDimitry Andric auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 2242fe6060f1SDimitry Andric FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 224381ad6265SDimitry Andric Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 22440b57cec5SDimitry Andric MI.getOperand(0).getReg()) 22450b57cec5SDimitry Andric .add(MI.getOperand(2)); 22460b57cec5SDimitry Andric expandPostRAPseudo(*Copy); 22470b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(NotOpc), Exec) 22480b57cec5SDimitry Andric .addReg(Exec); 22490b57cec5SDimitry Andric MI.eraseFromParent(); 22500b57cec5SDimitry Andric break; 22510b57cec5SDimitry Andric } 2252e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: 2253e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: 2254e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: 2255e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: 2256e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: 2257e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: 2258bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9: 2259bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10: 2260bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11: 2261bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12: 2262e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: 2263e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: 2264e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: 2265e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: 2266e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: 2267e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: 2268e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: 2269e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: 227006c3fb27SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9: 227106c3fb27SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10: 227206c3fb27SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11: 227306c3fb27SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12: 2274e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: 2275e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: 2276e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: 2277e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: 2278e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: 2279e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: 2280e8d8bef9SDimitry Andric case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { 22815ffd83dbSDimitry Andric const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 22825ffd83dbSDimitry Andric 22835ffd83dbSDimitry Andric unsigned Opc; 22845ffd83dbSDimitry Andric if (RI.hasVGPRs(EltRC)) { 2285e8d8bef9SDimitry Andric Opc = AMDGPU::V_MOVRELD_B32_e32; 22865ffd83dbSDimitry Andric } else { 2287e8d8bef9SDimitry Andric Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 2288e8d8bef9SDimitry Andric : AMDGPU::S_MOVRELD_B32; 22895ffd83dbSDimitry Andric } 22905ffd83dbSDimitry Andric 22915ffd83dbSDimitry Andric const MCInstrDesc &OpDesc = get(Opc); 22928bcb0991SDimitry Andric Register VecReg = MI.getOperand(0).getReg(); 22930b57cec5SDimitry Andric bool IsUndef = MI.getOperand(1).isUndef(); 22945ffd83dbSDimitry Andric unsigned SubReg = MI.getOperand(3).getImm(); 22950b57cec5SDimitry Andric assert(VecReg == MI.getOperand(1).getReg()); 22960b57cec5SDimitry Andric 22975ffd83dbSDimitry Andric MachineInstrBuilder MIB = 22985ffd83dbSDimitry Andric BuildMI(MBB, MI, DL, OpDesc) 22990b57cec5SDimitry Andric .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 23000b57cec5SDimitry Andric .add(MI.getOperand(2)) 23010b57cec5SDimitry Andric .addReg(VecReg, RegState::ImplicitDefine) 23025ffd83dbSDimitry Andric .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 23030b57cec5SDimitry Andric 23040b57cec5SDimitry Andric const int ImpDefIdx = 2305bdd1243dSDimitry Andric OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); 23060b57cec5SDimitry Andric const int ImpUseIdx = ImpDefIdx + 1; 23075ffd83dbSDimitry Andric MIB->tieOperands(ImpDefIdx, ImpUseIdx); 23080b57cec5SDimitry Andric MI.eraseFromParent(); 23090b57cec5SDimitry Andric break; 23100b57cec5SDimitry Andric } 2311e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: 2312e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: 2313e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: 2314e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: 2315e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: 2316e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: 2317bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9: 2318bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10: 2319bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11: 2320bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12: 2321e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: 2322e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { 2323e8d8bef9SDimitry Andric assert(ST.useVGPRIndexMode()); 2324e8d8bef9SDimitry Andric Register VecReg = MI.getOperand(0).getReg(); 2325e8d8bef9SDimitry Andric bool IsUndef = MI.getOperand(1).isUndef(); 2326e8d8bef9SDimitry Andric Register Idx = MI.getOperand(3).getReg(); 2327e8d8bef9SDimitry Andric Register SubReg = MI.getOperand(4).getImm(); 2328e8d8bef9SDimitry Andric 2329e8d8bef9SDimitry Andric MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 2330e8d8bef9SDimitry Andric .addReg(Idx) 2331e8d8bef9SDimitry Andric .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2332e8d8bef9SDimitry Andric SetOn->getOperand(3).setIsUndef(); 2333e8d8bef9SDimitry Andric 2334349cc55cSDimitry Andric const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write); 2335e8d8bef9SDimitry Andric MachineInstrBuilder MIB = 2336e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, OpDesc) 2337e8d8bef9SDimitry Andric .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2338e8d8bef9SDimitry Andric .add(MI.getOperand(2)) 2339e8d8bef9SDimitry Andric .addReg(VecReg, RegState::ImplicitDefine) 2340e8d8bef9SDimitry Andric .addReg(VecReg, 2341e8d8bef9SDimitry Andric RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2342e8d8bef9SDimitry Andric 2343bdd1243dSDimitry Andric const int ImpDefIdx = 2344bdd1243dSDimitry Andric OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); 2345e8d8bef9SDimitry Andric const int ImpUseIdx = ImpDefIdx + 1; 2346e8d8bef9SDimitry Andric MIB->tieOperands(ImpDefIdx, ImpUseIdx); 2347e8d8bef9SDimitry Andric 2348e8d8bef9SDimitry Andric MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 2349e8d8bef9SDimitry Andric 2350e8d8bef9SDimitry Andric finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 2351e8d8bef9SDimitry Andric 2352e8d8bef9SDimitry Andric MI.eraseFromParent(); 2353e8d8bef9SDimitry Andric break; 2354e8d8bef9SDimitry Andric } 2355e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: 2356e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: 2357e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: 2358e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: 2359e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: 2360e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: 2361bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9: 2362bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10: 2363bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11: 2364bdd1243dSDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12: 2365e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: 2366e8d8bef9SDimitry Andric case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { 2367e8d8bef9SDimitry Andric assert(ST.useVGPRIndexMode()); 2368e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2369e8d8bef9SDimitry Andric Register VecReg = MI.getOperand(1).getReg(); 2370e8d8bef9SDimitry Andric bool IsUndef = MI.getOperand(1).isUndef(); 2371e8d8bef9SDimitry Andric Register Idx = MI.getOperand(2).getReg(); 2372e8d8bef9SDimitry Andric Register SubReg = MI.getOperand(3).getImm(); 2373e8d8bef9SDimitry Andric 2374e8d8bef9SDimitry Andric MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 2375e8d8bef9SDimitry Andric .addReg(Idx) 2376e8d8bef9SDimitry Andric .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2377e8d8bef9SDimitry Andric SetOn->getOperand(3).setIsUndef(); 2378e8d8bef9SDimitry Andric 2379349cc55cSDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) 2380e8d8bef9SDimitry Andric .addDef(Dst) 2381e8d8bef9SDimitry Andric .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2382349cc55cSDimitry Andric .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2383e8d8bef9SDimitry Andric 2384e8d8bef9SDimitry Andric MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 2385e8d8bef9SDimitry Andric 2386e8d8bef9SDimitry Andric finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 2387e8d8bef9SDimitry Andric 2388e8d8bef9SDimitry Andric MI.eraseFromParent(); 2389e8d8bef9SDimitry Andric break; 2390e8d8bef9SDimitry Andric } 23910b57cec5SDimitry Andric case AMDGPU::SI_PC_ADD_REL_OFFSET: { 23920b57cec5SDimitry Andric MachineFunction &MF = *MBB.getParent(); 23938bcb0991SDimitry Andric Register Reg = MI.getOperand(0).getReg(); 23948bcb0991SDimitry Andric Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 23958bcb0991SDimitry Andric Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 23965f757f3fSDimitry Andric MachineOperand OpLo = MI.getOperand(1); 23975f757f3fSDimitry Andric MachineOperand OpHi = MI.getOperand(2); 23980b57cec5SDimitry Andric 23990b57cec5SDimitry Andric // Create a bundle so these instructions won't be re-ordered by the 24000b57cec5SDimitry Andric // post-RA scheduler. 24010b57cec5SDimitry Andric MIBundleBuilder Bundler(MBB, MI); 24020b57cec5SDimitry Andric Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 24030b57cec5SDimitry Andric 24045f757f3fSDimitry Andric // What we want here is an offset from the value returned by s_getpc (which 24055f757f3fSDimitry Andric // is the address of the s_add_u32 instruction) to the global variable, but 24065f757f3fSDimitry Andric // since the encoding of $symbol starts 4 bytes after the start of the 24075f757f3fSDimitry Andric // s_add_u32 instruction, we end up with an offset that is 4 bytes too 24085f757f3fSDimitry Andric // small. This requires us to add 4 to the global variable offset in order 24095f757f3fSDimitry Andric // to compute the correct address. Similarly for the s_addc_u32 instruction, 24105f757f3fSDimitry Andric // the encoding of $symbol starts 12 bytes after the start of the s_add_u32 24115f757f3fSDimitry Andric // instruction. 24120b57cec5SDimitry Andric 24135f757f3fSDimitry Andric if (OpLo.isGlobal()) 24145f757f3fSDimitry Andric OpLo.setOffset(OpLo.getOffset() + 4); 24155f757f3fSDimitry Andric Bundler.append( 24165f757f3fSDimitry Andric BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo)); 24170b57cec5SDimitry Andric 24185f757f3fSDimitry Andric if (OpHi.isGlobal()) 24195f757f3fSDimitry Andric OpHi.setOffset(OpHi.getOffset() + 12); 24205f757f3fSDimitry Andric Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 24215f757f3fSDimitry Andric .addReg(RegHi) 24225f757f3fSDimitry Andric .add(OpHi)); 24235f757f3fSDimitry Andric 24240b57cec5SDimitry Andric finalizeBundle(MBB, Bundler.begin()); 24250b57cec5SDimitry Andric 24260b57cec5SDimitry Andric MI.eraseFromParent(); 24270b57cec5SDimitry Andric break; 24280b57cec5SDimitry Andric } 2429fe6060f1SDimitry Andric case AMDGPU::ENTER_STRICT_WWM: { 24300b57cec5SDimitry Andric // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2431fe6060f1SDimitry Andric // Whole Wave Mode is entered. 24320b57cec5SDimitry Andric MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 24330b57cec5SDimitry Andric : AMDGPU::S_OR_SAVEEXEC_B64)); 24340b57cec5SDimitry Andric break; 24350b57cec5SDimitry Andric } 2436fe6060f1SDimitry Andric case AMDGPU::ENTER_STRICT_WQM: { 24370b57cec5SDimitry Andric // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2438fe6060f1SDimitry Andric // STRICT_WQM is entered. 2439fe6060f1SDimitry Andric const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 2440fe6060f1SDimitry Andric const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; 2441fe6060f1SDimitry Andric const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2442fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); 2443fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); 2444fe6060f1SDimitry Andric 2445fe6060f1SDimitry Andric MI.eraseFromParent(); 2446fe6060f1SDimitry Andric break; 2447fe6060f1SDimitry Andric } 2448fe6060f1SDimitry Andric case AMDGPU::EXIT_STRICT_WWM: 2449fe6060f1SDimitry Andric case AMDGPU::EXIT_STRICT_WQM: { 2450fe6060f1SDimitry Andric // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2451fe6060f1SDimitry Andric // WWM/STICT_WQM is exited. 24520b57cec5SDimitry Andric MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 24530b57cec5SDimitry Andric break; 24540b57cec5SDimitry Andric } 2455bdd1243dSDimitry Andric case AMDGPU::ENTER_PSEUDO_WM: 2456bdd1243dSDimitry Andric case AMDGPU::EXIT_PSEUDO_WM: { 2457bdd1243dSDimitry Andric // These do nothing. 2458bdd1243dSDimitry Andric MI.eraseFromParent(); 2459bdd1243dSDimitry Andric break; 2460bdd1243dSDimitry Andric } 246181ad6265SDimitry Andric case AMDGPU::SI_RETURN: { 246281ad6265SDimitry Andric const MachineFunction *MF = MBB.getParent(); 246381ad6265SDimitry Andric const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 246481ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 246581ad6265SDimitry Andric // Hiding the return address use with SI_RETURN may lead to extra kills in 246681ad6265SDimitry Andric // the function and missing live-ins. We are fine in practice because callee 246781ad6265SDimitry Andric // saved register handling ensures the register value is restored before 246881ad6265SDimitry Andric // RET, but we need the undef flag here to appease the MachineVerifier 246981ad6265SDimitry Andric // liveness checks. 247081ad6265SDimitry Andric MachineInstrBuilder MIB = 247181ad6265SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return)) 247281ad6265SDimitry Andric .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); 247381ad6265SDimitry Andric 247481ad6265SDimitry Andric MIB.copyImplicitOps(MI); 247581ad6265SDimitry Andric MI.eraseFromParent(); 247681ad6265SDimitry Andric break; 247781ad6265SDimitry Andric } 2478*1db9f3b2SDimitry Andric 2479*1db9f3b2SDimitry Andric case AMDGPU::S_MUL_U64_U32_PSEUDO: 2480*1db9f3b2SDimitry Andric case AMDGPU::S_MUL_I64_I32_PSEUDO: 2481*1db9f3b2SDimitry Andric MI.setDesc(get(AMDGPU::S_MUL_U64)); 2482*1db9f3b2SDimitry Andric break; 24830b57cec5SDimitry Andric } 24840b57cec5SDimitry Andric return true; 24850b57cec5SDimitry Andric } 24860b57cec5SDimitry Andric 24875f757f3fSDimitry Andric void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, 24885f757f3fSDimitry Andric MachineBasicBlock::iterator I, Register DestReg, 24895f757f3fSDimitry Andric unsigned SubIdx, const MachineInstr &Orig, 24905f757f3fSDimitry Andric const TargetRegisterInfo &RI) const { 24915f757f3fSDimitry Andric 24925f757f3fSDimitry Andric // Try shrinking the instruction to remat only the part needed for current 24935f757f3fSDimitry Andric // context. 24945f757f3fSDimitry Andric // TODO: Handle more cases. 24955f757f3fSDimitry Andric unsigned Opcode = Orig.getOpcode(); 24965f757f3fSDimitry Andric switch (Opcode) { 24975f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX16_IMM: 24985f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM: { 24995f757f3fSDimitry Andric if (SubIdx != 0) 25005f757f3fSDimitry Andric break; 25015f757f3fSDimitry Andric 25025f757f3fSDimitry Andric if (I == MBB.end()) 25035f757f3fSDimitry Andric break; 25045f757f3fSDimitry Andric 25055f757f3fSDimitry Andric if (I->isBundled()) 25065f757f3fSDimitry Andric break; 25075f757f3fSDimitry Andric 25085f757f3fSDimitry Andric // Look for a single use of the register that is also a subreg. 25095f757f3fSDimitry Andric Register RegToFind = Orig.getOperand(0).getReg(); 25105f757f3fSDimitry Andric MachineOperand *UseMO = nullptr; 25115f757f3fSDimitry Andric for (auto &CandMO : I->operands()) { 25125f757f3fSDimitry Andric if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef()) 25135f757f3fSDimitry Andric continue; 25145f757f3fSDimitry Andric if (UseMO) { 25155f757f3fSDimitry Andric UseMO = nullptr; 25165f757f3fSDimitry Andric break; 25175f757f3fSDimitry Andric } 25185f757f3fSDimitry Andric UseMO = &CandMO; 25195f757f3fSDimitry Andric } 25205f757f3fSDimitry Andric if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister) 25215f757f3fSDimitry Andric break; 25225f757f3fSDimitry Andric 25235f757f3fSDimitry Andric unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg()); 25245f757f3fSDimitry Andric unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg()); 25255f757f3fSDimitry Andric 25265f757f3fSDimitry Andric MachineFunction *MF = MBB.getParent(); 25275f757f3fSDimitry Andric MachineRegisterInfo &MRI = MF->getRegInfo(); 25285f757f3fSDimitry Andric assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet."); 25295f757f3fSDimitry Andric 25305f757f3fSDimitry Andric unsigned NewOpcode = -1; 25315f757f3fSDimitry Andric if (SubregSize == 256) 25325f757f3fSDimitry Andric NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM; 25335f757f3fSDimitry Andric else if (SubregSize == 128) 25345f757f3fSDimitry Andric NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM; 25355f757f3fSDimitry Andric else 25365f757f3fSDimitry Andric break; 25375f757f3fSDimitry Andric 25385f757f3fSDimitry Andric const MCInstrDesc &TID = get(NewOpcode); 25395f757f3fSDimitry Andric const TargetRegisterClass *NewRC = 25405f757f3fSDimitry Andric RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF)); 25415f757f3fSDimitry Andric MRI.setRegClass(DestReg, NewRC); 25425f757f3fSDimitry Andric 25435f757f3fSDimitry Andric UseMO->setReg(DestReg); 25445f757f3fSDimitry Andric UseMO->setSubReg(AMDGPU::NoSubRegister); 25455f757f3fSDimitry Andric 25465f757f3fSDimitry Andric // Use a smaller load with the desired size, possibly with updated offset. 25475f757f3fSDimitry Andric MachineInstr *MI = MF->CloneMachineInstr(&Orig); 25485f757f3fSDimitry Andric MI->setDesc(TID); 25495f757f3fSDimitry Andric MI->getOperand(0).setReg(DestReg); 25505f757f3fSDimitry Andric MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister); 25515f757f3fSDimitry Andric if (Offset) { 25525f757f3fSDimitry Andric MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset); 25535f757f3fSDimitry Andric int64_t FinalOffset = OffsetMO->getImm() + Offset / 8; 25545f757f3fSDimitry Andric OffsetMO->setImm(FinalOffset); 25555f757f3fSDimitry Andric } 25565f757f3fSDimitry Andric SmallVector<MachineMemOperand *> NewMMOs; 25575f757f3fSDimitry Andric for (const MachineMemOperand *MemOp : Orig.memoperands()) 25585f757f3fSDimitry Andric NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(), 25595f757f3fSDimitry Andric SubregSize / 8)); 25605f757f3fSDimitry Andric MI->setMemRefs(*MF, NewMMOs); 25615f757f3fSDimitry Andric 25625f757f3fSDimitry Andric MBB.insert(I, MI); 25635f757f3fSDimitry Andric return; 25645f757f3fSDimitry Andric } 25655f757f3fSDimitry Andric 25665f757f3fSDimitry Andric default: 25675f757f3fSDimitry Andric break; 25685f757f3fSDimitry Andric } 25695f757f3fSDimitry Andric 25705f757f3fSDimitry Andric TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); 25715f757f3fSDimitry Andric } 25725f757f3fSDimitry Andric 25738bcb0991SDimitry Andric std::pair<MachineInstr*, MachineInstr*> 25748bcb0991SDimitry Andric SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 25758bcb0991SDimitry Andric assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 25768bcb0991SDimitry Andric 257781ad6265SDimitry Andric if (ST.hasMovB64() && 25785f757f3fSDimitry Andric AMDGPU::isLegalDPALU_DPPControl( 257981ad6265SDimitry Andric getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { 258081ad6265SDimitry Andric MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); 2581bdd1243dSDimitry Andric return std::pair(&MI, nullptr); 258281ad6265SDimitry Andric } 258381ad6265SDimitry Andric 25848bcb0991SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 25858bcb0991SDimitry Andric DebugLoc DL = MBB.findDebugLoc(MI); 25868bcb0991SDimitry Andric MachineFunction *MF = MBB.getParent(); 25878bcb0991SDimitry Andric MachineRegisterInfo &MRI = MF->getRegInfo(); 25888bcb0991SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 25898bcb0991SDimitry Andric unsigned Part = 0; 25908bcb0991SDimitry Andric MachineInstr *Split[2]; 25918bcb0991SDimitry Andric 25928bcb0991SDimitry Andric for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 25938bcb0991SDimitry Andric auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 25948bcb0991SDimitry Andric if (Dst.isPhysical()) { 25958bcb0991SDimitry Andric MovDPP.addDef(RI.getSubReg(Dst, Sub)); 25968bcb0991SDimitry Andric } else { 25978bcb0991SDimitry Andric assert(MRI.isSSA()); 25988bcb0991SDimitry Andric auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 25998bcb0991SDimitry Andric MovDPP.addDef(Tmp); 26008bcb0991SDimitry Andric } 26018bcb0991SDimitry Andric 26028bcb0991SDimitry Andric for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 26038bcb0991SDimitry Andric const MachineOperand &SrcOp = MI.getOperand(I); 26048bcb0991SDimitry Andric assert(!SrcOp.isFPImm()); 26058bcb0991SDimitry Andric if (SrcOp.isImm()) { 26068bcb0991SDimitry Andric APInt Imm(64, SrcOp.getImm()); 26078bcb0991SDimitry Andric Imm.ashrInPlace(Part * 32); 26088bcb0991SDimitry Andric MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 26098bcb0991SDimitry Andric } else { 26108bcb0991SDimitry Andric assert(SrcOp.isReg()); 26118bcb0991SDimitry Andric Register Src = SrcOp.getReg(); 26128bcb0991SDimitry Andric if (Src.isPhysical()) 26138bcb0991SDimitry Andric MovDPP.addReg(RI.getSubReg(Src, Sub)); 26148bcb0991SDimitry Andric else 26158bcb0991SDimitry Andric MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 26168bcb0991SDimitry Andric } 26178bcb0991SDimitry Andric } 26188bcb0991SDimitry Andric 2619bdd1243dSDimitry Andric for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3)) 2620bdd1243dSDimitry Andric MovDPP.addImm(MO.getImm()); 26218bcb0991SDimitry Andric 26228bcb0991SDimitry Andric Split[Part] = MovDPP; 26238bcb0991SDimitry Andric ++Part; 26248bcb0991SDimitry Andric } 26258bcb0991SDimitry Andric 26268bcb0991SDimitry Andric if (Dst.isVirtual()) 26278bcb0991SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 26288bcb0991SDimitry Andric .addReg(Split[0]->getOperand(0).getReg()) 26298bcb0991SDimitry Andric .addImm(AMDGPU::sub0) 26308bcb0991SDimitry Andric .addReg(Split[1]->getOperand(0).getReg()) 26318bcb0991SDimitry Andric .addImm(AMDGPU::sub1); 26328bcb0991SDimitry Andric 26338bcb0991SDimitry Andric MI.eraseFromParent(); 2634bdd1243dSDimitry Andric return std::pair(Split[0], Split[1]); 26358bcb0991SDimitry Andric } 26368bcb0991SDimitry Andric 26375f757f3fSDimitry Andric std::optional<DestSourcePair> 26385f757f3fSDimitry Andric SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 26395f757f3fSDimitry Andric if (MI.getOpcode() == AMDGPU::WWM_COPY) 26405f757f3fSDimitry Andric return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; 26415f757f3fSDimitry Andric 26425f757f3fSDimitry Andric return std::nullopt; 26435f757f3fSDimitry Andric } 26445f757f3fSDimitry Andric 26450b57cec5SDimitry Andric bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 26460b57cec5SDimitry Andric MachineOperand &Src0, 26470b57cec5SDimitry Andric unsigned Src0OpName, 26480b57cec5SDimitry Andric MachineOperand &Src1, 26490b57cec5SDimitry Andric unsigned Src1OpName) const { 26500b57cec5SDimitry Andric MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 26510b57cec5SDimitry Andric if (!Src0Mods) 26520b57cec5SDimitry Andric return false; 26530b57cec5SDimitry Andric 26540b57cec5SDimitry Andric MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 26550b57cec5SDimitry Andric assert(Src1Mods && 26560b57cec5SDimitry Andric "All commutable instructions have both src0 and src1 modifiers"); 26570b57cec5SDimitry Andric 26580b57cec5SDimitry Andric int Src0ModsVal = Src0Mods->getImm(); 26590b57cec5SDimitry Andric int Src1ModsVal = Src1Mods->getImm(); 26600b57cec5SDimitry Andric 26610b57cec5SDimitry Andric Src1Mods->setImm(Src0ModsVal); 26620b57cec5SDimitry Andric Src0Mods->setImm(Src1ModsVal); 26630b57cec5SDimitry Andric return true; 26640b57cec5SDimitry Andric } 26650b57cec5SDimitry Andric 26660b57cec5SDimitry Andric static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 26670b57cec5SDimitry Andric MachineOperand &RegOp, 26680b57cec5SDimitry Andric MachineOperand &NonRegOp) { 26698bcb0991SDimitry Andric Register Reg = RegOp.getReg(); 26700b57cec5SDimitry Andric unsigned SubReg = RegOp.getSubReg(); 26710b57cec5SDimitry Andric bool IsKill = RegOp.isKill(); 26720b57cec5SDimitry Andric bool IsDead = RegOp.isDead(); 26730b57cec5SDimitry Andric bool IsUndef = RegOp.isUndef(); 26740b57cec5SDimitry Andric bool IsDebug = RegOp.isDebug(); 26750b57cec5SDimitry Andric 26760b57cec5SDimitry Andric if (NonRegOp.isImm()) 26770b57cec5SDimitry Andric RegOp.ChangeToImmediate(NonRegOp.getImm()); 26780b57cec5SDimitry Andric else if (NonRegOp.isFI()) 26790b57cec5SDimitry Andric RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 26805ffd83dbSDimitry Andric else if (NonRegOp.isGlobal()) { 26815ffd83dbSDimitry Andric RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), 26825ffd83dbSDimitry Andric NonRegOp.getTargetFlags()); 26835ffd83dbSDimitry Andric } else 26840b57cec5SDimitry Andric return nullptr; 26850b57cec5SDimitry Andric 26865ffd83dbSDimitry Andric // Make sure we don't reinterpret a subreg index in the target flags. 26875ffd83dbSDimitry Andric RegOp.setTargetFlags(NonRegOp.getTargetFlags()); 26885ffd83dbSDimitry Andric 26890b57cec5SDimitry Andric NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 26900b57cec5SDimitry Andric NonRegOp.setSubReg(SubReg); 26910b57cec5SDimitry Andric 26920b57cec5SDimitry Andric return &MI; 26930b57cec5SDimitry Andric } 26940b57cec5SDimitry Andric 26950b57cec5SDimitry Andric MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 26960b57cec5SDimitry Andric unsigned Src0Idx, 26970b57cec5SDimitry Andric unsigned Src1Idx) const { 26980b57cec5SDimitry Andric assert(!NewMI && "this should never be used"); 26990b57cec5SDimitry Andric 27000b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 27010b57cec5SDimitry Andric int CommutedOpcode = commuteOpcode(Opc); 27020b57cec5SDimitry Andric if (CommutedOpcode == -1) 27030b57cec5SDimitry Andric return nullptr; 27040b57cec5SDimitry Andric 27055f757f3fSDimitry Andric if (Src0Idx > Src1Idx) 27065f757f3fSDimitry Andric std::swap(Src0Idx, Src1Idx); 27075f757f3fSDimitry Andric 27080b57cec5SDimitry Andric assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 27090b57cec5SDimitry Andric static_cast<int>(Src0Idx) && 27100b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 27110b57cec5SDimitry Andric static_cast<int>(Src1Idx) && 27120b57cec5SDimitry Andric "inconsistency with findCommutedOpIndices"); 27130b57cec5SDimitry Andric 27140b57cec5SDimitry Andric MachineOperand &Src0 = MI.getOperand(Src0Idx); 27150b57cec5SDimitry Andric MachineOperand &Src1 = MI.getOperand(Src1Idx); 27160b57cec5SDimitry Andric 27170b57cec5SDimitry Andric MachineInstr *CommutedMI = nullptr; 27180b57cec5SDimitry Andric if (Src0.isReg() && Src1.isReg()) { 27190b57cec5SDimitry Andric if (isOperandLegal(MI, Src1Idx, &Src0)) { 27200b57cec5SDimitry Andric // Be sure to copy the source modifiers to the right place. 27210b57cec5SDimitry Andric CommutedMI 27220b57cec5SDimitry Andric = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 27230b57cec5SDimitry Andric } 27240b57cec5SDimitry Andric 27250b57cec5SDimitry Andric } else if (Src0.isReg() && !Src1.isReg()) { 27260b57cec5SDimitry Andric // src0 should always be able to support any operand type, so no need to 27270b57cec5SDimitry Andric // check operand legality. 27280b57cec5SDimitry Andric CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 27290b57cec5SDimitry Andric } else if (!Src0.isReg() && Src1.isReg()) { 27300b57cec5SDimitry Andric if (isOperandLegal(MI, Src1Idx, &Src0)) 27310b57cec5SDimitry Andric CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 27320b57cec5SDimitry Andric } else { 27330b57cec5SDimitry Andric // FIXME: Found two non registers to commute. This does happen. 27340b57cec5SDimitry Andric return nullptr; 27350b57cec5SDimitry Andric } 27360b57cec5SDimitry Andric 27370b57cec5SDimitry Andric if (CommutedMI) { 27380b57cec5SDimitry Andric swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 27390b57cec5SDimitry Andric Src1, AMDGPU::OpName::src1_modifiers); 27400b57cec5SDimitry Andric 27410b57cec5SDimitry Andric CommutedMI->setDesc(get(CommutedOpcode)); 27420b57cec5SDimitry Andric } 27430b57cec5SDimitry Andric 27440b57cec5SDimitry Andric return CommutedMI; 27450b57cec5SDimitry Andric } 27460b57cec5SDimitry Andric 27470b57cec5SDimitry Andric // This needs to be implemented because the source modifiers may be inserted 27480b57cec5SDimitry Andric // between the true commutable operands, and the base 27490b57cec5SDimitry Andric // TargetInstrInfo::commuteInstruction uses it. 27508bcb0991SDimitry Andric bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 27518bcb0991SDimitry Andric unsigned &SrcOpIdx0, 27520b57cec5SDimitry Andric unsigned &SrcOpIdx1) const { 27530b57cec5SDimitry Andric return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 27540b57cec5SDimitry Andric } 27550b57cec5SDimitry Andric 2756bdd1243dSDimitry Andric bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc, 2757bdd1243dSDimitry Andric unsigned &SrcOpIdx0, 27580b57cec5SDimitry Andric unsigned &SrcOpIdx1) const { 27590b57cec5SDimitry Andric if (!Desc.isCommutable()) 27600b57cec5SDimitry Andric return false; 27610b57cec5SDimitry Andric 27620b57cec5SDimitry Andric unsigned Opc = Desc.getOpcode(); 27630b57cec5SDimitry Andric int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 27640b57cec5SDimitry Andric if (Src0Idx == -1) 27650b57cec5SDimitry Andric return false; 27660b57cec5SDimitry Andric 27670b57cec5SDimitry Andric int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 27680b57cec5SDimitry Andric if (Src1Idx == -1) 27690b57cec5SDimitry Andric return false; 27700b57cec5SDimitry Andric 27710b57cec5SDimitry Andric return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 27720b57cec5SDimitry Andric } 27730b57cec5SDimitry Andric 27740b57cec5SDimitry Andric bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 27750b57cec5SDimitry Andric int64_t BrOffset) const { 27760b57cec5SDimitry Andric // BranchRelaxation should never have to check s_setpc_b64 because its dest 27770b57cec5SDimitry Andric // block is unanalyzable. 27780b57cec5SDimitry Andric assert(BranchOp != AMDGPU::S_SETPC_B64); 27790b57cec5SDimitry Andric 27800b57cec5SDimitry Andric // Convert to dwords. 27810b57cec5SDimitry Andric BrOffset /= 4; 27820b57cec5SDimitry Andric 27830b57cec5SDimitry Andric // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 27840b57cec5SDimitry Andric // from the next instruction. 27850b57cec5SDimitry Andric BrOffset -= 1; 27860b57cec5SDimitry Andric 27870b57cec5SDimitry Andric return isIntN(BranchOffsetBits, BrOffset); 27880b57cec5SDimitry Andric } 27890b57cec5SDimitry Andric 27905f757f3fSDimitry Andric MachineBasicBlock * 27915f757f3fSDimitry Andric SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 27920b57cec5SDimitry Andric return MI.getOperand(0).getMBB(); 27930b57cec5SDimitry Andric } 27940b57cec5SDimitry Andric 2795bdd1243dSDimitry Andric bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const { 2796bdd1243dSDimitry Andric for (const MachineInstr &MI : MBB->terminators()) { 2797bdd1243dSDimitry Andric if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 2798bdd1243dSDimitry Andric MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || 2799bdd1243dSDimitry Andric MI.getOpcode() == AMDGPU::SI_LOOP) 2800bdd1243dSDimitry Andric return true; 2801bdd1243dSDimitry Andric } 2802bdd1243dSDimitry Andric return false; 2803bdd1243dSDimitry Andric } 2804bdd1243dSDimitry Andric 2805349cc55cSDimitry Andric void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 28060b57cec5SDimitry Andric MachineBasicBlock &DestBB, 2807349cc55cSDimitry Andric MachineBasicBlock &RestoreBB, 2808349cc55cSDimitry Andric const DebugLoc &DL, int64_t BrOffset, 28090b57cec5SDimitry Andric RegScavenger *RS) const { 28100b57cec5SDimitry Andric assert(RS && "RegScavenger required for long branching"); 28110b57cec5SDimitry Andric assert(MBB.empty() && 28120b57cec5SDimitry Andric "new block should be inserted for expanding unconditional branch"); 28130b57cec5SDimitry Andric assert(MBB.pred_size() == 1); 2814349cc55cSDimitry Andric assert(RestoreBB.empty() && 2815349cc55cSDimitry Andric "restore block should be inserted for restoring clobbered registers"); 28160b57cec5SDimitry Andric 28170b57cec5SDimitry Andric MachineFunction *MF = MBB.getParent(); 28180b57cec5SDimitry Andric MachineRegisterInfo &MRI = MF->getRegInfo(); 281906c3fb27SDimitry Andric const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 28200b57cec5SDimitry Andric 28210b57cec5SDimitry Andric // FIXME: Virtual register workaround for RegScavenger not working with empty 28220b57cec5SDimitry Andric // blocks. 28238bcb0991SDimitry Andric Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 28240b57cec5SDimitry Andric 28250b57cec5SDimitry Andric auto I = MBB.end(); 28260b57cec5SDimitry Andric 28270b57cec5SDimitry Andric // We need to compute the offset relative to the instruction immediately after 28280b57cec5SDimitry Andric // s_getpc_b64. Insert pc arithmetic code before last terminator. 28290b57cec5SDimitry Andric MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 28300b57cec5SDimitry Andric 2831fe6060f1SDimitry Andric auto &MCCtx = MF->getContext(); 2832fe6060f1SDimitry Andric MCSymbol *PostGetPCLabel = 2833fe6060f1SDimitry Andric MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); 2834fe6060f1SDimitry Andric GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); 2835fe6060f1SDimitry Andric 2836fe6060f1SDimitry Andric MCSymbol *OffsetLo = 2837fe6060f1SDimitry Andric MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); 2838fe6060f1SDimitry Andric MCSymbol *OffsetHi = 2839fe6060f1SDimitry Andric MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); 28400b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 28410b57cec5SDimitry Andric .addReg(PCReg, RegState::Define, AMDGPU::sub0) 28420b57cec5SDimitry Andric .addReg(PCReg, 0, AMDGPU::sub0) 2843fe6060f1SDimitry Andric .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); 28440b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 28450b57cec5SDimitry Andric .addReg(PCReg, RegState::Define, AMDGPU::sub1) 28460b57cec5SDimitry Andric .addReg(PCReg, 0, AMDGPU::sub1) 2847fe6060f1SDimitry Andric .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); 28480b57cec5SDimitry Andric 28490b57cec5SDimitry Andric // Insert the indirect branch after the other terminator. 28500b57cec5SDimitry Andric BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 28510b57cec5SDimitry Andric .addReg(PCReg); 28520b57cec5SDimitry Andric 28530b57cec5SDimitry Andric // If a spill is needed for the pc register pair, we need to insert a spill 28540b57cec5SDimitry Andric // restore block right before the destination block, and insert a short branch 28550b57cec5SDimitry Andric // into the old destination block's fallthrough predecessor. 28560b57cec5SDimitry Andric // e.g.: 28570b57cec5SDimitry Andric // 28580b57cec5SDimitry Andric // s_cbranch_scc0 skip_long_branch: 28590b57cec5SDimitry Andric // 28600b57cec5SDimitry Andric // long_branch_bb: 28610b57cec5SDimitry Andric // spill s[8:9] 28620b57cec5SDimitry Andric // s_getpc_b64 s[8:9] 28630b57cec5SDimitry Andric // s_add_u32 s8, s8, restore_bb 28640b57cec5SDimitry Andric // s_addc_u32 s9, s9, 0 28650b57cec5SDimitry Andric // s_setpc_b64 s[8:9] 28660b57cec5SDimitry Andric // 28670b57cec5SDimitry Andric // skip_long_branch: 28680b57cec5SDimitry Andric // foo; 28690b57cec5SDimitry Andric // 28700b57cec5SDimitry Andric // ..... 28710b57cec5SDimitry Andric // 28720b57cec5SDimitry Andric // dest_bb_fallthrough_predecessor: 28730b57cec5SDimitry Andric // bar; 28740b57cec5SDimitry Andric // s_branch dest_bb 28750b57cec5SDimitry Andric // 28760b57cec5SDimitry Andric // restore_bb: 28770b57cec5SDimitry Andric // restore s[8:9] 28780b57cec5SDimitry Andric // fallthrough dest_bb 28790b57cec5SDimitry Andric /// 28800b57cec5SDimitry Andric // dest_bb: 28810b57cec5SDimitry Andric // buzz; 28820b57cec5SDimitry Andric 288306c3fb27SDimitry Andric Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 288406c3fb27SDimitry Andric Register Scav; 288506c3fb27SDimitry Andric 288606c3fb27SDimitry Andric // If we've previously reserved a register for long branches 288706c3fb27SDimitry Andric // avoid running the scavenger and just use those registers 288806c3fb27SDimitry Andric if (LongBranchReservedReg) { 288906c3fb27SDimitry Andric RS->enterBasicBlock(MBB); 289006c3fb27SDimitry Andric Scav = LongBranchReservedReg; 289106c3fb27SDimitry Andric } else { 28920b57cec5SDimitry Andric RS->enterBasicBlockEnd(MBB); 289306c3fb27SDimitry Andric Scav = RS->scavengeRegisterBackwards( 2894349cc55cSDimitry Andric AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), 2895349cc55cSDimitry Andric /* RestoreAfter */ false, 0, /* AllowSpill */ false); 289606c3fb27SDimitry Andric } 2897349cc55cSDimitry Andric if (Scav) { 2898349cc55cSDimitry Andric RS->setRegUsed(Scav); 28990b57cec5SDimitry Andric MRI.replaceRegWith(PCReg, Scav); 29000b57cec5SDimitry Andric MRI.clearVirtRegs(); 2901349cc55cSDimitry Andric } else { 2902349cc55cSDimitry Andric // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for 2903349cc55cSDimitry Andric // SGPR spill. 2904349cc55cSDimitry Andric const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 2905349cc55cSDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2906349cc55cSDimitry Andric TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS); 2907349cc55cSDimitry Andric MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1); 2908349cc55cSDimitry Andric MRI.clearVirtRegs(); 2909349cc55cSDimitry Andric } 29100b57cec5SDimitry Andric 2911349cc55cSDimitry Andric MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol(); 2912fe6060f1SDimitry Andric // Now, the distance could be defined. 2913fe6060f1SDimitry Andric auto *Offset = MCBinaryExpr::createSub( 2914349cc55cSDimitry Andric MCSymbolRefExpr::create(DestLabel, MCCtx), 2915fe6060f1SDimitry Andric MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); 2916fe6060f1SDimitry Andric // Add offset assignments. 2917fe6060f1SDimitry Andric auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); 2918fe6060f1SDimitry Andric OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); 2919fe6060f1SDimitry Andric auto *ShAmt = MCConstantExpr::create(32, MCCtx); 2920fe6060f1SDimitry Andric OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); 29210b57cec5SDimitry Andric } 29220b57cec5SDimitry Andric 29230b57cec5SDimitry Andric unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 29240b57cec5SDimitry Andric switch (Cond) { 29250b57cec5SDimitry Andric case SIInstrInfo::SCC_TRUE: 29260b57cec5SDimitry Andric return AMDGPU::S_CBRANCH_SCC1; 29270b57cec5SDimitry Andric case SIInstrInfo::SCC_FALSE: 29280b57cec5SDimitry Andric return AMDGPU::S_CBRANCH_SCC0; 29290b57cec5SDimitry Andric case SIInstrInfo::VCCNZ: 29300b57cec5SDimitry Andric return AMDGPU::S_CBRANCH_VCCNZ; 29310b57cec5SDimitry Andric case SIInstrInfo::VCCZ: 29320b57cec5SDimitry Andric return AMDGPU::S_CBRANCH_VCCZ; 29330b57cec5SDimitry Andric case SIInstrInfo::EXECNZ: 29340b57cec5SDimitry Andric return AMDGPU::S_CBRANCH_EXECNZ; 29350b57cec5SDimitry Andric case SIInstrInfo::EXECZ: 29360b57cec5SDimitry Andric return AMDGPU::S_CBRANCH_EXECZ; 29370b57cec5SDimitry Andric default: 29380b57cec5SDimitry Andric llvm_unreachable("invalid branch predicate"); 29390b57cec5SDimitry Andric } 29400b57cec5SDimitry Andric } 29410b57cec5SDimitry Andric 29420b57cec5SDimitry Andric SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 29430b57cec5SDimitry Andric switch (Opcode) { 29440b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_SCC0: 29450b57cec5SDimitry Andric return SCC_FALSE; 29460b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_SCC1: 29470b57cec5SDimitry Andric return SCC_TRUE; 29480b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_VCCNZ: 29490b57cec5SDimitry Andric return VCCNZ; 29500b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_VCCZ: 29510b57cec5SDimitry Andric return VCCZ; 29520b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_EXECNZ: 29530b57cec5SDimitry Andric return EXECNZ; 29540b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_EXECZ: 29550b57cec5SDimitry Andric return EXECZ; 29560b57cec5SDimitry Andric default: 29570b57cec5SDimitry Andric return INVALID_BR; 29580b57cec5SDimitry Andric } 29590b57cec5SDimitry Andric } 29600b57cec5SDimitry Andric 29610b57cec5SDimitry Andric bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 29620b57cec5SDimitry Andric MachineBasicBlock::iterator I, 29630b57cec5SDimitry Andric MachineBasicBlock *&TBB, 29640b57cec5SDimitry Andric MachineBasicBlock *&FBB, 29650b57cec5SDimitry Andric SmallVectorImpl<MachineOperand> &Cond, 29660b57cec5SDimitry Andric bool AllowModify) const { 29670b57cec5SDimitry Andric if (I->getOpcode() == AMDGPU::S_BRANCH) { 29680b57cec5SDimitry Andric // Unconditional Branch 29690b57cec5SDimitry Andric TBB = I->getOperand(0).getMBB(); 29700b57cec5SDimitry Andric return false; 29710b57cec5SDimitry Andric } 29720b57cec5SDimitry Andric 29730b57cec5SDimitry Andric MachineBasicBlock *CondBB = nullptr; 29740b57cec5SDimitry Andric 29750b57cec5SDimitry Andric if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 29760b57cec5SDimitry Andric CondBB = I->getOperand(1).getMBB(); 29770b57cec5SDimitry Andric Cond.push_back(I->getOperand(0)); 29780b57cec5SDimitry Andric } else { 29790b57cec5SDimitry Andric BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 29800b57cec5SDimitry Andric if (Pred == INVALID_BR) 29810b57cec5SDimitry Andric return true; 29820b57cec5SDimitry Andric 29830b57cec5SDimitry Andric CondBB = I->getOperand(0).getMBB(); 29840b57cec5SDimitry Andric Cond.push_back(MachineOperand::CreateImm(Pred)); 29850b57cec5SDimitry Andric Cond.push_back(I->getOperand(1)); // Save the branch register. 29860b57cec5SDimitry Andric } 29870b57cec5SDimitry Andric ++I; 29880b57cec5SDimitry Andric 29890b57cec5SDimitry Andric if (I == MBB.end()) { 29900b57cec5SDimitry Andric // Conditional branch followed by fall-through. 29910b57cec5SDimitry Andric TBB = CondBB; 29920b57cec5SDimitry Andric return false; 29930b57cec5SDimitry Andric } 29940b57cec5SDimitry Andric 29950b57cec5SDimitry Andric if (I->getOpcode() == AMDGPU::S_BRANCH) { 29960b57cec5SDimitry Andric TBB = CondBB; 29970b57cec5SDimitry Andric FBB = I->getOperand(0).getMBB(); 29980b57cec5SDimitry Andric return false; 29990b57cec5SDimitry Andric } 30000b57cec5SDimitry Andric 30010b57cec5SDimitry Andric return true; 30020b57cec5SDimitry Andric } 30030b57cec5SDimitry Andric 30040b57cec5SDimitry Andric bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 30050b57cec5SDimitry Andric MachineBasicBlock *&FBB, 30060b57cec5SDimitry Andric SmallVectorImpl<MachineOperand> &Cond, 30070b57cec5SDimitry Andric bool AllowModify) const { 30080b57cec5SDimitry Andric MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 30090b57cec5SDimitry Andric auto E = MBB.end(); 30100b57cec5SDimitry Andric if (I == E) 30110b57cec5SDimitry Andric return false; 30120b57cec5SDimitry Andric 30130b57cec5SDimitry Andric // Skip over the instructions that are artificially terminators for special 30140b57cec5SDimitry Andric // exec management. 3015fe6060f1SDimitry Andric while (I != E && !I->isBranch() && !I->isReturn()) { 30160b57cec5SDimitry Andric switch (I->getOpcode()) { 30170b57cec5SDimitry Andric case AMDGPU::S_MOV_B64_term: 30180b57cec5SDimitry Andric case AMDGPU::S_XOR_B64_term: 3019e8d8bef9SDimitry Andric case AMDGPU::S_OR_B64_term: 30200b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B64_term: 3021fe6060f1SDimitry Andric case AMDGPU::S_AND_B64_term: 302206c3fb27SDimitry Andric case AMDGPU::S_AND_SAVEEXEC_B64_term: 30230b57cec5SDimitry Andric case AMDGPU::S_MOV_B32_term: 30240b57cec5SDimitry Andric case AMDGPU::S_XOR_B32_term: 30250b57cec5SDimitry Andric case AMDGPU::S_OR_B32_term: 30260b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B32_term: 3027fe6060f1SDimitry Andric case AMDGPU::S_AND_B32_term: 302806c3fb27SDimitry Andric case AMDGPU::S_AND_SAVEEXEC_B32_term: 30290b57cec5SDimitry Andric break; 30300b57cec5SDimitry Andric case AMDGPU::SI_IF: 30310b57cec5SDimitry Andric case AMDGPU::SI_ELSE: 30320b57cec5SDimitry Andric case AMDGPU::SI_KILL_I1_TERMINATOR: 30330b57cec5SDimitry Andric case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 30340b57cec5SDimitry Andric // FIXME: It's messy that these need to be considered here at all. 30350b57cec5SDimitry Andric return true; 30360b57cec5SDimitry Andric default: 30370b57cec5SDimitry Andric llvm_unreachable("unexpected non-branch terminator inst"); 30380b57cec5SDimitry Andric } 30390b57cec5SDimitry Andric 30400b57cec5SDimitry Andric ++I; 30410b57cec5SDimitry Andric } 30420b57cec5SDimitry Andric 30430b57cec5SDimitry Andric if (I == E) 30440b57cec5SDimitry Andric return false; 30450b57cec5SDimitry Andric 30460b57cec5SDimitry Andric return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 30470b57cec5SDimitry Andric } 30480b57cec5SDimitry Andric 30490b57cec5SDimitry Andric unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 30500b57cec5SDimitry Andric int *BytesRemoved) const { 30510b57cec5SDimitry Andric unsigned Count = 0; 30520b57cec5SDimitry Andric unsigned RemovedSize = 0; 3053349cc55cSDimitry Andric for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) { 3054349cc55cSDimitry Andric // Skip over artificial terminators when removing instructions. 3055349cc55cSDimitry Andric if (MI.isBranch() || MI.isReturn()) { 3056349cc55cSDimitry Andric RemovedSize += getInstSizeInBytes(MI); 3057349cc55cSDimitry Andric MI.eraseFromParent(); 30580b57cec5SDimitry Andric ++Count; 3059349cc55cSDimitry Andric } 30600b57cec5SDimitry Andric } 30610b57cec5SDimitry Andric 30620b57cec5SDimitry Andric if (BytesRemoved) 30630b57cec5SDimitry Andric *BytesRemoved = RemovedSize; 30640b57cec5SDimitry Andric 30650b57cec5SDimitry Andric return Count; 30660b57cec5SDimitry Andric } 30670b57cec5SDimitry Andric 30680b57cec5SDimitry Andric // Copy the flags onto the implicit condition register operand. 30690b57cec5SDimitry Andric static void preserveCondRegFlags(MachineOperand &CondReg, 30700b57cec5SDimitry Andric const MachineOperand &OrigCond) { 30710b57cec5SDimitry Andric CondReg.setIsUndef(OrigCond.isUndef()); 30720b57cec5SDimitry Andric CondReg.setIsKill(OrigCond.isKill()); 30730b57cec5SDimitry Andric } 30740b57cec5SDimitry Andric 30750b57cec5SDimitry Andric unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 30760b57cec5SDimitry Andric MachineBasicBlock *TBB, 30770b57cec5SDimitry Andric MachineBasicBlock *FBB, 30780b57cec5SDimitry Andric ArrayRef<MachineOperand> Cond, 30790b57cec5SDimitry Andric const DebugLoc &DL, 30800b57cec5SDimitry Andric int *BytesAdded) const { 30810b57cec5SDimitry Andric if (!FBB && Cond.empty()) { 30820b57cec5SDimitry Andric BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 30830b57cec5SDimitry Andric .addMBB(TBB); 30840b57cec5SDimitry Andric if (BytesAdded) 3085e8d8bef9SDimitry Andric *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 30860b57cec5SDimitry Andric return 1; 30870b57cec5SDimitry Andric } 30880b57cec5SDimitry Andric 30890b57cec5SDimitry Andric if(Cond.size() == 1 && Cond[0].isReg()) { 30900b57cec5SDimitry Andric BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 30910b57cec5SDimitry Andric .add(Cond[0]) 30920b57cec5SDimitry Andric .addMBB(TBB); 30930b57cec5SDimitry Andric return 1; 30940b57cec5SDimitry Andric } 30950b57cec5SDimitry Andric 30960b57cec5SDimitry Andric assert(TBB && Cond[0].isImm()); 30970b57cec5SDimitry Andric 30980b57cec5SDimitry Andric unsigned Opcode 30990b57cec5SDimitry Andric = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 31000b57cec5SDimitry Andric 31010b57cec5SDimitry Andric if (!FBB) { 31020b57cec5SDimitry Andric MachineInstr *CondBr = 31030b57cec5SDimitry Andric BuildMI(&MBB, DL, get(Opcode)) 31040b57cec5SDimitry Andric .addMBB(TBB); 31050b57cec5SDimitry Andric 31060b57cec5SDimitry Andric // Copy the flags onto the implicit condition register operand. 31070b57cec5SDimitry Andric preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 31085ffd83dbSDimitry Andric fixImplicitOperands(*CondBr); 31090b57cec5SDimitry Andric 31100b57cec5SDimitry Andric if (BytesAdded) 3111e8d8bef9SDimitry Andric *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 31120b57cec5SDimitry Andric return 1; 31130b57cec5SDimitry Andric } 31140b57cec5SDimitry Andric 31150b57cec5SDimitry Andric assert(TBB && FBB); 31160b57cec5SDimitry Andric 31170b57cec5SDimitry Andric MachineInstr *CondBr = 31180b57cec5SDimitry Andric BuildMI(&MBB, DL, get(Opcode)) 31190b57cec5SDimitry Andric .addMBB(TBB); 3120fe6060f1SDimitry Andric fixImplicitOperands(*CondBr); 31210b57cec5SDimitry Andric BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 31220b57cec5SDimitry Andric .addMBB(FBB); 31230b57cec5SDimitry Andric 31240b57cec5SDimitry Andric MachineOperand &CondReg = CondBr->getOperand(1); 31250b57cec5SDimitry Andric CondReg.setIsUndef(Cond[1].isUndef()); 31260b57cec5SDimitry Andric CondReg.setIsKill(Cond[1].isKill()); 31270b57cec5SDimitry Andric 31280b57cec5SDimitry Andric if (BytesAdded) 3129e8d8bef9SDimitry Andric *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; 31300b57cec5SDimitry Andric 31310b57cec5SDimitry Andric return 2; 31320b57cec5SDimitry Andric } 31330b57cec5SDimitry Andric 31340b57cec5SDimitry Andric bool SIInstrInfo::reverseBranchCondition( 31350b57cec5SDimitry Andric SmallVectorImpl<MachineOperand> &Cond) const { 31360b57cec5SDimitry Andric if (Cond.size() != 2) { 31370b57cec5SDimitry Andric return true; 31380b57cec5SDimitry Andric } 31390b57cec5SDimitry Andric 31400b57cec5SDimitry Andric if (Cond[0].isImm()) { 31410b57cec5SDimitry Andric Cond[0].setImm(-Cond[0].getImm()); 31420b57cec5SDimitry Andric return false; 31430b57cec5SDimitry Andric } 31440b57cec5SDimitry Andric 31450b57cec5SDimitry Andric return true; 31460b57cec5SDimitry Andric } 31470b57cec5SDimitry Andric 31480b57cec5SDimitry Andric bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 31490b57cec5SDimitry Andric ArrayRef<MachineOperand> Cond, 31505ffd83dbSDimitry Andric Register DstReg, Register TrueReg, 31515ffd83dbSDimitry Andric Register FalseReg, int &CondCycles, 31520b57cec5SDimitry Andric int &TrueCycles, int &FalseCycles) const { 31530b57cec5SDimitry Andric switch (Cond[0].getImm()) { 31540b57cec5SDimitry Andric case VCCNZ: 31550b57cec5SDimitry Andric case VCCZ: { 31560b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 31570b57cec5SDimitry Andric const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 3158e8d8bef9SDimitry Andric if (MRI.getRegClass(FalseReg) != RC) 3159e8d8bef9SDimitry Andric return false; 31600b57cec5SDimitry Andric 316106c3fb27SDimitry Andric int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32; 31620b57cec5SDimitry Andric CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 31630b57cec5SDimitry Andric 31640b57cec5SDimitry Andric // Limit to equal cost for branch vs. N v_cndmask_b32s. 31650b57cec5SDimitry Andric return RI.hasVGPRs(RC) && NumInsts <= 6; 31660b57cec5SDimitry Andric } 31670b57cec5SDimitry Andric case SCC_TRUE: 31680b57cec5SDimitry Andric case SCC_FALSE: { 31690b57cec5SDimitry Andric // FIXME: We could insert for VGPRs if we could replace the original compare 31700b57cec5SDimitry Andric // with a vector one. 31710b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 31720b57cec5SDimitry Andric const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 3173e8d8bef9SDimitry Andric if (MRI.getRegClass(FalseReg) != RC) 3174e8d8bef9SDimitry Andric return false; 31750b57cec5SDimitry Andric 317606c3fb27SDimitry Andric int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32; 31770b57cec5SDimitry Andric 31780b57cec5SDimitry Andric // Multiples of 8 can do s_cselect_b64 31790b57cec5SDimitry Andric if (NumInsts % 2 == 0) 31800b57cec5SDimitry Andric NumInsts /= 2; 31810b57cec5SDimitry Andric 31820b57cec5SDimitry Andric CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 31830b57cec5SDimitry Andric return RI.isSGPRClass(RC); 31840b57cec5SDimitry Andric } 31850b57cec5SDimitry Andric default: 31860b57cec5SDimitry Andric return false; 31870b57cec5SDimitry Andric } 31880b57cec5SDimitry Andric } 31890b57cec5SDimitry Andric 31900b57cec5SDimitry Andric void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 31910b57cec5SDimitry Andric MachineBasicBlock::iterator I, const DebugLoc &DL, 31925ffd83dbSDimitry Andric Register DstReg, ArrayRef<MachineOperand> Cond, 31935ffd83dbSDimitry Andric Register TrueReg, Register FalseReg) const { 31940b57cec5SDimitry Andric BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 31950b57cec5SDimitry Andric if (Pred == VCCZ || Pred == SCC_FALSE) { 31960b57cec5SDimitry Andric Pred = static_cast<BranchPredicate>(-Pred); 31970b57cec5SDimitry Andric std::swap(TrueReg, FalseReg); 31980b57cec5SDimitry Andric } 31990b57cec5SDimitry Andric 32000b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 32010b57cec5SDimitry Andric const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 32020b57cec5SDimitry Andric unsigned DstSize = RI.getRegSizeInBits(*DstRC); 32030b57cec5SDimitry Andric 32040b57cec5SDimitry Andric if (DstSize == 32) { 32055ffd83dbSDimitry Andric MachineInstr *Select; 32065ffd83dbSDimitry Andric if (Pred == SCC_TRUE) { 32075ffd83dbSDimitry Andric Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 32085ffd83dbSDimitry Andric .addReg(TrueReg) 32095ffd83dbSDimitry Andric .addReg(FalseReg); 32105ffd83dbSDimitry Andric } else { 32110b57cec5SDimitry Andric // Instruction's operands are backwards from what is expected. 32125ffd83dbSDimitry Andric Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 32130b57cec5SDimitry Andric .addReg(FalseReg) 32140b57cec5SDimitry Andric .addReg(TrueReg); 32155ffd83dbSDimitry Andric } 32160b57cec5SDimitry Andric 32170b57cec5SDimitry Andric preserveCondRegFlags(Select->getOperand(3), Cond[1]); 32180b57cec5SDimitry Andric return; 32190b57cec5SDimitry Andric } 32200b57cec5SDimitry Andric 32210b57cec5SDimitry Andric if (DstSize == 64 && Pred == SCC_TRUE) { 32220b57cec5SDimitry Andric MachineInstr *Select = 32230b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 32245ffd83dbSDimitry Andric .addReg(TrueReg) 32255ffd83dbSDimitry Andric .addReg(FalseReg); 32260b57cec5SDimitry Andric 32270b57cec5SDimitry Andric preserveCondRegFlags(Select->getOperand(3), Cond[1]); 32280b57cec5SDimitry Andric return; 32290b57cec5SDimitry Andric } 32300b57cec5SDimitry Andric 32310b57cec5SDimitry Andric static const int16_t Sub0_15[] = { 32320b57cec5SDimitry Andric AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 32330b57cec5SDimitry Andric AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 32340b57cec5SDimitry Andric AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 32350b57cec5SDimitry Andric AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 32360b57cec5SDimitry Andric }; 32370b57cec5SDimitry Andric 32380b57cec5SDimitry Andric static const int16_t Sub0_15_64[] = { 32390b57cec5SDimitry Andric AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 32400b57cec5SDimitry Andric AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 32410b57cec5SDimitry Andric AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 32420b57cec5SDimitry Andric AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 32430b57cec5SDimitry Andric }; 32440b57cec5SDimitry Andric 32450b57cec5SDimitry Andric unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 32460b57cec5SDimitry Andric const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 32470b57cec5SDimitry Andric const int16_t *SubIndices = Sub0_15; 32480b57cec5SDimitry Andric int NElts = DstSize / 32; 32490b57cec5SDimitry Andric 32500b57cec5SDimitry Andric // 64-bit select is only available for SALU. 32510b57cec5SDimitry Andric // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 32520b57cec5SDimitry Andric if (Pred == SCC_TRUE) { 32530b57cec5SDimitry Andric if (NElts % 2) { 32540b57cec5SDimitry Andric SelOp = AMDGPU::S_CSELECT_B32; 32550b57cec5SDimitry Andric EltRC = &AMDGPU::SGPR_32RegClass; 32560b57cec5SDimitry Andric } else { 32570b57cec5SDimitry Andric SelOp = AMDGPU::S_CSELECT_B64; 32580b57cec5SDimitry Andric EltRC = &AMDGPU::SGPR_64RegClass; 32590b57cec5SDimitry Andric SubIndices = Sub0_15_64; 32600b57cec5SDimitry Andric NElts /= 2; 32610b57cec5SDimitry Andric } 32620b57cec5SDimitry Andric } 32630b57cec5SDimitry Andric 32640b57cec5SDimitry Andric MachineInstrBuilder MIB = BuildMI( 32650b57cec5SDimitry Andric MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 32660b57cec5SDimitry Andric 32670b57cec5SDimitry Andric I = MIB->getIterator(); 32680b57cec5SDimitry Andric 32695ffd83dbSDimitry Andric SmallVector<Register, 8> Regs; 32700b57cec5SDimitry Andric for (int Idx = 0; Idx != NElts; ++Idx) { 32718bcb0991SDimitry Andric Register DstElt = MRI.createVirtualRegister(EltRC); 32720b57cec5SDimitry Andric Regs.push_back(DstElt); 32730b57cec5SDimitry Andric 32740b57cec5SDimitry Andric unsigned SubIdx = SubIndices[Idx]; 32750b57cec5SDimitry Andric 32765ffd83dbSDimitry Andric MachineInstr *Select; 32775ffd83dbSDimitry Andric if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 32785ffd83dbSDimitry Andric Select = 32790b57cec5SDimitry Andric BuildMI(MBB, I, DL, get(SelOp), DstElt) 32800b57cec5SDimitry Andric .addReg(FalseReg, 0, SubIdx) 32810b57cec5SDimitry Andric .addReg(TrueReg, 0, SubIdx); 32825ffd83dbSDimitry Andric } else { 32835ffd83dbSDimitry Andric Select = 32845ffd83dbSDimitry Andric BuildMI(MBB, I, DL, get(SelOp), DstElt) 32855ffd83dbSDimitry Andric .addReg(TrueReg, 0, SubIdx) 32865ffd83dbSDimitry Andric .addReg(FalseReg, 0, SubIdx); 32875ffd83dbSDimitry Andric } 32885ffd83dbSDimitry Andric 32890b57cec5SDimitry Andric preserveCondRegFlags(Select->getOperand(3), Cond[1]); 32900b57cec5SDimitry Andric fixImplicitOperands(*Select); 32910b57cec5SDimitry Andric 32920b57cec5SDimitry Andric MIB.addReg(DstElt) 32930b57cec5SDimitry Andric .addImm(SubIdx); 32940b57cec5SDimitry Andric } 32950b57cec5SDimitry Andric } 32960b57cec5SDimitry Andric 3297349cc55cSDimitry Andric bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { 32980b57cec5SDimitry Andric switch (MI.getOpcode()) { 32990b57cec5SDimitry Andric case AMDGPU::V_MOV_B32_e32: 33000b57cec5SDimitry Andric case AMDGPU::V_MOV_B32_e64: 3301349cc55cSDimitry Andric case AMDGPU::V_MOV_B64_PSEUDO: 330281ad6265SDimitry Andric case AMDGPU::V_MOV_B64_e32: 330381ad6265SDimitry Andric case AMDGPU::V_MOV_B64_e64: 33040b57cec5SDimitry Andric case AMDGPU::S_MOV_B32: 33050b57cec5SDimitry Andric case AMDGPU::S_MOV_B64: 33065f757f3fSDimitry Andric case AMDGPU::S_MOV_B64_IMM_PSEUDO: 33070b57cec5SDimitry Andric case AMDGPU::COPY: 33085f757f3fSDimitry Andric case AMDGPU::WWM_COPY: 3309e8d8bef9SDimitry Andric case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 3310e8d8bef9SDimitry Andric case AMDGPU::V_ACCVGPR_READ_B32_e64: 3311fe6060f1SDimitry Andric case AMDGPU::V_ACCVGPR_MOV_B32: 33120b57cec5SDimitry Andric return true; 33130b57cec5SDimitry Andric default: 33140b57cec5SDimitry Andric return false; 33150b57cec5SDimitry Andric } 33160b57cec5SDimitry Andric } 33170b57cec5SDimitry Andric 331881ad6265SDimitry Andric static constexpr unsigned ModifierOpNames[] = { 331981ad6265SDimitry Andric AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, 332081ad6265SDimitry Andric AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, 3321bdd1243dSDimitry Andric AMDGPU::OpName::omod, AMDGPU::OpName::op_sel}; 33220b57cec5SDimitry Andric 332381ad6265SDimitry Andric void SIInstrInfo::removeModOperands(MachineInstr &MI) const { 33240b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 3325bdd1243dSDimitry Andric for (unsigned Name : reverse(ModifierOpNames)) { 3326bdd1243dSDimitry Andric int Idx = AMDGPU::getNamedOperandIdx(Opc, Name); 3327bdd1243dSDimitry Andric if (Idx >= 0) 3328bdd1243dSDimitry Andric MI.removeOperand(Idx); 3329bdd1243dSDimitry Andric } 33300b57cec5SDimitry Andric } 33310b57cec5SDimitry Andric 33320b57cec5SDimitry Andric bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 33335ffd83dbSDimitry Andric Register Reg, MachineRegisterInfo *MRI) const { 33340b57cec5SDimitry Andric if (!MRI->hasOneNonDBGUse(Reg)) 33350b57cec5SDimitry Andric return false; 33360b57cec5SDimitry Andric 33370b57cec5SDimitry Andric switch (DefMI.getOpcode()) { 33380b57cec5SDimitry Andric default: 33390b57cec5SDimitry Andric return false; 33405f757f3fSDimitry Andric case AMDGPU::V_MOV_B64_e32: 33410b57cec5SDimitry Andric case AMDGPU::S_MOV_B64: 33425f757f3fSDimitry Andric case AMDGPU::V_MOV_B64_PSEUDO: 33435f757f3fSDimitry Andric case AMDGPU::S_MOV_B64_IMM_PSEUDO: 33440b57cec5SDimitry Andric case AMDGPU::V_MOV_B32_e32: 33450b57cec5SDimitry Andric case AMDGPU::S_MOV_B32: 3346e8d8bef9SDimitry Andric case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 33470b57cec5SDimitry Andric break; 33480b57cec5SDimitry Andric } 33490b57cec5SDimitry Andric 33500b57cec5SDimitry Andric const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 33510b57cec5SDimitry Andric assert(ImmOp); 33520b57cec5SDimitry Andric // FIXME: We could handle FrameIndex values here. 33530b57cec5SDimitry Andric if (!ImmOp->isImm()) 33540b57cec5SDimitry Andric return false; 33550b57cec5SDimitry Andric 33565f757f3fSDimitry Andric auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t { 33575f757f3fSDimitry Andric int64_t Imm = ImmOp->getImm(); 33585f757f3fSDimitry Andric switch (UseOp.getSubReg()) { 33595f757f3fSDimitry Andric default: 33605f757f3fSDimitry Andric return Imm; 33615f757f3fSDimitry Andric case AMDGPU::sub0: 33625f757f3fSDimitry Andric return Lo_32(Imm); 33635f757f3fSDimitry Andric case AMDGPU::sub1: 33645f757f3fSDimitry Andric return Hi_32(Imm); 33655f757f3fSDimitry Andric case AMDGPU::lo16: 33665f757f3fSDimitry Andric return APInt(16, Imm).getSExtValue(); 33675f757f3fSDimitry Andric case AMDGPU::hi16: 33685f757f3fSDimitry Andric return APInt(32, Imm).ashr(16).getSExtValue(); 33695f757f3fSDimitry Andric case AMDGPU::sub1_lo16: 33705f757f3fSDimitry Andric return APInt(16, Hi_32(Imm)).getSExtValue(); 33715f757f3fSDimitry Andric case AMDGPU::sub1_hi16: 33725f757f3fSDimitry Andric return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue(); 33735f757f3fSDimitry Andric } 33745f757f3fSDimitry Andric }; 33755f757f3fSDimitry Andric 33765f757f3fSDimitry Andric assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); 33775f757f3fSDimitry Andric 33780b57cec5SDimitry Andric unsigned Opc = UseMI.getOpcode(); 33790b57cec5SDimitry Andric if (Opc == AMDGPU::COPY) { 33805f757f3fSDimitry Andric assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form"); 33815ffd83dbSDimitry Andric 33825f757f3fSDimitry Andric Register DstReg = UseMI.getOperand(0).getReg(); 33835f757f3fSDimitry Andric unsigned OpSize = getOpSize(UseMI, 0); 33845f757f3fSDimitry Andric bool Is16Bit = OpSize == 2; 33855f757f3fSDimitry Andric bool Is64Bit = OpSize == 8; 33865f757f3fSDimitry Andric bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 33875f757f3fSDimitry Andric unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO 33885f757f3fSDimitry Andric : AMDGPU::V_MOV_B32_e32 33895f757f3fSDimitry Andric : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO 33905f757f3fSDimitry Andric : AMDGPU::S_MOV_B32; 33915f757f3fSDimitry Andric APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1))); 33925ffd83dbSDimitry Andric 33935ffd83dbSDimitry Andric if (RI.isAGPR(*MRI, DstReg)) { 33945f757f3fSDimitry Andric if (Is64Bit || !isInlineConstant(Imm)) 33950b57cec5SDimitry Andric return false; 3396e8d8bef9SDimitry Andric NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 33970b57cec5SDimitry Andric } 33985ffd83dbSDimitry Andric 33995ffd83dbSDimitry Andric if (Is16Bit) { 34005ffd83dbSDimitry Andric if (isVGPRCopy) 34015ffd83dbSDimitry Andric return false; // Do not clobber vgpr_hi16 34025ffd83dbSDimitry Andric 34034824e7fdSDimitry Andric if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 34045ffd83dbSDimitry Andric return false; 34055ffd83dbSDimitry Andric 34065ffd83dbSDimitry Andric UseMI.getOperand(0).setSubReg(0); 34075ffd83dbSDimitry Andric if (DstReg.isPhysical()) { 34085ffd83dbSDimitry Andric DstReg = RI.get32BitRegister(DstReg); 34095ffd83dbSDimitry Andric UseMI.getOperand(0).setReg(DstReg); 34105ffd83dbSDimitry Andric } 34115ffd83dbSDimitry Andric assert(UseMI.getOperand(1).getReg().isVirtual()); 34125ffd83dbSDimitry Andric } 34135ffd83dbSDimitry Andric 341406c3fb27SDimitry Andric const MCInstrDesc &NewMCID = get(NewOpc); 341506c3fb27SDimitry Andric if (DstReg.isPhysical() && 341606c3fb27SDimitry Andric !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg)) 341706c3fb27SDimitry Andric return false; 341806c3fb27SDimitry Andric 341906c3fb27SDimitry Andric UseMI.setDesc(NewMCID); 34205ffd83dbSDimitry Andric UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 34210b57cec5SDimitry Andric UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 34220b57cec5SDimitry Andric return true; 34230b57cec5SDimitry Andric } 34240b57cec5SDimitry Andric 3425e8d8bef9SDimitry Andric if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 3426e8d8bef9SDimitry Andric Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 3427e8d8bef9SDimitry Andric Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 3428bdd1243dSDimitry Andric Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || 3429bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F16_t16_e64) { 34300b57cec5SDimitry Andric // Don't fold if we are using source or output modifiers. The new VOP2 34310b57cec5SDimitry Andric // instructions don't have them. 34320b57cec5SDimitry Andric if (hasAnyModifiersSet(UseMI)) 34330b57cec5SDimitry Andric return false; 34340b57cec5SDimitry Andric 34350b57cec5SDimitry Andric // If this is a free constant, there's no reason to do this. 34360b57cec5SDimitry Andric // TODO: We could fold this here instead of letting SIFoldOperands do it 34370b57cec5SDimitry Andric // later. 34380b57cec5SDimitry Andric MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 34390b57cec5SDimitry Andric 34400b57cec5SDimitry Andric // Any src operand can be used for the legality check. 34410b57cec5SDimitry Andric if (isInlineConstant(UseMI, *Src0, *ImmOp)) 34420b57cec5SDimitry Andric return false; 34430b57cec5SDimitry Andric 3444e8d8bef9SDimitry Andric bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 3445e8d8bef9SDimitry Andric Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; 3446bdd1243dSDimitry Andric bool IsFMA = 3447bdd1243dSDimitry Andric Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 3448bdd1243dSDimitry Andric Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || 3449bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F16_t16_e64; 34500b57cec5SDimitry Andric MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 34510b57cec5SDimitry Andric MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 34520b57cec5SDimitry Andric 34530b57cec5SDimitry Andric // Multiplied part is the constant: Use v_madmk_{f16, f32}. 34545f757f3fSDimitry Andric if ((Src0->isReg() && Src0->getReg() == Reg) || 34555f757f3fSDimitry Andric (Src1->isReg() && Src1->getReg() == Reg)) { 34565f757f3fSDimitry Andric MachineOperand *RegSrc = 34575f757f3fSDimitry Andric Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1; 34585f757f3fSDimitry Andric if (!RegSrc->isReg()) 34595f757f3fSDimitry Andric return false; 34605f757f3fSDimitry Andric if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) && 34615f757f3fSDimitry Andric ST.getConstantBusLimit(Opc) < 2) 34620b57cec5SDimitry Andric return false; 34630b57cec5SDimitry Andric 34640b57cec5SDimitry Andric if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 34650b57cec5SDimitry Andric return false; 34660b57cec5SDimitry Andric 34675f757f3fSDimitry Andric // If src2 is also a literal constant then we have to choose which one to 34685f757f3fSDimitry Andric // fold. In general it is better to choose madak so that the other literal 34695f757f3fSDimitry Andric // can be materialized in an sgpr instead of a vgpr: 34705f757f3fSDimitry Andric // s_mov_b32 s0, literal 34715f757f3fSDimitry Andric // v_madak_f32 v0, s0, v0, literal 34725f757f3fSDimitry Andric // Instead of: 34735f757f3fSDimitry Andric // v_mov_b32 v1, literal 34745f757f3fSDimitry Andric // v_madmk_f32 v0, v0, literal, v1 34755f757f3fSDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg()); 34765f757f3fSDimitry Andric if (Def && Def->isMoveImmediate() && 34775f757f3fSDimitry Andric !isInlineConstant(Def->getOperand(1))) 34785f757f3fSDimitry Andric return false; 34795f757f3fSDimitry Andric 34800b57cec5SDimitry Andric unsigned NewOpc = 3481bdd1243dSDimitry Andric IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 3482bdd1243dSDimitry Andric : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 3483bdd1243dSDimitry Andric : AMDGPU::V_FMAMK_F16) 34840b57cec5SDimitry Andric : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 34850b57cec5SDimitry Andric if (pseudoToMCOpcode(NewOpc) == -1) 34860b57cec5SDimitry Andric return false; 34870b57cec5SDimitry Andric 34885f757f3fSDimitry Andric // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite 34895f757f3fSDimitry Andric // would also require restricting their register classes. For now 34905f757f3fSDimitry Andric // just bail out. 34915f757f3fSDimitry Andric if (NewOpc == AMDGPU::V_FMAMK_F16_t16) 34925f757f3fSDimitry Andric return false; 34930b57cec5SDimitry Andric 34945f757f3fSDimitry Andric const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1); 34950b57cec5SDimitry Andric 34960b57cec5SDimitry Andric // FIXME: This would be a lot easier if we could return a new instruction 34970b57cec5SDimitry Andric // instead of having to modify in place. 34980b57cec5SDimitry Andric 34995f757f3fSDimitry Andric Register SrcReg = RegSrc->getReg(); 35005f757f3fSDimitry Andric unsigned SrcSubReg = RegSrc->getSubReg(); 35015f757f3fSDimitry Andric Src0->setReg(SrcReg); 35025f757f3fSDimitry Andric Src0->setSubReg(SrcSubReg); 35035f757f3fSDimitry Andric Src0->setIsKill(RegSrc->isKill()); 35040b57cec5SDimitry Andric 3505bdd1243dSDimitry Andric if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 3506bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || 35070b57cec5SDimitry Andric Opc == AMDGPU::V_FMAC_F16_e64) 35080b57cec5SDimitry Andric UseMI.untieRegOperand( 35090b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 35100b57cec5SDimitry Andric 35110b57cec5SDimitry Andric Src1->ChangeToImmediate(Imm); 35120b57cec5SDimitry Andric 35130b57cec5SDimitry Andric removeModOperands(UseMI); 35140b57cec5SDimitry Andric UseMI.setDesc(get(NewOpc)); 35150b57cec5SDimitry Andric 351681ad6265SDimitry Andric bool DeleteDef = MRI->use_nodbg_empty(Reg); 35170b57cec5SDimitry Andric if (DeleteDef) 35180b57cec5SDimitry Andric DefMI.eraseFromParent(); 35190b57cec5SDimitry Andric 35200b57cec5SDimitry Andric return true; 35210b57cec5SDimitry Andric } 35220b57cec5SDimitry Andric 35230b57cec5SDimitry Andric // Added part is the constant: Use v_madak_{f16, f32}. 35240b57cec5SDimitry Andric if (Src2->isReg() && Src2->getReg() == Reg) { 35255f757f3fSDimitry Andric if (ST.getConstantBusLimit(Opc) < 2) { 35260b57cec5SDimitry Andric // Not allowed to use constant bus for another operand. 35270b57cec5SDimitry Andric // We can however allow an inline immediate as src0. 35280b57cec5SDimitry Andric bool Src0Inlined = false; 35290b57cec5SDimitry Andric if (Src0->isReg()) { 35300b57cec5SDimitry Andric // Try to inline constant if possible. 35310b57cec5SDimitry Andric // If the Def moves immediate and the use is single 35320b57cec5SDimitry Andric // We are saving VGPR here. 35330b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 35340b57cec5SDimitry Andric if (Def && Def->isMoveImmediate() && 35350b57cec5SDimitry Andric isInlineConstant(Def->getOperand(1)) && 35360b57cec5SDimitry Andric MRI->hasOneUse(Src0->getReg())) { 35370b57cec5SDimitry Andric Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 35380b57cec5SDimitry Andric Src0Inlined = true; 35395f757f3fSDimitry Andric } else if (ST.getConstantBusLimit(Opc) <= 1 && 35405f757f3fSDimitry Andric RI.isSGPRReg(*MRI, Src0->getReg())) { 35410b57cec5SDimitry Andric return false; 35425f757f3fSDimitry Andric } 35430b57cec5SDimitry Andric // VGPR is okay as Src0 - fallthrough 35440b57cec5SDimitry Andric } 35450b57cec5SDimitry Andric 35460b57cec5SDimitry Andric if (Src1->isReg() && !Src0Inlined) { 35470b57cec5SDimitry Andric // We have one slot for inlinable constant so far - try to fill it 35480b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 35490b57cec5SDimitry Andric if (Def && Def->isMoveImmediate() && 35500b57cec5SDimitry Andric isInlineConstant(Def->getOperand(1)) && 35515f757f3fSDimitry Andric MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) 35520b57cec5SDimitry Andric Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 35535f757f3fSDimitry Andric else if (RI.isSGPRReg(*MRI, Src1->getReg())) 35540b57cec5SDimitry Andric return false; 35550b57cec5SDimitry Andric // VGPR is okay as Src1 - fallthrough 35560b57cec5SDimitry Andric } 35575f757f3fSDimitry Andric } 35580b57cec5SDimitry Andric 35590b57cec5SDimitry Andric unsigned NewOpc = 3560bdd1243dSDimitry Andric IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 3561bdd1243dSDimitry Andric : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 3562bdd1243dSDimitry Andric : AMDGPU::V_FMAAK_F16) 35630b57cec5SDimitry Andric : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 35640b57cec5SDimitry Andric if (pseudoToMCOpcode(NewOpc) == -1) 35650b57cec5SDimitry Andric return false; 35660b57cec5SDimitry Andric 35675f757f3fSDimitry Andric // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite 35685f757f3fSDimitry Andric // would also require restricting their register classes. For now 35695f757f3fSDimitry Andric // just bail out. 35705f757f3fSDimitry Andric if (NewOpc == AMDGPU::V_FMAAK_F16_t16) 35715f757f3fSDimitry Andric return false; 35720b57cec5SDimitry Andric 35730b57cec5SDimitry Andric // FIXME: This would be a lot easier if we could return a new instruction 35740b57cec5SDimitry Andric // instead of having to modify in place. 35750b57cec5SDimitry Andric 3576bdd1243dSDimitry Andric if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 3577bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || 35780b57cec5SDimitry Andric Opc == AMDGPU::V_FMAC_F16_e64) 35790b57cec5SDimitry Andric UseMI.untieRegOperand( 35800b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 35810b57cec5SDimitry Andric 35820b57cec5SDimitry Andric // ChangingToImmediate adds Src2 back to the instruction. 35835f757f3fSDimitry Andric Src2->ChangeToImmediate(getImmFor(*Src2)); 35840b57cec5SDimitry Andric 35850b57cec5SDimitry Andric // These come before src2. 35860b57cec5SDimitry Andric removeModOperands(UseMI); 35870b57cec5SDimitry Andric UseMI.setDesc(get(NewOpc)); 35880b57cec5SDimitry Andric // It might happen that UseMI was commuted 35890b57cec5SDimitry Andric // and we now have SGPR as SRC1. If so 2 inlined 35900b57cec5SDimitry Andric // constant and SGPR are illegal. 35910b57cec5SDimitry Andric legalizeOperands(UseMI); 35920b57cec5SDimitry Andric 359381ad6265SDimitry Andric bool DeleteDef = MRI->use_nodbg_empty(Reg); 35940b57cec5SDimitry Andric if (DeleteDef) 35950b57cec5SDimitry Andric DefMI.eraseFromParent(); 35960b57cec5SDimitry Andric 35970b57cec5SDimitry Andric return true; 35980b57cec5SDimitry Andric } 35990b57cec5SDimitry Andric } 36000b57cec5SDimitry Andric 36010b57cec5SDimitry Andric return false; 36020b57cec5SDimitry Andric } 36030b57cec5SDimitry Andric 36045ffd83dbSDimitry Andric static bool 36055ffd83dbSDimitry Andric memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 36065ffd83dbSDimitry Andric ArrayRef<const MachineOperand *> BaseOps2) { 36075ffd83dbSDimitry Andric if (BaseOps1.size() != BaseOps2.size()) 36085ffd83dbSDimitry Andric return false; 36095ffd83dbSDimitry Andric for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { 36105ffd83dbSDimitry Andric if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 36115ffd83dbSDimitry Andric return false; 36125ffd83dbSDimitry Andric } 36135ffd83dbSDimitry Andric return true; 36145ffd83dbSDimitry Andric } 36155ffd83dbSDimitry Andric 36160b57cec5SDimitry Andric static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 36170b57cec5SDimitry Andric int WidthB, int OffsetB) { 36180b57cec5SDimitry Andric int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 36190b57cec5SDimitry Andric int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 36200b57cec5SDimitry Andric int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 36210b57cec5SDimitry Andric return LowOffset + LowWidth <= HighOffset; 36220b57cec5SDimitry Andric } 36230b57cec5SDimitry Andric 36240b57cec5SDimitry Andric bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 36250b57cec5SDimitry Andric const MachineInstr &MIb) const { 36265ffd83dbSDimitry Andric SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 36270b57cec5SDimitry Andric int64_t Offset0, Offset1; 36285ffd83dbSDimitry Andric unsigned Dummy0, Dummy1; 36295ffd83dbSDimitry Andric bool Offset0IsScalable, Offset1IsScalable; 36305ffd83dbSDimitry Andric if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 36315ffd83dbSDimitry Andric Dummy0, &RI) || 36325ffd83dbSDimitry Andric !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 36335ffd83dbSDimitry Andric Dummy1, &RI)) 36345ffd83dbSDimitry Andric return false; 36350b57cec5SDimitry Andric 36365ffd83dbSDimitry Andric if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 36370b57cec5SDimitry Andric return false; 36380b57cec5SDimitry Andric 36390b57cec5SDimitry Andric if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 36400b57cec5SDimitry Andric // FIXME: Handle ds_read2 / ds_write2. 36410b57cec5SDimitry Andric return false; 36420b57cec5SDimitry Andric } 36435ffd83dbSDimitry Andric unsigned Width0 = MIa.memoperands().front()->getSize(); 36445ffd83dbSDimitry Andric unsigned Width1 = MIb.memoperands().front()->getSize(); 36455ffd83dbSDimitry Andric return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 36460b57cec5SDimitry Andric } 36470b57cec5SDimitry Andric 36480b57cec5SDimitry Andric bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 36498bcb0991SDimitry Andric const MachineInstr &MIb) const { 3650480093f4SDimitry Andric assert(MIa.mayLoadOrStore() && 36510b57cec5SDimitry Andric "MIa must load from or modify a memory location"); 3652480093f4SDimitry Andric assert(MIb.mayLoadOrStore() && 36530b57cec5SDimitry Andric "MIb must load from or modify a memory location"); 36540b57cec5SDimitry Andric 36550b57cec5SDimitry Andric if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 36560b57cec5SDimitry Andric return false; 36570b57cec5SDimitry Andric 36580b57cec5SDimitry Andric // XXX - Can we relax this between address spaces? 36590b57cec5SDimitry Andric if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 36600b57cec5SDimitry Andric return false; 36610b57cec5SDimitry Andric 3662cb14a3feSDimitry Andric if (isLDSDMA(MIa) || isLDSDMA(MIb)) 3663cb14a3feSDimitry Andric return false; 3664cb14a3feSDimitry Andric 36650b57cec5SDimitry Andric // TODO: Should we check the address space from the MachineMemOperand? That 36660b57cec5SDimitry Andric // would allow us to distinguish objects we know don't alias based on the 36670b57cec5SDimitry Andric // underlying address space, even if it was lowered to a different one, 36680b57cec5SDimitry Andric // e.g. private accesses lowered to use MUBUF instructions on a scratch 36690b57cec5SDimitry Andric // buffer. 36700b57cec5SDimitry Andric if (isDS(MIa)) { 36710b57cec5SDimitry Andric if (isDS(MIb)) 36720b57cec5SDimitry Andric return checkInstOffsetsDoNotOverlap(MIa, MIb); 36730b57cec5SDimitry Andric 36740b57cec5SDimitry Andric return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 36750b57cec5SDimitry Andric } 36760b57cec5SDimitry Andric 36770b57cec5SDimitry Andric if (isMUBUF(MIa) || isMTBUF(MIa)) { 36780b57cec5SDimitry Andric if (isMUBUF(MIb) || isMTBUF(MIb)) 36790b57cec5SDimitry Andric return checkInstOffsetsDoNotOverlap(MIa, MIb); 36800b57cec5SDimitry Andric 36815f757f3fSDimitry Andric if (isFLAT(MIb)) 36825f757f3fSDimitry Andric return isFLATScratch(MIb); 36835f757f3fSDimitry Andric 36845f757f3fSDimitry Andric return !isSMRD(MIb); 36850b57cec5SDimitry Andric } 36860b57cec5SDimitry Andric 36870b57cec5SDimitry Andric if (isSMRD(MIa)) { 36880b57cec5SDimitry Andric if (isSMRD(MIb)) 36890b57cec5SDimitry Andric return checkInstOffsetsDoNotOverlap(MIa, MIb); 36900b57cec5SDimitry Andric 36915f757f3fSDimitry Andric if (isFLAT(MIb)) 36925f757f3fSDimitry Andric return isFLATScratch(MIb); 36935f757f3fSDimitry Andric 36945f757f3fSDimitry Andric return !isMUBUF(MIb) && !isMTBUF(MIb); 36950b57cec5SDimitry Andric } 36960b57cec5SDimitry Andric 36970b57cec5SDimitry Andric if (isFLAT(MIa)) { 36985f757f3fSDimitry Andric if (isFLAT(MIb)) { 36995f757f3fSDimitry Andric if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) || 37005f757f3fSDimitry Andric (isFLATGlobal(MIa) && isFLATScratch(MIb))) 37015f757f3fSDimitry Andric return true; 37025f757f3fSDimitry Andric 37030b57cec5SDimitry Andric return checkInstOffsetsDoNotOverlap(MIa, MIb); 37045f757f3fSDimitry Andric } 37050b57cec5SDimitry Andric 37060b57cec5SDimitry Andric return false; 37070b57cec5SDimitry Andric } 37080b57cec5SDimitry Andric 37090b57cec5SDimitry Andric return false; 37100b57cec5SDimitry Andric } 37110b57cec5SDimitry Andric 3712349cc55cSDimitry Andric static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, 37130eae32dcSDimitry Andric int64_t &Imm, MachineInstr **DefMI = nullptr) { 3714349cc55cSDimitry Andric if (Reg.isPhysical()) 3715349cc55cSDimitry Andric return false; 3716349cc55cSDimitry Andric auto *Def = MRI.getUniqueVRegDef(Reg); 3717349cc55cSDimitry Andric if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { 3718349cc55cSDimitry Andric Imm = Def->getOperand(1).getImm(); 37190eae32dcSDimitry Andric if (DefMI) 37200eae32dcSDimitry Andric *DefMI = Def; 3721349cc55cSDimitry Andric return true; 3722349cc55cSDimitry Andric } 3723349cc55cSDimitry Andric return false; 3724349cc55cSDimitry Andric } 3725349cc55cSDimitry Andric 37260eae32dcSDimitry Andric static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, 37270eae32dcSDimitry Andric MachineInstr **DefMI = nullptr) { 37280b57cec5SDimitry Andric if (!MO->isReg()) 37290b57cec5SDimitry Andric return false; 37300b57cec5SDimitry Andric const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 37310b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF->getRegInfo(); 37320eae32dcSDimitry Andric return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); 37330b57cec5SDimitry Andric } 37340b57cec5SDimitry Andric 3735e8d8bef9SDimitry Andric static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, 3736e8d8bef9SDimitry Andric MachineInstr &NewMI) { 3737e8d8bef9SDimitry Andric if (LV) { 3738e8d8bef9SDimitry Andric unsigned NumOps = MI.getNumOperands(); 3739e8d8bef9SDimitry Andric for (unsigned I = 1; I < NumOps; ++I) { 3740e8d8bef9SDimitry Andric MachineOperand &Op = MI.getOperand(I); 3741e8d8bef9SDimitry Andric if (Op.isReg() && Op.isKill()) 3742e8d8bef9SDimitry Andric LV->replaceKillInstruction(Op.getReg(), MI, NewMI); 3743e8d8bef9SDimitry Andric } 3744e8d8bef9SDimitry Andric } 3745e8d8bef9SDimitry Andric } 3746e8d8bef9SDimitry Andric 3747349cc55cSDimitry Andric MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, 3748349cc55cSDimitry Andric LiveVariables *LV, 3749349cc55cSDimitry Andric LiveIntervals *LIS) const { 375004eeddc0SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 375181ad6265SDimitry Andric unsigned Opc = MI.getOpcode(); 375204eeddc0SDimitry Andric 375381ad6265SDimitry Andric // Handle MFMA. 375481ad6265SDimitry Andric int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); 375504eeddc0SDimitry Andric if (NewMFMAOpc != -1) { 375681ad6265SDimitry Andric MachineInstrBuilder MIB = 375781ad6265SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); 375804eeddc0SDimitry Andric for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) 375904eeddc0SDimitry Andric MIB.add(MI.getOperand(I)); 376004eeddc0SDimitry Andric updateLiveVariables(LV, MI, *MIB); 376104eeddc0SDimitry Andric if (LIS) 376204eeddc0SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *MIB); 376304eeddc0SDimitry Andric return MIB; 376404eeddc0SDimitry Andric } 376504eeddc0SDimitry Andric 376681ad6265SDimitry Andric if (SIInstrInfo::isWMMA(MI)) { 376781ad6265SDimitry Andric unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); 376881ad6265SDimitry Andric MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 376981ad6265SDimitry Andric .setMIFlags(MI.getFlags()); 377081ad6265SDimitry Andric for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) 377181ad6265SDimitry Andric MIB->addOperand(MI.getOperand(I)); 377281ad6265SDimitry Andric 377381ad6265SDimitry Andric updateLiveVariables(LV, MI, *MIB); 377481ad6265SDimitry Andric if (LIS) 377581ad6265SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *MIB); 377681ad6265SDimitry Andric 377781ad6265SDimitry Andric return MIB; 377881ad6265SDimitry Andric } 377981ad6265SDimitry Andric 3780bdd1243dSDimitry Andric assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 && 3781bdd1243dSDimitry Andric "V_FMAC_F16_t16_e32 is not supported and not expected to be present " 3782bdd1243dSDimitry Andric "pre-RA"); 3783bdd1243dSDimitry Andric 378481ad6265SDimitry Andric // Handle MAC/FMAC. 378581ad6265SDimitry Andric bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || 3786bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || 3787bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F16_t16_e64; 378881ad6265SDimitry Andric bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 378981ad6265SDimitry Andric Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || 379081ad6265SDimitry Andric Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || 379181ad6265SDimitry Andric Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || 3792bdd1243dSDimitry Andric Opc == AMDGPU::V_FMAC_F16_t16_e64 || 379381ad6265SDimitry Andric Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 379481ad6265SDimitry Andric bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 379581ad6265SDimitry Andric bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || 379681ad6265SDimitry Andric Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || 379781ad6265SDimitry Andric Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || 379881ad6265SDimitry Andric Opc == AMDGPU::V_FMAC_LEGACY_F32_e64; 379981ad6265SDimitry Andric bool Src0Literal = false; 380081ad6265SDimitry Andric 380181ad6265SDimitry Andric switch (Opc) { 380281ad6265SDimitry Andric default: 380381ad6265SDimitry Andric return nullptr; 380481ad6265SDimitry Andric case AMDGPU::V_MAC_F16_e64: 380581ad6265SDimitry Andric case AMDGPU::V_FMAC_F16_e64: 3806bdd1243dSDimitry Andric case AMDGPU::V_FMAC_F16_t16_e64: 380781ad6265SDimitry Andric case AMDGPU::V_MAC_F32_e64: 380881ad6265SDimitry Andric case AMDGPU::V_MAC_LEGACY_F32_e64: 380981ad6265SDimitry Andric case AMDGPU::V_FMAC_F32_e64: 381081ad6265SDimitry Andric case AMDGPU::V_FMAC_LEGACY_F32_e64: 381181ad6265SDimitry Andric case AMDGPU::V_FMAC_F64_e64: 381281ad6265SDimitry Andric break; 381381ad6265SDimitry Andric case AMDGPU::V_MAC_F16_e32: 381481ad6265SDimitry Andric case AMDGPU::V_FMAC_F16_e32: 381581ad6265SDimitry Andric case AMDGPU::V_MAC_F32_e32: 381681ad6265SDimitry Andric case AMDGPU::V_MAC_LEGACY_F32_e32: 381781ad6265SDimitry Andric case AMDGPU::V_FMAC_F32_e32: 381881ad6265SDimitry Andric case AMDGPU::V_FMAC_LEGACY_F32_e32: 381981ad6265SDimitry Andric case AMDGPU::V_FMAC_F64_e32: { 382081ad6265SDimitry Andric int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 382181ad6265SDimitry Andric AMDGPU::OpName::src0); 382281ad6265SDimitry Andric const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 382381ad6265SDimitry Andric if (!Src0->isReg() && !Src0->isImm()) 382481ad6265SDimitry Andric return nullptr; 382581ad6265SDimitry Andric 382681ad6265SDimitry Andric if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 382781ad6265SDimitry Andric Src0Literal = true; 382881ad6265SDimitry Andric 382981ad6265SDimitry Andric break; 383081ad6265SDimitry Andric } 383181ad6265SDimitry Andric } 383281ad6265SDimitry Andric 383381ad6265SDimitry Andric MachineInstrBuilder MIB; 38340b57cec5SDimitry Andric const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 38350b57cec5SDimitry Andric const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 38360b57cec5SDimitry Andric const MachineOperand *Src0Mods = 38370b57cec5SDimitry Andric getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 38380b57cec5SDimitry Andric const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 38390b57cec5SDimitry Andric const MachineOperand *Src1Mods = 38400b57cec5SDimitry Andric getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 38410b57cec5SDimitry Andric const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 384281ad6265SDimitry Andric const MachineOperand *Src2Mods = 384381ad6265SDimitry Andric getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); 38440b57cec5SDimitry Andric const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 38450b57cec5SDimitry Andric const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 3846bdd1243dSDimitry Andric const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel); 38470b57cec5SDimitry Andric 384881ad6265SDimitry Andric if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && 384981ad6265SDimitry Andric !IsLegacy && 38500b57cec5SDimitry Andric // If we have an SGPR input, we will violate the constant bus restriction. 3851e8d8bef9SDimitry Andric (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || 3852349cc55cSDimitry Andric !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { 38530eae32dcSDimitry Andric MachineInstr *DefMI; 3854753f127fSDimitry Andric const auto killDef = [&]() -> void { 38550eae32dcSDimitry Andric const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 38560eae32dcSDimitry Andric // The only user is the instruction which will be killed. 3857753f127fSDimitry Andric Register DefReg = DefMI->getOperand(0).getReg(); 3858753f127fSDimitry Andric if (!MRI.hasOneNonDBGUse(DefReg)) 38590eae32dcSDimitry Andric return; 38600eae32dcSDimitry Andric // We cannot just remove the DefMI here, calling pass will crash. 38610eae32dcSDimitry Andric DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); 38620eae32dcSDimitry Andric for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) 386381ad6265SDimitry Andric DefMI->removeOperand(I); 3864753f127fSDimitry Andric if (LV) 3865753f127fSDimitry Andric LV->getVarInfo(DefReg).AliveBlocks.clear(); 38660eae32dcSDimitry Andric }; 38670eae32dcSDimitry Andric 3868349cc55cSDimitry Andric int64_t Imm; 386981ad6265SDimitry Andric if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { 38700b57cec5SDimitry Andric unsigned NewOpc = 3871bdd1243dSDimitry Andric IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 3872bdd1243dSDimitry Andric : AMDGPU::V_FMAAK_F16) 3873bdd1243dSDimitry Andric : AMDGPU::V_FMAAK_F32) 38740b57cec5SDimitry Andric : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 3875e8d8bef9SDimitry Andric if (pseudoToMCOpcode(NewOpc) != -1) { 3876349cc55cSDimitry Andric MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 38770b57cec5SDimitry Andric .add(*Dst) 38780b57cec5SDimitry Andric .add(*Src0) 38790b57cec5SDimitry Andric .add(*Src1) 38800b57cec5SDimitry Andric .addImm(Imm); 3881e8d8bef9SDimitry Andric updateLiveVariables(LV, MI, *MIB); 3882349cc55cSDimitry Andric if (LIS) 3883349cc55cSDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *MIB); 38840eae32dcSDimitry Andric killDef(); 3885e8d8bef9SDimitry Andric return MIB; 38860b57cec5SDimitry Andric } 3887e8d8bef9SDimitry Andric } 3888bdd1243dSDimitry Andric unsigned NewOpc = 3889bdd1243dSDimitry Andric IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 3890bdd1243dSDimitry Andric : AMDGPU::V_FMAMK_F16) 3891bdd1243dSDimitry Andric : AMDGPU::V_FMAMK_F32) 38920b57cec5SDimitry Andric : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 389381ad6265SDimitry Andric if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { 3894e8d8bef9SDimitry Andric if (pseudoToMCOpcode(NewOpc) != -1) { 3895349cc55cSDimitry Andric MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 38960b57cec5SDimitry Andric .add(*Dst) 38970b57cec5SDimitry Andric .add(*Src0) 38980b57cec5SDimitry Andric .addImm(Imm) 38990b57cec5SDimitry Andric .add(*Src2); 3900e8d8bef9SDimitry Andric updateLiveVariables(LV, MI, *MIB); 3901349cc55cSDimitry Andric if (LIS) 3902349cc55cSDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *MIB); 39030eae32dcSDimitry Andric killDef(); 3904e8d8bef9SDimitry Andric return MIB; 3905e8d8bef9SDimitry Andric } 39060b57cec5SDimitry Andric } 390781ad6265SDimitry Andric if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) { 390881ad6265SDimitry Andric if (Src0Literal) { 390981ad6265SDimitry Andric Imm = Src0->getImm(); 391081ad6265SDimitry Andric DefMI = nullptr; 391181ad6265SDimitry Andric } 39120b57cec5SDimitry Andric if (pseudoToMCOpcode(NewOpc) != -1 && 3913e8d8bef9SDimitry Andric isOperandLegal( 3914e8d8bef9SDimitry Andric MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), 3915e8d8bef9SDimitry Andric Src1)) { 3916349cc55cSDimitry Andric MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 39170b57cec5SDimitry Andric .add(*Dst) 39180b57cec5SDimitry Andric .add(*Src1) 39190b57cec5SDimitry Andric .addImm(Imm) 39200b57cec5SDimitry Andric .add(*Src2); 3921e8d8bef9SDimitry Andric updateLiveVariables(LV, MI, *MIB); 3922349cc55cSDimitry Andric if (LIS) 3923349cc55cSDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *MIB); 392481ad6265SDimitry Andric if (DefMI) 39250eae32dcSDimitry Andric killDef(); 3926e8d8bef9SDimitry Andric return MIB; 3927e8d8bef9SDimitry Andric } 39280b57cec5SDimitry Andric } 39290b57cec5SDimitry Andric } 39300b57cec5SDimitry Andric 393181ad6265SDimitry Andric // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma 3932bdd1243dSDimitry Andric // if VOP3 does not allow a literal operand. 3933bdd1243dSDimitry Andric if (Src0Literal && !ST.hasVOP3Literal()) 393481ad6265SDimitry Andric return nullptr; 393581ad6265SDimitry Andric 393681ad6265SDimitry Andric unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 3937fe6060f1SDimitry Andric : IsF64 ? AMDGPU::V_FMA_F64_e64 393881ad6265SDimitry Andric : IsLegacy 393981ad6265SDimitry Andric ? AMDGPU::V_FMA_LEGACY_F32_e64 394081ad6265SDimitry Andric : AMDGPU::V_FMA_F32_e64 394181ad6265SDimitry Andric : IsF16 ? AMDGPU::V_MAD_F16_e64 394281ad6265SDimitry Andric : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 394381ad6265SDimitry Andric : AMDGPU::V_MAD_F32_e64; 39440b57cec5SDimitry Andric if (pseudoToMCOpcode(NewOpc) == -1) 39450b57cec5SDimitry Andric return nullptr; 39460b57cec5SDimitry Andric 3947349cc55cSDimitry Andric MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 39480b57cec5SDimitry Andric .add(*Dst) 39490b57cec5SDimitry Andric .addImm(Src0Mods ? Src0Mods->getImm() : 0) 39500b57cec5SDimitry Andric .add(*Src0) 39510b57cec5SDimitry Andric .addImm(Src1Mods ? Src1Mods->getImm() : 0) 39520b57cec5SDimitry Andric .add(*Src1) 395381ad6265SDimitry Andric .addImm(Src2Mods ? Src2Mods->getImm() : 0) 39540b57cec5SDimitry Andric .add(*Src2) 39550b57cec5SDimitry Andric .addImm(Clamp ? Clamp->getImm() : 0) 39560b57cec5SDimitry Andric .addImm(Omod ? Omod->getImm() : 0); 3957bdd1243dSDimitry Andric if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) 3958bdd1243dSDimitry Andric MIB.addImm(OpSel ? OpSel->getImm() : 0); 3959e8d8bef9SDimitry Andric updateLiveVariables(LV, MI, *MIB); 3960349cc55cSDimitry Andric if (LIS) 3961349cc55cSDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3962e8d8bef9SDimitry Andric return MIB; 39630b57cec5SDimitry Andric } 39640b57cec5SDimitry Andric 39650b57cec5SDimitry Andric // It's not generally safe to move VALU instructions across these since it will 39660b57cec5SDimitry Andric // start using the register as a base index rather than directly. 39670b57cec5SDimitry Andric // XXX - Why isn't hasSideEffects sufficient for these? 39680b57cec5SDimitry Andric static bool changesVGPRIndexingMode(const MachineInstr &MI) { 39690b57cec5SDimitry Andric switch (MI.getOpcode()) { 39700b57cec5SDimitry Andric case AMDGPU::S_SET_GPR_IDX_ON: 39710b57cec5SDimitry Andric case AMDGPU::S_SET_GPR_IDX_MODE: 39720b57cec5SDimitry Andric case AMDGPU::S_SET_GPR_IDX_OFF: 39730b57cec5SDimitry Andric return true; 39740b57cec5SDimitry Andric default: 39750b57cec5SDimitry Andric return false; 39760b57cec5SDimitry Andric } 39770b57cec5SDimitry Andric } 39780b57cec5SDimitry Andric 39790b57cec5SDimitry Andric bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 39800b57cec5SDimitry Andric const MachineBasicBlock *MBB, 39810b57cec5SDimitry Andric const MachineFunction &MF) const { 39825ffd83dbSDimitry Andric // Skipping the check for SP writes in the base implementation. The reason it 39835ffd83dbSDimitry Andric // was added was apparently due to compile time concerns. 39845ffd83dbSDimitry Andric // 39855ffd83dbSDimitry Andric // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 39865ffd83dbSDimitry Andric // but is probably avoidable. 39875ffd83dbSDimitry Andric 39885ffd83dbSDimitry Andric // Copied from base implementation. 39895ffd83dbSDimitry Andric // Terminators and labels can't be scheduled around. 39905ffd83dbSDimitry Andric if (MI.isTerminator() || MI.isPosition()) 39915ffd83dbSDimitry Andric return true; 39925ffd83dbSDimitry Andric 39935ffd83dbSDimitry Andric // INLINEASM_BR can jump to another block 39945ffd83dbSDimitry Andric if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) 39955ffd83dbSDimitry Andric return true; 39960b57cec5SDimitry Andric 399781ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) 399881ad6265SDimitry Andric return true; 399981ad6265SDimitry Andric 40000b57cec5SDimitry Andric // Target-independent instructions do not have an implicit-use of EXEC, even 40010b57cec5SDimitry Andric // when they operate on VGPRs. Treating EXEC modifications as scheduling 40020b57cec5SDimitry Andric // boundaries prevents incorrect movements of such instructions. 40035ffd83dbSDimitry Andric return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 40040b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 40050b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::S_SETREG_B32 || 4006bdd1243dSDimitry Andric MI.getOpcode() == AMDGPU::S_SETPRIO || 40070b57cec5SDimitry Andric changesVGPRIndexingMode(MI); 40080b57cec5SDimitry Andric } 40090b57cec5SDimitry Andric 40100b57cec5SDimitry Andric bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 40115f757f3fSDimitry Andric return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); 40120b57cec5SDimitry Andric } 40130b57cec5SDimitry Andric 40145ffd83dbSDimitry Andric bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { 40155ffd83dbSDimitry Andric // Skip the full operand and register alias search modifiesRegister 40165ffd83dbSDimitry Andric // does. There's only a handful of instructions that touch this, it's only an 40175ffd83dbSDimitry Andric // implicit def, and doesn't alias any other registers. 4018bdd1243dSDimitry Andric return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE); 40195ffd83dbSDimitry Andric } 40205ffd83dbSDimitry Andric 40210b57cec5SDimitry Andric bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 40220b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode(); 40230b57cec5SDimitry Andric 40240b57cec5SDimitry Andric if (MI.mayStore() && isSMRD(MI)) 40250b57cec5SDimitry Andric return true; // scalar store or atomic 40260b57cec5SDimitry Andric 40270b57cec5SDimitry Andric // This will terminate the function when other lanes may need to continue. 40280b57cec5SDimitry Andric if (MI.isReturn()) 40290b57cec5SDimitry Andric return true; 40300b57cec5SDimitry Andric 40310b57cec5SDimitry Andric // These instructions cause shader I/O that may cause hardware lockups 40320b57cec5SDimitry Andric // when executed with an empty EXEC mask. 40330b57cec5SDimitry Andric // 40340b57cec5SDimitry Andric // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 40350b57cec5SDimitry Andric // EXEC = 0, but checking for that case here seems not worth it 40360b57cec5SDimitry Andric // given the typical code patterns. 40370b57cec5SDimitry Andric if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 4038e8d8bef9SDimitry Andric isEXP(Opcode) || 40390b57cec5SDimitry Andric Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 40400b57cec5SDimitry Andric Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 40410b57cec5SDimitry Andric return true; 40420b57cec5SDimitry Andric 40430b57cec5SDimitry Andric if (MI.isCall() || MI.isInlineAsm()) 40440b57cec5SDimitry Andric return true; // conservative assumption 40450b57cec5SDimitry Andric 40465ffd83dbSDimitry Andric // A mode change is a scalar operation that influences vector instructions. 40475ffd83dbSDimitry Andric if (modifiesModeRegister(MI)) 40485ffd83dbSDimitry Andric return true; 40495ffd83dbSDimitry Andric 40500b57cec5SDimitry Andric // These are like SALU instructions in terms of effects, so it's questionable 40510b57cec5SDimitry Andric // whether we should return true for those. 40520b57cec5SDimitry Andric // 40530b57cec5SDimitry Andric // However, executing them with EXEC = 0 causes them to operate on undefined 40540b57cec5SDimitry Andric // data, which we avoid by returning true here. 4055e8d8bef9SDimitry Andric if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || 40565f757f3fSDimitry Andric Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 || 40575f757f3fSDimitry Andric Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR || 40585f757f3fSDimitry Andric Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR) 40590b57cec5SDimitry Andric return true; 40600b57cec5SDimitry Andric 40610b57cec5SDimitry Andric return false; 40620b57cec5SDimitry Andric } 40630b57cec5SDimitry Andric 40640b57cec5SDimitry Andric bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 40650b57cec5SDimitry Andric const MachineInstr &MI) const { 40660b57cec5SDimitry Andric if (MI.isMetaInstruction()) 40670b57cec5SDimitry Andric return false; 40680b57cec5SDimitry Andric 40690b57cec5SDimitry Andric // This won't read exec if this is an SGPR->SGPR copy. 40700b57cec5SDimitry Andric if (MI.isCopyLike()) { 40710b57cec5SDimitry Andric if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 40720b57cec5SDimitry Andric return true; 40730b57cec5SDimitry Andric 40740b57cec5SDimitry Andric // Make sure this isn't copying exec as a normal operand 40750b57cec5SDimitry Andric return MI.readsRegister(AMDGPU::EXEC, &RI); 40760b57cec5SDimitry Andric } 40770b57cec5SDimitry Andric 40780b57cec5SDimitry Andric // Make a conservative assumption about the callee. 40790b57cec5SDimitry Andric if (MI.isCall()) 40800b57cec5SDimitry Andric return true; 40810b57cec5SDimitry Andric 40820b57cec5SDimitry Andric // Be conservative with any unhandled generic opcodes. 40830b57cec5SDimitry Andric if (!isTargetSpecificOpcode(MI.getOpcode())) 40840b57cec5SDimitry Andric return true; 40850b57cec5SDimitry Andric 40860b57cec5SDimitry Andric return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 40870b57cec5SDimitry Andric } 40880b57cec5SDimitry Andric 40890b57cec5SDimitry Andric bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 40900b57cec5SDimitry Andric switch (Imm.getBitWidth()) { 40910b57cec5SDimitry Andric case 1: // This likely will be a condition code mask. 40920b57cec5SDimitry Andric return true; 40930b57cec5SDimitry Andric 40940b57cec5SDimitry Andric case 32: 40950b57cec5SDimitry Andric return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 40960b57cec5SDimitry Andric ST.hasInv2PiInlineImm()); 40970b57cec5SDimitry Andric case 64: 40980b57cec5SDimitry Andric return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 40990b57cec5SDimitry Andric ST.hasInv2PiInlineImm()); 41000b57cec5SDimitry Andric case 16: 41010b57cec5SDimitry Andric return ST.has16BitInsts() && 41020b57cec5SDimitry Andric AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 41030b57cec5SDimitry Andric ST.hasInv2PiInlineImm()); 41040b57cec5SDimitry Andric default: 41050b57cec5SDimitry Andric llvm_unreachable("invalid bitwidth"); 41060b57cec5SDimitry Andric } 41070b57cec5SDimitry Andric } 41080b57cec5SDimitry Andric 41090b57cec5SDimitry Andric bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 41100b57cec5SDimitry Andric uint8_t OperandType) const { 4111bdd1243dSDimitry Andric assert(!MO.isReg() && "isInlineConstant called on register operand!"); 41125f757f3fSDimitry Andric if (!MO.isImm()) 41130b57cec5SDimitry Andric return false; 41140b57cec5SDimitry Andric 41150b57cec5SDimitry Andric // MachineOperand provides no way to tell the true operand size, since it only 41160b57cec5SDimitry Andric // records a 64-bit value. We need to know the size to determine if a 32-bit 41170b57cec5SDimitry Andric // floating point immediate bit pattern is legal for an integer immediate. It 41180b57cec5SDimitry Andric // would be for any 32-bit integer operand, but would not be for a 64-bit one. 41190b57cec5SDimitry Andric 41200b57cec5SDimitry Andric int64_t Imm = MO.getImm(); 41210b57cec5SDimitry Andric switch (OperandType) { 41220b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_INT32: 41230b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP32: 4124349cc55cSDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: 41250b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_INT32: 41260b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_FP32: 4127fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_IMM_V2FP32: 4128fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: 4129fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_IMM_V2INT32: 4130fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: 41310b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 41325f757f3fSDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 41335f757f3fSDimitry Andric case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: { 41340b57cec5SDimitry Andric int32_t Trunc = static_cast<int32_t>(Imm); 41350b57cec5SDimitry Andric return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 41360b57cec5SDimitry Andric } 41370b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_INT64: 41380b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP64: 41390b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_INT64: 41400b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_FP64: 4141fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_FP64: 41420b57cec5SDimitry Andric return AMDGPU::isInlinableLiteral64(MO.getImm(), 41430b57cec5SDimitry Andric ST.hasInv2PiInlineImm()); 41440b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_INT16: 41450b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_INT16: 41460b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 41475ffd83dbSDimitry Andric // We would expect inline immediates to not be concerned with an integer/fp 41485ffd83dbSDimitry Andric // distinction. However, in the case of 16-bit integer operations, the 41495ffd83dbSDimitry Andric // "floating point" values appear to not work. It seems read the low 16-bits 41505ffd83dbSDimitry Andric // of 32-bit immediates, which happens to always work for the integer 41515ffd83dbSDimitry Andric // values. 41525ffd83dbSDimitry Andric // 41535ffd83dbSDimitry Andric // See llvm bugzilla 46302. 41545ffd83dbSDimitry Andric // 41555ffd83dbSDimitry Andric // TODO: Theoretically we could use op-sel to use the high bits of the 41565ffd83dbSDimitry Andric // 32-bit FP values. 41575ffd83dbSDimitry Andric return AMDGPU::isInlinableIntLiteral(Imm); 41585ffd83dbSDimitry Andric case AMDGPU::OPERAND_REG_IMM_V2INT16: 41595ffd83dbSDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 41605ffd83dbSDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 4161*1db9f3b2SDimitry Andric return AMDGPU::isInlinableLiteralV2I16(Imm); 4162*1db9f3b2SDimitry Andric case AMDGPU::OPERAND_REG_IMM_V2FP16: 4163*1db9f3b2SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 4164*1db9f3b2SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: 4165*1db9f3b2SDimitry Andric return AMDGPU::isInlinableLiteralV2F16(Imm); 41665ffd83dbSDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP16: 4167349cc55cSDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: 41685ffd83dbSDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_FP16: 4169*1db9f3b2SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 41700b57cec5SDimitry Andric if (isInt<16>(Imm) || isUInt<16>(Imm)) { 41710b57cec5SDimitry Andric // A few special case instructions have 16-bit operands on subtargets 41720b57cec5SDimitry Andric // where 16-bit instructions are not legal. 41730b57cec5SDimitry Andric // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 41740b57cec5SDimitry Andric // constants in these cases 41750b57cec5SDimitry Andric int16_t Trunc = static_cast<int16_t>(Imm); 41760b57cec5SDimitry Andric return ST.has16BitInsts() && 41770b57cec5SDimitry Andric AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 41780b57cec5SDimitry Andric } 41790b57cec5SDimitry Andric 41800b57cec5SDimitry Andric return false; 41810b57cec5SDimitry Andric } 4182349cc55cSDimitry Andric case AMDGPU::OPERAND_KIMM32: 4183349cc55cSDimitry Andric case AMDGPU::OPERAND_KIMM16: 4184349cc55cSDimitry Andric return false; 41855f757f3fSDimitry Andric case AMDGPU::OPERAND_INPUT_MODS: 41865f757f3fSDimitry Andric case MCOI::OPERAND_IMMEDIATE: 41875f757f3fSDimitry Andric // Always embedded in the instruction for free. 41885f757f3fSDimitry Andric return true; 41895f757f3fSDimitry Andric case MCOI::OPERAND_UNKNOWN: 41905f757f3fSDimitry Andric case MCOI::OPERAND_REGISTER: 41915f757f3fSDimitry Andric case MCOI::OPERAND_PCREL: 41925f757f3fSDimitry Andric case MCOI::OPERAND_GENERIC_0: 41935f757f3fSDimitry Andric case MCOI::OPERAND_GENERIC_1: 41945f757f3fSDimitry Andric case MCOI::OPERAND_GENERIC_2: 41955f757f3fSDimitry Andric case MCOI::OPERAND_GENERIC_3: 41965f757f3fSDimitry Andric case MCOI::OPERAND_GENERIC_4: 41975f757f3fSDimitry Andric case MCOI::OPERAND_GENERIC_5: 41985f757f3fSDimitry Andric // Just ignore anything else. 41995f757f3fSDimitry Andric return true; 42000b57cec5SDimitry Andric default: 42015f757f3fSDimitry Andric llvm_unreachable("invalid operand type"); 42020b57cec5SDimitry Andric } 42030b57cec5SDimitry Andric } 42040b57cec5SDimitry Andric 42050b57cec5SDimitry Andric static bool compareMachineOp(const MachineOperand &Op0, 42060b57cec5SDimitry Andric const MachineOperand &Op1) { 42070b57cec5SDimitry Andric if (Op0.getType() != Op1.getType()) 42080b57cec5SDimitry Andric return false; 42090b57cec5SDimitry Andric 42100b57cec5SDimitry Andric switch (Op0.getType()) { 42110b57cec5SDimitry Andric case MachineOperand::MO_Register: 42120b57cec5SDimitry Andric return Op0.getReg() == Op1.getReg(); 42130b57cec5SDimitry Andric case MachineOperand::MO_Immediate: 42140b57cec5SDimitry Andric return Op0.getImm() == Op1.getImm(); 42150b57cec5SDimitry Andric default: 42160b57cec5SDimitry Andric llvm_unreachable("Didn't expect to be comparing these operand types"); 42170b57cec5SDimitry Andric } 42180b57cec5SDimitry Andric } 42190b57cec5SDimitry Andric 42200b57cec5SDimitry Andric bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 42210b57cec5SDimitry Andric const MachineOperand &MO) const { 42220b57cec5SDimitry Andric const MCInstrDesc &InstDesc = MI.getDesc(); 4223bdd1243dSDimitry Andric const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 42240b57cec5SDimitry Andric 42250b57cec5SDimitry Andric assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 42260b57cec5SDimitry Andric 42270b57cec5SDimitry Andric if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 42280b57cec5SDimitry Andric return true; 42290b57cec5SDimitry Andric 42300b57cec5SDimitry Andric if (OpInfo.RegClass < 0) 42310b57cec5SDimitry Andric return false; 42320b57cec5SDimitry Andric 42338bcb0991SDimitry Andric if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 42348bcb0991SDimitry Andric if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 42358bcb0991SDimitry Andric OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 42368bcb0991SDimitry Andric AMDGPU::OpName::src2)) 42378bcb0991SDimitry Andric return false; 42380b57cec5SDimitry Andric return RI.opCanUseInlineConstant(OpInfo.OperandType); 42398bcb0991SDimitry Andric } 42400b57cec5SDimitry Andric 42410b57cec5SDimitry Andric if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 42420b57cec5SDimitry Andric return false; 42430b57cec5SDimitry Andric 42440b57cec5SDimitry Andric if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 42450b57cec5SDimitry Andric return true; 42460b57cec5SDimitry Andric 42470b57cec5SDimitry Andric return ST.hasVOP3Literal(); 42480b57cec5SDimitry Andric } 42490b57cec5SDimitry Andric 42500b57cec5SDimitry Andric bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 4251fe6060f1SDimitry Andric // GFX90A does not have V_MUL_LEGACY_F32_e32. 4252fe6060f1SDimitry Andric if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) 4253fe6060f1SDimitry Andric return false; 4254fe6060f1SDimitry Andric 42550b57cec5SDimitry Andric int Op32 = AMDGPU::getVOPe32(Opcode); 42560b57cec5SDimitry Andric if (Op32 == -1) 42570b57cec5SDimitry Andric return false; 42580b57cec5SDimitry Andric 42590b57cec5SDimitry Andric return pseudoToMCOpcode(Op32) != -1; 42600b57cec5SDimitry Andric } 42610b57cec5SDimitry Andric 42620b57cec5SDimitry Andric bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 42630b57cec5SDimitry Andric // The src0_modifier operand is present on all instructions 42640b57cec5SDimitry Andric // that have modifiers. 42650b57cec5SDimitry Andric 4266bdd1243dSDimitry Andric return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers); 42670b57cec5SDimitry Andric } 42680b57cec5SDimitry Andric 42690b57cec5SDimitry Andric bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 42700b57cec5SDimitry Andric unsigned OpName) const { 42710b57cec5SDimitry Andric const MachineOperand *Mods = getNamedOperand(MI, OpName); 42720b57cec5SDimitry Andric return Mods && Mods->getImm(); 42730b57cec5SDimitry Andric } 42740b57cec5SDimitry Andric 42750b57cec5SDimitry Andric bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 427681ad6265SDimitry Andric return any_of(ModifierOpNames, 427781ad6265SDimitry Andric [&](unsigned Name) { return hasModifiersSet(MI, Name); }); 42780b57cec5SDimitry Andric } 42790b57cec5SDimitry Andric 42800b57cec5SDimitry Andric bool SIInstrInfo::canShrink(const MachineInstr &MI, 42810b57cec5SDimitry Andric const MachineRegisterInfo &MRI) const { 42820b57cec5SDimitry Andric const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 42830b57cec5SDimitry Andric // Can't shrink instruction with three operands. 42840b57cec5SDimitry Andric if (Src2) { 42850b57cec5SDimitry Andric switch (MI.getOpcode()) { 42860b57cec5SDimitry Andric default: return false; 42870b57cec5SDimitry Andric 42880b57cec5SDimitry Andric case AMDGPU::V_ADDC_U32_e64: 42890b57cec5SDimitry Andric case AMDGPU::V_SUBB_U32_e64: 42900b57cec5SDimitry Andric case AMDGPU::V_SUBBREV_U32_e64: { 42910b57cec5SDimitry Andric const MachineOperand *Src1 42920b57cec5SDimitry Andric = getNamedOperand(MI, AMDGPU::OpName::src1); 42930b57cec5SDimitry Andric if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 42940b57cec5SDimitry Andric return false; 42950b57cec5SDimitry Andric // Additional verification is needed for sdst/src2. 42960b57cec5SDimitry Andric return true; 42970b57cec5SDimitry Andric } 42980b57cec5SDimitry Andric case AMDGPU::V_MAC_F16_e64: 4299349cc55cSDimitry Andric case AMDGPU::V_MAC_F32_e64: 4300349cc55cSDimitry Andric case AMDGPU::V_MAC_LEGACY_F32_e64: 43010b57cec5SDimitry Andric case AMDGPU::V_FMAC_F16_e64: 4302bdd1243dSDimitry Andric case AMDGPU::V_FMAC_F16_t16_e64: 4303349cc55cSDimitry Andric case AMDGPU::V_FMAC_F32_e64: 4304fe6060f1SDimitry Andric case AMDGPU::V_FMAC_F64_e64: 4305349cc55cSDimitry Andric case AMDGPU::V_FMAC_LEGACY_F32_e64: 43060b57cec5SDimitry Andric if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 43070b57cec5SDimitry Andric hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 43080b57cec5SDimitry Andric return false; 43090b57cec5SDimitry Andric break; 43100b57cec5SDimitry Andric 43110b57cec5SDimitry Andric case AMDGPU::V_CNDMASK_B32_e64: 43120b57cec5SDimitry Andric break; 43130b57cec5SDimitry Andric } 43140b57cec5SDimitry Andric } 43150b57cec5SDimitry Andric 43160b57cec5SDimitry Andric const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 43170b57cec5SDimitry Andric if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 43180b57cec5SDimitry Andric hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 43190b57cec5SDimitry Andric return false; 43200b57cec5SDimitry Andric 43210b57cec5SDimitry Andric // We don't need to check src0, all input types are legal, so just make sure 43220b57cec5SDimitry Andric // src0 isn't using any modifiers. 43230b57cec5SDimitry Andric if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 43240b57cec5SDimitry Andric return false; 43250b57cec5SDimitry Andric 43260b57cec5SDimitry Andric // Can it be shrunk to a valid 32 bit opcode? 43270b57cec5SDimitry Andric if (!hasVALU32BitEncoding(MI.getOpcode())) 43280b57cec5SDimitry Andric return false; 43290b57cec5SDimitry Andric 43300b57cec5SDimitry Andric // Check output modifiers 43310b57cec5SDimitry Andric return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 43320b57cec5SDimitry Andric !hasModifiersSet(MI, AMDGPU::OpName::clamp); 43330b57cec5SDimitry Andric } 43340b57cec5SDimitry Andric 43350b57cec5SDimitry Andric // Set VCC operand with all flags from \p Orig, except for setting it as 43360b57cec5SDimitry Andric // implicit. 43370b57cec5SDimitry Andric static void copyFlagsToImplicitVCC(MachineInstr &MI, 43380b57cec5SDimitry Andric const MachineOperand &Orig) { 43390b57cec5SDimitry Andric 43400b57cec5SDimitry Andric for (MachineOperand &Use : MI.implicit_operands()) { 43415ffd83dbSDimitry Andric if (Use.isUse() && 43425ffd83dbSDimitry Andric (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { 43430b57cec5SDimitry Andric Use.setIsUndef(Orig.isUndef()); 43440b57cec5SDimitry Andric Use.setIsKill(Orig.isKill()); 43450b57cec5SDimitry Andric return; 43460b57cec5SDimitry Andric } 43470b57cec5SDimitry Andric } 43480b57cec5SDimitry Andric } 43490b57cec5SDimitry Andric 43500b57cec5SDimitry Andric MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 43510b57cec5SDimitry Andric unsigned Op32) const { 435281ad6265SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 43530b57cec5SDimitry Andric MachineInstrBuilder Inst32 = 43545ffd83dbSDimitry Andric BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 43555ffd83dbSDimitry Andric .setMIFlags(MI.getFlags()); 43560b57cec5SDimitry Andric 43570b57cec5SDimitry Andric // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 43580b57cec5SDimitry Andric // For VOPC instructions, this is replaced by an implicit def of vcc. 4359bdd1243dSDimitry Andric if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) { 43600b57cec5SDimitry Andric // dst 43610b57cec5SDimitry Andric Inst32.add(MI.getOperand(0)); 4362bdd1243dSDimitry Andric } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) { 436381ad6265SDimitry Andric // VOPCX instructions won't be writing to an explicit dst, so this should 436481ad6265SDimitry Andric // not fail for these instructions. 43650b57cec5SDimitry Andric assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 43660b57cec5SDimitry Andric (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 43670b57cec5SDimitry Andric "Unexpected case"); 43680b57cec5SDimitry Andric } 43690b57cec5SDimitry Andric 43700b57cec5SDimitry Andric Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 43710b57cec5SDimitry Andric 43720b57cec5SDimitry Andric const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 43730b57cec5SDimitry Andric if (Src1) 43740b57cec5SDimitry Andric Inst32.add(*Src1); 43750b57cec5SDimitry Andric 43760b57cec5SDimitry Andric const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 43770b57cec5SDimitry Andric 43780b57cec5SDimitry Andric if (Src2) { 43790b57cec5SDimitry Andric int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 43800b57cec5SDimitry Andric if (Op32Src2Idx != -1) { 43810b57cec5SDimitry Andric Inst32.add(*Src2); 43820b57cec5SDimitry Andric } else { 43830b57cec5SDimitry Andric // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 4384e8d8bef9SDimitry Andric // replaced with an implicit read of vcc or vcc_lo. The implicit read 4385e8d8bef9SDimitry Andric // of vcc was already added during the initial BuildMI, but we 4386e8d8bef9SDimitry Andric // 1) may need to change vcc to vcc_lo to preserve the original register 4387e8d8bef9SDimitry Andric // 2) have to preserve the original flags. 4388e8d8bef9SDimitry Andric fixImplicitOperands(*Inst32); 43890b57cec5SDimitry Andric copyFlagsToImplicitVCC(*Inst32, *Src2); 43900b57cec5SDimitry Andric } 43910b57cec5SDimitry Andric } 43920b57cec5SDimitry Andric 43930b57cec5SDimitry Andric return Inst32; 43940b57cec5SDimitry Andric } 43950b57cec5SDimitry Andric 43960b57cec5SDimitry Andric bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 43970b57cec5SDimitry Andric const MachineOperand &MO, 43980b57cec5SDimitry Andric const MCOperandInfo &OpInfo) const { 43990b57cec5SDimitry Andric // Literal constants use the constant bus. 44000b57cec5SDimitry Andric if (!MO.isReg()) 4401bdd1243dSDimitry Andric return !isInlineConstant(MO, OpInfo); 44020b57cec5SDimitry Andric 44030b57cec5SDimitry Andric if (!MO.isUse()) 44040b57cec5SDimitry Andric return false; 44050b57cec5SDimitry Andric 4406e8d8bef9SDimitry Andric if (MO.getReg().isVirtual()) 44070b57cec5SDimitry Andric return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 44080b57cec5SDimitry Andric 44090b57cec5SDimitry Andric // Null is free 441081ad6265SDimitry Andric if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64) 44110b57cec5SDimitry Andric return false; 44120b57cec5SDimitry Andric 44130b57cec5SDimitry Andric // SGPRs use the constant bus 44140b57cec5SDimitry Andric if (MO.isImplicit()) { 44150b57cec5SDimitry Andric return MO.getReg() == AMDGPU::M0 || 44160b57cec5SDimitry Andric MO.getReg() == AMDGPU::VCC || 44170b57cec5SDimitry Andric MO.getReg() == AMDGPU::VCC_LO; 44180b57cec5SDimitry Andric } else { 44190b57cec5SDimitry Andric return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 44200b57cec5SDimitry Andric AMDGPU::SReg_64RegClass.contains(MO.getReg()); 44210b57cec5SDimitry Andric } 44220b57cec5SDimitry Andric } 44230b57cec5SDimitry Andric 44245ffd83dbSDimitry Andric static Register findImplicitSGPRRead(const MachineInstr &MI) { 44250b57cec5SDimitry Andric for (const MachineOperand &MO : MI.implicit_operands()) { 44260b57cec5SDimitry Andric // We only care about reads. 44270b57cec5SDimitry Andric if (MO.isDef()) 44280b57cec5SDimitry Andric continue; 44290b57cec5SDimitry Andric 44300b57cec5SDimitry Andric switch (MO.getReg()) { 44310b57cec5SDimitry Andric case AMDGPU::VCC: 44320b57cec5SDimitry Andric case AMDGPU::VCC_LO: 44330b57cec5SDimitry Andric case AMDGPU::VCC_HI: 44340b57cec5SDimitry Andric case AMDGPU::M0: 44350b57cec5SDimitry Andric case AMDGPU::FLAT_SCR: 44360b57cec5SDimitry Andric return MO.getReg(); 44370b57cec5SDimitry Andric 44380b57cec5SDimitry Andric default: 44390b57cec5SDimitry Andric break; 44400b57cec5SDimitry Andric } 44410b57cec5SDimitry Andric } 44420b57cec5SDimitry Andric 4443bdd1243dSDimitry Andric return Register(); 44440b57cec5SDimitry Andric } 44450b57cec5SDimitry Andric 44460b57cec5SDimitry Andric static bool shouldReadExec(const MachineInstr &MI) { 44470b57cec5SDimitry Andric if (SIInstrInfo::isVALU(MI)) { 44480b57cec5SDimitry Andric switch (MI.getOpcode()) { 44490b57cec5SDimitry Andric case AMDGPU::V_READLANE_B32: 44505f757f3fSDimitry Andric case AMDGPU::SI_RESTORE_S32_FROM_VGPR: 44510b57cec5SDimitry Andric case AMDGPU::V_WRITELANE_B32: 44525f757f3fSDimitry Andric case AMDGPU::SI_SPILL_S32_TO_VGPR: 44530b57cec5SDimitry Andric return false; 44540b57cec5SDimitry Andric } 44550b57cec5SDimitry Andric 44560b57cec5SDimitry Andric return true; 44570b57cec5SDimitry Andric } 44580b57cec5SDimitry Andric 44598bcb0991SDimitry Andric if (MI.isPreISelOpcode() || 44608bcb0991SDimitry Andric SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 44610b57cec5SDimitry Andric SIInstrInfo::isSALU(MI) || 44620b57cec5SDimitry Andric SIInstrInfo::isSMRD(MI)) 44630b57cec5SDimitry Andric return false; 44640b57cec5SDimitry Andric 44650b57cec5SDimitry Andric return true; 44660b57cec5SDimitry Andric } 44670b57cec5SDimitry Andric 44680b57cec5SDimitry Andric static bool isSubRegOf(const SIRegisterInfo &TRI, 44690b57cec5SDimitry Andric const MachineOperand &SuperVec, 44700b57cec5SDimitry Andric const MachineOperand &SubReg) { 4471e8d8bef9SDimitry Andric if (SubReg.getReg().isPhysical()) 44720b57cec5SDimitry Andric return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 44730b57cec5SDimitry Andric 44740b57cec5SDimitry Andric return SubReg.getSubReg() != AMDGPU::NoSubRegister && 44750b57cec5SDimitry Andric SubReg.getReg() == SuperVec.getReg(); 44760b57cec5SDimitry Andric } 44770b57cec5SDimitry Andric 44780b57cec5SDimitry Andric bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 44790b57cec5SDimitry Andric StringRef &ErrInfo) const { 44800b57cec5SDimitry Andric uint16_t Opcode = MI.getOpcode(); 44810b57cec5SDimitry Andric if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 44820b57cec5SDimitry Andric return true; 44830b57cec5SDimitry Andric 44840b57cec5SDimitry Andric const MachineFunction *MF = MI.getParent()->getParent(); 44850b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF->getRegInfo(); 44860b57cec5SDimitry Andric 44870b57cec5SDimitry Andric int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 44880b57cec5SDimitry Andric int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 44890b57cec5SDimitry Andric int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 4490753f127fSDimitry Andric int Src3Idx = -1; 4491753f127fSDimitry Andric if (Src0Idx == -1) { 4492753f127fSDimitry Andric // VOPD V_DUAL_* instructions use different operand names. 4493753f127fSDimitry Andric Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); 4494753f127fSDimitry Andric Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); 4495753f127fSDimitry Andric Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); 4496753f127fSDimitry Andric Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); 4497753f127fSDimitry Andric } 44980b57cec5SDimitry Andric 44990b57cec5SDimitry Andric // Make sure the number of operands is correct. 45000b57cec5SDimitry Andric const MCInstrDesc &Desc = get(Opcode); 45010b57cec5SDimitry Andric if (!Desc.isVariadic() && 45020b57cec5SDimitry Andric Desc.getNumOperands() != MI.getNumExplicitOperands()) { 45030b57cec5SDimitry Andric ErrInfo = "Instruction has wrong number of operands."; 45040b57cec5SDimitry Andric return false; 45050b57cec5SDimitry Andric } 45060b57cec5SDimitry Andric 45070b57cec5SDimitry Andric if (MI.isInlineAsm()) { 45080b57cec5SDimitry Andric // Verify register classes for inlineasm constraints. 45090b57cec5SDimitry Andric for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 45100b57cec5SDimitry Andric I != E; ++I) { 45110b57cec5SDimitry Andric const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 45120b57cec5SDimitry Andric if (!RC) 45130b57cec5SDimitry Andric continue; 45140b57cec5SDimitry Andric 45150b57cec5SDimitry Andric const MachineOperand &Op = MI.getOperand(I); 45160b57cec5SDimitry Andric if (!Op.isReg()) 45170b57cec5SDimitry Andric continue; 45180b57cec5SDimitry Andric 45198bcb0991SDimitry Andric Register Reg = Op.getReg(); 4520e8d8bef9SDimitry Andric if (!Reg.isVirtual() && !RC->contains(Reg)) { 45210b57cec5SDimitry Andric ErrInfo = "inlineasm operand has incorrect register class."; 45220b57cec5SDimitry Andric return false; 45230b57cec5SDimitry Andric } 45240b57cec5SDimitry Andric } 45250b57cec5SDimitry Andric 45260b57cec5SDimitry Andric return true; 45270b57cec5SDimitry Andric } 45280b57cec5SDimitry Andric 45295f757f3fSDimitry Andric if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 45305f757f3fSDimitry Andric ErrInfo = "missing memory operand from image instruction."; 45315ffd83dbSDimitry Andric return false; 45325ffd83dbSDimitry Andric } 45335ffd83dbSDimitry Andric 45340b57cec5SDimitry Andric // Make sure the register classes are correct. 45350b57cec5SDimitry Andric for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 4536fe6060f1SDimitry Andric const MachineOperand &MO = MI.getOperand(i); 4537fe6060f1SDimitry Andric if (MO.isFPImm()) { 45380b57cec5SDimitry Andric ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 45390b57cec5SDimitry Andric "all fp values to integers."; 45400b57cec5SDimitry Andric return false; 45410b57cec5SDimitry Andric } 45420b57cec5SDimitry Andric 4543bdd1243dSDimitry Andric int RegClass = Desc.operands()[i].RegClass; 45440b57cec5SDimitry Andric 4545bdd1243dSDimitry Andric switch (Desc.operands()[i].OperandType) { 45460b57cec5SDimitry Andric case MCOI::OPERAND_REGISTER: 45470b57cec5SDimitry Andric if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 45480b57cec5SDimitry Andric ErrInfo = "Illegal immediate value for operand."; 45490b57cec5SDimitry Andric return false; 45500b57cec5SDimitry Andric } 45510b57cec5SDimitry Andric break; 45520b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_INT32: 45530b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP32: 4554349cc55cSDimitry Andric case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: 455581ad6265SDimitry Andric case AMDGPU::OPERAND_REG_IMM_V2FP32: 45560b57cec5SDimitry Andric break; 45570b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_INT32: 45580b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_FP32: 45590b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_INT64: 45600b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_FP64: 45610b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_INT16: 45620b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_C_FP16: 45630b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 45640b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 45650b57cec5SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 4566fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_FP16: 4567fe6060f1SDimitry Andric case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { 45680b57cec5SDimitry Andric if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 45690b57cec5SDimitry Andric ErrInfo = "Illegal immediate value for operand."; 45700b57cec5SDimitry Andric return false; 45710b57cec5SDimitry Andric } 45720b57cec5SDimitry Andric break; 45730b57cec5SDimitry Andric } 45745f757f3fSDimitry Andric case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: 45755f757f3fSDimitry Andric if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) { 45765f757f3fSDimitry Andric ErrInfo = "Expected inline constant for operand."; 45775f757f3fSDimitry Andric return false; 45785f757f3fSDimitry Andric } 45795f757f3fSDimitry Andric break; 45800b57cec5SDimitry Andric case MCOI::OPERAND_IMMEDIATE: 45810b57cec5SDimitry Andric case AMDGPU::OPERAND_KIMM32: 45820b57cec5SDimitry Andric // Check if this operand is an immediate. 45830b57cec5SDimitry Andric // FrameIndex operands will be replaced by immediates, so they are 45840b57cec5SDimitry Andric // allowed. 45850b57cec5SDimitry Andric if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 45860b57cec5SDimitry Andric ErrInfo = "Expected immediate, but got non-immediate"; 45870b57cec5SDimitry Andric return false; 45880b57cec5SDimitry Andric } 4589bdd1243dSDimitry Andric [[fallthrough]]; 45900b57cec5SDimitry Andric default: 45910b57cec5SDimitry Andric continue; 45920b57cec5SDimitry Andric } 45930b57cec5SDimitry Andric 4594fe6060f1SDimitry Andric if (!MO.isReg()) 4595fe6060f1SDimitry Andric continue; 4596fe6060f1SDimitry Andric Register Reg = MO.getReg(); 4597fe6060f1SDimitry Andric if (!Reg) 45980b57cec5SDimitry Andric continue; 45990b57cec5SDimitry Andric 4600fe6060f1SDimitry Andric // FIXME: Ideally we would have separate instruction definitions with the 4601fe6060f1SDimitry Andric // aligned register constraint. 4602fe6060f1SDimitry Andric // FIXME: We do not verify inline asm operands, but custom inline asm 4603fe6060f1SDimitry Andric // verification is broken anyway 4604fe6060f1SDimitry Andric if (ST.needsAlignedVGPRs()) { 4605fe6060f1SDimitry Andric const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); 46064824e7fdSDimitry Andric if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { 4607fe6060f1SDimitry Andric const TargetRegisterClass *SubRC = 4608bdd1243dSDimitry Andric RI.getSubRegisterClass(RC, MO.getSubReg()); 4609fe6060f1SDimitry Andric RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); 4610fe6060f1SDimitry Andric if (RC) 4611fe6060f1SDimitry Andric RC = SubRC; 4612fe6060f1SDimitry Andric } 4613fe6060f1SDimitry Andric 4614fe6060f1SDimitry Andric // Check that this is the aligned version of the class. 4615fe6060f1SDimitry Andric if (!RC || !RI.isProperlyAlignedRC(*RC)) { 4616fe6060f1SDimitry Andric ErrInfo = "Subtarget requires even aligned vector registers"; 4617fe6060f1SDimitry Andric return false; 4618fe6060f1SDimitry Andric } 4619fe6060f1SDimitry Andric } 4620fe6060f1SDimitry Andric 46210b57cec5SDimitry Andric if (RegClass != -1) { 4622fe6060f1SDimitry Andric if (Reg.isVirtual()) 46230b57cec5SDimitry Andric continue; 46240b57cec5SDimitry Andric 46250b57cec5SDimitry Andric const TargetRegisterClass *RC = RI.getRegClass(RegClass); 46260b57cec5SDimitry Andric if (!RC->contains(Reg)) { 46270b57cec5SDimitry Andric ErrInfo = "Operand has incorrect register class."; 46280b57cec5SDimitry Andric return false; 46290b57cec5SDimitry Andric } 46300b57cec5SDimitry Andric } 46310b57cec5SDimitry Andric } 46320b57cec5SDimitry Andric 46330b57cec5SDimitry Andric // Verify SDWA 46340b57cec5SDimitry Andric if (isSDWA(MI)) { 46350b57cec5SDimitry Andric if (!ST.hasSDWA()) { 46360b57cec5SDimitry Andric ErrInfo = "SDWA is not supported on this target"; 46370b57cec5SDimitry Andric return false; 46380b57cec5SDimitry Andric } 46390b57cec5SDimitry Andric 46400b57cec5SDimitry Andric int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 46410b57cec5SDimitry Andric 464281ad6265SDimitry Andric for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) { 46430b57cec5SDimitry Andric if (OpIdx == -1) 46440b57cec5SDimitry Andric continue; 46450b57cec5SDimitry Andric const MachineOperand &MO = MI.getOperand(OpIdx); 46460b57cec5SDimitry Andric 46470b57cec5SDimitry Andric if (!ST.hasSDWAScalar()) { 46480b57cec5SDimitry Andric // Only VGPRS on VI 46490b57cec5SDimitry Andric if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 46500b57cec5SDimitry Andric ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 46510b57cec5SDimitry Andric return false; 46520b57cec5SDimitry Andric } 46530b57cec5SDimitry Andric } else { 46540b57cec5SDimitry Andric // No immediates on GFX9 46550b57cec5SDimitry Andric if (!MO.isReg()) { 4656e8d8bef9SDimitry Andric ErrInfo = 4657e8d8bef9SDimitry Andric "Only reg allowed as operands in SDWA instructions on GFX9+"; 46580b57cec5SDimitry Andric return false; 46590b57cec5SDimitry Andric } 46600b57cec5SDimitry Andric } 46610b57cec5SDimitry Andric } 46620b57cec5SDimitry Andric 46630b57cec5SDimitry Andric if (!ST.hasSDWAOmod()) { 46640b57cec5SDimitry Andric // No omod allowed on VI 46650b57cec5SDimitry Andric const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 46660b57cec5SDimitry Andric if (OMod != nullptr && 46670b57cec5SDimitry Andric (!OMod->isImm() || OMod->getImm() != 0)) { 46680b57cec5SDimitry Andric ErrInfo = "OMod not allowed in SDWA instructions on VI"; 46690b57cec5SDimitry Andric return false; 46700b57cec5SDimitry Andric } 46710b57cec5SDimitry Andric } 46720b57cec5SDimitry Andric 46730b57cec5SDimitry Andric uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 46740b57cec5SDimitry Andric if (isVOPC(BasicOpcode)) { 46750b57cec5SDimitry Andric if (!ST.hasSDWASdst() && DstIdx != -1) { 46760b57cec5SDimitry Andric // Only vcc allowed as dst on VI for VOPC 46770b57cec5SDimitry Andric const MachineOperand &Dst = MI.getOperand(DstIdx); 46780b57cec5SDimitry Andric if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 46790b57cec5SDimitry Andric ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 46800b57cec5SDimitry Andric return false; 46810b57cec5SDimitry Andric } 46820b57cec5SDimitry Andric } else if (!ST.hasSDWAOutModsVOPC()) { 46830b57cec5SDimitry Andric // No clamp allowed on GFX9 for VOPC 46840b57cec5SDimitry Andric const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 46850b57cec5SDimitry Andric if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 46860b57cec5SDimitry Andric ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 46870b57cec5SDimitry Andric return false; 46880b57cec5SDimitry Andric } 46890b57cec5SDimitry Andric 46900b57cec5SDimitry Andric // No omod allowed on GFX9 for VOPC 46910b57cec5SDimitry Andric const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 46920b57cec5SDimitry Andric if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 46930b57cec5SDimitry Andric ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 46940b57cec5SDimitry Andric return false; 46950b57cec5SDimitry Andric } 46960b57cec5SDimitry Andric } 46970b57cec5SDimitry Andric } 46980b57cec5SDimitry Andric 46990b57cec5SDimitry Andric const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 47000b57cec5SDimitry Andric if (DstUnused && DstUnused->isImm() && 47010b57cec5SDimitry Andric DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 47020b57cec5SDimitry Andric const MachineOperand &Dst = MI.getOperand(DstIdx); 47030b57cec5SDimitry Andric if (!Dst.isReg() || !Dst.isTied()) { 47040b57cec5SDimitry Andric ErrInfo = "Dst register should have tied register"; 47050b57cec5SDimitry Andric return false; 47060b57cec5SDimitry Andric } 47070b57cec5SDimitry Andric 47080b57cec5SDimitry Andric const MachineOperand &TiedMO = 47090b57cec5SDimitry Andric MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 47100b57cec5SDimitry Andric if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 47110b57cec5SDimitry Andric ErrInfo = 47120b57cec5SDimitry Andric "Dst register should be tied to implicit use of preserved register"; 47130b57cec5SDimitry Andric return false; 4714e8d8bef9SDimitry Andric } else if (TiedMO.getReg().isPhysical() && 47150b57cec5SDimitry Andric Dst.getReg() != TiedMO.getReg()) { 47160b57cec5SDimitry Andric ErrInfo = "Dst register should use same physical register as preserved"; 47170b57cec5SDimitry Andric return false; 47180b57cec5SDimitry Andric } 47190b57cec5SDimitry Andric } 47200b57cec5SDimitry Andric } 47210b57cec5SDimitry Andric 47225f757f3fSDimitry Andric // Verify MIMG / VIMAGE / VSAMPLE 47235f757f3fSDimitry Andric if (isImage(MI.getOpcode()) && !MI.mayStore()) { 47240b57cec5SDimitry Andric // Ensure that the return type used is large enough for all the options 47250b57cec5SDimitry Andric // being used TFE/LWE require an extra result register. 47260b57cec5SDimitry Andric const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 47270b57cec5SDimitry Andric if (DMask) { 47280b57cec5SDimitry Andric uint64_t DMaskImm = DMask->getImm(); 47290b57cec5SDimitry Andric uint32_t RegCount = 4730bdd1243dSDimitry Andric isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm); 47310b57cec5SDimitry Andric const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 47320b57cec5SDimitry Andric const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 47330b57cec5SDimitry Andric const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 47340b57cec5SDimitry Andric 47350b57cec5SDimitry Andric // Adjust for packed 16 bit values 47360b57cec5SDimitry Andric if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 473706c3fb27SDimitry Andric RegCount = divideCeil(RegCount, 2); 47380b57cec5SDimitry Andric 47390b57cec5SDimitry Andric // Adjust if using LWE or TFE 47400b57cec5SDimitry Andric if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 47410b57cec5SDimitry Andric RegCount += 1; 47420b57cec5SDimitry Andric 47430b57cec5SDimitry Andric const uint32_t DstIdx = 47440b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 47450b57cec5SDimitry Andric const MachineOperand &Dst = MI.getOperand(DstIdx); 47460b57cec5SDimitry Andric if (Dst.isReg()) { 47470b57cec5SDimitry Andric const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 47480b57cec5SDimitry Andric uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 47490b57cec5SDimitry Andric if (RegCount > DstSize) { 475006c3fb27SDimitry Andric ErrInfo = "Image instruction returns too many registers for dst " 47510b57cec5SDimitry Andric "register class"; 47520b57cec5SDimitry Andric return false; 47530b57cec5SDimitry Andric } 47540b57cec5SDimitry Andric } 47550b57cec5SDimitry Andric } 47560b57cec5SDimitry Andric } 47570b57cec5SDimitry Andric 47580b57cec5SDimitry Andric // Verify VOP*. Ignore multiple sgpr operands on writelane. 475981ad6265SDimitry Andric if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) { 47600b57cec5SDimitry Andric unsigned ConstantBusCount = 0; 4761fe6060f1SDimitry Andric bool UsesLiteral = false; 4762fe6060f1SDimitry Andric const MachineOperand *LiteralVal = nullptr; 47630b57cec5SDimitry Andric 476481ad6265SDimitry Andric int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm); 476581ad6265SDimitry Andric if (ImmIdx != -1) { 47660b57cec5SDimitry Andric ++ConstantBusCount; 476781ad6265SDimitry Andric UsesLiteral = true; 476881ad6265SDimitry Andric LiteralVal = &MI.getOperand(ImmIdx); 476981ad6265SDimitry Andric } 47700b57cec5SDimitry Andric 47715ffd83dbSDimitry Andric SmallVector<Register, 2> SGPRsUsed; 4772e8d8bef9SDimitry Andric Register SGPRUsed; 47730b57cec5SDimitry Andric 477481ad6265SDimitry Andric // Only look at the true operands. Only a real operand can use the constant 477581ad6265SDimitry Andric // bus, and we don't want to check pseudo-operands like the source modifier 477681ad6265SDimitry Andric // flags. 4777753f127fSDimitry Andric for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { 47780b57cec5SDimitry Andric if (OpIdx == -1) 4779753f127fSDimitry Andric continue; 47800b57cec5SDimitry Andric const MachineOperand &MO = MI.getOperand(OpIdx); 4781bdd1243dSDimitry Andric if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) { 47820b57cec5SDimitry Andric if (MO.isReg()) { 47830b57cec5SDimitry Andric SGPRUsed = MO.getReg(); 4784bdd1243dSDimitry Andric if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) { 47850b57cec5SDimitry Andric ++ConstantBusCount; 47860b57cec5SDimitry Andric SGPRsUsed.push_back(SGPRUsed); 47870b57cec5SDimitry Andric } 47880b57cec5SDimitry Andric } else { 4789fe6060f1SDimitry Andric if (!UsesLiteral) { 47900b57cec5SDimitry Andric ++ConstantBusCount; 4791fe6060f1SDimitry Andric UsesLiteral = true; 4792fe6060f1SDimitry Andric LiteralVal = &MO; 4793fe6060f1SDimitry Andric } else if (!MO.isIdenticalTo(*LiteralVal)) { 479481ad6265SDimitry Andric assert(isVOP2(MI) || isVOP3(MI)); 479581ad6265SDimitry Andric ErrInfo = "VOP2/VOP3 instruction uses more than one literal"; 4796fe6060f1SDimitry Andric return false; 4797fe6060f1SDimitry Andric } 47980b57cec5SDimitry Andric } 47990b57cec5SDimitry Andric } 48000b57cec5SDimitry Andric } 4801e8d8bef9SDimitry Andric 4802e8d8bef9SDimitry Andric SGPRUsed = findImplicitSGPRRead(MI); 4803bdd1243dSDimitry Andric if (SGPRUsed) { 480481ad6265SDimitry Andric // Implicit uses may safely overlap true operands 4805e8d8bef9SDimitry Andric if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 4806e8d8bef9SDimitry Andric return !RI.regsOverlap(SGPRUsed, SGPR); 4807e8d8bef9SDimitry Andric })) { 4808e8d8bef9SDimitry Andric ++ConstantBusCount; 4809e8d8bef9SDimitry Andric SGPRsUsed.push_back(SGPRUsed); 4810e8d8bef9SDimitry Andric } 4811e8d8bef9SDimitry Andric } 4812e8d8bef9SDimitry Andric 48130b57cec5SDimitry Andric // v_writelane_b32 is an exception from constant bus restriction: 48140b57cec5SDimitry Andric // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 48150b57cec5SDimitry Andric if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 48160b57cec5SDimitry Andric Opcode != AMDGPU::V_WRITELANE_B32) { 48170b57cec5SDimitry Andric ErrInfo = "VOP* instruction violates constant bus restriction"; 48180b57cec5SDimitry Andric return false; 48190b57cec5SDimitry Andric } 48200b57cec5SDimitry Andric 4821fe6060f1SDimitry Andric if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { 48220b57cec5SDimitry Andric ErrInfo = "VOP3 instruction uses literal"; 48230b57cec5SDimitry Andric return false; 48240b57cec5SDimitry Andric } 48250b57cec5SDimitry Andric } 48260b57cec5SDimitry Andric 48278bcb0991SDimitry Andric // Special case for writelane - this can break the multiple constant bus rule, 48288bcb0991SDimitry Andric // but still can't use more than one SGPR register 48298bcb0991SDimitry Andric if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 48308bcb0991SDimitry Andric unsigned SGPRCount = 0; 4831bdd1243dSDimitry Andric Register SGPRUsed; 48328bcb0991SDimitry Andric 483381ad6265SDimitry Andric for (int OpIdx : {Src0Idx, Src1Idx}) { 48348bcb0991SDimitry Andric if (OpIdx == -1) 48358bcb0991SDimitry Andric break; 48368bcb0991SDimitry Andric 48378bcb0991SDimitry Andric const MachineOperand &MO = MI.getOperand(OpIdx); 48388bcb0991SDimitry Andric 4839bdd1243dSDimitry Andric if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) { 48408bcb0991SDimitry Andric if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 48418bcb0991SDimitry Andric if (MO.getReg() != SGPRUsed) 48428bcb0991SDimitry Andric ++SGPRCount; 48438bcb0991SDimitry Andric SGPRUsed = MO.getReg(); 48448bcb0991SDimitry Andric } 48458bcb0991SDimitry Andric } 48468bcb0991SDimitry Andric if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 48478bcb0991SDimitry Andric ErrInfo = "WRITELANE instruction violates constant bus restriction"; 48488bcb0991SDimitry Andric return false; 48498bcb0991SDimitry Andric } 48508bcb0991SDimitry Andric } 48518bcb0991SDimitry Andric } 48528bcb0991SDimitry Andric 48530b57cec5SDimitry Andric // Verify misc. restrictions on specific instructions. 4854e8d8bef9SDimitry Andric if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || 4855e8d8bef9SDimitry Andric Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { 48560b57cec5SDimitry Andric const MachineOperand &Src0 = MI.getOperand(Src0Idx); 48570b57cec5SDimitry Andric const MachineOperand &Src1 = MI.getOperand(Src1Idx); 48580b57cec5SDimitry Andric const MachineOperand &Src2 = MI.getOperand(Src2Idx); 48590b57cec5SDimitry Andric if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 48600b57cec5SDimitry Andric if (!compareMachineOp(Src0, Src1) && 48610b57cec5SDimitry Andric !compareMachineOp(Src0, Src2)) { 48620b57cec5SDimitry Andric ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 48630b57cec5SDimitry Andric return false; 48640b57cec5SDimitry Andric } 48650b57cec5SDimitry Andric } 4866e8d8bef9SDimitry Andric if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 4867e8d8bef9SDimitry Andric SISrcMods::ABS) || 4868e8d8bef9SDimitry Andric (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & 4869e8d8bef9SDimitry Andric SISrcMods::ABS) || 4870e8d8bef9SDimitry Andric (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 4871e8d8bef9SDimitry Andric SISrcMods::ABS)) { 4872e8d8bef9SDimitry Andric ErrInfo = "ABS not allowed in VOP3B instructions"; 4873e8d8bef9SDimitry Andric return false; 4874e8d8bef9SDimitry Andric } 48750b57cec5SDimitry Andric } 48760b57cec5SDimitry Andric 48770b57cec5SDimitry Andric if (isSOP2(MI) || isSOPC(MI)) { 48780b57cec5SDimitry Andric const MachineOperand &Src0 = MI.getOperand(Src0Idx); 48790b57cec5SDimitry Andric const MachineOperand &Src1 = MI.getOperand(Src1Idx); 48800b57cec5SDimitry Andric 488181ad6265SDimitry Andric if (!Src0.isReg() && !Src1.isReg() && 4882bdd1243dSDimitry Andric !isInlineConstant(Src0, Desc.operands()[Src0Idx]) && 4883bdd1243dSDimitry Andric !isInlineConstant(Src1, Desc.operands()[Src1Idx]) && 488481ad6265SDimitry Andric !Src0.isIdenticalTo(Src1)) { 48850b57cec5SDimitry Andric ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 48860b57cec5SDimitry Andric return false; 48870b57cec5SDimitry Andric } 48880b57cec5SDimitry Andric } 48890b57cec5SDimitry Andric 48900b57cec5SDimitry Andric if (isSOPK(MI)) { 48910b57cec5SDimitry Andric auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 48920b57cec5SDimitry Andric if (Desc.isBranch()) { 48930b57cec5SDimitry Andric if (!Op->isMBB()) { 48940b57cec5SDimitry Andric ErrInfo = "invalid branch target for SOPK instruction"; 48950b57cec5SDimitry Andric return false; 48960b57cec5SDimitry Andric } 48970b57cec5SDimitry Andric } else { 48980b57cec5SDimitry Andric uint64_t Imm = Op->getImm(); 48990b57cec5SDimitry Andric if (sopkIsZext(MI)) { 49000b57cec5SDimitry Andric if (!isUInt<16>(Imm)) { 49010b57cec5SDimitry Andric ErrInfo = "invalid immediate for SOPK instruction"; 49020b57cec5SDimitry Andric return false; 49030b57cec5SDimitry Andric } 49040b57cec5SDimitry Andric } else { 49050b57cec5SDimitry Andric if (!isInt<16>(Imm)) { 49060b57cec5SDimitry Andric ErrInfo = "invalid immediate for SOPK instruction"; 49070b57cec5SDimitry Andric return false; 49080b57cec5SDimitry Andric } 49090b57cec5SDimitry Andric } 49100b57cec5SDimitry Andric } 49110b57cec5SDimitry Andric } 49120b57cec5SDimitry Andric 49130b57cec5SDimitry Andric if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 49140b57cec5SDimitry Andric Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 49150b57cec5SDimitry Andric Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 49160b57cec5SDimitry Andric Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 49170b57cec5SDimitry Andric const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 49180b57cec5SDimitry Andric Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 49190b57cec5SDimitry Andric 4920bdd1243dSDimitry Andric const unsigned StaticNumOps = 4921bdd1243dSDimitry Andric Desc.getNumOperands() + Desc.implicit_uses().size(); 49220b57cec5SDimitry Andric const unsigned NumImplicitOps = IsDst ? 2 : 1; 49230b57cec5SDimitry Andric 49240b57cec5SDimitry Andric // Allow additional implicit operands. This allows a fixup done by the post 49250b57cec5SDimitry Andric // RA scheduler where the main implicit operand is killed and implicit-defs 49260b57cec5SDimitry Andric // are added for sub-registers that remain live after this instruction. 49270b57cec5SDimitry Andric if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 49280b57cec5SDimitry Andric ErrInfo = "missing implicit register operands"; 49290b57cec5SDimitry Andric return false; 49300b57cec5SDimitry Andric } 49310b57cec5SDimitry Andric 49320b57cec5SDimitry Andric const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 49330b57cec5SDimitry Andric if (IsDst) { 49340b57cec5SDimitry Andric if (!Dst->isUse()) { 49350b57cec5SDimitry Andric ErrInfo = "v_movreld_b32 vdst should be a use operand"; 49360b57cec5SDimitry Andric return false; 49370b57cec5SDimitry Andric } 49380b57cec5SDimitry Andric 49390b57cec5SDimitry Andric unsigned UseOpIdx; 49400b57cec5SDimitry Andric if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 49410b57cec5SDimitry Andric UseOpIdx != StaticNumOps + 1) { 49420b57cec5SDimitry Andric ErrInfo = "movrel implicit operands should be tied"; 49430b57cec5SDimitry Andric return false; 49440b57cec5SDimitry Andric } 49450b57cec5SDimitry Andric } 49460b57cec5SDimitry Andric 49470b57cec5SDimitry Andric const MachineOperand &Src0 = MI.getOperand(Src0Idx); 49480b57cec5SDimitry Andric const MachineOperand &ImpUse 49490b57cec5SDimitry Andric = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 49500b57cec5SDimitry Andric if (!ImpUse.isReg() || !ImpUse.isUse() || 49510b57cec5SDimitry Andric !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 49520b57cec5SDimitry Andric ErrInfo = "src0 should be subreg of implicit vector use"; 49530b57cec5SDimitry Andric return false; 49540b57cec5SDimitry Andric } 49550b57cec5SDimitry Andric } 49560b57cec5SDimitry Andric 49570b57cec5SDimitry Andric // Make sure we aren't losing exec uses in the td files. This mostly requires 49580b57cec5SDimitry Andric // being careful when using let Uses to try to add other use registers. 49590b57cec5SDimitry Andric if (shouldReadExec(MI)) { 49600b57cec5SDimitry Andric if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 49610b57cec5SDimitry Andric ErrInfo = "VALU instruction does not implicitly read exec mask"; 49620b57cec5SDimitry Andric return false; 49630b57cec5SDimitry Andric } 49640b57cec5SDimitry Andric } 49650b57cec5SDimitry Andric 49660b57cec5SDimitry Andric if (isSMRD(MI)) { 496781ad6265SDimitry Andric if (MI.mayStore() && 496881ad6265SDimitry Andric ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { 49690b57cec5SDimitry Andric // The register offset form of scalar stores may only use m0 as the 49700b57cec5SDimitry Andric // soffset register. 497181ad6265SDimitry Andric const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset); 49720b57cec5SDimitry Andric if (Soff && Soff->getReg() != AMDGPU::M0) { 49730b57cec5SDimitry Andric ErrInfo = "scalar stores must use m0 as offset register"; 49740b57cec5SDimitry Andric return false; 49750b57cec5SDimitry Andric } 49760b57cec5SDimitry Andric } 49770b57cec5SDimitry Andric } 49780b57cec5SDimitry Andric 4979e8d8bef9SDimitry Andric if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { 49800b57cec5SDimitry Andric const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 49810b57cec5SDimitry Andric if (Offset->getImm() != 0) { 49820b57cec5SDimitry Andric ErrInfo = "subtarget does not support offsets in flat instructions"; 49830b57cec5SDimitry Andric return false; 49840b57cec5SDimitry Andric } 49850b57cec5SDimitry Andric } 49860b57cec5SDimitry Andric 4987cb14a3feSDimitry Andric if (isDS(MI) && !ST.hasGDS()) { 4988cb14a3feSDimitry Andric const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds); 4989cb14a3feSDimitry Andric if (GDSOp && GDSOp->getImm() != 0) { 4990cb14a3feSDimitry Andric ErrInfo = "GDS is not supported on this subtarget"; 4991cb14a3feSDimitry Andric return false; 4992cb14a3feSDimitry Andric } 4993cb14a3feSDimitry Andric } 4994cb14a3feSDimitry Andric 49955f757f3fSDimitry Andric if (isImage(MI)) { 49960b57cec5SDimitry Andric const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 49970b57cec5SDimitry Andric if (DimOp) { 49980b57cec5SDimitry Andric int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 49990b57cec5SDimitry Andric AMDGPU::OpName::vaddr0); 50005f757f3fSDimitry Andric int RSrcOpName = 50015f757f3fSDimitry Andric isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 50025f757f3fSDimitry Andric int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName); 50030b57cec5SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 50040b57cec5SDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 50050b57cec5SDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 50060b57cec5SDimitry Andric const AMDGPU::MIMGDimInfo *Dim = 50070b57cec5SDimitry Andric AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 50080b57cec5SDimitry Andric 50090b57cec5SDimitry Andric if (!Dim) { 50100b57cec5SDimitry Andric ErrInfo = "dim is out of range"; 50110b57cec5SDimitry Andric return false; 50120b57cec5SDimitry Andric } 50130b57cec5SDimitry Andric 50145ffd83dbSDimitry Andric bool IsA16 = false; 50155ffd83dbSDimitry Andric if (ST.hasR128A16()) { 50165ffd83dbSDimitry Andric const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 50175ffd83dbSDimitry Andric IsA16 = R128A16->getImm() != 0; 5018bdd1243dSDimitry Andric } else if (ST.hasA16()) { 50195ffd83dbSDimitry Andric const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 50205ffd83dbSDimitry Andric IsA16 = A16->getImm() != 0; 50215ffd83dbSDimitry Andric } 50225ffd83dbSDimitry Andric 50235f757f3fSDimitry Andric bool IsNSA = RsrcIdx - VAddr0Idx > 1; 50245ffd83dbSDimitry Andric 5025fe6060f1SDimitry Andric unsigned AddrWords = 5026fe6060f1SDimitry Andric AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); 50270b57cec5SDimitry Andric 50280b57cec5SDimitry Andric unsigned VAddrWords; 50290b57cec5SDimitry Andric if (IsNSA) { 50305f757f3fSDimitry Andric VAddrWords = RsrcIdx - VAddr0Idx; 50315f757f3fSDimitry Andric if (ST.hasPartialNSAEncoding() && 50325f757f3fSDimitry Andric AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) { 50335f757f3fSDimitry Andric unsigned LastVAddrIdx = RsrcIdx - 1; 503406c3fb27SDimitry Andric VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1; 503506c3fb27SDimitry Andric } 50360b57cec5SDimitry Andric } else { 503706c3fb27SDimitry Andric VAddrWords = getOpSize(MI, VAddr0Idx) / 4; 5038bdd1243dSDimitry Andric if (AddrWords > 12) 50390b57cec5SDimitry Andric AddrWords = 16; 50400b57cec5SDimitry Andric } 50410b57cec5SDimitry Andric 50420b57cec5SDimitry Andric if (VAddrWords != AddrWords) { 50435ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords 50445ffd83dbSDimitry Andric << " but got " << VAddrWords << "\n"); 50450b57cec5SDimitry Andric ErrInfo = "bad vaddr size"; 50460b57cec5SDimitry Andric return false; 50470b57cec5SDimitry Andric } 50480b57cec5SDimitry Andric } 50490b57cec5SDimitry Andric } 50500b57cec5SDimitry Andric 50510b57cec5SDimitry Andric const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 50520b57cec5SDimitry Andric if (DppCt) { 50530b57cec5SDimitry Andric using namespace AMDGPU::DPP; 50540b57cec5SDimitry Andric 50550b57cec5SDimitry Andric unsigned DC = DppCt->getImm(); 50560b57cec5SDimitry Andric if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 50570b57cec5SDimitry Andric DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 50580b57cec5SDimitry Andric (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 50590b57cec5SDimitry Andric (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 50600b57cec5SDimitry Andric (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 50610b57cec5SDimitry Andric (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 50620b57cec5SDimitry Andric (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 50630b57cec5SDimitry Andric ErrInfo = "Invalid dpp_ctrl value"; 50640b57cec5SDimitry Andric return false; 50650b57cec5SDimitry Andric } 50660b57cec5SDimitry Andric if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 50670b57cec5SDimitry Andric ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 50680b57cec5SDimitry Andric ErrInfo = "Invalid dpp_ctrl value: " 50690b57cec5SDimitry Andric "wavefront shifts are not supported on GFX10+"; 50700b57cec5SDimitry Andric return false; 50710b57cec5SDimitry Andric } 50720b57cec5SDimitry Andric if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 50730b57cec5SDimitry Andric ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 50740b57cec5SDimitry Andric ErrInfo = "Invalid dpp_ctrl value: " 50758bcb0991SDimitry Andric "broadcasts are not supported on GFX10+"; 50760b57cec5SDimitry Andric return false; 50770b57cec5SDimitry Andric } 50780b57cec5SDimitry Andric if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 50790b57cec5SDimitry Andric ST.getGeneration() < AMDGPUSubtarget::GFX10) { 5080fe6060f1SDimitry Andric if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && 5081fe6060f1SDimitry Andric DC <= DppCtrl::ROW_NEWBCAST_LAST && 5082fe6060f1SDimitry Andric !ST.hasGFX90AInsts()) { 5083fe6060f1SDimitry Andric ErrInfo = "Invalid dpp_ctrl value: " 5084fe6060f1SDimitry Andric "row_newbroadcast/row_share is not supported before " 5085fe6060f1SDimitry Andric "GFX90A/GFX10"; 5086fe6060f1SDimitry Andric return false; 5087fe6060f1SDimitry Andric } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { 50880b57cec5SDimitry Andric ErrInfo = "Invalid dpp_ctrl value: " 50890b57cec5SDimitry Andric "row_share and row_xmask are not supported before GFX10"; 50900b57cec5SDimitry Andric return false; 50910b57cec5SDimitry Andric } 50920b57cec5SDimitry Andric } 50930b57cec5SDimitry Andric 5094fe6060f1SDimitry Andric if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && 50955f757f3fSDimitry Andric !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) { 5096fe6060f1SDimitry Andric ErrInfo = "Invalid dpp_ctrl value: " 50975f757f3fSDimitry Andric "DP ALU dpp only support row_newbcast"; 5098fe6060f1SDimitry Andric return false; 5099fe6060f1SDimitry Andric } 5100fe6060f1SDimitry Andric } 5101fe6060f1SDimitry Andric 5102fe6060f1SDimitry Andric if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { 5103fe6060f1SDimitry Andric const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 5104fe6060f1SDimitry Andric uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 5105fe6060f1SDimitry Andric : AMDGPU::OpName::vdata; 5106fe6060f1SDimitry Andric const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); 5107fe6060f1SDimitry Andric const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); 5108fe6060f1SDimitry Andric if (Data && !Data->isReg()) 5109fe6060f1SDimitry Andric Data = nullptr; 5110fe6060f1SDimitry Andric 5111fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) { 5112fe6060f1SDimitry Andric if (Dst && Data && 5113fe6060f1SDimitry Andric (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { 5114fe6060f1SDimitry Andric ErrInfo = "Invalid register class: " 5115fe6060f1SDimitry Andric "vdata and vdst should be both VGPR or AGPR"; 5116fe6060f1SDimitry Andric return false; 5117fe6060f1SDimitry Andric } 5118fe6060f1SDimitry Andric if (Data && Data2 && 5119fe6060f1SDimitry Andric (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { 5120fe6060f1SDimitry Andric ErrInfo = "Invalid register class: " 5121fe6060f1SDimitry Andric "both data operands should be VGPR or AGPR"; 5122fe6060f1SDimitry Andric return false; 5123fe6060f1SDimitry Andric } 5124fe6060f1SDimitry Andric } else { 5125fe6060f1SDimitry Andric if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || 5126fe6060f1SDimitry Andric (Data && RI.isAGPR(MRI, Data->getReg())) || 5127fe6060f1SDimitry Andric (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { 5128fe6060f1SDimitry Andric ErrInfo = "Invalid register class: " 5129fe6060f1SDimitry Andric "agpr loads and stores not supported on this GPU"; 5130fe6060f1SDimitry Andric return false; 5131fe6060f1SDimitry Andric } 5132fe6060f1SDimitry Andric } 5133fe6060f1SDimitry Andric } 5134fe6060f1SDimitry Andric 513581ad6265SDimitry Andric if (ST.needsAlignedVGPRs()) { 513681ad6265SDimitry Andric const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool { 513781ad6265SDimitry Andric const MachineOperand *Op = getNamedOperand(MI, OpName); 513881ad6265SDimitry Andric if (!Op) 513981ad6265SDimitry Andric return true; 5140fe6060f1SDimitry Andric Register Reg = Op->getReg(); 514181ad6265SDimitry Andric if (Reg.isPhysical()) 514281ad6265SDimitry Andric return !(RI.getHWRegIndex(Reg) & 1); 5143fe6060f1SDimitry Andric const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 514481ad6265SDimitry Andric return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && 5145fe6060f1SDimitry Andric !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); 514681ad6265SDimitry Andric }; 5147fe6060f1SDimitry Andric 514881ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::DS_GWS_INIT || 514981ad6265SDimitry Andric MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || 515081ad6265SDimitry Andric MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) { 515181ad6265SDimitry Andric 515281ad6265SDimitry Andric if (!isAlignedReg(AMDGPU::OpName::data0)) { 5153fe6060f1SDimitry Andric ErrInfo = "Subtarget requires even aligned vector registers " 5154fe6060f1SDimitry Andric "for DS_GWS instructions"; 5155fe6060f1SDimitry Andric return false; 5156fe6060f1SDimitry Andric } 5157fe6060f1SDimitry Andric } 5158fe6060f1SDimitry Andric 515981ad6265SDimitry Andric if (isMIMG(MI)) { 516081ad6265SDimitry Andric if (!isAlignedReg(AMDGPU::OpName::vaddr)) { 516181ad6265SDimitry Andric ErrInfo = "Subtarget requires even aligned vector registers " 516281ad6265SDimitry Andric "for vaddr operand of image instructions"; 516381ad6265SDimitry Andric return false; 516481ad6265SDimitry Andric } 516581ad6265SDimitry Andric } 516681ad6265SDimitry Andric } 516781ad6265SDimitry Andric 516881ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 516981ad6265SDimitry Andric !ST.hasGFX90AInsts()) { 517081ad6265SDimitry Andric const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0); 517181ad6265SDimitry Andric if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) { 517281ad6265SDimitry Andric ErrInfo = "Invalid register class: " 517381ad6265SDimitry Andric "v_accvgpr_write with an SGPR is not supported on this GPU"; 517481ad6265SDimitry Andric return false; 517581ad6265SDimitry Andric } 517681ad6265SDimitry Andric } 517781ad6265SDimitry Andric 517804eeddc0SDimitry Andric if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { 517904eeddc0SDimitry Andric const MachineOperand &SrcOp = MI.getOperand(1); 518004eeddc0SDimitry Andric if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { 518104eeddc0SDimitry Andric ErrInfo = "pseudo expects only physical SGPRs"; 518204eeddc0SDimitry Andric return false; 518304eeddc0SDimitry Andric } 518404eeddc0SDimitry Andric } 518504eeddc0SDimitry Andric 51860b57cec5SDimitry Andric return true; 51870b57cec5SDimitry Andric } 51880b57cec5SDimitry Andric 51895f757f3fSDimitry Andric // It is more readable to list mapped opcodes on the same line. 51905f757f3fSDimitry Andric // clang-format off 51915f757f3fSDimitry Andric 51920b57cec5SDimitry Andric unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 51930b57cec5SDimitry Andric switch (MI.getOpcode()) { 51940b57cec5SDimitry Andric default: return AMDGPU::INSTRUCTION_LIST_END; 51950b57cec5SDimitry Andric case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 51960b57cec5SDimitry Andric case AMDGPU::COPY: return AMDGPU::COPY; 51970b57cec5SDimitry Andric case AMDGPU::PHI: return AMDGPU::PHI; 51980b57cec5SDimitry Andric case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 51990b57cec5SDimitry Andric case AMDGPU::WQM: return AMDGPU::WQM; 52008bcb0991SDimitry Andric case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 5201fe6060f1SDimitry Andric case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; 5202fe6060f1SDimitry Andric case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; 52030b57cec5SDimitry Andric case AMDGPU::S_MOV_B32: { 52040b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 52050b57cec5SDimitry Andric return MI.getOperand(1).isReg() || 52060b57cec5SDimitry Andric RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 52070b57cec5SDimitry Andric AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 52080b57cec5SDimitry Andric } 52090b57cec5SDimitry Andric case AMDGPU::S_ADD_I32: 5210e8d8bef9SDimitry Andric return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; 52110b57cec5SDimitry Andric case AMDGPU::S_ADDC_U32: 52120b57cec5SDimitry Andric return AMDGPU::V_ADDC_U32_e32; 52130b57cec5SDimitry Andric case AMDGPU::S_SUB_I32: 5214e8d8bef9SDimitry Andric return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; 52150b57cec5SDimitry Andric // FIXME: These are not consistently handled, and selected when the carry is 52160b57cec5SDimitry Andric // used. 52170b57cec5SDimitry Andric case AMDGPU::S_ADD_U32: 5218e8d8bef9SDimitry Andric return AMDGPU::V_ADD_CO_U32_e32; 52190b57cec5SDimitry Andric case AMDGPU::S_SUB_U32: 5220e8d8bef9SDimitry Andric return AMDGPU::V_SUB_CO_U32_e32; 52210b57cec5SDimitry Andric case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 5222e8d8bef9SDimitry Andric case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; 5223e8d8bef9SDimitry Andric case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; 5224e8d8bef9SDimitry Andric case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; 52250b57cec5SDimitry Andric case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 52260b57cec5SDimitry Andric case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 52270b57cec5SDimitry Andric case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 52280b57cec5SDimitry Andric case AMDGPU::S_XNOR_B32: 52290b57cec5SDimitry Andric return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 52300b57cec5SDimitry Andric case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 52310b57cec5SDimitry Andric case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 52320b57cec5SDimitry Andric case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 52330b57cec5SDimitry Andric case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 52340b57cec5SDimitry Andric case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 5235e8d8bef9SDimitry Andric case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; 52360b57cec5SDimitry Andric case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 5237e8d8bef9SDimitry Andric case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; 52380b57cec5SDimitry Andric case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 5239e8d8bef9SDimitry Andric case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; 5240e8d8bef9SDimitry Andric case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; 5241e8d8bef9SDimitry Andric case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; 5242e8d8bef9SDimitry Andric case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; 5243e8d8bef9SDimitry Andric case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; 52440b57cec5SDimitry Andric case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 52450b57cec5SDimitry Andric case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 52460b57cec5SDimitry Andric case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 52470b57cec5SDimitry Andric case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 5248349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; 5249349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; 5250349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; 5251349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; 5252349cc55cSDimitry Andric case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; 5253349cc55cSDimitry Andric case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; 5254349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; 5255349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; 5256349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; 5257349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; 5258349cc55cSDimitry Andric case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; 5259349cc55cSDimitry Andric case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; 5260349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; 5261349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; 52620b57cec5SDimitry Andric case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 52630b57cec5SDimitry Andric case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 52640b57cec5SDimitry Andric case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 52650b57cec5SDimitry Andric case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 52660b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 52670b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 52685f757f3fSDimitry Andric case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64; 52695f757f3fSDimitry Andric case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64; 52705f757f3fSDimitry Andric case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64; 52715f757f3fSDimitry Andric case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64; 52725f757f3fSDimitry Andric case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64; 52735f757f3fSDimitry Andric case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64; 52745f757f3fSDimitry Andric case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64; 52755f757f3fSDimitry Andric case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64; 52765f757f3fSDimitry Andric case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64; 52775f757f3fSDimitry Andric case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64; 52785f757f3fSDimitry Andric case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64; 52795f757f3fSDimitry Andric case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64; 52805f757f3fSDimitry Andric case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64; 52815f757f3fSDimitry Andric case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64; 52825f757f3fSDimitry Andric case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64; 52835f757f3fSDimitry Andric case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64; 52845f757f3fSDimitry Andric case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64; 52855f757f3fSDimitry Andric case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64; 52865f757f3fSDimitry Andric case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64; 52875f757f3fSDimitry Andric case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64; 52885f757f3fSDimitry Andric case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64; 52895f757f3fSDimitry Andric case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64; 52905f757f3fSDimitry Andric case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64; 52915f757f3fSDimitry Andric case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64; 52925f757f3fSDimitry Andric case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64; 52935f757f3fSDimitry Andric case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64; 52945f757f3fSDimitry Andric case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64; 52955f757f3fSDimitry Andric case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64; 52965f757f3fSDimitry Andric case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; 52975f757f3fSDimitry Andric case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; 52985f757f3fSDimitry Andric case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; 52995f757f3fSDimitry Andric case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64; 53005f757f3fSDimitry Andric case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; 53015f757f3fSDimitry Andric case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; 53025f757f3fSDimitry Andric case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; 53035f757f3fSDimitry Andric case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64; 53045f757f3fSDimitry Andric case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64; 53055f757f3fSDimitry Andric case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64; 53065f757f3fSDimitry Andric case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64; 53075f757f3fSDimitry Andric case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64; 53085f757f3fSDimitry Andric case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64; 53095f757f3fSDimitry Andric case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64; 53105f757f3fSDimitry Andric case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64; 53115f757f3fSDimitry Andric case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64; 53125f757f3fSDimitry Andric case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64; 53135f757f3fSDimitry Andric case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64; 53145f757f3fSDimitry Andric case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64; 53155f757f3fSDimitry Andric case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64; 53165f757f3fSDimitry Andric case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64; 53175f757f3fSDimitry Andric case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64; 53185f757f3fSDimitry Andric case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64; 53195f757f3fSDimitry Andric case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64; 53205f757f3fSDimitry Andric case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64; 53215f757f3fSDimitry Andric case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64; 53225f757f3fSDimitry Andric case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64; 53235f757f3fSDimitry Andric case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64; 53245f757f3fSDimitry Andric case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64; 53255f757f3fSDimitry Andric case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64; 53265f757f3fSDimitry Andric case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64; 53275f757f3fSDimitry Andric case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64; 53285f757f3fSDimitry Andric case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64; 53295f757f3fSDimitry Andric case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64; 53305f757f3fSDimitry Andric case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64; 53315f757f3fSDimitry Andric case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64; 53325f757f3fSDimitry Andric case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64; 53335f757f3fSDimitry Andric case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64; 53345f757f3fSDimitry Andric case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64; 53355f757f3fSDimitry Andric case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64; 53365f757f3fSDimitry Andric case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64; 53375f757f3fSDimitry Andric case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64; 53385f757f3fSDimitry Andric case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64; 53395f757f3fSDimitry Andric case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64; 53400b57cec5SDimitry Andric } 53410b57cec5SDimitry Andric llvm_unreachable( 53420b57cec5SDimitry Andric "Unexpected scalar opcode without corresponding vector one!"); 53430b57cec5SDimitry Andric } 53440b57cec5SDimitry Andric 53455f757f3fSDimitry Andric // clang-format on 53465f757f3fSDimitry Andric 534706c3fb27SDimitry Andric void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, 534806c3fb27SDimitry Andric MachineBasicBlock &MBB, 534906c3fb27SDimitry Andric MachineBasicBlock::iterator MBBI, 535006c3fb27SDimitry Andric const DebugLoc &DL, Register Reg, 53515f757f3fSDimitry Andric bool IsSCCLive, 53525f757f3fSDimitry Andric SlotIndexes *Indexes) const { 535306c3fb27SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 535406c3fb27SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 535506c3fb27SDimitry Andric bool IsWave32 = ST.isWave32(); 535606c3fb27SDimitry Andric if (IsSCCLive) { 535706c3fb27SDimitry Andric // Insert two move instructions, one to save the original value of EXEC and 535806c3fb27SDimitry Andric // the other to turn on all bits in EXEC. This is required as we can't use 535906c3fb27SDimitry Andric // the single instruction S_OR_SAVEEXEC that clobbers SCC. 536006c3fb27SDimitry Andric unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 536106c3fb27SDimitry Andric MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 53625f757f3fSDimitry Andric auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) 53635f757f3fSDimitry Andric .addReg(Exec, RegState::Kill); 53645f757f3fSDimitry Andric auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); 53655f757f3fSDimitry Andric if (Indexes) { 53665f757f3fSDimitry Andric Indexes->insertMachineInstrInMaps(*StoreExecMI); 53675f757f3fSDimitry Andric Indexes->insertMachineInstrInMaps(*FlipExecMI); 53685f757f3fSDimitry Andric } 536906c3fb27SDimitry Andric } else { 537006c3fb27SDimitry Andric const unsigned OrSaveExec = 537106c3fb27SDimitry Andric IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 537206c3fb27SDimitry Andric auto SaveExec = 537306c3fb27SDimitry Andric BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); 537406c3fb27SDimitry Andric SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 53755f757f3fSDimitry Andric if (Indexes) 53765f757f3fSDimitry Andric Indexes->insertMachineInstrInMaps(*SaveExec); 537706c3fb27SDimitry Andric } 537806c3fb27SDimitry Andric } 537906c3fb27SDimitry Andric 538006c3fb27SDimitry Andric void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, 538106c3fb27SDimitry Andric MachineBasicBlock::iterator MBBI, 53825f757f3fSDimitry Andric const DebugLoc &DL, Register Reg, 53835f757f3fSDimitry Andric SlotIndexes *Indexes) const { 538406c3fb27SDimitry Andric unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 538506c3fb27SDimitry Andric MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 53865f757f3fSDimitry Andric auto ExecRestoreMI = 538706c3fb27SDimitry Andric BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); 53885f757f3fSDimitry Andric if (Indexes) 53895f757f3fSDimitry Andric Indexes->insertMachineInstrInMaps(*ExecRestoreMI); 539006c3fb27SDimitry Andric } 539106c3fb27SDimitry Andric 539281ad6265SDimitry Andric static const TargetRegisterClass * 539381ad6265SDimitry Andric adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, 5394fe6060f1SDimitry Andric const MachineRegisterInfo &MRI, 539581ad6265SDimitry Andric const MCInstrDesc &TID, unsigned RCID, 5396fe6060f1SDimitry Andric bool IsAllocatable) { 5397fe6060f1SDimitry Andric if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 53980eae32dcSDimitry Andric (((TID.mayLoad() || TID.mayStore()) && 53990eae32dcSDimitry Andric !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || 5400fe6060f1SDimitry Andric (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { 5401fe6060f1SDimitry Andric switch (RCID) { 540281ad6265SDimitry Andric case AMDGPU::AV_32RegClassID: 540381ad6265SDimitry Andric RCID = AMDGPU::VGPR_32RegClassID; 540481ad6265SDimitry Andric break; 540581ad6265SDimitry Andric case AMDGPU::AV_64RegClassID: 540681ad6265SDimitry Andric RCID = AMDGPU::VReg_64RegClassID; 540781ad6265SDimitry Andric break; 540881ad6265SDimitry Andric case AMDGPU::AV_96RegClassID: 540981ad6265SDimitry Andric RCID = AMDGPU::VReg_96RegClassID; 541081ad6265SDimitry Andric break; 541181ad6265SDimitry Andric case AMDGPU::AV_128RegClassID: 541281ad6265SDimitry Andric RCID = AMDGPU::VReg_128RegClassID; 541381ad6265SDimitry Andric break; 541481ad6265SDimitry Andric case AMDGPU::AV_160RegClassID: 541581ad6265SDimitry Andric RCID = AMDGPU::VReg_160RegClassID; 541681ad6265SDimitry Andric break; 541781ad6265SDimitry Andric case AMDGPU::AV_512RegClassID: 541881ad6265SDimitry Andric RCID = AMDGPU::VReg_512RegClassID; 541981ad6265SDimitry Andric break; 5420fe6060f1SDimitry Andric default: 5421fe6060f1SDimitry Andric break; 5422fe6060f1SDimitry Andric } 5423fe6060f1SDimitry Andric } 542481ad6265SDimitry Andric 542581ad6265SDimitry Andric return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); 5426fe6060f1SDimitry Andric } 5427fe6060f1SDimitry Andric 5428fe6060f1SDimitry Andric const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, 5429fe6060f1SDimitry Andric unsigned OpNum, const TargetRegisterInfo *TRI, 5430fe6060f1SDimitry Andric const MachineFunction &MF) 5431fe6060f1SDimitry Andric const { 5432fe6060f1SDimitry Andric if (OpNum >= TID.getNumOperands()) 5433fe6060f1SDimitry Andric return nullptr; 5434bdd1243dSDimitry Andric auto RegClass = TID.operands()[OpNum].RegClass; 5435fe6060f1SDimitry Andric bool IsAllocatable = false; 5436fe6060f1SDimitry Andric if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { 5437fe6060f1SDimitry Andric // vdst and vdata should be both VGPR or AGPR, same for the DS instructions 543881ad6265SDimitry Andric // with two data operands. Request register class constrained to VGPR only 5439fe6060f1SDimitry Andric // of both operands present as Machine Copy Propagation can not check this 5440fe6060f1SDimitry Andric // constraint and possibly other passes too. 5441fe6060f1SDimitry Andric // 5442fe6060f1SDimitry Andric // The check is limited to FLAT and DS because atomics in non-flat encoding 5443fe6060f1SDimitry Andric // have their vdst and vdata tied to be the same register. 5444fe6060f1SDimitry Andric const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 5445fe6060f1SDimitry Andric AMDGPU::OpName::vdst); 5446fe6060f1SDimitry Andric const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 5447fe6060f1SDimitry Andric (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 5448fe6060f1SDimitry Andric : AMDGPU::OpName::vdata); 5449fe6060f1SDimitry Andric if (DataIdx != -1) { 5450bdd1243dSDimitry Andric IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand( 5451bdd1243dSDimitry Andric TID.Opcode, AMDGPU::OpName::data1); 5452fe6060f1SDimitry Andric } 5453fe6060f1SDimitry Andric } 545481ad6265SDimitry Andric return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass, 5455fe6060f1SDimitry Andric IsAllocatable); 5456fe6060f1SDimitry Andric } 5457fe6060f1SDimitry Andric 54580b57cec5SDimitry Andric const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 54590b57cec5SDimitry Andric unsigned OpNo) const { 54600b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 54610b57cec5SDimitry Andric const MCInstrDesc &Desc = get(MI.getOpcode()); 54620b57cec5SDimitry Andric if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 5463bdd1243dSDimitry Andric Desc.operands()[OpNo].RegClass == -1) { 54648bcb0991SDimitry Andric Register Reg = MI.getOperand(OpNo).getReg(); 54650b57cec5SDimitry Andric 5466e8d8bef9SDimitry Andric if (Reg.isVirtual()) 54670b57cec5SDimitry Andric return MRI.getRegClass(Reg); 5468bdd1243dSDimitry Andric return RI.getPhysRegBaseClass(Reg); 54690b57cec5SDimitry Andric } 54700b57cec5SDimitry Andric 5471bdd1243dSDimitry Andric unsigned RCID = Desc.operands()[OpNo].RegClass; 547281ad6265SDimitry Andric return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true); 54730b57cec5SDimitry Andric } 54740b57cec5SDimitry Andric 54750b57cec5SDimitry Andric void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 54760b57cec5SDimitry Andric MachineBasicBlock::iterator I = MI; 54770b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 54780b57cec5SDimitry Andric MachineOperand &MO = MI.getOperand(OpIdx); 54790b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5480bdd1243dSDimitry Andric unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; 54810b57cec5SDimitry Andric const TargetRegisterClass *RC = RI.getRegClass(RCID); 5482e8d8bef9SDimitry Andric unsigned Size = RI.getRegSizeInBits(*RC); 54830b57cec5SDimitry Andric unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 54840b57cec5SDimitry Andric if (MO.isReg()) 54850b57cec5SDimitry Andric Opcode = AMDGPU::COPY; 54860b57cec5SDimitry Andric else if (RI.isSGPRClass(RC)) 54870b57cec5SDimitry Andric Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 54880b57cec5SDimitry Andric 54890b57cec5SDimitry Andric const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 54908bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(VRC); 54910b57cec5SDimitry Andric DebugLoc DL = MBB->findDebugLoc(I); 54920b57cec5SDimitry Andric BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 54930b57cec5SDimitry Andric MO.ChangeToRegister(Reg, false); 54940b57cec5SDimitry Andric } 54950b57cec5SDimitry Andric 54965f757f3fSDimitry Andric unsigned SIInstrInfo::buildExtractSubReg( 54975f757f3fSDimitry Andric MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, 54985f757f3fSDimitry Andric const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, 54995f757f3fSDimitry Andric unsigned SubIdx, const TargetRegisterClass *SubRC) const { 55000b57cec5SDimitry Andric MachineBasicBlock *MBB = MI->getParent(); 55010b57cec5SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 55028bcb0991SDimitry Andric Register SubReg = MRI.createVirtualRegister(SubRC); 55030b57cec5SDimitry Andric 55040b57cec5SDimitry Andric if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 55050b57cec5SDimitry Andric BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 55060b57cec5SDimitry Andric .addReg(SuperReg.getReg(), 0, SubIdx); 55070b57cec5SDimitry Andric return SubReg; 55080b57cec5SDimitry Andric } 55090b57cec5SDimitry Andric 55100b57cec5SDimitry Andric // Just in case the super register is itself a sub-register, copy it to a new 55110b57cec5SDimitry Andric // value so we don't need to worry about merging its subreg index with the 55120b57cec5SDimitry Andric // SubIdx passed to this function. The register coalescer should be able to 55130b57cec5SDimitry Andric // eliminate this extra copy. 55148bcb0991SDimitry Andric Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 55150b57cec5SDimitry Andric 55160b57cec5SDimitry Andric BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 55170b57cec5SDimitry Andric .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 55180b57cec5SDimitry Andric 55190b57cec5SDimitry Andric BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 55200b57cec5SDimitry Andric .addReg(NewSuperReg, 0, SubIdx); 55210b57cec5SDimitry Andric 55220b57cec5SDimitry Andric return SubReg; 55230b57cec5SDimitry Andric } 55240b57cec5SDimitry Andric 55250b57cec5SDimitry Andric MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 55265f757f3fSDimitry Andric MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI, 55275f757f3fSDimitry Andric const MachineOperand &Op, const TargetRegisterClass *SuperRC, 55285f757f3fSDimitry Andric unsigned SubIdx, const TargetRegisterClass *SubRC) const { 55290b57cec5SDimitry Andric if (Op.isImm()) { 55300b57cec5SDimitry Andric if (SubIdx == AMDGPU::sub0) 55310b57cec5SDimitry Andric return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 55320b57cec5SDimitry Andric if (SubIdx == AMDGPU::sub1) 55330b57cec5SDimitry Andric return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 55340b57cec5SDimitry Andric 55350b57cec5SDimitry Andric llvm_unreachable("Unhandled register index for immediate"); 55360b57cec5SDimitry Andric } 55370b57cec5SDimitry Andric 55380b57cec5SDimitry Andric unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 55390b57cec5SDimitry Andric SubIdx, SubRC); 55400b57cec5SDimitry Andric return MachineOperand::CreateReg(SubReg, false); 55410b57cec5SDimitry Andric } 55420b57cec5SDimitry Andric 55430b57cec5SDimitry Andric // Change the order of operands from (0, 1, 2) to (0, 2, 1) 55440b57cec5SDimitry Andric void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 55450b57cec5SDimitry Andric assert(Inst.getNumExplicitOperands() == 3); 55460b57cec5SDimitry Andric MachineOperand Op1 = Inst.getOperand(1); 554781ad6265SDimitry Andric Inst.removeOperand(1); 55480b57cec5SDimitry Andric Inst.addOperand(Op1); 55490b57cec5SDimitry Andric } 55500b57cec5SDimitry Andric 55510b57cec5SDimitry Andric bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 55520b57cec5SDimitry Andric const MCOperandInfo &OpInfo, 55530b57cec5SDimitry Andric const MachineOperand &MO) const { 55540b57cec5SDimitry Andric if (!MO.isReg()) 55550b57cec5SDimitry Andric return false; 55560b57cec5SDimitry Andric 55578bcb0991SDimitry Andric Register Reg = MO.getReg(); 55580b57cec5SDimitry Andric 5559480093f4SDimitry Andric const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 5560e8d8bef9SDimitry Andric if (Reg.isPhysical()) 5561e8d8bef9SDimitry Andric return DRC->contains(Reg); 5562e8d8bef9SDimitry Andric 5563e8d8bef9SDimitry Andric const TargetRegisterClass *RC = MRI.getRegClass(Reg); 5564e8d8bef9SDimitry Andric 5565480093f4SDimitry Andric if (MO.getSubReg()) { 5566480093f4SDimitry Andric const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 5567480093f4SDimitry Andric const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 5568480093f4SDimitry Andric if (!SuperRC) 5569480093f4SDimitry Andric return false; 55700b57cec5SDimitry Andric 5571480093f4SDimitry Andric DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 5572480093f4SDimitry Andric if (!DRC) 5573480093f4SDimitry Andric return false; 5574480093f4SDimitry Andric } 5575480093f4SDimitry Andric return RC->hasSuperClassEq(DRC); 55760b57cec5SDimitry Andric } 55770b57cec5SDimitry Andric 55780b57cec5SDimitry Andric bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 55790b57cec5SDimitry Andric const MCOperandInfo &OpInfo, 55800b57cec5SDimitry Andric const MachineOperand &MO) const { 55810b57cec5SDimitry Andric if (MO.isReg()) 55820b57cec5SDimitry Andric return isLegalRegOperand(MRI, OpInfo, MO); 55830b57cec5SDimitry Andric 55840b57cec5SDimitry Andric // Handle non-register types that are treated like immediates. 55850b57cec5SDimitry Andric assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 55860b57cec5SDimitry Andric return true; 55870b57cec5SDimitry Andric } 55880b57cec5SDimitry Andric 55890b57cec5SDimitry Andric bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 55900b57cec5SDimitry Andric const MachineOperand *MO) const { 55910b57cec5SDimitry Andric const MachineFunction &MF = *MI.getParent()->getParent(); 55920b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 55930b57cec5SDimitry Andric const MCInstrDesc &InstDesc = MI.getDesc(); 5594bdd1243dSDimitry Andric const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; 55950b57cec5SDimitry Andric const TargetRegisterClass *DefinedRC = 55960b57cec5SDimitry Andric OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 55970b57cec5SDimitry Andric if (!MO) 55980b57cec5SDimitry Andric MO = &MI.getOperand(OpIdx); 55990b57cec5SDimitry Andric 56000b57cec5SDimitry Andric int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 560181ad6265SDimitry Andric int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; 56020b57cec5SDimitry Andric if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 5603bdd1243dSDimitry Andric if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--) 56040b57cec5SDimitry Andric return false; 56050b57cec5SDimitry Andric 56060b57cec5SDimitry Andric SmallDenseSet<RegSubRegPair> SGPRsUsed; 56070b57cec5SDimitry Andric if (MO->isReg()) 56080b57cec5SDimitry Andric SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 56090b57cec5SDimitry Andric 56100b57cec5SDimitry Andric for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 56110b57cec5SDimitry Andric if (i == OpIdx) 56120b57cec5SDimitry Andric continue; 56130b57cec5SDimitry Andric const MachineOperand &Op = MI.getOperand(i); 56140b57cec5SDimitry Andric if (Op.isReg()) { 56150b57cec5SDimitry Andric RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 56160b57cec5SDimitry Andric if (!SGPRsUsed.count(SGPR) && 5617bdd1243dSDimitry Andric // FIXME: This can access off the end of the operands() array. 5618bdd1243dSDimitry Andric usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) { 56190b57cec5SDimitry Andric if (--ConstantBusLimit <= 0) 56200b57cec5SDimitry Andric return false; 56210b57cec5SDimitry Andric SGPRsUsed.insert(SGPR); 56220b57cec5SDimitry Andric } 56235f757f3fSDimitry Andric } else if (AMDGPU::isSISrcOperand(InstDesc, i) && 56245f757f3fSDimitry Andric !isInlineConstant(Op, InstDesc.operands()[i])) { 562581ad6265SDimitry Andric if (!LiteralLimit--) 56260b57cec5SDimitry Andric return false; 56270b57cec5SDimitry Andric if (--ConstantBusLimit <= 0) 56280b57cec5SDimitry Andric return false; 56290b57cec5SDimitry Andric } 56300b57cec5SDimitry Andric } 56310b57cec5SDimitry Andric } 56320b57cec5SDimitry Andric 56330b57cec5SDimitry Andric if (MO->isReg()) { 5634fcaf7f86SDimitry Andric if (!DefinedRC) 5635fcaf7f86SDimitry Andric return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN; 5636fe6060f1SDimitry Andric if (!isLegalRegOperand(MRI, OpInfo, *MO)) 5637fe6060f1SDimitry Andric return false; 5638fe6060f1SDimitry Andric bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); 5639fe6060f1SDimitry Andric if (IsAGPR && !ST.hasMAIInsts()) 5640fe6060f1SDimitry Andric return false; 5641fe6060f1SDimitry Andric unsigned Opc = MI.getOpcode(); 5642fe6060f1SDimitry Andric if (IsAGPR && 5643fe6060f1SDimitry Andric (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 5644fe6060f1SDimitry Andric (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) 5645fe6060f1SDimitry Andric return false; 5646fe6060f1SDimitry Andric // Atomics should have both vdst and vdata either vgpr or agpr. 5647fe6060f1SDimitry Andric const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 5648fe6060f1SDimitry Andric const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, 5649fe6060f1SDimitry Andric isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); 5650fe6060f1SDimitry Andric if ((int)OpIdx == VDstIdx && DataIdx != -1 && 5651fe6060f1SDimitry Andric MI.getOperand(DataIdx).isReg() && 5652fe6060f1SDimitry Andric RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) 5653fe6060f1SDimitry Andric return false; 5654fe6060f1SDimitry Andric if ((int)OpIdx == DataIdx) { 5655fe6060f1SDimitry Andric if (VDstIdx != -1 && 5656fe6060f1SDimitry Andric RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) 5657fe6060f1SDimitry Andric return false; 5658fe6060f1SDimitry Andric // DS instructions with 2 src operands also must have tied RC. 5659fe6060f1SDimitry Andric const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, 5660fe6060f1SDimitry Andric AMDGPU::OpName::data1); 5661fe6060f1SDimitry Andric if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && 5662fe6060f1SDimitry Andric RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) 5663fe6060f1SDimitry Andric return false; 5664fe6060f1SDimitry Andric } 566581ad6265SDimitry Andric if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && 5666fe6060f1SDimitry Andric (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && 5667fe6060f1SDimitry Andric RI.isSGPRReg(MRI, MO->getReg())) 5668fe6060f1SDimitry Andric return false; 5669fe6060f1SDimitry Andric return true; 56700b57cec5SDimitry Andric } 56710b57cec5SDimitry Andric 56725f757f3fSDimitry Andric if (MO->isImm()) { 56735f757f3fSDimitry Andric uint64_t Imm = MO->getImm(); 56745f757f3fSDimitry Andric bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64; 56755f757f3fSDimitry Andric bool Is64BitOp = Is64BitFPOp || 56765f757f3fSDimitry Andric OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || 56775f757f3fSDimitry Andric OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 || 56785f757f3fSDimitry Andric OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; 56795f757f3fSDimitry Andric if (Is64BitOp && 56805f757f3fSDimitry Andric !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) { 56815f757f3fSDimitry Andric if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp)) 56825f757f3fSDimitry Andric return false; 56835f757f3fSDimitry Andric 56845f757f3fSDimitry Andric // FIXME: We can use sign extended 64-bit literals, but only for signed 56855f757f3fSDimitry Andric // operands. At the moment we do not know if an operand is signed. 56865f757f3fSDimitry Andric // Such operand will be encoded as its low 32 bits and then either 56875f757f3fSDimitry Andric // correctly sign extended or incorrectly zero extended by HW. 56885f757f3fSDimitry Andric if (!Is64BitFPOp && (int32_t)Imm < 0) 56895f757f3fSDimitry Andric return false; 56905f757f3fSDimitry Andric } 56915f757f3fSDimitry Andric } 56925f757f3fSDimitry Andric 56930b57cec5SDimitry Andric // Handle non-register types that are treated like immediates. 56940b57cec5SDimitry Andric assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 56950b57cec5SDimitry Andric 56960b57cec5SDimitry Andric if (!DefinedRC) { 56970b57cec5SDimitry Andric // This operand expects an immediate. 56980b57cec5SDimitry Andric return true; 56990b57cec5SDimitry Andric } 57000b57cec5SDimitry Andric 57010b57cec5SDimitry Andric return isImmOperandLegal(MI, OpIdx, *MO); 57020b57cec5SDimitry Andric } 57030b57cec5SDimitry Andric 57040b57cec5SDimitry Andric void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 57050b57cec5SDimitry Andric MachineInstr &MI) const { 57060b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 57070b57cec5SDimitry Andric const MCInstrDesc &InstrDesc = get(Opc); 57080b57cec5SDimitry Andric 57090b57cec5SDimitry Andric int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 57100b57cec5SDimitry Andric MachineOperand &Src0 = MI.getOperand(Src0Idx); 57110b57cec5SDimitry Andric 57120b57cec5SDimitry Andric int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 57130b57cec5SDimitry Andric MachineOperand &Src1 = MI.getOperand(Src1Idx); 57140b57cec5SDimitry Andric 57150b57cec5SDimitry Andric // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 57160b57cec5SDimitry Andric // we need to only have one constant bus use before GFX10. 5717bdd1243dSDimitry Andric bool HasImplicitSGPR = findImplicitSGPRRead(MI); 5718bdd1243dSDimitry Andric if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() && 5719bdd1243dSDimitry Andric RI.isSGPRReg(MRI, Src0.getReg())) 57200b57cec5SDimitry Andric legalizeOpWithMove(MI, Src0Idx); 57210b57cec5SDimitry Andric 57220b57cec5SDimitry Andric // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 57230b57cec5SDimitry Andric // both the value to write (src0) and lane select (src1). Fix up non-SGPR 57240b57cec5SDimitry Andric // src0/src1 with V_READFIRSTLANE. 57250b57cec5SDimitry Andric if (Opc == AMDGPU::V_WRITELANE_B32) { 57260b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 57270b57cec5SDimitry Andric if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 57288bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 57290b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 57300b57cec5SDimitry Andric .add(Src0); 57310b57cec5SDimitry Andric Src0.ChangeToRegister(Reg, false); 57320b57cec5SDimitry Andric } 57330b57cec5SDimitry Andric if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 57348bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 57350b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 57360b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 57370b57cec5SDimitry Andric .add(Src1); 57380b57cec5SDimitry Andric Src1.ChangeToRegister(Reg, false); 57390b57cec5SDimitry Andric } 57400b57cec5SDimitry Andric return; 57410b57cec5SDimitry Andric } 57420b57cec5SDimitry Andric 57430b57cec5SDimitry Andric // No VOP2 instructions support AGPRs. 57440b57cec5SDimitry Andric if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 57450b57cec5SDimitry Andric legalizeOpWithMove(MI, Src0Idx); 57460b57cec5SDimitry Andric 57470b57cec5SDimitry Andric if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 57480b57cec5SDimitry Andric legalizeOpWithMove(MI, Src1Idx); 57490b57cec5SDimitry Andric 57505f757f3fSDimitry Andric // Special case: V_FMAC_F32 and V_FMAC_F16 have src2. 57515f757f3fSDimitry Andric if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) { 57525f757f3fSDimitry Andric int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 57535f757f3fSDimitry Andric if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg())) 57545f757f3fSDimitry Andric legalizeOpWithMove(MI, Src2Idx); 57555f757f3fSDimitry Andric } 57565f757f3fSDimitry Andric 57570b57cec5SDimitry Andric // VOP2 src0 instructions support all operand types, so we don't need to check 57580b57cec5SDimitry Andric // their legality. If src1 is already legal, we don't need to do anything. 5759bdd1243dSDimitry Andric if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1)) 57600b57cec5SDimitry Andric return; 57610b57cec5SDimitry Andric 57620b57cec5SDimitry Andric // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 57630b57cec5SDimitry Andric // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 57640b57cec5SDimitry Andric // select is uniform. 57650b57cec5SDimitry Andric if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 57660b57cec5SDimitry Andric RI.isVGPR(MRI, Src1.getReg())) { 57678bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 57680b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 57690b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 57700b57cec5SDimitry Andric .add(Src1); 57710b57cec5SDimitry Andric Src1.ChangeToRegister(Reg, false); 57720b57cec5SDimitry Andric return; 57730b57cec5SDimitry Andric } 57740b57cec5SDimitry Andric 57750b57cec5SDimitry Andric // We do not use commuteInstruction here because it is too aggressive and will 57760b57cec5SDimitry Andric // commute if it is possible. We only want to commute here if it improves 57770b57cec5SDimitry Andric // legality. This can be called a fairly large number of times so don't waste 57780b57cec5SDimitry Andric // compile time pointlessly swapping and checking legality again. 57790b57cec5SDimitry Andric if (HasImplicitSGPR || !MI.isCommutable()) { 57800b57cec5SDimitry Andric legalizeOpWithMove(MI, Src1Idx); 57810b57cec5SDimitry Andric return; 57820b57cec5SDimitry Andric } 57830b57cec5SDimitry Andric 57840b57cec5SDimitry Andric // If src0 can be used as src1, commuting will make the operands legal. 57850b57cec5SDimitry Andric // Otherwise we have to give up and insert a move. 57860b57cec5SDimitry Andric // 57870b57cec5SDimitry Andric // TODO: Other immediate-like operand kinds could be commuted if there was a 57880b57cec5SDimitry Andric // MachineOperand::ChangeTo* for them. 57890b57cec5SDimitry Andric if ((!Src1.isImm() && !Src1.isReg()) || 5790bdd1243dSDimitry Andric !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) { 57910b57cec5SDimitry Andric legalizeOpWithMove(MI, Src1Idx); 57920b57cec5SDimitry Andric return; 57930b57cec5SDimitry Andric } 57940b57cec5SDimitry Andric 57950b57cec5SDimitry Andric int CommutedOpc = commuteOpcode(MI); 57960b57cec5SDimitry Andric if (CommutedOpc == -1) { 57970b57cec5SDimitry Andric legalizeOpWithMove(MI, Src1Idx); 57980b57cec5SDimitry Andric return; 57990b57cec5SDimitry Andric } 58000b57cec5SDimitry Andric 58010b57cec5SDimitry Andric MI.setDesc(get(CommutedOpc)); 58020b57cec5SDimitry Andric 58038bcb0991SDimitry Andric Register Src0Reg = Src0.getReg(); 58040b57cec5SDimitry Andric unsigned Src0SubReg = Src0.getSubReg(); 58050b57cec5SDimitry Andric bool Src0Kill = Src0.isKill(); 58060b57cec5SDimitry Andric 58070b57cec5SDimitry Andric if (Src1.isImm()) 58080b57cec5SDimitry Andric Src0.ChangeToImmediate(Src1.getImm()); 58090b57cec5SDimitry Andric else if (Src1.isReg()) { 58100b57cec5SDimitry Andric Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 58110b57cec5SDimitry Andric Src0.setSubReg(Src1.getSubReg()); 58120b57cec5SDimitry Andric } else 58130b57cec5SDimitry Andric llvm_unreachable("Should only have register or immediate operands"); 58140b57cec5SDimitry Andric 58150b57cec5SDimitry Andric Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 58160b57cec5SDimitry Andric Src1.setSubReg(Src0SubReg); 58170b57cec5SDimitry Andric fixImplicitOperands(MI); 58180b57cec5SDimitry Andric } 58190b57cec5SDimitry Andric 58200b57cec5SDimitry Andric // Legalize VOP3 operands. All operand types are supported for any operand 58210b57cec5SDimitry Andric // but only one literal constant and only starting from GFX10. 58220b57cec5SDimitry Andric void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 58230b57cec5SDimitry Andric MachineInstr &MI) const { 58240b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 58250b57cec5SDimitry Andric 58260b57cec5SDimitry Andric int VOP3Idx[3] = { 58270b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 58280b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 58290b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 58300b57cec5SDimitry Andric }; 58310b57cec5SDimitry Andric 5832e8d8bef9SDimitry Andric if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || 5833e8d8bef9SDimitry Andric Opc == AMDGPU::V_PERMLANEX16_B32_e64) { 58340b57cec5SDimitry Andric // src1 and src2 must be scalar 58350b57cec5SDimitry Andric MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 58360b57cec5SDimitry Andric MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 58370b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 58380b57cec5SDimitry Andric if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 58398bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 58400b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 58410b57cec5SDimitry Andric .add(Src1); 58420b57cec5SDimitry Andric Src1.ChangeToRegister(Reg, false); 58430b57cec5SDimitry Andric } 58440b57cec5SDimitry Andric if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 58458bcb0991SDimitry Andric Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 58460b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 58470b57cec5SDimitry Andric .add(Src2); 58480b57cec5SDimitry Andric Src2.ChangeToRegister(Reg, false); 58490b57cec5SDimitry Andric } 58500b57cec5SDimitry Andric } 58510b57cec5SDimitry Andric 58520b57cec5SDimitry Andric // Find the one SGPR operand we are allowed to use. 58530b57cec5SDimitry Andric int ConstantBusLimit = ST.getConstantBusLimit(Opc); 58540b57cec5SDimitry Andric int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 58550b57cec5SDimitry Andric SmallDenseSet<unsigned> SGPRsUsed; 5856e8d8bef9SDimitry Andric Register SGPRReg = findUsedSGPR(MI, VOP3Idx); 5857bdd1243dSDimitry Andric if (SGPRReg) { 58580b57cec5SDimitry Andric SGPRsUsed.insert(SGPRReg); 58590b57cec5SDimitry Andric --ConstantBusLimit; 58600b57cec5SDimitry Andric } 58610b57cec5SDimitry Andric 58620eae32dcSDimitry Andric for (int Idx : VOP3Idx) { 58630b57cec5SDimitry Andric if (Idx == -1) 58640b57cec5SDimitry Andric break; 58650b57cec5SDimitry Andric MachineOperand &MO = MI.getOperand(Idx); 58660b57cec5SDimitry Andric 58670b57cec5SDimitry Andric if (!MO.isReg()) { 5868bdd1243dSDimitry Andric if (isInlineConstant(MO, get(Opc).operands()[Idx])) 58690b57cec5SDimitry Andric continue; 58700b57cec5SDimitry Andric 58710b57cec5SDimitry Andric if (LiteralLimit > 0 && ConstantBusLimit > 0) { 58720b57cec5SDimitry Andric --LiteralLimit; 58730b57cec5SDimitry Andric --ConstantBusLimit; 58740b57cec5SDimitry Andric continue; 58750b57cec5SDimitry Andric } 58760b57cec5SDimitry Andric 58770b57cec5SDimitry Andric --LiteralLimit; 58780b57cec5SDimitry Andric --ConstantBusLimit; 58790b57cec5SDimitry Andric legalizeOpWithMove(MI, Idx); 58800b57cec5SDimitry Andric continue; 58810b57cec5SDimitry Andric } 58820b57cec5SDimitry Andric 5883349cc55cSDimitry Andric if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && 58840b57cec5SDimitry Andric !isOperandLegal(MI, Idx, &MO)) { 58850b57cec5SDimitry Andric legalizeOpWithMove(MI, Idx); 58860b57cec5SDimitry Andric continue; 58870b57cec5SDimitry Andric } 58880b57cec5SDimitry Andric 5889349cc55cSDimitry Andric if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) 58900b57cec5SDimitry Andric continue; // VGPRs are legal 58910b57cec5SDimitry Andric 58920b57cec5SDimitry Andric // We can use one SGPR in each VOP3 instruction prior to GFX10 58930b57cec5SDimitry Andric // and two starting from GFX10. 58940b57cec5SDimitry Andric if (SGPRsUsed.count(MO.getReg())) 58950b57cec5SDimitry Andric continue; 58960b57cec5SDimitry Andric if (ConstantBusLimit > 0) { 58970b57cec5SDimitry Andric SGPRsUsed.insert(MO.getReg()); 58980b57cec5SDimitry Andric --ConstantBusLimit; 58990b57cec5SDimitry Andric continue; 59000b57cec5SDimitry Andric } 59010b57cec5SDimitry Andric 59020b57cec5SDimitry Andric // If we make it this far, then the operand is not legal and we must 59030b57cec5SDimitry Andric // legalize it. 59040b57cec5SDimitry Andric legalizeOpWithMove(MI, Idx); 59050b57cec5SDimitry Andric } 59065f757f3fSDimitry Andric 59075f757f3fSDimitry Andric // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst. 59085f757f3fSDimitry Andric if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && 59095f757f3fSDimitry Andric !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) 59105f757f3fSDimitry Andric legalizeOpWithMove(MI, VOP3Idx[2]); 59110b57cec5SDimitry Andric } 59120b57cec5SDimitry Andric 59135ffd83dbSDimitry Andric Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 59140b57cec5SDimitry Andric MachineRegisterInfo &MRI) const { 59150b57cec5SDimitry Andric const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 59160b57cec5SDimitry Andric const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 59178bcb0991SDimitry Andric Register DstReg = MRI.createVirtualRegister(SRC); 59180b57cec5SDimitry Andric unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 59190b57cec5SDimitry Andric 59200b57cec5SDimitry Andric if (RI.hasAGPRs(VRC)) { 59210b57cec5SDimitry Andric VRC = RI.getEquivalentVGPRClass(VRC); 59228bcb0991SDimitry Andric Register NewSrcReg = MRI.createVirtualRegister(VRC); 59230b57cec5SDimitry Andric BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 59240b57cec5SDimitry Andric get(TargetOpcode::COPY), NewSrcReg) 59250b57cec5SDimitry Andric .addReg(SrcReg); 59260b57cec5SDimitry Andric SrcReg = NewSrcReg; 59270b57cec5SDimitry Andric } 59280b57cec5SDimitry Andric 59290b57cec5SDimitry Andric if (SubRegs == 1) { 59300b57cec5SDimitry Andric BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 59310b57cec5SDimitry Andric get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 59320b57cec5SDimitry Andric .addReg(SrcReg); 59330b57cec5SDimitry Andric return DstReg; 59340b57cec5SDimitry Andric } 59350b57cec5SDimitry Andric 5936bdd1243dSDimitry Andric SmallVector<Register, 8> SRegs; 59370b57cec5SDimitry Andric for (unsigned i = 0; i < SubRegs; ++i) { 59388bcb0991SDimitry Andric Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 59390b57cec5SDimitry Andric BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 59400b57cec5SDimitry Andric get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 59410b57cec5SDimitry Andric .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 59420b57cec5SDimitry Andric SRegs.push_back(SGPR); 59430b57cec5SDimitry Andric } 59440b57cec5SDimitry Andric 59450b57cec5SDimitry Andric MachineInstrBuilder MIB = 59460b57cec5SDimitry Andric BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 59470b57cec5SDimitry Andric get(AMDGPU::REG_SEQUENCE), DstReg); 59480b57cec5SDimitry Andric for (unsigned i = 0; i < SubRegs; ++i) { 59490b57cec5SDimitry Andric MIB.addReg(SRegs[i]); 59500b57cec5SDimitry Andric MIB.addImm(RI.getSubRegFromChannel(i)); 59510b57cec5SDimitry Andric } 59520b57cec5SDimitry Andric return DstReg; 59530b57cec5SDimitry Andric } 59540b57cec5SDimitry Andric 59550b57cec5SDimitry Andric void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 59560b57cec5SDimitry Andric MachineInstr &MI) const { 59570b57cec5SDimitry Andric 59580b57cec5SDimitry Andric // If the pointer is store in VGPRs, then we need to move them to 59590b57cec5SDimitry Andric // SGPRs using v_readfirstlane. This is safe because we only select 59600b57cec5SDimitry Andric // loads with uniform pointers to SMRD instruction so we know the 59610b57cec5SDimitry Andric // pointer value is uniform. 59620b57cec5SDimitry Andric MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 59630b57cec5SDimitry Andric if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 5964e8d8bef9SDimitry Andric Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 59650b57cec5SDimitry Andric SBase->setReg(SGPR); 59660b57cec5SDimitry Andric } 596781ad6265SDimitry Andric MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); 59680b57cec5SDimitry Andric if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 5969e8d8bef9SDimitry Andric Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 59700b57cec5SDimitry Andric SOff->setReg(SGPR); 59710b57cec5SDimitry Andric } 59720b57cec5SDimitry Andric } 59730b57cec5SDimitry Andric 5974fe6060f1SDimitry Andric bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { 5975fe6060f1SDimitry Andric unsigned Opc = Inst.getOpcode(); 5976fe6060f1SDimitry Andric int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 5977fe6060f1SDimitry Andric if (OldSAddrIdx < 0) 5978fe6060f1SDimitry Andric return false; 5979fe6060f1SDimitry Andric 5980fe6060f1SDimitry Andric assert(isSegmentSpecificFLAT(Inst)); 5981fe6060f1SDimitry Andric 5982fe6060f1SDimitry Andric int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); 5983fe6060f1SDimitry Andric if (NewOpc < 0) 5984fe6060f1SDimitry Andric NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); 5985fe6060f1SDimitry Andric if (NewOpc < 0) 5986fe6060f1SDimitry Andric return false; 5987fe6060f1SDimitry Andric 5988fe6060f1SDimitry Andric MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); 5989fe6060f1SDimitry Andric MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); 5990fe6060f1SDimitry Andric if (RI.isSGPRReg(MRI, SAddr.getReg())) 5991fe6060f1SDimitry Andric return false; 5992fe6060f1SDimitry Andric 5993fe6060f1SDimitry Andric int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); 5994fe6060f1SDimitry Andric if (NewVAddrIdx < 0) 5995fe6060f1SDimitry Andric return false; 5996fe6060f1SDimitry Andric 5997fe6060f1SDimitry Andric int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 5998fe6060f1SDimitry Andric 5999fe6060f1SDimitry Andric // Check vaddr, it shall be zero or absent. 6000fe6060f1SDimitry Andric MachineInstr *VAddrDef = nullptr; 6001fe6060f1SDimitry Andric if (OldVAddrIdx >= 0) { 6002fe6060f1SDimitry Andric MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); 6003fe6060f1SDimitry Andric VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); 6004fe6060f1SDimitry Andric if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || 6005fe6060f1SDimitry Andric !VAddrDef->getOperand(1).isImm() || 6006fe6060f1SDimitry Andric VAddrDef->getOperand(1).getImm() != 0) 6007fe6060f1SDimitry Andric return false; 6008fe6060f1SDimitry Andric } 6009fe6060f1SDimitry Andric 6010fe6060f1SDimitry Andric const MCInstrDesc &NewDesc = get(NewOpc); 6011fe6060f1SDimitry Andric Inst.setDesc(NewDesc); 6012fe6060f1SDimitry Andric 601381ad6265SDimitry Andric // Callers expect iterator to be valid after this call, so modify the 6014fe6060f1SDimitry Andric // instruction in place. 6015fe6060f1SDimitry Andric if (OldVAddrIdx == NewVAddrIdx) { 6016fe6060f1SDimitry Andric MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); 6017fe6060f1SDimitry Andric // Clear use list from the old vaddr holding a zero register. 6018fe6060f1SDimitry Andric MRI.removeRegOperandFromUseList(&NewVAddr); 6019fe6060f1SDimitry Andric MRI.moveOperands(&NewVAddr, &SAddr, 1); 602081ad6265SDimitry Andric Inst.removeOperand(OldSAddrIdx); 6021fe6060f1SDimitry Andric // Update the use list with the pointer we have just moved from vaddr to 602281ad6265SDimitry Andric // saddr position. Otherwise new vaddr will be missing from the use list. 6023fe6060f1SDimitry Andric MRI.removeRegOperandFromUseList(&NewVAddr); 6024fe6060f1SDimitry Andric MRI.addRegOperandToUseList(&NewVAddr); 6025fe6060f1SDimitry Andric } else { 6026fe6060f1SDimitry Andric assert(OldSAddrIdx == NewVAddrIdx); 6027fe6060f1SDimitry Andric 6028fe6060f1SDimitry Andric if (OldVAddrIdx >= 0) { 6029fe6060f1SDimitry Andric int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, 6030fe6060f1SDimitry Andric AMDGPU::OpName::vdst_in); 6031fe6060f1SDimitry Andric 603281ad6265SDimitry Andric // removeOperand doesn't try to fixup tied operand indexes at it goes, so 6033fe6060f1SDimitry Andric // it asserts. Untie the operands for now and retie them afterwards. 6034fe6060f1SDimitry Andric if (NewVDstIn != -1) { 6035fe6060f1SDimitry Andric int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); 6036fe6060f1SDimitry Andric Inst.untieRegOperand(OldVDstIn); 6037fe6060f1SDimitry Andric } 6038fe6060f1SDimitry Andric 603981ad6265SDimitry Andric Inst.removeOperand(OldVAddrIdx); 6040fe6060f1SDimitry Andric 6041fe6060f1SDimitry Andric if (NewVDstIn != -1) { 6042fe6060f1SDimitry Andric int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 6043fe6060f1SDimitry Andric Inst.tieOperands(NewVDst, NewVDstIn); 6044fe6060f1SDimitry Andric } 6045fe6060f1SDimitry Andric } 6046fe6060f1SDimitry Andric } 6047fe6060f1SDimitry Andric 6048fe6060f1SDimitry Andric if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) 6049fe6060f1SDimitry Andric VAddrDef->eraseFromParent(); 6050fe6060f1SDimitry Andric 6051fe6060f1SDimitry Andric return true; 6052fe6060f1SDimitry Andric } 6053fe6060f1SDimitry Andric 6054e8d8bef9SDimitry Andric // FIXME: Remove this when SelectionDAG is obsoleted. 6055e8d8bef9SDimitry Andric void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, 6056e8d8bef9SDimitry Andric MachineInstr &MI) const { 6057e8d8bef9SDimitry Andric if (!isSegmentSpecificFLAT(MI)) 6058e8d8bef9SDimitry Andric return; 6059e8d8bef9SDimitry Andric 6060e8d8bef9SDimitry Andric // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence 6061e8d8bef9SDimitry Andric // thinks they are uniform, so a readfirstlane should be valid. 6062e8d8bef9SDimitry Andric MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); 6063e8d8bef9SDimitry Andric if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) 6064e8d8bef9SDimitry Andric return; 6065e8d8bef9SDimitry Andric 6066fe6060f1SDimitry Andric if (moveFlatAddrToVGPR(MI)) 6067fe6060f1SDimitry Andric return; 6068fe6060f1SDimitry Andric 6069e8d8bef9SDimitry Andric Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); 6070e8d8bef9SDimitry Andric SAddr->setReg(ToSGPR); 6071e8d8bef9SDimitry Andric } 6072e8d8bef9SDimitry Andric 60730b57cec5SDimitry Andric void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 60740b57cec5SDimitry Andric MachineBasicBlock::iterator I, 60750b57cec5SDimitry Andric const TargetRegisterClass *DstRC, 60760b57cec5SDimitry Andric MachineOperand &Op, 60770b57cec5SDimitry Andric MachineRegisterInfo &MRI, 60780b57cec5SDimitry Andric const DebugLoc &DL) const { 60798bcb0991SDimitry Andric Register OpReg = Op.getReg(); 60800b57cec5SDimitry Andric unsigned OpSubReg = Op.getSubReg(); 60810b57cec5SDimitry Andric 60820b57cec5SDimitry Andric const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 60830b57cec5SDimitry Andric RI.getRegClassForReg(MRI, OpReg), OpSubReg); 60840b57cec5SDimitry Andric 60850b57cec5SDimitry Andric // Check if operand is already the correct register class. 60860b57cec5SDimitry Andric if (DstRC == OpRC) 60870b57cec5SDimitry Andric return; 60880b57cec5SDimitry Andric 60898bcb0991SDimitry Andric Register DstReg = MRI.createVirtualRegister(DstRC); 6090349cc55cSDimitry Andric auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 60910b57cec5SDimitry Andric 60920b57cec5SDimitry Andric Op.setReg(DstReg); 60930b57cec5SDimitry Andric Op.setSubReg(0); 60940b57cec5SDimitry Andric 60950b57cec5SDimitry Andric MachineInstr *Def = MRI.getVRegDef(OpReg); 60960b57cec5SDimitry Andric if (!Def) 60970b57cec5SDimitry Andric return; 60980b57cec5SDimitry Andric 60990b57cec5SDimitry Andric // Try to eliminate the copy if it is copying an immediate value. 61008bcb0991SDimitry Andric if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 61010b57cec5SDimitry Andric FoldImmediate(*Copy, *Def, OpReg, &MRI); 61028bcb0991SDimitry Andric 61038bcb0991SDimitry Andric bool ImpDef = Def->isImplicitDef(); 61048bcb0991SDimitry Andric while (!ImpDef && Def && Def->isCopy()) { 61058bcb0991SDimitry Andric if (Def->getOperand(1).getReg().isPhysical()) 61068bcb0991SDimitry Andric break; 61078bcb0991SDimitry Andric Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 61088bcb0991SDimitry Andric ImpDef = Def && Def->isImplicitDef(); 61098bcb0991SDimitry Andric } 61108bcb0991SDimitry Andric if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 61118bcb0991SDimitry Andric !ImpDef) 6112349cc55cSDimitry Andric Copy.addReg(AMDGPU::EXEC, RegState::Implicit); 61130b57cec5SDimitry Andric } 61140b57cec5SDimitry Andric 61150b57cec5SDimitry Andric // Emit the actual waterfall loop, executing the wrapped instruction for each 611606c3fb27SDimitry Andric // unique value of \p ScalarOps across all lanes. In the best case we execute 1 61170b57cec5SDimitry Andric // iteration, in the worst case we execute 64 (once per lane). 611806c3fb27SDimitry Andric static void emitLoadScalarOpsFromVGPRLoop( 611906c3fb27SDimitry Andric const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, 612006c3fb27SDimitry Andric MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, 612106c3fb27SDimitry Andric ArrayRef<MachineOperand *> ScalarOps) { 61220b57cec5SDimitry Andric MachineFunction &MF = *OrigBB.getParent(); 61230b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 61240b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 61250b57cec5SDimitry Andric unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 61260b57cec5SDimitry Andric unsigned SaveExecOpc = 61270b57cec5SDimitry Andric ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 61280b57cec5SDimitry Andric unsigned XorTermOpc = 61290b57cec5SDimitry Andric ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 61300b57cec5SDimitry Andric unsigned AndOpc = 61310b57cec5SDimitry Andric ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 61320b57cec5SDimitry Andric const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 61330b57cec5SDimitry Andric 61340b57cec5SDimitry Andric MachineBasicBlock::iterator I = LoopBB.begin(); 61350b57cec5SDimitry Andric 6136e8d8bef9SDimitry Andric SmallVector<Register, 8> ReadlanePieces; 6137bdd1243dSDimitry Andric Register CondReg; 6138e8d8bef9SDimitry Andric 613906c3fb27SDimitry Andric for (MachineOperand *ScalarOp : ScalarOps) { 614006c3fb27SDimitry Andric unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); 6141e8d8bef9SDimitry Andric unsigned NumSubRegs = RegSize / 32; 614206c3fb27SDimitry Andric Register VScalarOp = ScalarOp->getReg(); 614306c3fb27SDimitry Andric 614406c3fb27SDimitry Andric if (NumSubRegs == 1) { 614506c3fb27SDimitry Andric Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 614606c3fb27SDimitry Andric 614706c3fb27SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg) 614806c3fb27SDimitry Andric .addReg(VScalarOp); 614906c3fb27SDimitry Andric 615006c3fb27SDimitry Andric Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); 615106c3fb27SDimitry Andric 615206c3fb27SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg) 615306c3fb27SDimitry Andric .addReg(CurReg) 615406c3fb27SDimitry Andric .addReg(VScalarOp); 615506c3fb27SDimitry Andric 615606c3fb27SDimitry Andric // Combine the comparison results with AND. 615706c3fb27SDimitry Andric if (!CondReg) // First. 615806c3fb27SDimitry Andric CondReg = NewCondReg; 615906c3fb27SDimitry Andric else { // If not the first, we create an AND. 616006c3fb27SDimitry Andric Register AndReg = MRI.createVirtualRegister(BoolXExecRC); 616106c3fb27SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) 616206c3fb27SDimitry Andric .addReg(CondReg) 616306c3fb27SDimitry Andric .addReg(NewCondReg); 616406c3fb27SDimitry Andric CondReg = AndReg; 616506c3fb27SDimitry Andric } 616606c3fb27SDimitry Andric 616706c3fb27SDimitry Andric // Update ScalarOp operand to use the SGPR ScalarOp. 616806c3fb27SDimitry Andric ScalarOp->setReg(CurReg); 616906c3fb27SDimitry Andric ScalarOp->setIsKill(); 617006c3fb27SDimitry Andric } else { 617106c3fb27SDimitry Andric unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef()); 617206c3fb27SDimitry Andric assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && 617306c3fb27SDimitry Andric "Unhandled register size"); 61740b57cec5SDimitry Andric 6175e8d8bef9SDimitry Andric for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { 6176e8d8bef9SDimitry Andric Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 6177e8d8bef9SDimitry Andric Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 6178e8d8bef9SDimitry Andric 6179e8d8bef9SDimitry Andric // Read the next variant <- also loop target. 6180e8d8bef9SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) 618106c3fb27SDimitry Andric .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx)); 6182e8d8bef9SDimitry Andric 6183e8d8bef9SDimitry Andric // Read the next variant <- also loop target. 6184e8d8bef9SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) 618506c3fb27SDimitry Andric .addReg(VScalarOp, VScalarOpUndef, 618606c3fb27SDimitry Andric TRI->getSubRegFromChannel(Idx + 1)); 6187e8d8bef9SDimitry Andric 6188e8d8bef9SDimitry Andric ReadlanePieces.push_back(CurRegLo); 6189e8d8bef9SDimitry Andric ReadlanePieces.push_back(CurRegHi); 6190e8d8bef9SDimitry Andric 6191e8d8bef9SDimitry Andric // Comparison is to be done as 64-bit. 6192e8d8bef9SDimitry Andric Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 6193e8d8bef9SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) 6194e8d8bef9SDimitry Andric .addReg(CurRegLo) 61950b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 6196e8d8bef9SDimitry Andric .addReg(CurRegHi) 6197e8d8bef9SDimitry Andric .addImm(AMDGPU::sub1); 6198e8d8bef9SDimitry Andric 6199e8d8bef9SDimitry Andric Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); 620006c3fb27SDimitry Andric auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), 620106c3fb27SDimitry Andric NewCondReg) 6202e8d8bef9SDimitry Andric .addReg(CurReg); 6203e8d8bef9SDimitry Andric if (NumSubRegs <= 2) 620406c3fb27SDimitry Andric Cmp.addReg(VScalarOp); 6205e8d8bef9SDimitry Andric else 620606c3fb27SDimitry Andric Cmp.addReg(VScalarOp, VScalarOpUndef, 620706c3fb27SDimitry Andric TRI->getSubRegFromChannel(Idx, 2)); 6208e8d8bef9SDimitry Andric 620981ad6265SDimitry Andric // Combine the comparison results with AND. 6210bdd1243dSDimitry Andric if (!CondReg) // First. 6211e8d8bef9SDimitry Andric CondReg = NewCondReg; 6212e8d8bef9SDimitry Andric else { // If not the first, we create an AND. 6213e8d8bef9SDimitry Andric Register AndReg = MRI.createVirtualRegister(BoolXExecRC); 6214e8d8bef9SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) 6215e8d8bef9SDimitry Andric .addReg(CondReg) 6216e8d8bef9SDimitry Andric .addReg(NewCondReg); 6217e8d8bef9SDimitry Andric CondReg = AndReg; 6218e8d8bef9SDimitry Andric } 6219e8d8bef9SDimitry Andric } // End for loop. 6220e8d8bef9SDimitry Andric 622106c3fb27SDimitry Andric auto SScalarOpRC = 622206c3fb27SDimitry Andric TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp)); 622306c3fb27SDimitry Andric Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC); 6224e8d8bef9SDimitry Andric 622506c3fb27SDimitry Andric // Build scalar ScalarOp. 622606c3fb27SDimitry Andric auto Merge = 622706c3fb27SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp); 6228e8d8bef9SDimitry Andric unsigned Channel = 0; 6229e8d8bef9SDimitry Andric for (Register Piece : ReadlanePieces) { 623006c3fb27SDimitry Andric Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++)); 6231e8d8bef9SDimitry Andric } 62320b57cec5SDimitry Andric 623306c3fb27SDimitry Andric // Update ScalarOp operand to use the SGPR ScalarOp. 623406c3fb27SDimitry Andric ScalarOp->setReg(SScalarOp); 623506c3fb27SDimitry Andric ScalarOp->setIsKill(); 623606c3fb27SDimitry Andric } 623706c3fb27SDimitry Andric } 62380b57cec5SDimitry Andric 6239e8d8bef9SDimitry Andric Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 6240e8d8bef9SDimitry Andric MRI.setSimpleHint(SaveExec, CondReg); 62410b57cec5SDimitry Andric 62420b57cec5SDimitry Andric // Update EXEC to matching lanes, saving original to SaveExec. 62430b57cec5SDimitry Andric BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 6244e8d8bef9SDimitry Andric .addReg(CondReg, RegState::Kill); 62450b57cec5SDimitry Andric 62460b57cec5SDimitry Andric // The original instruction is here; we insert the terminators after it. 624781ad6265SDimitry Andric I = BodyBB.end(); 62480b57cec5SDimitry Andric 62490b57cec5SDimitry Andric // Update EXEC, switch all done bits to 0 and all todo bits to 1. 625081ad6265SDimitry Andric BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) 62510b57cec5SDimitry Andric .addReg(Exec) 62520b57cec5SDimitry Andric .addReg(SaveExec); 6253e8d8bef9SDimitry Andric 625481ad6265SDimitry Andric BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); 62550b57cec5SDimitry Andric } 62560b57cec5SDimitry Andric 625706c3fb27SDimitry Andric // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register 62580b57cec5SDimitry Andric // with SGPRs by iterating over all unique values across all lanes. 6259e8d8bef9SDimitry Andric // Returns the loop basic block that now contains \p MI. 6260e8d8bef9SDimitry Andric static MachineBasicBlock * 626106c3fb27SDimitry Andric loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 626206c3fb27SDimitry Andric ArrayRef<MachineOperand *> ScalarOps, 626306c3fb27SDimitry Andric MachineDominatorTree *MDT, 6264e8d8bef9SDimitry Andric MachineBasicBlock::iterator Begin = nullptr, 6265e8d8bef9SDimitry Andric MachineBasicBlock::iterator End = nullptr) { 62660b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 62670b57cec5SDimitry Andric MachineFunction &MF = *MBB.getParent(); 62680b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 62690b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 62700b57cec5SDimitry Andric MachineRegisterInfo &MRI = MF.getRegInfo(); 6271e8d8bef9SDimitry Andric if (!Begin.isValid()) 6272e8d8bef9SDimitry Andric Begin = &MI; 6273e8d8bef9SDimitry Andric if (!End.isValid()) { 6274e8d8bef9SDimitry Andric End = &MI; 6275e8d8bef9SDimitry Andric ++End; 6276e8d8bef9SDimitry Andric } 62770b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 62780b57cec5SDimitry Andric unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 62790b57cec5SDimitry Andric unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 62800b57cec5SDimitry Andric const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 62810b57cec5SDimitry Andric 62825f757f3fSDimitry Andric // Save SCC. Waterfall Loop may overwrite SCC. 62835f757f3fSDimitry Andric Register SaveSCCReg; 62845f757f3fSDimitry Andric bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) != 62855f757f3fSDimitry Andric MachineBasicBlock::LQR_Dead); 62865f757f3fSDimitry Andric if (SCCNotDead) { 62875f757f3fSDimitry Andric SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 62885f757f3fSDimitry Andric BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg) 62895f757f3fSDimitry Andric .addImm(1) 62905f757f3fSDimitry Andric .addImm(0); 62915f757f3fSDimitry Andric } 62925f757f3fSDimitry Andric 62938bcb0991SDimitry Andric Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 62940b57cec5SDimitry Andric 62950b57cec5SDimitry Andric // Save the EXEC mask 6296e8d8bef9SDimitry Andric BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 62970b57cec5SDimitry Andric 62980b57cec5SDimitry Andric // Killed uses in the instruction we are waterfalling around will be 62990b57cec5SDimitry Andric // incorrect due to the added control-flow. 6300e8d8bef9SDimitry Andric MachineBasicBlock::iterator AfterMI = MI; 6301e8d8bef9SDimitry Andric ++AfterMI; 6302e8d8bef9SDimitry Andric for (auto I = Begin; I != AfterMI; I++) { 630306c3fb27SDimitry Andric for (auto &MO : I->all_uses()) 63040b57cec5SDimitry Andric MRI.clearKillFlags(MO.getReg()); 63050b57cec5SDimitry Andric } 63060b57cec5SDimitry Andric 63070b57cec5SDimitry Andric // To insert the loop we need to split the block. Move everything after this 63080b57cec5SDimitry Andric // point to a new block, and insert a new empty block between the two. 63090b57cec5SDimitry Andric MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 631081ad6265SDimitry Andric MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); 63110b57cec5SDimitry Andric MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 63120b57cec5SDimitry Andric MachineFunction::iterator MBBI(MBB); 63130b57cec5SDimitry Andric ++MBBI; 63140b57cec5SDimitry Andric 63150b57cec5SDimitry Andric MF.insert(MBBI, LoopBB); 631681ad6265SDimitry Andric MF.insert(MBBI, BodyBB); 63170b57cec5SDimitry Andric MF.insert(MBBI, RemainderBB); 63180b57cec5SDimitry Andric 631981ad6265SDimitry Andric LoopBB->addSuccessor(BodyBB); 632081ad6265SDimitry Andric BodyBB->addSuccessor(LoopBB); 632181ad6265SDimitry Andric BodyBB->addSuccessor(RemainderBB); 63220b57cec5SDimitry Andric 632381ad6265SDimitry Andric // Move Begin to MI to the BodyBB, and the remainder of the block to 6324e8d8bef9SDimitry Andric // RemainderBB. 63250b57cec5SDimitry Andric RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 6326e8d8bef9SDimitry Andric RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); 632781ad6265SDimitry Andric BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end()); 63280b57cec5SDimitry Andric 63290b57cec5SDimitry Andric MBB.addSuccessor(LoopBB); 63300b57cec5SDimitry Andric 63310b57cec5SDimitry Andric // Update dominators. We know that MBB immediately dominates LoopBB, that 633281ad6265SDimitry Andric // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates 633381ad6265SDimitry Andric // RemainderBB. RemainderBB immediately dominates all of the successors 633481ad6265SDimitry Andric // transferred to it from MBB that MBB used to properly dominate. 63350b57cec5SDimitry Andric if (MDT) { 63360b57cec5SDimitry Andric MDT->addNewBlock(LoopBB, &MBB); 633781ad6265SDimitry Andric MDT->addNewBlock(BodyBB, LoopBB); 633881ad6265SDimitry Andric MDT->addNewBlock(RemainderBB, BodyBB); 63390b57cec5SDimitry Andric for (auto &Succ : RemainderBB->successors()) { 6340480093f4SDimitry Andric if (MDT->properlyDominates(&MBB, Succ)) { 63410b57cec5SDimitry Andric MDT->changeImmediateDominator(Succ, RemainderBB); 63420b57cec5SDimitry Andric } 63430b57cec5SDimitry Andric } 63440b57cec5SDimitry Andric } 63450b57cec5SDimitry Andric 634606c3fb27SDimitry Andric emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps); 63470b57cec5SDimitry Andric 63480b57cec5SDimitry Andric MachineBasicBlock::iterator First = RemainderBB->begin(); 63495f757f3fSDimitry Andric // Restore SCC 63505f757f3fSDimitry Andric if (SCCNotDead) { 63515f757f3fSDimitry Andric BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32)) 63525f757f3fSDimitry Andric .addReg(SaveSCCReg, RegState::Kill) 63535f757f3fSDimitry Andric .addImm(0); 63545f757f3fSDimitry Andric } 63555f757f3fSDimitry Andric 63565f757f3fSDimitry Andric // Restore the EXEC mask 63570b57cec5SDimitry Andric BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 635881ad6265SDimitry Andric return BodyBB; 63590b57cec5SDimitry Andric } 63600b57cec5SDimitry Andric 63610b57cec5SDimitry Andric // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 63620b57cec5SDimitry Andric static std::tuple<unsigned, unsigned> 63630b57cec5SDimitry Andric extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 63640b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 63650b57cec5SDimitry Andric MachineFunction &MF = *MBB.getParent(); 63660b57cec5SDimitry Andric MachineRegisterInfo &MRI = MF.getRegInfo(); 63670b57cec5SDimitry Andric 63680b57cec5SDimitry Andric // Extract the ptr from the resource descriptor. 63690b57cec5SDimitry Andric unsigned RsrcPtr = 63700b57cec5SDimitry Andric TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 63710b57cec5SDimitry Andric AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 63720b57cec5SDimitry Andric 63730b57cec5SDimitry Andric // Create an empty resource descriptor 63748bcb0991SDimitry Andric Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 63758bcb0991SDimitry Andric Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 63768bcb0991SDimitry Andric Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 63778bcb0991SDimitry Andric Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 63780b57cec5SDimitry Andric uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 63790b57cec5SDimitry Andric 63800b57cec5SDimitry Andric // Zero64 = 0 63810b57cec5SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 63820b57cec5SDimitry Andric .addImm(0); 63830b57cec5SDimitry Andric 63840b57cec5SDimitry Andric // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 63850b57cec5SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 63860b57cec5SDimitry Andric .addImm(RsrcDataFormat & 0xFFFFFFFF); 63870b57cec5SDimitry Andric 63880b57cec5SDimitry Andric // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 63890b57cec5SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 63900b57cec5SDimitry Andric .addImm(RsrcDataFormat >> 32); 63910b57cec5SDimitry Andric 63920b57cec5SDimitry Andric // NewSRsrc = {Zero64, SRsrcFormat} 63930b57cec5SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 63940b57cec5SDimitry Andric .addReg(Zero64) 63950b57cec5SDimitry Andric .addImm(AMDGPU::sub0_sub1) 63960b57cec5SDimitry Andric .addReg(SRsrcFormatLo) 63970b57cec5SDimitry Andric .addImm(AMDGPU::sub2) 63980b57cec5SDimitry Andric .addReg(SRsrcFormatHi) 63990b57cec5SDimitry Andric .addImm(AMDGPU::sub3); 64000b57cec5SDimitry Andric 6401bdd1243dSDimitry Andric return std::tuple(RsrcPtr, NewSRsrc); 64020b57cec5SDimitry Andric } 64030b57cec5SDimitry Andric 6404e8d8bef9SDimitry Andric MachineBasicBlock * 6405e8d8bef9SDimitry Andric SIInstrInfo::legalizeOperands(MachineInstr &MI, 64060b57cec5SDimitry Andric MachineDominatorTree *MDT) const { 64070b57cec5SDimitry Andric MachineFunction &MF = *MI.getParent()->getParent(); 64080b57cec5SDimitry Andric MachineRegisterInfo &MRI = MF.getRegInfo(); 6409e8d8bef9SDimitry Andric MachineBasicBlock *CreatedBB = nullptr; 64100b57cec5SDimitry Andric 64110b57cec5SDimitry Andric // Legalize VOP2 64120b57cec5SDimitry Andric if (isVOP2(MI) || isVOPC(MI)) { 64130b57cec5SDimitry Andric legalizeOperandsVOP2(MRI, MI); 6414e8d8bef9SDimitry Andric return CreatedBB; 64150b57cec5SDimitry Andric } 64160b57cec5SDimitry Andric 64170b57cec5SDimitry Andric // Legalize VOP3 64180b57cec5SDimitry Andric if (isVOP3(MI)) { 64190b57cec5SDimitry Andric legalizeOperandsVOP3(MRI, MI); 6420e8d8bef9SDimitry Andric return CreatedBB; 64210b57cec5SDimitry Andric } 64220b57cec5SDimitry Andric 64230b57cec5SDimitry Andric // Legalize SMRD 64240b57cec5SDimitry Andric if (isSMRD(MI)) { 64250b57cec5SDimitry Andric legalizeOperandsSMRD(MRI, MI); 6426e8d8bef9SDimitry Andric return CreatedBB; 6427e8d8bef9SDimitry Andric } 6428e8d8bef9SDimitry Andric 6429e8d8bef9SDimitry Andric // Legalize FLAT 6430e8d8bef9SDimitry Andric if (isFLAT(MI)) { 6431e8d8bef9SDimitry Andric legalizeOperandsFLAT(MRI, MI); 6432e8d8bef9SDimitry Andric return CreatedBB; 64330b57cec5SDimitry Andric } 64340b57cec5SDimitry Andric 64350b57cec5SDimitry Andric // Legalize REG_SEQUENCE and PHI 64360b57cec5SDimitry Andric // The register class of the operands much be the same type as the register 64370b57cec5SDimitry Andric // class of the output. 64380b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::PHI) { 64390b57cec5SDimitry Andric const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 64400b57cec5SDimitry Andric for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 6441e8d8bef9SDimitry Andric if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) 64420b57cec5SDimitry Andric continue; 64430b57cec5SDimitry Andric const TargetRegisterClass *OpRC = 64440b57cec5SDimitry Andric MRI.getRegClass(MI.getOperand(i).getReg()); 64450b57cec5SDimitry Andric if (RI.hasVectorRegisters(OpRC)) { 64460b57cec5SDimitry Andric VRC = OpRC; 64470b57cec5SDimitry Andric } else { 64480b57cec5SDimitry Andric SRC = OpRC; 64490b57cec5SDimitry Andric } 64500b57cec5SDimitry Andric } 64510b57cec5SDimitry Andric 64520b57cec5SDimitry Andric // If any of the operands are VGPR registers, then they all most be 64530b57cec5SDimitry Andric // otherwise we will create illegal VGPR->SGPR copies when legalizing 64540b57cec5SDimitry Andric // them. 64550b57cec5SDimitry Andric if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 64560b57cec5SDimitry Andric if (!VRC) { 64570b57cec5SDimitry Andric assert(SRC); 64588bcb0991SDimitry Andric if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 64598bcb0991SDimitry Andric VRC = &AMDGPU::VReg_1RegClass; 64608bcb0991SDimitry Andric } else 64614824e7fdSDimitry Andric VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) 64628bcb0991SDimitry Andric ? RI.getEquivalentAGPRClass(SRC) 64630b57cec5SDimitry Andric : RI.getEquivalentVGPRClass(SRC); 64648bcb0991SDimitry Andric } else { 64654824e7fdSDimitry Andric VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) 64668bcb0991SDimitry Andric ? RI.getEquivalentAGPRClass(VRC) 64678bcb0991SDimitry Andric : RI.getEquivalentVGPRClass(VRC); 64680b57cec5SDimitry Andric } 64690b57cec5SDimitry Andric RC = VRC; 64700b57cec5SDimitry Andric } else { 64710b57cec5SDimitry Andric RC = SRC; 64720b57cec5SDimitry Andric } 64730b57cec5SDimitry Andric 64740b57cec5SDimitry Andric // Update all the operands so they have the same type. 64750b57cec5SDimitry Andric for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 64760b57cec5SDimitry Andric MachineOperand &Op = MI.getOperand(I); 6477e8d8bef9SDimitry Andric if (!Op.isReg() || !Op.getReg().isVirtual()) 64780b57cec5SDimitry Andric continue; 64790b57cec5SDimitry Andric 64800b57cec5SDimitry Andric // MI is a PHI instruction. 64810b57cec5SDimitry Andric MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 64820b57cec5SDimitry Andric MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 64830b57cec5SDimitry Andric 64840b57cec5SDimitry Andric // Avoid creating no-op copies with the same src and dst reg class. These 64850b57cec5SDimitry Andric // confuse some of the machine passes. 64860b57cec5SDimitry Andric legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 64870b57cec5SDimitry Andric } 64880b57cec5SDimitry Andric } 64890b57cec5SDimitry Andric 64900b57cec5SDimitry Andric // REG_SEQUENCE doesn't really require operand legalization, but if one has a 64910b57cec5SDimitry Andric // VGPR dest type and SGPR sources, insert copies so all operands are 64920b57cec5SDimitry Andric // VGPRs. This seems to help operand folding / the register coalescer. 64930b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 64940b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 64950b57cec5SDimitry Andric const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 64960b57cec5SDimitry Andric if (RI.hasVGPRs(DstRC)) { 64970b57cec5SDimitry Andric // Update all the operands so they are VGPR register classes. These may 64980b57cec5SDimitry Andric // not be the same register class because REG_SEQUENCE supports mixing 64990b57cec5SDimitry Andric // subregister index types e.g. sub0_sub1 + sub2 + sub3 65000b57cec5SDimitry Andric for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 65010b57cec5SDimitry Andric MachineOperand &Op = MI.getOperand(I); 6502e8d8bef9SDimitry Andric if (!Op.isReg() || !Op.getReg().isVirtual()) 65030b57cec5SDimitry Andric continue; 65040b57cec5SDimitry Andric 65050b57cec5SDimitry Andric const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 65060b57cec5SDimitry Andric const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 65070b57cec5SDimitry Andric if (VRC == OpRC) 65080b57cec5SDimitry Andric continue; 65090b57cec5SDimitry Andric 65100b57cec5SDimitry Andric legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 65110b57cec5SDimitry Andric Op.setIsKill(); 65120b57cec5SDimitry Andric } 65130b57cec5SDimitry Andric } 65140b57cec5SDimitry Andric 6515e8d8bef9SDimitry Andric return CreatedBB; 65160b57cec5SDimitry Andric } 65170b57cec5SDimitry Andric 65180b57cec5SDimitry Andric // Legalize INSERT_SUBREG 65190b57cec5SDimitry Andric // src0 must have the same register class as dst 65200b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 65218bcb0991SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 65228bcb0991SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 65230b57cec5SDimitry Andric const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 65240b57cec5SDimitry Andric const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 65250b57cec5SDimitry Andric if (DstRC != Src0RC) { 65260b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent(); 65270b57cec5SDimitry Andric MachineOperand &Op = MI.getOperand(1); 65280b57cec5SDimitry Andric legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 65290b57cec5SDimitry Andric } 6530e8d8bef9SDimitry Andric return CreatedBB; 65310b57cec5SDimitry Andric } 65320b57cec5SDimitry Andric 65330b57cec5SDimitry Andric // Legalize SI_INIT_M0 65340b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 65350b57cec5SDimitry Andric MachineOperand &Src = MI.getOperand(0); 65360b57cec5SDimitry Andric if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 65370b57cec5SDimitry Andric Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 6538e8d8bef9SDimitry Andric return CreatedBB; 65390b57cec5SDimitry Andric } 65400b57cec5SDimitry Andric 65415f757f3fSDimitry Andric // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM 65425f757f3fSDimitry Andric if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 || 65435f757f3fSDimitry Andric MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || 65445f757f3fSDimitry Andric MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || 65455f757f3fSDimitry Andric MI.getOpcode() == AMDGPU::S_WQM_B32 || 65465f757f3fSDimitry Andric MI.getOpcode() == AMDGPU::S_WQM_B64) { 65475f757f3fSDimitry Andric MachineOperand &Src = MI.getOperand(1); 65485f757f3fSDimitry Andric if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 65495f757f3fSDimitry Andric Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 65505f757f3fSDimitry Andric return CreatedBB; 65515f757f3fSDimitry Andric } 65525f757f3fSDimitry Andric 65535f757f3fSDimitry Andric // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders. 65540b57cec5SDimitry Andric // 65550b57cec5SDimitry Andric // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 65560b57cec5SDimitry Andric // scratch memory access. In both cases, the legalization never involves 65570b57cec5SDimitry Andric // conversion to the addr64 form. 65585f757f3fSDimitry Andric if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && 65590b57cec5SDimitry Andric (isMUBUF(MI) || isMTBUF(MI)))) { 65605f757f3fSDimitry Andric int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc 65615f757f3fSDimitry Andric : AMDGPU::OpName::srsrc; 65625f757f3fSDimitry Andric MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName); 6563e8d8bef9SDimitry Andric if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) 656406c3fb27SDimitry Andric CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); 65650b57cec5SDimitry Andric 65665f757f3fSDimitry Andric int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; 65675f757f3fSDimitry Andric MachineOperand *SSamp = getNamedOperand(MI, SampOpName); 6568e8d8bef9SDimitry Andric if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) 656906c3fb27SDimitry Andric CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); 6570e8d8bef9SDimitry Andric 6571e8d8bef9SDimitry Andric return CreatedBB; 65720b57cec5SDimitry Andric } 6573e8d8bef9SDimitry Andric 6574e8d8bef9SDimitry Andric // Legalize SI_CALL 6575e8d8bef9SDimitry Andric if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { 6576e8d8bef9SDimitry Andric MachineOperand *Dest = &MI.getOperand(0); 6577e8d8bef9SDimitry Andric if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { 6578e8d8bef9SDimitry Andric // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and 6579e8d8bef9SDimitry Andric // following copies, we also need to move copies from and to physical 6580e8d8bef9SDimitry Andric // registers into the loop block. 6581e8d8bef9SDimitry Andric unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); 6582e8d8bef9SDimitry Andric unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); 6583e8d8bef9SDimitry Andric 6584e8d8bef9SDimitry Andric // Also move the copies to physical registers into the loop block 6585e8d8bef9SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 6586e8d8bef9SDimitry Andric MachineBasicBlock::iterator Start(&MI); 6587e8d8bef9SDimitry Andric while (Start->getOpcode() != FrameSetupOpcode) 6588e8d8bef9SDimitry Andric --Start; 6589e8d8bef9SDimitry Andric MachineBasicBlock::iterator End(&MI); 6590e8d8bef9SDimitry Andric while (End->getOpcode() != FrameDestroyOpcode) 6591e8d8bef9SDimitry Andric ++End; 6592e8d8bef9SDimitry Andric // Also include following copies of the return value 6593e8d8bef9SDimitry Andric ++End; 6594e8d8bef9SDimitry Andric while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && 6595e8d8bef9SDimitry Andric MI.definesRegister(End->getOperand(1).getReg())) 6596e8d8bef9SDimitry Andric ++End; 659706c3fb27SDimitry Andric CreatedBB = 659806c3fb27SDimitry Andric loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); 6599e8d8bef9SDimitry Andric } 66000b57cec5SDimitry Andric } 66010b57cec5SDimitry Andric 66025f757f3fSDimitry Andric // Legalize s_sleep_var. 66035f757f3fSDimitry Andric if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) { 66045f757f3fSDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 66055f757f3fSDimitry Andric Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 66065f757f3fSDimitry Andric int Src0Idx = 66075f757f3fSDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 66085f757f3fSDimitry Andric MachineOperand &Src0 = MI.getOperand(Src0Idx); 66095f757f3fSDimitry Andric BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 66105f757f3fSDimitry Andric .add(Src0); 66115f757f3fSDimitry Andric Src0.ChangeToRegister(Reg, false); 66125f757f3fSDimitry Andric return nullptr; 66135f757f3fSDimitry Andric } 66145f757f3fSDimitry Andric 661506c3fb27SDimitry Andric // Legalize MUBUF instructions. 661606c3fb27SDimitry Andric bool isSoffsetLegal = true; 661706c3fb27SDimitry Andric int SoffsetIdx = 661806c3fb27SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset); 661906c3fb27SDimitry Andric if (SoffsetIdx != -1) { 662006c3fb27SDimitry Andric MachineOperand *Soffset = &MI.getOperand(SoffsetIdx); 66215f757f3fSDimitry Andric if (Soffset->isReg() && Soffset->getReg().isVirtual() && 662206c3fb27SDimitry Andric !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) { 662306c3fb27SDimitry Andric isSoffsetLegal = false; 662406c3fb27SDimitry Andric } 662506c3fb27SDimitry Andric } 662606c3fb27SDimitry Andric 662706c3fb27SDimitry Andric bool isRsrcLegal = true; 66280b57cec5SDimitry Andric int RsrcIdx = 66290b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 66300b57cec5SDimitry Andric if (RsrcIdx != -1) { 66310b57cec5SDimitry Andric MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 663206c3fb27SDimitry Andric if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) { 663306c3fb27SDimitry Andric isRsrcLegal = false; 663406c3fb27SDimitry Andric } 66350b57cec5SDimitry Andric } 66360b57cec5SDimitry Andric 663706c3fb27SDimitry Andric // The operands are legal. 663806c3fb27SDimitry Andric if (isRsrcLegal && isSoffsetLegal) 663906c3fb27SDimitry Andric return CreatedBB; 664006c3fb27SDimitry Andric 664106c3fb27SDimitry Andric if (!isRsrcLegal) { 664206c3fb27SDimitry Andric // Legalize a VGPR Rsrc 66430b57cec5SDimitry Andric // 66440b57cec5SDimitry Andric // If the instruction is _ADDR64, we can avoid a waterfall by extracting 66450b57cec5SDimitry Andric // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 66460b57cec5SDimitry Andric // a zero-value SRsrc. 66470b57cec5SDimitry Andric // 66480b57cec5SDimitry Andric // If the instruction is _OFFSET (both idxen and offen disabled), and we 66490b57cec5SDimitry Andric // support ADDR64 instructions, we can convert to ADDR64 and do the same as 66500b57cec5SDimitry Andric // above. 66510b57cec5SDimitry Andric // 66520b57cec5SDimitry Andric // Otherwise we are on non-ADDR64 hardware, and/or we have 66530b57cec5SDimitry Andric // idxen/offen/bothen and we fall back to a waterfall loop. 66540b57cec5SDimitry Andric 665506c3fb27SDimitry Andric MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 66560b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 66570b57cec5SDimitry Andric 66580b57cec5SDimitry Andric MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 66590b57cec5SDimitry Andric if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 66600b57cec5SDimitry Andric // This is already an ADDR64 instruction so we need to add the pointer 66610b57cec5SDimitry Andric // extracted from the resource descriptor to the current value of VAddr. 66628bcb0991SDimitry Andric Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 66638bcb0991SDimitry Andric Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 66648bcb0991SDimitry Andric Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 66650b57cec5SDimitry Andric 66660b57cec5SDimitry Andric const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 66678bcb0991SDimitry Andric Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 66688bcb0991SDimitry Andric Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 66690b57cec5SDimitry Andric 66700b57cec5SDimitry Andric unsigned RsrcPtr, NewSRsrc; 66710b57cec5SDimitry Andric std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 66720b57cec5SDimitry Andric 66730b57cec5SDimitry Andric // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 66740b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 6675e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) 66760b57cec5SDimitry Andric .addDef(CondReg0) 66770b57cec5SDimitry Andric .addReg(RsrcPtr, 0, AMDGPU::sub0) 66780b57cec5SDimitry Andric .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 66790b57cec5SDimitry Andric .addImm(0); 66800b57cec5SDimitry Andric 66810b57cec5SDimitry Andric // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 66820b57cec5SDimitry Andric BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 66830b57cec5SDimitry Andric .addDef(CondReg1, RegState::Dead) 66840b57cec5SDimitry Andric .addReg(RsrcPtr, 0, AMDGPU::sub1) 66850b57cec5SDimitry Andric .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 66860b57cec5SDimitry Andric .addReg(CondReg0, RegState::Kill) 66870b57cec5SDimitry Andric .addImm(0); 66880b57cec5SDimitry Andric 66890b57cec5SDimitry Andric // NewVaddr = {NewVaddrHi, NewVaddrLo} 66900b57cec5SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 66910b57cec5SDimitry Andric .addReg(NewVAddrLo) 66920b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 66930b57cec5SDimitry Andric .addReg(NewVAddrHi) 66940b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 66950b57cec5SDimitry Andric 66960b57cec5SDimitry Andric VAddr->setReg(NewVAddr); 66970b57cec5SDimitry Andric Rsrc->setReg(NewSRsrc); 66980b57cec5SDimitry Andric } else if (!VAddr && ST.hasAddr64()) { 66990b57cec5SDimitry Andric // This instructions is the _OFFSET variant, so we need to convert it to 67000b57cec5SDimitry Andric // ADDR64. 6701e8d8bef9SDimitry Andric assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && 67020b57cec5SDimitry Andric "FIXME: Need to emit flat atomics here"); 67030b57cec5SDimitry Andric 67040b57cec5SDimitry Andric unsigned RsrcPtr, NewSRsrc; 67050b57cec5SDimitry Andric std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 67060b57cec5SDimitry Andric 67078bcb0991SDimitry Andric Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 67080b57cec5SDimitry Andric MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 67090b57cec5SDimitry Andric MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 67100b57cec5SDimitry Andric MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 67110b57cec5SDimitry Andric unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 67120b57cec5SDimitry Andric 671381ad6265SDimitry Andric // Atomics with return have an additional tied operand and are 67140b57cec5SDimitry Andric // missing some of the special bits. 67150b57cec5SDimitry Andric MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 67160b57cec5SDimitry Andric MachineInstr *Addr64; 67170b57cec5SDimitry Andric 67180b57cec5SDimitry Andric if (!VDataIn) { 67190b57cec5SDimitry Andric // Regular buffer load / store. 67200b57cec5SDimitry Andric MachineInstrBuilder MIB = 67210b57cec5SDimitry Andric BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 67220b57cec5SDimitry Andric .add(*VData) 67230b57cec5SDimitry Andric .addReg(NewVAddr) 67240b57cec5SDimitry Andric .addReg(NewSRsrc) 67250b57cec5SDimitry Andric .add(*SOffset) 67260b57cec5SDimitry Andric .add(*Offset); 67270b57cec5SDimitry Andric 6728fe6060f1SDimitry Andric if (const MachineOperand *CPol = 6729fe6060f1SDimitry Andric getNamedOperand(MI, AMDGPU::OpName::cpol)) { 6730fe6060f1SDimitry Andric MIB.addImm(CPol->getImm()); 67310b57cec5SDimitry Andric } 67320b57cec5SDimitry Andric 67330b57cec5SDimitry Andric if (const MachineOperand *TFE = 67340b57cec5SDimitry Andric getNamedOperand(MI, AMDGPU::OpName::tfe)) { 67350b57cec5SDimitry Andric MIB.addImm(TFE->getImm()); 67360b57cec5SDimitry Andric } 67370b57cec5SDimitry Andric 67388bcb0991SDimitry Andric MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 67398bcb0991SDimitry Andric 67400b57cec5SDimitry Andric MIB.cloneMemRefs(MI); 67410b57cec5SDimitry Andric Addr64 = MIB; 67420b57cec5SDimitry Andric } else { 67430b57cec5SDimitry Andric // Atomics with return. 67440b57cec5SDimitry Andric Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 67450b57cec5SDimitry Andric .add(*VData) 67460b57cec5SDimitry Andric .add(*VDataIn) 67470b57cec5SDimitry Andric .addReg(NewVAddr) 67480b57cec5SDimitry Andric .addReg(NewSRsrc) 67490b57cec5SDimitry Andric .add(*SOffset) 67500b57cec5SDimitry Andric .add(*Offset) 6751fe6060f1SDimitry Andric .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) 67520b57cec5SDimitry Andric .cloneMemRefs(MI); 67530b57cec5SDimitry Andric } 67540b57cec5SDimitry Andric 67550b57cec5SDimitry Andric MI.removeFromParent(); 67560b57cec5SDimitry Andric 67570b57cec5SDimitry Andric // NewVaddr = {NewVaddrHi, NewVaddrLo} 67580b57cec5SDimitry Andric BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 67590b57cec5SDimitry Andric NewVAddr) 67600b57cec5SDimitry Andric .addReg(RsrcPtr, 0, AMDGPU::sub0) 67610b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 67620b57cec5SDimitry Andric .addReg(RsrcPtr, 0, AMDGPU::sub1) 67630b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 67640b57cec5SDimitry Andric } else { 676506c3fb27SDimitry Andric // Legalize a VGPR Rsrc and soffset together. 676606c3fb27SDimitry Andric if (!isSoffsetLegal) { 676706c3fb27SDimitry Andric MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 676806c3fb27SDimitry Andric CreatedBB = 676906c3fb27SDimitry Andric loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT); 6770e8d8bef9SDimitry Andric return CreatedBB; 67710b57cec5SDimitry Andric } 677206c3fb27SDimitry Andric CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); 677306c3fb27SDimitry Andric return CreatedBB; 677406c3fb27SDimitry Andric } 677506c3fb27SDimitry Andric } 677606c3fb27SDimitry Andric 677706c3fb27SDimitry Andric // Legalize a VGPR soffset. 677806c3fb27SDimitry Andric if (!isSoffsetLegal) { 677906c3fb27SDimitry Andric MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 678006c3fb27SDimitry Andric CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); 678106c3fb27SDimitry Andric return CreatedBB; 67820b57cec5SDimitry Andric } 6783e8d8bef9SDimitry Andric return CreatedBB; 67840b57cec5SDimitry Andric } 67850b57cec5SDimitry Andric 678606c3fb27SDimitry Andric void SIInstrWorklist::insert(MachineInstr *MI) { 678706c3fb27SDimitry Andric InstrList.insert(MI); 678806c3fb27SDimitry Andric // Add MBUF instructiosn to deferred list. 678906c3fb27SDimitry Andric int RsrcIdx = 679006c3fb27SDimitry Andric AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 679106c3fb27SDimitry Andric if (RsrcIdx != -1) { 679206c3fb27SDimitry Andric DeferredList.insert(MI); 679306c3fb27SDimitry Andric } 679406c3fb27SDimitry Andric } 679506c3fb27SDimitry Andric 679606c3fb27SDimitry Andric bool SIInstrWorklist::isDeferred(MachineInstr *MI) { 679706c3fb27SDimitry Andric return DeferredList.contains(MI); 679806c3fb27SDimitry Andric } 679906c3fb27SDimitry Andric 680006c3fb27SDimitry Andric void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, 68010b57cec5SDimitry Andric MachineDominatorTree *MDT) const { 68020b57cec5SDimitry Andric 68030b57cec5SDimitry Andric while (!Worklist.empty()) { 680406c3fb27SDimitry Andric MachineInstr &Inst = *Worklist.top(); 680506c3fb27SDimitry Andric Worklist.erase_top(); 680606c3fb27SDimitry Andric // Skip MachineInstr in the deferred list. 680706c3fb27SDimitry Andric if (Worklist.isDeferred(&Inst)) 680806c3fb27SDimitry Andric continue; 680906c3fb27SDimitry Andric moveToVALUImpl(Worklist, MDT, Inst); 681006c3fb27SDimitry Andric } 68110b57cec5SDimitry Andric 681206c3fb27SDimitry Andric // Deferred list of instructions will be processed once 681306c3fb27SDimitry Andric // all the MachineInstr in the worklist are done. 681406c3fb27SDimitry Andric for (MachineInstr *Inst : Worklist.getDeferredList()) { 681506c3fb27SDimitry Andric moveToVALUImpl(Worklist, MDT, *Inst); 681606c3fb27SDimitry Andric assert(Worklist.empty() && 681706c3fb27SDimitry Andric "Deferred MachineInstr are not supposed to re-populate worklist"); 681806c3fb27SDimitry Andric } 681906c3fb27SDimitry Andric } 682006c3fb27SDimitry Andric 682106c3fb27SDimitry Andric void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, 682206c3fb27SDimitry Andric MachineDominatorTree *MDT, 682306c3fb27SDimitry Andric MachineInstr &Inst) const { 682406c3fb27SDimitry Andric 682506c3fb27SDimitry Andric MachineBasicBlock *MBB = Inst.getParent(); 682606c3fb27SDimitry Andric if (!MBB) 682706c3fb27SDimitry Andric return; 682806c3fb27SDimitry Andric MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 68290b57cec5SDimitry Andric unsigned Opcode = Inst.getOpcode(); 68300b57cec5SDimitry Andric unsigned NewOpcode = getVALUOp(Inst); 68310b57cec5SDimitry Andric // Handle some special cases 68320b57cec5SDimitry Andric switch (Opcode) { 68330b57cec5SDimitry Andric default: 68340b57cec5SDimitry Andric break; 68350b57cec5SDimitry Andric case AMDGPU::S_ADD_U64_PSEUDO: 68365f757f3fSDimitry Andric NewOpcode = AMDGPU::V_ADD_U64_PSEUDO; 68375f757f3fSDimitry Andric break; 68380b57cec5SDimitry Andric case AMDGPU::S_SUB_U64_PSEUDO: 68395f757f3fSDimitry Andric NewOpcode = AMDGPU::V_SUB_U64_PSEUDO; 68405f757f3fSDimitry Andric break; 68410b57cec5SDimitry Andric case AMDGPU::S_ADD_I32: 6842e8d8bef9SDimitry Andric case AMDGPU::S_SUB_I32: { 68430b57cec5SDimitry Andric // FIXME: The u32 versions currently selected use the carry. 6844e8d8bef9SDimitry Andric bool Changed; 684506c3fb27SDimitry Andric MachineBasicBlock *CreatedBBTmp = nullptr; 6846e8d8bef9SDimitry Andric std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); 6847e8d8bef9SDimitry Andric if (Changed) 684806c3fb27SDimitry Andric return; 68490b57cec5SDimitry Andric 68500b57cec5SDimitry Andric // Default handling 68510b57cec5SDimitry Andric break; 6852e8d8bef9SDimitry Andric } 6853*1db9f3b2SDimitry Andric 6854*1db9f3b2SDimitry Andric case AMDGPU::S_MUL_U64: 6855*1db9f3b2SDimitry Andric // Split s_mul_u64 in 32-bit vector multiplications. 6856*1db9f3b2SDimitry Andric splitScalarSMulU64(Worklist, Inst, MDT); 6857*1db9f3b2SDimitry Andric Inst.eraseFromParent(); 6858*1db9f3b2SDimitry Andric return; 6859*1db9f3b2SDimitry Andric 6860*1db9f3b2SDimitry Andric case AMDGPU::S_MUL_U64_U32_PSEUDO: 6861*1db9f3b2SDimitry Andric case AMDGPU::S_MUL_I64_I32_PSEUDO: 6862*1db9f3b2SDimitry Andric // This is a special case of s_mul_u64 where all the operands are either 6863*1db9f3b2SDimitry Andric // zero extended or sign extended. 6864*1db9f3b2SDimitry Andric splitScalarSMulPseudo(Worklist, Inst, MDT); 6865*1db9f3b2SDimitry Andric Inst.eraseFromParent(); 6866*1db9f3b2SDimitry Andric return; 6867*1db9f3b2SDimitry Andric 68680b57cec5SDimitry Andric case AMDGPU::S_AND_B64: 68690b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 68700b57cec5SDimitry Andric Inst.eraseFromParent(); 687106c3fb27SDimitry Andric return; 68720b57cec5SDimitry Andric 68730b57cec5SDimitry Andric case AMDGPU::S_OR_B64: 68740b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 68750b57cec5SDimitry Andric Inst.eraseFromParent(); 687606c3fb27SDimitry Andric return; 68770b57cec5SDimitry Andric 68780b57cec5SDimitry Andric case AMDGPU::S_XOR_B64: 68790b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 68800b57cec5SDimitry Andric Inst.eraseFromParent(); 688106c3fb27SDimitry Andric return; 68820b57cec5SDimitry Andric 68830b57cec5SDimitry Andric case AMDGPU::S_NAND_B64: 68840b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 68850b57cec5SDimitry Andric Inst.eraseFromParent(); 688606c3fb27SDimitry Andric return; 68870b57cec5SDimitry Andric 68880b57cec5SDimitry Andric case AMDGPU::S_NOR_B64: 68890b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 68900b57cec5SDimitry Andric Inst.eraseFromParent(); 689106c3fb27SDimitry Andric return; 68920b57cec5SDimitry Andric 68930b57cec5SDimitry Andric case AMDGPU::S_XNOR_B64: 68940b57cec5SDimitry Andric if (ST.hasDLInsts()) 68950b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 68960b57cec5SDimitry Andric else 68970b57cec5SDimitry Andric splitScalar64BitXnor(Worklist, Inst, MDT); 68980b57cec5SDimitry Andric Inst.eraseFromParent(); 689906c3fb27SDimitry Andric return; 69000b57cec5SDimitry Andric 69010b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B64: 69020b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 69030b57cec5SDimitry Andric Inst.eraseFromParent(); 690406c3fb27SDimitry Andric return; 69050b57cec5SDimitry Andric 69060b57cec5SDimitry Andric case AMDGPU::S_ORN2_B64: 69070b57cec5SDimitry Andric splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 69080b57cec5SDimitry Andric Inst.eraseFromParent(); 690906c3fb27SDimitry Andric return; 69100b57cec5SDimitry Andric 6911fe6060f1SDimitry Andric case AMDGPU::S_BREV_B64: 6912fe6060f1SDimitry Andric splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); 6913fe6060f1SDimitry Andric Inst.eraseFromParent(); 691406c3fb27SDimitry Andric return; 6915fe6060f1SDimitry Andric 69160b57cec5SDimitry Andric case AMDGPU::S_NOT_B64: 69170b57cec5SDimitry Andric splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 69180b57cec5SDimitry Andric Inst.eraseFromParent(); 691906c3fb27SDimitry Andric return; 69200b57cec5SDimitry Andric 69210b57cec5SDimitry Andric case AMDGPU::S_BCNT1_I32_B64: 69220b57cec5SDimitry Andric splitScalar64BitBCNT(Worklist, Inst); 69230b57cec5SDimitry Andric Inst.eraseFromParent(); 692406c3fb27SDimitry Andric return; 69250b57cec5SDimitry Andric 69260b57cec5SDimitry Andric case AMDGPU::S_BFE_I64: 69270b57cec5SDimitry Andric splitScalar64BitBFE(Worklist, Inst); 69280b57cec5SDimitry Andric Inst.eraseFromParent(); 692906c3fb27SDimitry Andric return; 69300b57cec5SDimitry Andric 6931cb14a3feSDimitry Andric case AMDGPU::S_FLBIT_I32_B64: 6932cb14a3feSDimitry Andric splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32); 6933cb14a3feSDimitry Andric Inst.eraseFromParent(); 6934cb14a3feSDimitry Andric return; 6935cb14a3feSDimitry Andric case AMDGPU::S_FF1_I32_B64: 6936cb14a3feSDimitry Andric splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32); 6937cb14a3feSDimitry Andric Inst.eraseFromParent(); 6938cb14a3feSDimitry Andric return; 6939cb14a3feSDimitry Andric 69400b57cec5SDimitry Andric case AMDGPU::S_LSHL_B32: 69410b57cec5SDimitry Andric if (ST.hasOnlyRevVALUShifts()) { 69420b57cec5SDimitry Andric NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 69430b57cec5SDimitry Andric swapOperands(Inst); 69440b57cec5SDimitry Andric } 69450b57cec5SDimitry Andric break; 69460b57cec5SDimitry Andric case AMDGPU::S_ASHR_I32: 69470b57cec5SDimitry Andric if (ST.hasOnlyRevVALUShifts()) { 69480b57cec5SDimitry Andric NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 69490b57cec5SDimitry Andric swapOperands(Inst); 69500b57cec5SDimitry Andric } 69510b57cec5SDimitry Andric break; 69520b57cec5SDimitry Andric case AMDGPU::S_LSHR_B32: 69530b57cec5SDimitry Andric if (ST.hasOnlyRevVALUShifts()) { 69540b57cec5SDimitry Andric NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 69550b57cec5SDimitry Andric swapOperands(Inst); 69560b57cec5SDimitry Andric } 69570b57cec5SDimitry Andric break; 69580b57cec5SDimitry Andric case AMDGPU::S_LSHL_B64: 69590b57cec5SDimitry Andric if (ST.hasOnlyRevVALUShifts()) { 69605f757f3fSDimitry Andric NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12 69615f757f3fSDimitry Andric ? AMDGPU::V_LSHLREV_B64_pseudo_e64 69625f757f3fSDimitry Andric : AMDGPU::V_LSHLREV_B64_e64; 69630b57cec5SDimitry Andric swapOperands(Inst); 69640b57cec5SDimitry Andric } 69650b57cec5SDimitry Andric break; 69660b57cec5SDimitry Andric case AMDGPU::S_ASHR_I64: 69670b57cec5SDimitry Andric if (ST.hasOnlyRevVALUShifts()) { 6968e8d8bef9SDimitry Andric NewOpcode = AMDGPU::V_ASHRREV_I64_e64; 69690b57cec5SDimitry Andric swapOperands(Inst); 69700b57cec5SDimitry Andric } 69710b57cec5SDimitry Andric break; 69720b57cec5SDimitry Andric case AMDGPU::S_LSHR_B64: 69730b57cec5SDimitry Andric if (ST.hasOnlyRevVALUShifts()) { 6974e8d8bef9SDimitry Andric NewOpcode = AMDGPU::V_LSHRREV_B64_e64; 69750b57cec5SDimitry Andric swapOperands(Inst); 69760b57cec5SDimitry Andric } 69770b57cec5SDimitry Andric break; 69780b57cec5SDimitry Andric 69790b57cec5SDimitry Andric case AMDGPU::S_ABS_I32: 69800b57cec5SDimitry Andric lowerScalarAbs(Worklist, Inst); 69810b57cec5SDimitry Andric Inst.eraseFromParent(); 698206c3fb27SDimitry Andric return; 69830b57cec5SDimitry Andric 69840b57cec5SDimitry Andric case AMDGPU::S_CBRANCH_SCC0: 6985349cc55cSDimitry Andric case AMDGPU::S_CBRANCH_SCC1: { 69860b57cec5SDimitry Andric // Clear unused bits of vcc 6987349cc55cSDimitry Andric Register CondReg = Inst.getOperand(1).getReg(); 6988349cc55cSDimitry Andric bool IsSCC = CondReg == AMDGPU::SCC; 6989349cc55cSDimitry Andric Register VCC = RI.getVCC(); 6990349cc55cSDimitry Andric Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 6991349cc55cSDimitry Andric unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 6992349cc55cSDimitry Andric BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) 6993349cc55cSDimitry Andric .addReg(EXEC) 6994349cc55cSDimitry Andric .addReg(IsSCC ? VCC : CondReg); 699581ad6265SDimitry Andric Inst.removeOperand(1); 699606c3fb27SDimitry Andric } break; 69970b57cec5SDimitry Andric 69980b57cec5SDimitry Andric case AMDGPU::S_BFE_U64: 69990b57cec5SDimitry Andric case AMDGPU::S_BFM_B64: 70000b57cec5SDimitry Andric llvm_unreachable("Moving this op to VALU not implemented"); 70010b57cec5SDimitry Andric 70020b57cec5SDimitry Andric case AMDGPU::S_PACK_LL_B32_B16: 70030b57cec5SDimitry Andric case AMDGPU::S_PACK_LH_B32_B16: 700481ad6265SDimitry Andric case AMDGPU::S_PACK_HL_B32_B16: 70050b57cec5SDimitry Andric case AMDGPU::S_PACK_HH_B32_B16: 70060b57cec5SDimitry Andric movePackToVALU(Worklist, MRI, Inst); 70070b57cec5SDimitry Andric Inst.eraseFromParent(); 700806c3fb27SDimitry Andric return; 70090b57cec5SDimitry Andric 70100b57cec5SDimitry Andric case AMDGPU::S_XNOR_B32: 70110b57cec5SDimitry Andric lowerScalarXnor(Worklist, Inst); 70120b57cec5SDimitry Andric Inst.eraseFromParent(); 701306c3fb27SDimitry Andric return; 70140b57cec5SDimitry Andric 70150b57cec5SDimitry Andric case AMDGPU::S_NAND_B32: 70160b57cec5SDimitry Andric splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 70170b57cec5SDimitry Andric Inst.eraseFromParent(); 701806c3fb27SDimitry Andric return; 70190b57cec5SDimitry Andric 70200b57cec5SDimitry Andric case AMDGPU::S_NOR_B32: 70210b57cec5SDimitry Andric splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 70220b57cec5SDimitry Andric Inst.eraseFromParent(); 702306c3fb27SDimitry Andric return; 70240b57cec5SDimitry Andric 70250b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B32: 70260b57cec5SDimitry Andric splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 70270b57cec5SDimitry Andric Inst.eraseFromParent(); 702806c3fb27SDimitry Andric return; 70290b57cec5SDimitry Andric 70300b57cec5SDimitry Andric case AMDGPU::S_ORN2_B32: 70310b57cec5SDimitry Andric splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 70320b57cec5SDimitry Andric Inst.eraseFromParent(); 703306c3fb27SDimitry Andric return; 70345ffd83dbSDimitry Andric 70355ffd83dbSDimitry Andric // TODO: remove as soon as everything is ready 70365ffd83dbSDimitry Andric // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 70375ffd83dbSDimitry Andric // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 70385ffd83dbSDimitry Andric // can only be selected from the uniform SDNode. 70395ffd83dbSDimitry Andric case AMDGPU::S_ADD_CO_PSEUDO: 70405ffd83dbSDimitry Andric case AMDGPU::S_SUB_CO_PSEUDO: { 70415ffd83dbSDimitry Andric unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 70425ffd83dbSDimitry Andric ? AMDGPU::V_ADDC_U32_e64 70435ffd83dbSDimitry Andric : AMDGPU::V_SUBB_U32_e64; 70445ffd83dbSDimitry Andric const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 70455ffd83dbSDimitry Andric 70465ffd83dbSDimitry Andric Register CarryInReg = Inst.getOperand(4).getReg(); 70475ffd83dbSDimitry Andric if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 70485ffd83dbSDimitry Andric Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 704906c3fb27SDimitry Andric BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 70505ffd83dbSDimitry Andric .addReg(CarryInReg); 70515ffd83dbSDimitry Andric } 70525ffd83dbSDimitry Andric 70535ffd83dbSDimitry Andric Register CarryOutReg = Inst.getOperand(1).getReg(); 70545ffd83dbSDimitry Andric 70555ffd83dbSDimitry Andric Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 70565ffd83dbSDimitry Andric MRI.getRegClass(Inst.getOperand(0).getReg()))); 70575ffd83dbSDimitry Andric MachineInstr *CarryOp = 70585ffd83dbSDimitry Andric BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 70595ffd83dbSDimitry Andric .addReg(CarryOutReg, RegState::Define) 70605ffd83dbSDimitry Andric .add(Inst.getOperand(2)) 70615ffd83dbSDimitry Andric .add(Inst.getOperand(3)) 70625ffd83dbSDimitry Andric .addReg(CarryInReg) 70635ffd83dbSDimitry Andric .addImm(0); 706406c3fb27SDimitry Andric legalizeOperands(*CarryOp); 70655ffd83dbSDimitry Andric MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 70665ffd83dbSDimitry Andric addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 70675ffd83dbSDimitry Andric Inst.eraseFromParent(); 70685ffd83dbSDimitry Andric } 706906c3fb27SDimitry Andric return; 70705ffd83dbSDimitry Andric case AMDGPU::S_UADDO_PSEUDO: 70715ffd83dbSDimitry Andric case AMDGPU::S_USUBO_PSEUDO: { 70725ffd83dbSDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 70735ffd83dbSDimitry Andric MachineOperand &Dest0 = Inst.getOperand(0); 70745ffd83dbSDimitry Andric MachineOperand &Dest1 = Inst.getOperand(1); 70755ffd83dbSDimitry Andric MachineOperand &Src0 = Inst.getOperand(2); 70765ffd83dbSDimitry Andric MachineOperand &Src1 = Inst.getOperand(3); 70775ffd83dbSDimitry Andric 70785ffd83dbSDimitry Andric unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 7079e8d8bef9SDimitry Andric ? AMDGPU::V_ADD_CO_U32_e64 7080e8d8bef9SDimitry Andric : AMDGPU::V_SUB_CO_U32_e64; 70815ffd83dbSDimitry Andric const TargetRegisterClass *NewRC = 70825ffd83dbSDimitry Andric RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 70835ffd83dbSDimitry Andric Register DestReg = MRI.createVirtualRegister(NewRC); 70845ffd83dbSDimitry Andric MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 70855ffd83dbSDimitry Andric .addReg(Dest1.getReg(), RegState::Define) 70865ffd83dbSDimitry Andric .add(Src0) 70875ffd83dbSDimitry Andric .add(Src1) 70885ffd83dbSDimitry Andric .addImm(0); // clamp bit 70895ffd83dbSDimitry Andric 709006c3fb27SDimitry Andric legalizeOperands(*NewInstr, MDT); 70915ffd83dbSDimitry Andric MRI.replaceRegWith(Dest0.getReg(), DestReg); 70925ffd83dbSDimitry Andric addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 70935ffd83dbSDimitry Andric Worklist); 70945ffd83dbSDimitry Andric Inst.eraseFromParent(); 70955ffd83dbSDimitry Andric } 709606c3fb27SDimitry Andric return; 70975ffd83dbSDimitry Andric 70985ffd83dbSDimitry Andric case AMDGPU::S_CSELECT_B32: 7099349cc55cSDimitry Andric case AMDGPU::S_CSELECT_B64: 710004eeddc0SDimitry Andric lowerSelect(Worklist, Inst, MDT); 7101349cc55cSDimitry Andric Inst.eraseFromParent(); 710206c3fb27SDimitry Andric return; 7103349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_I32: 7104349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_I32: 7105349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_I32: 7106349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_I32: 7107349cc55cSDimitry Andric case AMDGPU::S_CMP_LT_I32: 7108349cc55cSDimitry Andric case AMDGPU::S_CMP_LE_I32: 7109349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U32: 7110349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U32: 7111349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_U32: 7112349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_U32: 7113349cc55cSDimitry Andric case AMDGPU::S_CMP_LT_U32: 7114349cc55cSDimitry Andric case AMDGPU::S_CMP_LE_U32: 7115349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U64: 71165f757f3fSDimitry Andric case AMDGPU::S_CMP_LG_U64: 71175f757f3fSDimitry Andric case AMDGPU::S_CMP_LT_F32: 71185f757f3fSDimitry Andric case AMDGPU::S_CMP_EQ_F32: 71195f757f3fSDimitry Andric case AMDGPU::S_CMP_LE_F32: 71205f757f3fSDimitry Andric case AMDGPU::S_CMP_GT_F32: 71215f757f3fSDimitry Andric case AMDGPU::S_CMP_LG_F32: 71225f757f3fSDimitry Andric case AMDGPU::S_CMP_GE_F32: 71235f757f3fSDimitry Andric case AMDGPU::S_CMP_O_F32: 71245f757f3fSDimitry Andric case AMDGPU::S_CMP_U_F32: 71255f757f3fSDimitry Andric case AMDGPU::S_CMP_NGE_F32: 71265f757f3fSDimitry Andric case AMDGPU::S_CMP_NLG_F32: 71275f757f3fSDimitry Andric case AMDGPU::S_CMP_NGT_F32: 71285f757f3fSDimitry Andric case AMDGPU::S_CMP_NLE_F32: 71295f757f3fSDimitry Andric case AMDGPU::S_CMP_NEQ_F32: 71305f757f3fSDimitry Andric case AMDGPU::S_CMP_NLT_F32: 71315f757f3fSDimitry Andric case AMDGPU::S_CMP_LT_F16: 71325f757f3fSDimitry Andric case AMDGPU::S_CMP_EQ_F16: 71335f757f3fSDimitry Andric case AMDGPU::S_CMP_LE_F16: 71345f757f3fSDimitry Andric case AMDGPU::S_CMP_GT_F16: 71355f757f3fSDimitry Andric case AMDGPU::S_CMP_LG_F16: 71365f757f3fSDimitry Andric case AMDGPU::S_CMP_GE_F16: 71375f757f3fSDimitry Andric case AMDGPU::S_CMP_O_F16: 71385f757f3fSDimitry Andric case AMDGPU::S_CMP_U_F16: 71395f757f3fSDimitry Andric case AMDGPU::S_CMP_NGE_F16: 71405f757f3fSDimitry Andric case AMDGPU::S_CMP_NLG_F16: 71415f757f3fSDimitry Andric case AMDGPU::S_CMP_NGT_F16: 71425f757f3fSDimitry Andric case AMDGPU::S_CMP_NLE_F16: 71435f757f3fSDimitry Andric case AMDGPU::S_CMP_NEQ_F16: 71445f757f3fSDimitry Andric case AMDGPU::S_CMP_NLT_F16: { 7145349cc55cSDimitry Andric Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); 71465f757f3fSDimitry Andric auto NewInstr = 71475f757f3fSDimitry Andric BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg) 71485f757f3fSDimitry Andric .setMIFlags(Inst.getFlags()); 71495f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, 71505f757f3fSDimitry Andric AMDGPU::OpName::src0_modifiers) >= 0) { 71515f757f3fSDimitry Andric NewInstr 71525f757f3fSDimitry Andric .addImm(0) // src0_modifiers 71535f757f3fSDimitry Andric .add(Inst.getOperand(0)) // src0 71545f757f3fSDimitry Andric .addImm(0) // src1_modifiers 71555f757f3fSDimitry Andric .add(Inst.getOperand(1)) // src1 71565f757f3fSDimitry Andric .addImm(0); // clamp 71575f757f3fSDimitry Andric } else { 71585f757f3fSDimitry Andric NewInstr 7159349cc55cSDimitry Andric .add(Inst.getOperand(0)) 7160349cc55cSDimitry Andric .add(Inst.getOperand(1)); 71615f757f3fSDimitry Andric } 7162349cc55cSDimitry Andric legalizeOperands(*NewInstr, MDT); 7163349cc55cSDimitry Andric int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); 7164349cc55cSDimitry Andric MachineOperand SCCOp = Inst.getOperand(SCCIdx); 7165349cc55cSDimitry Andric addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); 7166349cc55cSDimitry Andric Inst.eraseFromParent(); 716706c3fb27SDimitry Andric return; 7168349cc55cSDimitry Andric } 71695f757f3fSDimitry Andric case AMDGPU::S_CVT_HI_F32_F16: { 71705f757f3fSDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 71715f757f3fSDimitry Andric Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 71725f757f3fSDimitry Andric Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 71735f757f3fSDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 71745f757f3fSDimitry Andric .addImm(16) 71755f757f3fSDimitry Andric .add(Inst.getOperand(1)); 71765f757f3fSDimitry Andric BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) 71775f757f3fSDimitry Andric .addImm(0) // src0_modifiers 71785f757f3fSDimitry Andric .addReg(TmpReg) 71795f757f3fSDimitry Andric .addImm(0) // clamp 71805f757f3fSDimitry Andric .addImm(0); // omod 71815f757f3fSDimitry Andric 71825f757f3fSDimitry Andric MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); 71835f757f3fSDimitry Andric addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); 71845f757f3fSDimitry Andric Inst.eraseFromParent(); 71855f757f3fSDimitry Andric return; 71865f757f3fSDimitry Andric } 71875f757f3fSDimitry Andric case AMDGPU::S_MINIMUM_F32: 71885f757f3fSDimitry Andric case AMDGPU::S_MAXIMUM_F32: 71895f757f3fSDimitry Andric case AMDGPU::S_MINIMUM_F16: 71905f757f3fSDimitry Andric case AMDGPU::S_MAXIMUM_F16: { 71915f757f3fSDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 71925f757f3fSDimitry Andric Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 71935f757f3fSDimitry Andric MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) 71945f757f3fSDimitry Andric .addImm(0) // src0_modifiers 71955f757f3fSDimitry Andric .add(Inst.getOperand(1)) 71965f757f3fSDimitry Andric .addImm(0) // src1_modifiers 71975f757f3fSDimitry Andric .add(Inst.getOperand(2)) 71985f757f3fSDimitry Andric .addImm(0) // clamp 71995f757f3fSDimitry Andric .addImm(0); // omod 72005f757f3fSDimitry Andric MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); 72015f757f3fSDimitry Andric 72025f757f3fSDimitry Andric legalizeOperands(*NewInstr, MDT); 72035f757f3fSDimitry Andric addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); 72045f757f3fSDimitry Andric Inst.eraseFromParent(); 72055f757f3fSDimitry Andric return; 72065f757f3fSDimitry Andric } 72075f757f3fSDimitry Andric } 7208349cc55cSDimitry Andric 72090b57cec5SDimitry Andric if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 72100b57cec5SDimitry Andric // We cannot move this instruction to the VALU, so we should try to 72110b57cec5SDimitry Andric // legalize its operands instead. 721206c3fb27SDimitry Andric legalizeOperands(Inst, MDT); 721306c3fb27SDimitry Andric return; 72140b57cec5SDimitry Andric } 7215bdd1243dSDimitry Andric // Handle converting generic instructions like COPY-to-SGPR into 7216bdd1243dSDimitry Andric // COPY-to-VGPR. 7217bdd1243dSDimitry Andric if (NewOpcode == Opcode) { 72188bcb0991SDimitry Andric Register DstReg = Inst.getOperand(0).getReg(); 72190b57cec5SDimitry Andric const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 72200b57cec5SDimitry Andric 7221647cbc5dSDimitry Andric // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and 7222647cbc5dSDimitry Andric // hope for the best. 7223647cbc5dSDimitry Andric if (Inst.isCopy() && DstReg.isPhysical() && 7224647cbc5dSDimitry Andric RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { 7225647cbc5dSDimitry Andric // TODO: Only works for 32 bit registers. 7226647cbc5dSDimitry Andric BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), 7227647cbc5dSDimitry Andric get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg()) 7228647cbc5dSDimitry Andric .add(Inst.getOperand(1)); 7229647cbc5dSDimitry Andric Inst.eraseFromParent(); 7230647cbc5dSDimitry Andric return; 7231647cbc5dSDimitry Andric } 7232647cbc5dSDimitry Andric 7233e8d8bef9SDimitry Andric if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && 72340b57cec5SDimitry Andric NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 72350b57cec5SDimitry Andric // Instead of creating a copy where src and dst are the same register 72360b57cec5SDimitry Andric // class, we just replace all uses of dst with src. These kinds of 72370b57cec5SDimitry Andric // copies interfere with the heuristics MachineSink uses to decide 72380b57cec5SDimitry Andric // whether or not to split a critical edge. Since the pass assumes 72390b57cec5SDimitry Andric // that copies will end up as machine instructions and not be 72400b57cec5SDimitry Andric // eliminated. 72410b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 72420b57cec5SDimitry Andric MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 72430b57cec5SDimitry Andric MRI.clearKillFlags(Inst.getOperand(1).getReg()); 72440b57cec5SDimitry Andric Inst.getOperand(0).setReg(DstReg); 72450b57cec5SDimitry Andric // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 72460b57cec5SDimitry Andric // these are deleted later, but at -O0 it would leave a suspicious 72470b57cec5SDimitry Andric // looking illegal copy of an undef register. 72480b57cec5SDimitry Andric for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 724981ad6265SDimitry Andric Inst.removeOperand(I); 72500b57cec5SDimitry Andric Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 725106c3fb27SDimitry Andric return; 72520b57cec5SDimitry Andric } 7253bdd1243dSDimitry Andric Register NewDstReg = MRI.createVirtualRegister(NewDstRC); 7254bdd1243dSDimitry Andric MRI.replaceRegWith(DstReg, NewDstReg); 7255bdd1243dSDimitry Andric legalizeOperands(Inst, MDT); 7256bdd1243dSDimitry Andric addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 725706c3fb27SDimitry Andric return; 7258bdd1243dSDimitry Andric } 7259bdd1243dSDimitry Andric 7260bdd1243dSDimitry Andric // Use the new VALU Opcode. 7261bdd1243dSDimitry Andric auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) 7262bdd1243dSDimitry Andric .setMIFlags(Inst.getFlags()); 72635f757f3fSDimitry Andric if (isVOP3(NewOpcode) && !isVOP3(Opcode)) { 72645f757f3fSDimitry Andric // Intersperse VOP3 modifiers among the SALU operands. 72655f757f3fSDimitry Andric NewInstr->addOperand(Inst.getOperand(0)); 72665f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, 72675f757f3fSDimitry Andric AMDGPU::OpName::src0_modifiers) >= 0) 72685f757f3fSDimitry Andric NewInstr.addImm(0); 72695f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0) 72705f757f3fSDimitry Andric NewInstr->addOperand(Inst.getOperand(1)); 72715f757f3fSDimitry Andric 72725f757f3fSDimitry Andric if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 72735f757f3fSDimitry Andric // We are converting these to a BFE, so we need to add the missing 72745f757f3fSDimitry Andric // operands for the size and offset. 72755f757f3fSDimitry Andric unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 72765f757f3fSDimitry Andric NewInstr.addImm(0); 72775f757f3fSDimitry Andric NewInstr.addImm(Size); 72785f757f3fSDimitry Andric } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 72795f757f3fSDimitry Andric // The VALU version adds the second operand to the result, so insert an 72805f757f3fSDimitry Andric // extra 0 operand. 72815f757f3fSDimitry Andric NewInstr.addImm(0); 72825f757f3fSDimitry Andric } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 72835f757f3fSDimitry Andric const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 72845f757f3fSDimitry Andric // If we need to move this to VGPRs, we need to unpack the second 72855f757f3fSDimitry Andric // operand back into the 2 separate ones for bit offset and width. 72865f757f3fSDimitry Andric assert(OffsetWidthOp.isImm() && 72875f757f3fSDimitry Andric "Scalar BFE is only implemented for constant width and offset"); 72885f757f3fSDimitry Andric uint32_t Imm = OffsetWidthOp.getImm(); 72895f757f3fSDimitry Andric 72905f757f3fSDimitry Andric uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 72915f757f3fSDimitry Andric uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 72925f757f3fSDimitry Andric NewInstr.addImm(Offset); 72935f757f3fSDimitry Andric NewInstr.addImm(BitWidth); 72945f757f3fSDimitry Andric } else { 72955f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, 72965f757f3fSDimitry Andric AMDGPU::OpName::src1_modifiers) >= 0) 72975f757f3fSDimitry Andric NewInstr.addImm(0); 72985f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0) 72995f757f3fSDimitry Andric NewInstr->addOperand(Inst.getOperand(2)); 73005f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, 73015f757f3fSDimitry Andric AMDGPU::OpName::src2_modifiers) >= 0) 73025f757f3fSDimitry Andric NewInstr.addImm(0); 73035f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0) 73045f757f3fSDimitry Andric NewInstr->addOperand(Inst.getOperand(3)); 73055f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0) 73065f757f3fSDimitry Andric NewInstr.addImm(0); 73075f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0) 73085f757f3fSDimitry Andric NewInstr.addImm(0); 73095f757f3fSDimitry Andric if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0) 73105f757f3fSDimitry Andric NewInstr.addImm(0); 73115f757f3fSDimitry Andric } 73125f757f3fSDimitry Andric } else { 73135f757f3fSDimitry Andric // Just copy the SALU operands. 7314bdd1243dSDimitry Andric for (const MachineOperand &Op : Inst.explicit_operands()) 7315bdd1243dSDimitry Andric NewInstr->addOperand(Op); 73165f757f3fSDimitry Andric } 73175f757f3fSDimitry Andric 7318bdd1243dSDimitry Andric // Remove any references to SCC. Vector instructions can't read from it, and 7319bdd1243dSDimitry Andric // We're just about to add the implicit use / defs of VCC, and we don't want 7320bdd1243dSDimitry Andric // both. 7321bdd1243dSDimitry Andric for (MachineOperand &Op : Inst.implicit_operands()) { 7322bdd1243dSDimitry Andric if (Op.getReg() == AMDGPU::SCC) { 7323bdd1243dSDimitry Andric // Only propagate through live-def of SCC. 7324bdd1243dSDimitry Andric if (Op.isDef() && !Op.isDead()) 7325bdd1243dSDimitry Andric addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 7326bdd1243dSDimitry Andric if (Op.isUse()) 7327bdd1243dSDimitry Andric addSCCDefsToVALUWorklist(NewInstr, Worklist); 7328bdd1243dSDimitry Andric } 7329bdd1243dSDimitry Andric } 7330bdd1243dSDimitry Andric Inst.eraseFromParent(); 7331bdd1243dSDimitry Andric Register NewDstReg; 7332bdd1243dSDimitry Andric if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { 7333bdd1243dSDimitry Andric Register DstReg = NewInstr->getOperand(0).getReg(); 7334bdd1243dSDimitry Andric assert(DstReg.isVirtual()); 7335bdd1243dSDimitry Andric // Update the destination register class. 733606c3fb27SDimitry Andric const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr); 7337bdd1243dSDimitry Andric assert(NewDstRC); 73380b57cec5SDimitry Andric NewDstReg = MRI.createVirtualRegister(NewDstRC); 73390b57cec5SDimitry Andric MRI.replaceRegWith(DstReg, NewDstReg); 73400b57cec5SDimitry Andric } 7341bdd1243dSDimitry Andric fixImplicitOperands(*NewInstr); 73420b57cec5SDimitry Andric // Legalize the operands 734306c3fb27SDimitry Andric legalizeOperands(*NewInstr, MDT); 7344bdd1243dSDimitry Andric if (NewDstReg) 73450b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 73460b57cec5SDimitry Andric } 73470b57cec5SDimitry Andric 73480b57cec5SDimitry Andric // Add/sub require special handling to deal with carry outs. 7349e8d8bef9SDimitry Andric std::pair<bool, MachineBasicBlock *> 735006c3fb27SDimitry Andric SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, 73510b57cec5SDimitry Andric MachineDominatorTree *MDT) const { 73520b57cec5SDimitry Andric if (ST.hasAddNoCarry()) { 73530b57cec5SDimitry Andric // Assume there is no user of scc since we don't select this in that case. 73540b57cec5SDimitry Andric // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 73550b57cec5SDimitry Andric // is used. 73560b57cec5SDimitry Andric 73570b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 73580b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 73590b57cec5SDimitry Andric 73608bcb0991SDimitry Andric Register OldDstReg = Inst.getOperand(0).getReg(); 73618bcb0991SDimitry Andric Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 73620b57cec5SDimitry Andric 73630b57cec5SDimitry Andric unsigned Opc = Inst.getOpcode(); 73640b57cec5SDimitry Andric assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 73650b57cec5SDimitry Andric 73660b57cec5SDimitry Andric unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 73670b57cec5SDimitry Andric AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 73680b57cec5SDimitry Andric 73690b57cec5SDimitry Andric assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 737081ad6265SDimitry Andric Inst.removeOperand(3); 73710b57cec5SDimitry Andric 73720b57cec5SDimitry Andric Inst.setDesc(get(NewOpc)); 73730b57cec5SDimitry Andric Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 73740b57cec5SDimitry Andric Inst.addImplicitDefUseOperands(*MBB.getParent()); 73750b57cec5SDimitry Andric MRI.replaceRegWith(OldDstReg, ResultReg); 7376e8d8bef9SDimitry Andric MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); 73770b57cec5SDimitry Andric 73780b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7379bdd1243dSDimitry Andric return std::pair(true, NewBB); 73800b57cec5SDimitry Andric } 73810b57cec5SDimitry Andric 7382bdd1243dSDimitry Andric return std::pair(false, nullptr); 73830b57cec5SDimitry Andric } 73840b57cec5SDimitry Andric 738506c3fb27SDimitry Andric void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, 73865ffd83dbSDimitry Andric MachineDominatorTree *MDT) const { 73875ffd83dbSDimitry Andric 73885ffd83dbSDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 73895ffd83dbSDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 73905ffd83dbSDimitry Andric MachineBasicBlock::iterator MII = Inst; 73915ffd83dbSDimitry Andric DebugLoc DL = Inst.getDebugLoc(); 73925ffd83dbSDimitry Andric 73935ffd83dbSDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 73945ffd83dbSDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 73955ffd83dbSDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 73965ffd83dbSDimitry Andric MachineOperand &Cond = Inst.getOperand(3); 73975ffd83dbSDimitry Andric 73985f757f3fSDimitry Andric Register CondReg = Cond.getReg(); 73995f757f3fSDimitry Andric bool IsSCC = (CondReg == AMDGPU::SCC); 7400349cc55cSDimitry Andric 7401349cc55cSDimitry Andric // If this is a trivial select where the condition is effectively not SCC 74025f757f3fSDimitry Andric // (CondReg is a source of copy to SCC), then the select is semantically 74035f757f3fSDimitry Andric // equivalent to copying CondReg. Hence, there is no need to create 7404349cc55cSDimitry Andric // V_CNDMASK, we can just use that and bail out. 7405349cc55cSDimitry Andric if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && 7406349cc55cSDimitry Andric (Src1.getImm() == 0)) { 74075f757f3fSDimitry Andric MRI.replaceRegWith(Dest.getReg(), CondReg); 7408349cc55cSDimitry Andric return; 7409349cc55cSDimitry Andric } 7410349cc55cSDimitry Andric 74115f757f3fSDimitry Andric Register NewCondReg = CondReg; 74125f757f3fSDimitry Andric if (IsSCC) { 7413349cc55cSDimitry Andric const TargetRegisterClass *TC = 7414349cc55cSDimitry Andric RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 74155f757f3fSDimitry Andric NewCondReg = MRI.createVirtualRegister(TC); 7416349cc55cSDimitry Andric 7417349cc55cSDimitry Andric // Now look for the closest SCC def if it is a copy 74185f757f3fSDimitry Andric // replacing the CondReg with the COPY source register 7419349cc55cSDimitry Andric bool CopyFound = false; 74205ffd83dbSDimitry Andric for (MachineInstr &CandI : 74215ffd83dbSDimitry Andric make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), 74225ffd83dbSDimitry Andric Inst.getParent()->rend())) { 74235ffd83dbSDimitry Andric if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != 74245ffd83dbSDimitry Andric -1) { 74255ffd83dbSDimitry Andric if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { 74265f757f3fSDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg) 7427349cc55cSDimitry Andric .addReg(CandI.getOperand(1).getReg()); 7428349cc55cSDimitry Andric CopyFound = true; 74295ffd83dbSDimitry Andric } 74305ffd83dbSDimitry Andric break; 74315ffd83dbSDimitry Andric } 74325ffd83dbSDimitry Andric } 7433349cc55cSDimitry Andric if (!CopyFound) { 7434349cc55cSDimitry Andric // SCC def is not a copy 74355ffd83dbSDimitry Andric // Insert a trivial select instead of creating a copy, because a copy from 74365ffd83dbSDimitry Andric // SCC would semantically mean just copying a single bit, but we may need 74375ffd83dbSDimitry Andric // the result to be a vector condition mask that needs preserving. 74385ffd83dbSDimitry Andric unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 74395ffd83dbSDimitry Andric : AMDGPU::S_CSELECT_B32; 74405ffd83dbSDimitry Andric auto NewSelect = 74415f757f3fSDimitry Andric BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0); 74425ffd83dbSDimitry Andric NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); 7443349cc55cSDimitry Andric } 74445ffd83dbSDimitry Andric } 74455ffd83dbSDimitry Andric 74465f757f3fSDimitry Andric Register NewDestReg = MRI.createVirtualRegister( 74475f757f3fSDimitry Andric RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()))); 74485f757f3fSDimitry Andric MachineInstr *NewInst; 74495f757f3fSDimitry Andric if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) { 74505f757f3fSDimitry Andric NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg) 74515ffd83dbSDimitry Andric .addImm(0) 74525ffd83dbSDimitry Andric .add(Src1) // False 74535ffd83dbSDimitry Andric .addImm(0) 74545ffd83dbSDimitry Andric .add(Src0) // True 74555f757f3fSDimitry Andric .addReg(NewCondReg); 74565f757f3fSDimitry Andric } else { 74575f757f3fSDimitry Andric NewInst = 74585f757f3fSDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg) 74595f757f3fSDimitry Andric .add(Src1) // False 74605f757f3fSDimitry Andric .add(Src0) // True 74615f757f3fSDimitry Andric .addReg(NewCondReg); 74625f757f3fSDimitry Andric } 74635f757f3fSDimitry Andric MRI.replaceRegWith(Dest.getReg(), NewDestReg); 74645f757f3fSDimitry Andric legalizeOperands(*NewInst, MDT); 74655f757f3fSDimitry Andric addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist); 74665ffd83dbSDimitry Andric } 74675ffd83dbSDimitry Andric 746806c3fb27SDimitry Andric void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, 74690b57cec5SDimitry Andric MachineInstr &Inst) const { 74700b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 74710b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 74720b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 74730b57cec5SDimitry Andric DebugLoc DL = Inst.getDebugLoc(); 74740b57cec5SDimitry Andric 74750b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 74760b57cec5SDimitry Andric MachineOperand &Src = Inst.getOperand(1); 74778bcb0991SDimitry Andric Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 74788bcb0991SDimitry Andric Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 74790b57cec5SDimitry Andric 74800b57cec5SDimitry Andric unsigned SubOp = ST.hasAddNoCarry() ? 7481e8d8bef9SDimitry Andric AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; 74820b57cec5SDimitry Andric 74830b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 74840b57cec5SDimitry Andric .addImm(0) 74850b57cec5SDimitry Andric .addReg(Src.getReg()); 74860b57cec5SDimitry Andric 74870b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 74880b57cec5SDimitry Andric .addReg(Src.getReg()) 74890b57cec5SDimitry Andric .addReg(TmpReg); 74900b57cec5SDimitry Andric 74910b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), ResultReg); 74920b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 74930b57cec5SDimitry Andric } 74940b57cec5SDimitry Andric 749506c3fb27SDimitry Andric void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, 74960b57cec5SDimitry Andric MachineInstr &Inst) const { 74970b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 74980b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 74990b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 75000b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 75010b57cec5SDimitry Andric 75020b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 75030b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 75040b57cec5SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 75050b57cec5SDimitry Andric 75060b57cec5SDimitry Andric if (ST.hasDLInsts()) { 75078bcb0991SDimitry Andric Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 75080b57cec5SDimitry Andric legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 75090b57cec5SDimitry Andric legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 75100b57cec5SDimitry Andric 75110b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 75120b57cec5SDimitry Andric .add(Src0) 75130b57cec5SDimitry Andric .add(Src1); 75140b57cec5SDimitry Andric 75150b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), NewDest); 75160b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 75170b57cec5SDimitry Andric } else { 75180b57cec5SDimitry Andric // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 75190b57cec5SDimitry Andric // invert either source and then perform the XOR. If either source is a 75200b57cec5SDimitry Andric // scalar register, then we can leave the inversion on the scalar unit to 752181ad6265SDimitry Andric // achieve a better distribution of scalar and vector instructions. 75220b57cec5SDimitry Andric bool Src0IsSGPR = Src0.isReg() && 75230b57cec5SDimitry Andric RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 75240b57cec5SDimitry Andric bool Src1IsSGPR = Src1.isReg() && 75250b57cec5SDimitry Andric RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 75260b57cec5SDimitry Andric MachineInstr *Xor; 75278bcb0991SDimitry Andric Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 75288bcb0991SDimitry Andric Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 75290b57cec5SDimitry Andric 75300b57cec5SDimitry Andric // Build a pair of scalar instructions and add them to the work list. 75310b57cec5SDimitry Andric // The next iteration over the work list will lower these to the vector 75320b57cec5SDimitry Andric // unit as necessary. 75330b57cec5SDimitry Andric if (Src0IsSGPR) { 75340b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 75350b57cec5SDimitry Andric Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 75360b57cec5SDimitry Andric .addReg(Temp) 75370b57cec5SDimitry Andric .add(Src1); 75380b57cec5SDimitry Andric } else if (Src1IsSGPR) { 75390b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 75400b57cec5SDimitry Andric Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 75410b57cec5SDimitry Andric .add(Src0) 75420b57cec5SDimitry Andric .addReg(Temp); 75430b57cec5SDimitry Andric } else { 75440b57cec5SDimitry Andric Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 75450b57cec5SDimitry Andric .add(Src0) 75460b57cec5SDimitry Andric .add(Src1); 75470b57cec5SDimitry Andric MachineInstr *Not = 75480b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 75490b57cec5SDimitry Andric Worklist.insert(Not); 75500b57cec5SDimitry Andric } 75510b57cec5SDimitry Andric 75520b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), NewDest); 75530b57cec5SDimitry Andric 75540b57cec5SDimitry Andric Worklist.insert(Xor); 75550b57cec5SDimitry Andric 75560b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 75570b57cec5SDimitry Andric } 75580b57cec5SDimitry Andric } 75590b57cec5SDimitry Andric 756006c3fb27SDimitry Andric void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist, 75610b57cec5SDimitry Andric MachineInstr &Inst, 75620b57cec5SDimitry Andric unsigned Opcode) const { 75630b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 75640b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 75650b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 75660b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 75670b57cec5SDimitry Andric 75680b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 75690b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 75700b57cec5SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 75710b57cec5SDimitry Andric 75728bcb0991SDimitry Andric Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 75738bcb0991SDimitry Andric Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 75740b57cec5SDimitry Andric 75750b57cec5SDimitry Andric MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 75760b57cec5SDimitry Andric .add(Src0) 75770b57cec5SDimitry Andric .add(Src1); 75780b57cec5SDimitry Andric 75790b57cec5SDimitry Andric MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 75800b57cec5SDimitry Andric .addReg(Interm); 75810b57cec5SDimitry Andric 75820b57cec5SDimitry Andric Worklist.insert(&Op); 75830b57cec5SDimitry Andric Worklist.insert(&Not); 75840b57cec5SDimitry Andric 75850b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), NewDest); 75860b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 75870b57cec5SDimitry Andric } 75880b57cec5SDimitry Andric 758906c3fb27SDimitry Andric void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist, 75900b57cec5SDimitry Andric MachineInstr &Inst, 75910b57cec5SDimitry Andric unsigned Opcode) const { 75920b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 75930b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 75940b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 75950b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 75960b57cec5SDimitry Andric 75970b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 75980b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 75990b57cec5SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 76000b57cec5SDimitry Andric 76018bcb0991SDimitry Andric Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 76028bcb0991SDimitry Andric Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 76030b57cec5SDimitry Andric 76040b57cec5SDimitry Andric MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 76050b57cec5SDimitry Andric .add(Src1); 76060b57cec5SDimitry Andric 76070b57cec5SDimitry Andric MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 76080b57cec5SDimitry Andric .add(Src0) 76090b57cec5SDimitry Andric .addReg(Interm); 76100b57cec5SDimitry Andric 76110b57cec5SDimitry Andric Worklist.insert(&Not); 76120b57cec5SDimitry Andric Worklist.insert(&Op); 76130b57cec5SDimitry Andric 76140b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), NewDest); 76150b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 76160b57cec5SDimitry Andric } 76170b57cec5SDimitry Andric 761806c3fb27SDimitry Andric void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, 761906c3fb27SDimitry Andric MachineInstr &Inst, unsigned Opcode, 762006c3fb27SDimitry Andric bool Swap) const { 76210b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 76220b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 76230b57cec5SDimitry Andric 76240b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 76250b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 76260b57cec5SDimitry Andric DebugLoc DL = Inst.getDebugLoc(); 76270b57cec5SDimitry Andric 76280b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 76290b57cec5SDimitry Andric 76300b57cec5SDimitry Andric const MCInstrDesc &InstDesc = get(Opcode); 76310b57cec5SDimitry Andric const TargetRegisterClass *Src0RC = Src0.isReg() ? 76320b57cec5SDimitry Andric MRI.getRegClass(Src0.getReg()) : 76330b57cec5SDimitry Andric &AMDGPU::SGPR_32RegClass; 76340b57cec5SDimitry Andric 7635bdd1243dSDimitry Andric const TargetRegisterClass *Src0SubRC = 7636bdd1243dSDimitry Andric RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 76370b57cec5SDimitry Andric 76380b57cec5SDimitry Andric MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 76390b57cec5SDimitry Andric AMDGPU::sub0, Src0SubRC); 76400b57cec5SDimitry Andric 76410b57cec5SDimitry Andric const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 76420b57cec5SDimitry Andric const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 7643bdd1243dSDimitry Andric const TargetRegisterClass *NewDestSubRC = 7644bdd1243dSDimitry Andric RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0); 76450b57cec5SDimitry Andric 76468bcb0991SDimitry Andric Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 76470b57cec5SDimitry Andric MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 76480b57cec5SDimitry Andric 76490b57cec5SDimitry Andric MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 76500b57cec5SDimitry Andric AMDGPU::sub1, Src0SubRC); 76510b57cec5SDimitry Andric 76528bcb0991SDimitry Andric Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 76530b57cec5SDimitry Andric MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 76540b57cec5SDimitry Andric 7655fe6060f1SDimitry Andric if (Swap) 7656fe6060f1SDimitry Andric std::swap(DestSub0, DestSub1); 7657fe6060f1SDimitry Andric 76588bcb0991SDimitry Andric Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 76590b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 76600b57cec5SDimitry Andric .addReg(DestSub0) 76610b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 76620b57cec5SDimitry Andric .addReg(DestSub1) 76630b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 76640b57cec5SDimitry Andric 76650b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), FullDestReg); 76660b57cec5SDimitry Andric 76670b57cec5SDimitry Andric Worklist.insert(&LoHalf); 76680b57cec5SDimitry Andric Worklist.insert(&HiHalf); 76690b57cec5SDimitry Andric 76700b57cec5SDimitry Andric // We don't need to legalizeOperands here because for a single operand, src0 76710b57cec5SDimitry Andric // will support any kind of input. 76720b57cec5SDimitry Andric 76730b57cec5SDimitry Andric // Move all users of this moved value. 76740b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 76750b57cec5SDimitry Andric } 76760b57cec5SDimitry Andric 7677*1db9f3b2SDimitry Andric // There is not a vector equivalent of s_mul_u64. For this reason, we need to 7678*1db9f3b2SDimitry Andric // split the s_mul_u64 in 32-bit vector multiplications. 7679*1db9f3b2SDimitry Andric void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist, 7680*1db9f3b2SDimitry Andric MachineInstr &Inst, 7681*1db9f3b2SDimitry Andric MachineDominatorTree *MDT) const { 7682*1db9f3b2SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 7683*1db9f3b2SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7684*1db9f3b2SDimitry Andric 7685*1db9f3b2SDimitry Andric Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 7686*1db9f3b2SDimitry Andric Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7687*1db9f3b2SDimitry Andric Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7688*1db9f3b2SDimitry Andric 7689*1db9f3b2SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 7690*1db9f3b2SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 7691*1db9f3b2SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 7692*1db9f3b2SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 7693*1db9f3b2SDimitry Andric MachineBasicBlock::iterator MII = Inst; 7694*1db9f3b2SDimitry Andric 7695*1db9f3b2SDimitry Andric const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 7696*1db9f3b2SDimitry Andric const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 7697*1db9f3b2SDimitry Andric const TargetRegisterClass *Src0SubRC = 7698*1db9f3b2SDimitry Andric RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 7699*1db9f3b2SDimitry Andric if (RI.isSGPRClass(Src0SubRC)) 7700*1db9f3b2SDimitry Andric Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); 7701*1db9f3b2SDimitry Andric const TargetRegisterClass *Src1SubRC = 7702*1db9f3b2SDimitry Andric RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); 7703*1db9f3b2SDimitry Andric if (RI.isSGPRClass(Src1SubRC)) 7704*1db9f3b2SDimitry Andric Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); 7705*1db9f3b2SDimitry Andric 7706*1db9f3b2SDimitry Andric // First, we extract the low 32-bit and high 32-bit values from each of the 7707*1db9f3b2SDimitry Andric // operands. 7708*1db9f3b2SDimitry Andric MachineOperand Op0L = 7709*1db9f3b2SDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 7710*1db9f3b2SDimitry Andric MachineOperand Op1L = 7711*1db9f3b2SDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 7712*1db9f3b2SDimitry Andric MachineOperand Op0H = 7713*1db9f3b2SDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 7714*1db9f3b2SDimitry Andric MachineOperand Op1H = 7715*1db9f3b2SDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 7716*1db9f3b2SDimitry Andric 7717*1db9f3b2SDimitry Andric // The multilication is done as follows: 7718*1db9f3b2SDimitry Andric // 7719*1db9f3b2SDimitry Andric // Op1H Op1L 7720*1db9f3b2SDimitry Andric // * Op0H Op0L 7721*1db9f3b2SDimitry Andric // -------------------- 7722*1db9f3b2SDimitry Andric // Op1H*Op0L Op1L*Op0L 7723*1db9f3b2SDimitry Andric // + Op1H*Op0H Op1L*Op0H 7724*1db9f3b2SDimitry Andric // ----------------------------------------- 7725*1db9f3b2SDimitry Andric // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L 7726*1db9f3b2SDimitry Andric // 7727*1db9f3b2SDimitry Andric // We drop Op1H*Op0H because the result of the multiplication is a 64-bit 7728*1db9f3b2SDimitry Andric // value and that would overflow. 7729*1db9f3b2SDimitry Andric // The low 32-bit value is Op1L*Op0L. 7730*1db9f3b2SDimitry Andric // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L). 7731*1db9f3b2SDimitry Andric 7732*1db9f3b2SDimitry Andric Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7733*1db9f3b2SDimitry Andric MachineInstr *Op1L_Op0H = 7734*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg) 7735*1db9f3b2SDimitry Andric .add(Op1L) 7736*1db9f3b2SDimitry Andric .add(Op0H); 7737*1db9f3b2SDimitry Andric 7738*1db9f3b2SDimitry Andric Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7739*1db9f3b2SDimitry Andric MachineInstr *Op1H_Op0L = 7740*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg) 7741*1db9f3b2SDimitry Andric .add(Op1H) 7742*1db9f3b2SDimitry Andric .add(Op0L); 7743*1db9f3b2SDimitry Andric 7744*1db9f3b2SDimitry Andric Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7745*1db9f3b2SDimitry Andric MachineInstr *Carry = 7746*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg) 7747*1db9f3b2SDimitry Andric .add(Op1L) 7748*1db9f3b2SDimitry Andric .add(Op0L); 7749*1db9f3b2SDimitry Andric 7750*1db9f3b2SDimitry Andric MachineInstr *LoHalf = 7751*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) 7752*1db9f3b2SDimitry Andric .add(Op1L) 7753*1db9f3b2SDimitry Andric .add(Op0L); 7754*1db9f3b2SDimitry Andric 7755*1db9f3b2SDimitry Andric Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7756*1db9f3b2SDimitry Andric MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg) 7757*1db9f3b2SDimitry Andric .addReg(Op1L_Op0H_Reg) 7758*1db9f3b2SDimitry Andric .addReg(Op1H_Op0L_Reg); 7759*1db9f3b2SDimitry Andric 7760*1db9f3b2SDimitry Andric MachineInstr *HiHalf = 7761*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1) 7762*1db9f3b2SDimitry Andric .addReg(AddReg) 7763*1db9f3b2SDimitry Andric .addReg(CarryReg); 7764*1db9f3b2SDimitry Andric 7765*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 7766*1db9f3b2SDimitry Andric .addReg(DestSub0) 7767*1db9f3b2SDimitry Andric .addImm(AMDGPU::sub0) 7768*1db9f3b2SDimitry Andric .addReg(DestSub1) 7769*1db9f3b2SDimitry Andric .addImm(AMDGPU::sub1); 7770*1db9f3b2SDimitry Andric 7771*1db9f3b2SDimitry Andric MRI.replaceRegWith(Dest.getReg(), FullDestReg); 7772*1db9f3b2SDimitry Andric 7773*1db9f3b2SDimitry Andric // Try to legalize the operands in case we need to swap the order to keep it 7774*1db9f3b2SDimitry Andric // valid. 7775*1db9f3b2SDimitry Andric legalizeOperands(*Op1L_Op0H, MDT); 7776*1db9f3b2SDimitry Andric legalizeOperands(*Op1H_Op0L, MDT); 7777*1db9f3b2SDimitry Andric legalizeOperands(*Carry, MDT); 7778*1db9f3b2SDimitry Andric legalizeOperands(*LoHalf, MDT); 7779*1db9f3b2SDimitry Andric legalizeOperands(*Add, MDT); 7780*1db9f3b2SDimitry Andric legalizeOperands(*HiHalf, MDT); 7781*1db9f3b2SDimitry Andric 7782*1db9f3b2SDimitry Andric // Move all users of this moved value. 7783*1db9f3b2SDimitry Andric addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 7784*1db9f3b2SDimitry Andric } 7785*1db9f3b2SDimitry Andric 7786*1db9f3b2SDimitry Andric // Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector 7787*1db9f3b2SDimitry Andric // multiplications. 7788*1db9f3b2SDimitry Andric void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist, 7789*1db9f3b2SDimitry Andric MachineInstr &Inst, 7790*1db9f3b2SDimitry Andric MachineDominatorTree *MDT) const { 7791*1db9f3b2SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 7792*1db9f3b2SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7793*1db9f3b2SDimitry Andric 7794*1db9f3b2SDimitry Andric Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 7795*1db9f3b2SDimitry Andric Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7796*1db9f3b2SDimitry Andric Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7797*1db9f3b2SDimitry Andric 7798*1db9f3b2SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 7799*1db9f3b2SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 7800*1db9f3b2SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 7801*1db9f3b2SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 7802*1db9f3b2SDimitry Andric MachineBasicBlock::iterator MII = Inst; 7803*1db9f3b2SDimitry Andric 7804*1db9f3b2SDimitry Andric const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 7805*1db9f3b2SDimitry Andric const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 7806*1db9f3b2SDimitry Andric const TargetRegisterClass *Src0SubRC = 7807*1db9f3b2SDimitry Andric RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 7808*1db9f3b2SDimitry Andric if (RI.isSGPRClass(Src0SubRC)) 7809*1db9f3b2SDimitry Andric Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); 7810*1db9f3b2SDimitry Andric const TargetRegisterClass *Src1SubRC = 7811*1db9f3b2SDimitry Andric RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); 7812*1db9f3b2SDimitry Andric if (RI.isSGPRClass(Src1SubRC)) 7813*1db9f3b2SDimitry Andric Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); 7814*1db9f3b2SDimitry Andric 7815*1db9f3b2SDimitry Andric // First, we extract the low 32-bit and high 32-bit values from each of the 7816*1db9f3b2SDimitry Andric // operands. 7817*1db9f3b2SDimitry Andric MachineOperand Op0L = 7818*1db9f3b2SDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 7819*1db9f3b2SDimitry Andric MachineOperand Op1L = 7820*1db9f3b2SDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 7821*1db9f3b2SDimitry Andric 7822*1db9f3b2SDimitry Andric unsigned Opc = Inst.getOpcode(); 7823*1db9f3b2SDimitry Andric unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO 7824*1db9f3b2SDimitry Andric ? AMDGPU::V_MUL_HI_U32_e64 7825*1db9f3b2SDimitry Andric : AMDGPU::V_MUL_HI_I32_e64; 7826*1db9f3b2SDimitry Andric MachineInstr *HiHalf = 7827*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L); 7828*1db9f3b2SDimitry Andric 7829*1db9f3b2SDimitry Andric MachineInstr *LoHalf = 7830*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) 7831*1db9f3b2SDimitry Andric .add(Op1L) 7832*1db9f3b2SDimitry Andric .add(Op0L); 7833*1db9f3b2SDimitry Andric 7834*1db9f3b2SDimitry Andric BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 7835*1db9f3b2SDimitry Andric .addReg(DestSub0) 7836*1db9f3b2SDimitry Andric .addImm(AMDGPU::sub0) 7837*1db9f3b2SDimitry Andric .addReg(DestSub1) 7838*1db9f3b2SDimitry Andric .addImm(AMDGPU::sub1); 7839*1db9f3b2SDimitry Andric 7840*1db9f3b2SDimitry Andric MRI.replaceRegWith(Dest.getReg(), FullDestReg); 7841*1db9f3b2SDimitry Andric 7842*1db9f3b2SDimitry Andric // Try to legalize the operands in case we need to swap the order to keep it 7843*1db9f3b2SDimitry Andric // valid. 7844*1db9f3b2SDimitry Andric legalizeOperands(*HiHalf, MDT); 7845*1db9f3b2SDimitry Andric legalizeOperands(*LoHalf, MDT); 7846*1db9f3b2SDimitry Andric 7847*1db9f3b2SDimitry Andric // Move all users of this moved value. 7848*1db9f3b2SDimitry Andric addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 7849*1db9f3b2SDimitry Andric } 7850*1db9f3b2SDimitry Andric 785106c3fb27SDimitry Andric void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, 78520b57cec5SDimitry Andric MachineInstr &Inst, unsigned Opcode, 78530b57cec5SDimitry Andric MachineDominatorTree *MDT) const { 78540b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 78550b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 78560b57cec5SDimitry Andric 78570b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 78580b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 78590b57cec5SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 78600b57cec5SDimitry Andric DebugLoc DL = Inst.getDebugLoc(); 78610b57cec5SDimitry Andric 78620b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 78630b57cec5SDimitry Andric 78640b57cec5SDimitry Andric const MCInstrDesc &InstDesc = get(Opcode); 78650b57cec5SDimitry Andric const TargetRegisterClass *Src0RC = Src0.isReg() ? 78660b57cec5SDimitry Andric MRI.getRegClass(Src0.getReg()) : 78670b57cec5SDimitry Andric &AMDGPU::SGPR_32RegClass; 78680b57cec5SDimitry Andric 7869bdd1243dSDimitry Andric const TargetRegisterClass *Src0SubRC = 7870bdd1243dSDimitry Andric RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 78710b57cec5SDimitry Andric const TargetRegisterClass *Src1RC = Src1.isReg() ? 78720b57cec5SDimitry Andric MRI.getRegClass(Src1.getReg()) : 78730b57cec5SDimitry Andric &AMDGPU::SGPR_32RegClass; 78740b57cec5SDimitry Andric 7875bdd1243dSDimitry Andric const TargetRegisterClass *Src1SubRC = 7876bdd1243dSDimitry Andric RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); 78770b57cec5SDimitry Andric 78780b57cec5SDimitry Andric MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 78790b57cec5SDimitry Andric AMDGPU::sub0, Src0SubRC); 78800b57cec5SDimitry Andric MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 78810b57cec5SDimitry Andric AMDGPU::sub0, Src1SubRC); 78820b57cec5SDimitry Andric MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 78830b57cec5SDimitry Andric AMDGPU::sub1, Src0SubRC); 78840b57cec5SDimitry Andric MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 78850b57cec5SDimitry Andric AMDGPU::sub1, Src1SubRC); 78860b57cec5SDimitry Andric 78870b57cec5SDimitry Andric const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 78880b57cec5SDimitry Andric const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 7889bdd1243dSDimitry Andric const TargetRegisterClass *NewDestSubRC = 7890bdd1243dSDimitry Andric RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0); 78910b57cec5SDimitry Andric 78928bcb0991SDimitry Andric Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 78930b57cec5SDimitry Andric MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 78940b57cec5SDimitry Andric .add(SrcReg0Sub0) 78950b57cec5SDimitry Andric .add(SrcReg1Sub0); 78960b57cec5SDimitry Andric 78978bcb0991SDimitry Andric Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 78980b57cec5SDimitry Andric MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 78990b57cec5SDimitry Andric .add(SrcReg0Sub1) 79000b57cec5SDimitry Andric .add(SrcReg1Sub1); 79010b57cec5SDimitry Andric 79028bcb0991SDimitry Andric Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 79030b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 79040b57cec5SDimitry Andric .addReg(DestSub0) 79050b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 79060b57cec5SDimitry Andric .addReg(DestSub1) 79070b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 79080b57cec5SDimitry Andric 79090b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), FullDestReg); 79100b57cec5SDimitry Andric 79110b57cec5SDimitry Andric Worklist.insert(&LoHalf); 79120b57cec5SDimitry Andric Worklist.insert(&HiHalf); 79130b57cec5SDimitry Andric 791481ad6265SDimitry Andric // Move all users of this moved value. 79150b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 79160b57cec5SDimitry Andric } 79170b57cec5SDimitry Andric 791806c3fb27SDimitry Andric void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist, 79190b57cec5SDimitry Andric MachineInstr &Inst, 79200b57cec5SDimitry Andric MachineDominatorTree *MDT) const { 79210b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 79220b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 79230b57cec5SDimitry Andric 79240b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 79250b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 79260b57cec5SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 79270b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 79280b57cec5SDimitry Andric 79290b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 79300b57cec5SDimitry Andric 79310b57cec5SDimitry Andric const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 79320b57cec5SDimitry Andric 79338bcb0991SDimitry Andric Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 79340b57cec5SDimitry Andric 79350b57cec5SDimitry Andric MachineOperand* Op0; 79360b57cec5SDimitry Andric MachineOperand* Op1; 79370b57cec5SDimitry Andric 79380b57cec5SDimitry Andric if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 79390b57cec5SDimitry Andric Op0 = &Src0; 79400b57cec5SDimitry Andric Op1 = &Src1; 79410b57cec5SDimitry Andric } else { 79420b57cec5SDimitry Andric Op0 = &Src1; 79430b57cec5SDimitry Andric Op1 = &Src0; 79440b57cec5SDimitry Andric } 79450b57cec5SDimitry Andric 79460b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 79470b57cec5SDimitry Andric .add(*Op0); 79480b57cec5SDimitry Andric 79498bcb0991SDimitry Andric Register NewDest = MRI.createVirtualRegister(DestRC); 79500b57cec5SDimitry Andric 79510b57cec5SDimitry Andric MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 79520b57cec5SDimitry Andric .addReg(Interm) 79530b57cec5SDimitry Andric .add(*Op1); 79540b57cec5SDimitry Andric 79550b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), NewDest); 79560b57cec5SDimitry Andric 79570b57cec5SDimitry Andric Worklist.insert(&Xor); 79580b57cec5SDimitry Andric } 79590b57cec5SDimitry Andric 796006c3fb27SDimitry Andric void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist, 796106c3fb27SDimitry Andric MachineInstr &Inst) const { 79620b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 79630b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 79640b57cec5SDimitry Andric 79650b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 79660b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 79670b57cec5SDimitry Andric 79680b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 79690b57cec5SDimitry Andric MachineOperand &Src = Inst.getOperand(1); 79700b57cec5SDimitry Andric 79710b57cec5SDimitry Andric const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 79720b57cec5SDimitry Andric const TargetRegisterClass *SrcRC = Src.isReg() ? 79730b57cec5SDimitry Andric MRI.getRegClass(Src.getReg()) : 79740b57cec5SDimitry Andric &AMDGPU::SGPR_32RegClass; 79750b57cec5SDimitry Andric 79768bcb0991SDimitry Andric Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 79778bcb0991SDimitry Andric Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 79780b57cec5SDimitry Andric 7979bdd1243dSDimitry Andric const TargetRegisterClass *SrcSubRC = 7980bdd1243dSDimitry Andric RI.getSubRegisterClass(SrcRC, AMDGPU::sub0); 79810b57cec5SDimitry Andric 79820b57cec5SDimitry Andric MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 79830b57cec5SDimitry Andric AMDGPU::sub0, SrcSubRC); 79840b57cec5SDimitry Andric MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 79850b57cec5SDimitry Andric AMDGPU::sub1, SrcSubRC); 79860b57cec5SDimitry Andric 79870b57cec5SDimitry Andric BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 79880b57cec5SDimitry Andric 79890b57cec5SDimitry Andric BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 79900b57cec5SDimitry Andric 79910b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), ResultReg); 79920b57cec5SDimitry Andric 799381ad6265SDimitry Andric // We don't need to legalize operands here. src0 for either instruction can be 79940b57cec5SDimitry Andric // an SGPR, and the second input is unused or determined here. 79950b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 79960b57cec5SDimitry Andric } 79970b57cec5SDimitry Andric 799806c3fb27SDimitry Andric void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, 79990b57cec5SDimitry Andric MachineInstr &Inst) const { 80000b57cec5SDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 80010b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 80020b57cec5SDimitry Andric MachineBasicBlock::iterator MII = Inst; 80030b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 80040b57cec5SDimitry Andric 80050b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 80060b57cec5SDimitry Andric uint32_t Imm = Inst.getOperand(2).getImm(); 80070b57cec5SDimitry Andric uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 80080b57cec5SDimitry Andric uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 80090b57cec5SDimitry Andric 80100b57cec5SDimitry Andric (void) Offset; 80110b57cec5SDimitry Andric 80120b57cec5SDimitry Andric // Only sext_inreg cases handled. 80130b57cec5SDimitry Andric assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 80140b57cec5SDimitry Andric Offset == 0 && "Not implemented"); 80150b57cec5SDimitry Andric 80160b57cec5SDimitry Andric if (BitWidth < 32) { 80178bcb0991SDimitry Andric Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 80188bcb0991SDimitry Andric Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 80198bcb0991SDimitry Andric Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 80200b57cec5SDimitry Andric 8021e8d8bef9SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) 80220b57cec5SDimitry Andric .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 80230b57cec5SDimitry Andric .addImm(0) 80240b57cec5SDimitry Andric .addImm(BitWidth); 80250b57cec5SDimitry Andric 80260b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 80270b57cec5SDimitry Andric .addImm(31) 80280b57cec5SDimitry Andric .addReg(MidRegLo); 80290b57cec5SDimitry Andric 80300b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 80310b57cec5SDimitry Andric .addReg(MidRegLo) 80320b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 80330b57cec5SDimitry Andric .addReg(MidRegHi) 80340b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 80350b57cec5SDimitry Andric 80360b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), ResultReg); 80370b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 80380b57cec5SDimitry Andric return; 80390b57cec5SDimitry Andric } 80400b57cec5SDimitry Andric 80410b57cec5SDimitry Andric MachineOperand &Src = Inst.getOperand(1); 80428bcb0991SDimitry Andric Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 80438bcb0991SDimitry Andric Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 80440b57cec5SDimitry Andric 80450b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 80460b57cec5SDimitry Andric .addImm(31) 80470b57cec5SDimitry Andric .addReg(Src.getReg(), 0, AMDGPU::sub0); 80480b57cec5SDimitry Andric 80490b57cec5SDimitry Andric BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 80500b57cec5SDimitry Andric .addReg(Src.getReg(), 0, AMDGPU::sub0) 80510b57cec5SDimitry Andric .addImm(AMDGPU::sub0) 80520b57cec5SDimitry Andric .addReg(TmpReg) 80530b57cec5SDimitry Andric .addImm(AMDGPU::sub1); 80540b57cec5SDimitry Andric 80550b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), ResultReg); 80560b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 80570b57cec5SDimitry Andric } 80580b57cec5SDimitry Andric 8059cb14a3feSDimitry Andric void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist, 8060cb14a3feSDimitry Andric MachineInstr &Inst, unsigned Opcode, 8061cb14a3feSDimitry Andric MachineDominatorTree *MDT) const { 8062cb14a3feSDimitry Andric // (S_FLBIT_I32_B64 hi:lo) -> 8063cb14a3feSDimitry Andric // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32)) 8064cb14a3feSDimitry Andric // (S_FF1_I32_B64 hi:lo) -> 8065cb14a3feSDimitry Andric // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo)) 8066cb14a3feSDimitry Andric 8067cb14a3feSDimitry Andric MachineBasicBlock &MBB = *Inst.getParent(); 8068cb14a3feSDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 8069cb14a3feSDimitry Andric MachineBasicBlock::iterator MII = Inst; 8070cb14a3feSDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 8071cb14a3feSDimitry Andric 8072cb14a3feSDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 8073cb14a3feSDimitry Andric MachineOperand &Src = Inst.getOperand(1); 8074cb14a3feSDimitry Andric 8075cb14a3feSDimitry Andric const MCInstrDesc &InstDesc = get(Opcode); 8076cb14a3feSDimitry Andric 8077cb14a3feSDimitry Andric bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32; 8078cb14a3feSDimitry Andric unsigned OpcodeAdd = 8079cb14a3feSDimitry Andric ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; 8080cb14a3feSDimitry Andric 8081cb14a3feSDimitry Andric const TargetRegisterClass *SrcRC = 8082cb14a3feSDimitry Andric Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; 8083cb14a3feSDimitry Andric const TargetRegisterClass *SrcSubRC = 8084cb14a3feSDimitry Andric RI.getSubRegisterClass(SrcRC, AMDGPU::sub0); 8085cb14a3feSDimitry Andric 8086cb14a3feSDimitry Andric MachineOperand SrcRegSub0 = 8087cb14a3feSDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC); 8088cb14a3feSDimitry Andric MachineOperand SrcRegSub1 = 8089cb14a3feSDimitry Andric buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); 8090cb14a3feSDimitry Andric 8091cb14a3feSDimitry Andric Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 8092cb14a3feSDimitry Andric Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 8093cb14a3feSDimitry Andric Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 8094cb14a3feSDimitry Andric Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 8095cb14a3feSDimitry Andric 8096cb14a3feSDimitry Andric BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0); 8097cb14a3feSDimitry Andric 8098cb14a3feSDimitry Andric BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1); 8099cb14a3feSDimitry Andric 8100cb14a3feSDimitry Andric BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3) 8101cb14a3feSDimitry Andric .addReg(IsCtlz ? MidReg1 : MidReg2) 8102cb14a3feSDimitry Andric .addImm(32) 8103cb14a3feSDimitry Andric .addImm(1); // enable clamp 8104cb14a3feSDimitry Andric 8105cb14a3feSDimitry Andric BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4) 8106cb14a3feSDimitry Andric .addReg(MidReg3) 8107cb14a3feSDimitry Andric .addReg(IsCtlz ? MidReg2 : MidReg1); 8108cb14a3feSDimitry Andric 8109cb14a3feSDimitry Andric MRI.replaceRegWith(Dest.getReg(), MidReg4); 8110cb14a3feSDimitry Andric 8111cb14a3feSDimitry Andric addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist); 8112cb14a3feSDimitry Andric } 8113cb14a3feSDimitry Andric 81140b57cec5SDimitry Andric void SIInstrInfo::addUsersToMoveToVALUWorklist( 811506c3fb27SDimitry Andric Register DstReg, MachineRegisterInfo &MRI, 811606c3fb27SDimitry Andric SIInstrWorklist &Worklist) const { 81170b57cec5SDimitry Andric for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 81180b57cec5SDimitry Andric E = MRI.use_end(); I != E;) { 81190b57cec5SDimitry Andric MachineInstr &UseMI = *I->getParent(); 81200b57cec5SDimitry Andric 81210b57cec5SDimitry Andric unsigned OpNo = 0; 81220b57cec5SDimitry Andric 81230b57cec5SDimitry Andric switch (UseMI.getOpcode()) { 81240b57cec5SDimitry Andric case AMDGPU::COPY: 81250b57cec5SDimitry Andric case AMDGPU::WQM: 81268bcb0991SDimitry Andric case AMDGPU::SOFT_WQM: 8127fe6060f1SDimitry Andric case AMDGPU::STRICT_WWM: 8128fe6060f1SDimitry Andric case AMDGPU::STRICT_WQM: 81290b57cec5SDimitry Andric case AMDGPU::REG_SEQUENCE: 81300b57cec5SDimitry Andric case AMDGPU::PHI: 81310b57cec5SDimitry Andric case AMDGPU::INSERT_SUBREG: 81320b57cec5SDimitry Andric break; 81330b57cec5SDimitry Andric default: 81340b57cec5SDimitry Andric OpNo = I.getOperandNo(); 81350b57cec5SDimitry Andric break; 81360b57cec5SDimitry Andric } 81370b57cec5SDimitry Andric 81380b57cec5SDimitry Andric if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 81390b57cec5SDimitry Andric Worklist.insert(&UseMI); 81400b57cec5SDimitry Andric 81410b57cec5SDimitry Andric do { 81420b57cec5SDimitry Andric ++I; 81430b57cec5SDimitry Andric } while (I != E && I->getParent() == &UseMI); 81440b57cec5SDimitry Andric } else { 81450b57cec5SDimitry Andric ++I; 81460b57cec5SDimitry Andric } 81470b57cec5SDimitry Andric } 81480b57cec5SDimitry Andric } 81490b57cec5SDimitry Andric 815006c3fb27SDimitry Andric void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, 81510b57cec5SDimitry Andric MachineRegisterInfo &MRI, 81520b57cec5SDimitry Andric MachineInstr &Inst) const { 81538bcb0991SDimitry Andric Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 81540b57cec5SDimitry Andric MachineBasicBlock *MBB = Inst.getParent(); 81550b57cec5SDimitry Andric MachineOperand &Src0 = Inst.getOperand(1); 81560b57cec5SDimitry Andric MachineOperand &Src1 = Inst.getOperand(2); 81570b57cec5SDimitry Andric const DebugLoc &DL = Inst.getDebugLoc(); 81580b57cec5SDimitry Andric 81590b57cec5SDimitry Andric switch (Inst.getOpcode()) { 81600b57cec5SDimitry Andric case AMDGPU::S_PACK_LL_B32_B16: { 81618bcb0991SDimitry Andric Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 81628bcb0991SDimitry Andric Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 81630b57cec5SDimitry Andric 81640b57cec5SDimitry Andric // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 81650b57cec5SDimitry Andric // 0. 81660b57cec5SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 81670b57cec5SDimitry Andric .addImm(0xffff); 81680b57cec5SDimitry Andric 81690b57cec5SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 81700b57cec5SDimitry Andric .addReg(ImmReg, RegState::Kill) 81710b57cec5SDimitry Andric .add(Src0); 81720b57cec5SDimitry Andric 8173e8d8bef9SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) 81740b57cec5SDimitry Andric .add(Src1) 81750b57cec5SDimitry Andric .addImm(16) 81760b57cec5SDimitry Andric .addReg(TmpReg, RegState::Kill); 81770b57cec5SDimitry Andric break; 81780b57cec5SDimitry Andric } 81790b57cec5SDimitry Andric case AMDGPU::S_PACK_LH_B32_B16: { 81808bcb0991SDimitry Andric Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 81810b57cec5SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 81820b57cec5SDimitry Andric .addImm(0xffff); 8183e8d8bef9SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) 81840b57cec5SDimitry Andric .addReg(ImmReg, RegState::Kill) 81850b57cec5SDimitry Andric .add(Src0) 81860b57cec5SDimitry Andric .add(Src1); 81870b57cec5SDimitry Andric break; 81880b57cec5SDimitry Andric } 818981ad6265SDimitry Andric case AMDGPU::S_PACK_HL_B32_B16: { 819081ad6265SDimitry Andric Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 819181ad6265SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 819281ad6265SDimitry Andric .addImm(16) 819381ad6265SDimitry Andric .add(Src0); 819481ad6265SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) 819581ad6265SDimitry Andric .add(Src1) 819681ad6265SDimitry Andric .addImm(16) 819781ad6265SDimitry Andric .addReg(TmpReg, RegState::Kill); 819881ad6265SDimitry Andric break; 819981ad6265SDimitry Andric } 82000b57cec5SDimitry Andric case AMDGPU::S_PACK_HH_B32_B16: { 82018bcb0991SDimitry Andric Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 82028bcb0991SDimitry Andric Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 82030b57cec5SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 82040b57cec5SDimitry Andric .addImm(16) 82050b57cec5SDimitry Andric .add(Src0); 82060b57cec5SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 82070b57cec5SDimitry Andric .addImm(0xffff0000); 8208e8d8bef9SDimitry Andric BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) 82090b57cec5SDimitry Andric .add(Src1) 82100b57cec5SDimitry Andric .addReg(ImmReg, RegState::Kill) 82110b57cec5SDimitry Andric .addReg(TmpReg, RegState::Kill); 82120b57cec5SDimitry Andric break; 82130b57cec5SDimitry Andric } 82140b57cec5SDimitry Andric default: 82150b57cec5SDimitry Andric llvm_unreachable("unhandled s_pack_* instruction"); 82160b57cec5SDimitry Andric } 82170b57cec5SDimitry Andric 82180b57cec5SDimitry Andric MachineOperand &Dest = Inst.getOperand(0); 82190b57cec5SDimitry Andric MRI.replaceRegWith(Dest.getReg(), ResultReg); 82200b57cec5SDimitry Andric addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 82210b57cec5SDimitry Andric } 82220b57cec5SDimitry Andric 82230b57cec5SDimitry Andric void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 82240b57cec5SDimitry Andric MachineInstr &SCCDefInst, 822506c3fb27SDimitry Andric SIInstrWorklist &Worklist, 8226349cc55cSDimitry Andric Register NewCond) const { 82275ffd83dbSDimitry Andric 82280b57cec5SDimitry Andric // Ensure that def inst defines SCC, which is still live. 82290b57cec5SDimitry Andric assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 82300b57cec5SDimitry Andric !Op.isDead() && Op.getParent() == &SCCDefInst); 82315ffd83dbSDimitry Andric SmallVector<MachineInstr *, 4> CopyToDelete; 82320b57cec5SDimitry Andric // This assumes that all the users of SCC are in the same block 82330b57cec5SDimitry Andric // as the SCC def. 82340b57cec5SDimitry Andric for (MachineInstr &MI : // Skip the def inst itself. 82350b57cec5SDimitry Andric make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 82360b57cec5SDimitry Andric SCCDefInst.getParent()->end())) { 82370b57cec5SDimitry Andric // Check if SCC is used first. 8238349cc55cSDimitry Andric int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI); 8239349cc55cSDimitry Andric if (SCCIdx != -1) { 82405ffd83dbSDimitry Andric if (MI.isCopy()) { 82415ffd83dbSDimitry Andric MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 8242e8d8bef9SDimitry Andric Register DestReg = MI.getOperand(0).getReg(); 82435ffd83dbSDimitry Andric 8244349cc55cSDimitry Andric MRI.replaceRegWith(DestReg, NewCond); 82455ffd83dbSDimitry Andric CopyToDelete.push_back(&MI); 82465ffd83dbSDimitry Andric } else { 8247349cc55cSDimitry Andric 8248349cc55cSDimitry Andric if (NewCond.isValid()) 8249349cc55cSDimitry Andric MI.getOperand(SCCIdx).setReg(NewCond); 82505ffd83dbSDimitry Andric 82510b57cec5SDimitry Andric Worklist.insert(&MI); 82525ffd83dbSDimitry Andric } 82535ffd83dbSDimitry Andric } 82540b57cec5SDimitry Andric // Exit if we find another SCC def. 82550b57cec5SDimitry Andric if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 82565ffd83dbSDimitry Andric break; 82575ffd83dbSDimitry Andric } 82585ffd83dbSDimitry Andric for (auto &Copy : CopyToDelete) 82595ffd83dbSDimitry Andric Copy->eraseFromParent(); 82600b57cec5SDimitry Andric } 82610b57cec5SDimitry Andric 8262fe6060f1SDimitry Andric // Instructions that use SCC may be converted to VALU instructions. When that 8263fe6060f1SDimitry Andric // happens, the SCC register is changed to VCC_LO. The instruction that defines 8264fe6060f1SDimitry Andric // SCC must be changed to an instruction that defines VCC. This function makes 8265fe6060f1SDimitry Andric // sure that the instruction that defines SCC is added to the moveToVALU 8266fe6060f1SDimitry Andric // worklist. 8267bdd1243dSDimitry Andric void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, 826806c3fb27SDimitry Andric SIInstrWorklist &Worklist) const { 826981ad6265SDimitry Andric // Look for a preceding instruction that either defines VCC or SCC. If VCC 8270fe6060f1SDimitry Andric // then there is nothing to do because the defining instruction has been 8271fe6060f1SDimitry Andric // converted to a VALU already. If SCC then that instruction needs to be 8272fe6060f1SDimitry Andric // converted to a VALU. 8273fe6060f1SDimitry Andric for (MachineInstr &MI : 8274fe6060f1SDimitry Andric make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), 8275fe6060f1SDimitry Andric SCCUseInst->getParent()->rend())) { 8276fe6060f1SDimitry Andric if (MI.modifiesRegister(AMDGPU::VCC, &RI)) 8277fe6060f1SDimitry Andric break; 8278fe6060f1SDimitry Andric if (MI.definesRegister(AMDGPU::SCC, &RI)) { 8279fe6060f1SDimitry Andric Worklist.insert(&MI); 8280fe6060f1SDimitry Andric break; 8281fe6060f1SDimitry Andric } 8282fe6060f1SDimitry Andric } 8283fe6060f1SDimitry Andric } 8284fe6060f1SDimitry Andric 82850b57cec5SDimitry Andric const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 82860b57cec5SDimitry Andric const MachineInstr &Inst) const { 82870b57cec5SDimitry Andric const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 82880b57cec5SDimitry Andric 82890b57cec5SDimitry Andric switch (Inst.getOpcode()) { 82900b57cec5SDimitry Andric // For target instructions, getOpRegClass just returns the virtual register 82910b57cec5SDimitry Andric // class associated with the operand, so we need to find an equivalent VGPR 82920b57cec5SDimitry Andric // register class in order to move the instruction to the VALU. 82930b57cec5SDimitry Andric case AMDGPU::COPY: 82940b57cec5SDimitry Andric case AMDGPU::PHI: 82950b57cec5SDimitry Andric case AMDGPU::REG_SEQUENCE: 82960b57cec5SDimitry Andric case AMDGPU::INSERT_SUBREG: 82970b57cec5SDimitry Andric case AMDGPU::WQM: 82988bcb0991SDimitry Andric case AMDGPU::SOFT_WQM: 8299fe6060f1SDimitry Andric case AMDGPU::STRICT_WWM: 8300fe6060f1SDimitry Andric case AMDGPU::STRICT_WQM: { 83010b57cec5SDimitry Andric const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 83024824e7fdSDimitry Andric if (RI.isAGPRClass(SrcRC)) { 83034824e7fdSDimitry Andric if (RI.isAGPRClass(NewDstRC)) 83040b57cec5SDimitry Andric return nullptr; 83050b57cec5SDimitry Andric 83068bcb0991SDimitry Andric switch (Inst.getOpcode()) { 83078bcb0991SDimitry Andric case AMDGPU::PHI: 83088bcb0991SDimitry Andric case AMDGPU::REG_SEQUENCE: 83098bcb0991SDimitry Andric case AMDGPU::INSERT_SUBREG: 83100b57cec5SDimitry Andric NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 83118bcb0991SDimitry Andric break; 83128bcb0991SDimitry Andric default: 83138bcb0991SDimitry Andric NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 83148bcb0991SDimitry Andric } 83158bcb0991SDimitry Andric 83160b57cec5SDimitry Andric if (!NewDstRC) 83170b57cec5SDimitry Andric return nullptr; 83180b57cec5SDimitry Andric } else { 83194824e7fdSDimitry Andric if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 83200b57cec5SDimitry Andric return nullptr; 83210b57cec5SDimitry Andric 83220b57cec5SDimitry Andric NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 83230b57cec5SDimitry Andric if (!NewDstRC) 83240b57cec5SDimitry Andric return nullptr; 83250b57cec5SDimitry Andric } 83260b57cec5SDimitry Andric 83270b57cec5SDimitry Andric return NewDstRC; 83280b57cec5SDimitry Andric } 83290b57cec5SDimitry Andric default: 83300b57cec5SDimitry Andric return NewDstRC; 83310b57cec5SDimitry Andric } 83320b57cec5SDimitry Andric } 83330b57cec5SDimitry Andric 83340b57cec5SDimitry Andric // Find the one SGPR operand we are allowed to use. 83355ffd83dbSDimitry Andric Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 83360b57cec5SDimitry Andric int OpIndices[3]) const { 83370b57cec5SDimitry Andric const MCInstrDesc &Desc = MI.getDesc(); 83380b57cec5SDimitry Andric 83390b57cec5SDimitry Andric // Find the one SGPR operand we are allowed to use. 83400b57cec5SDimitry Andric // 83410b57cec5SDimitry Andric // First we need to consider the instruction's operand requirements before 83420b57cec5SDimitry Andric // legalizing. Some operands are required to be SGPRs, such as implicit uses 83430b57cec5SDimitry Andric // of VCC, but we are still bound by the constant bus requirement to only use 83440b57cec5SDimitry Andric // one. 83450b57cec5SDimitry Andric // 83460b57cec5SDimitry Andric // If the operand's class is an SGPR, we can never move it. 83470b57cec5SDimitry Andric 83485ffd83dbSDimitry Andric Register SGPRReg = findImplicitSGPRRead(MI); 8349bdd1243dSDimitry Andric if (SGPRReg) 83500b57cec5SDimitry Andric return SGPRReg; 83510b57cec5SDimitry Andric 8352bdd1243dSDimitry Andric Register UsedSGPRs[3] = {Register()}; 83530b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 83540b57cec5SDimitry Andric 83550b57cec5SDimitry Andric for (unsigned i = 0; i < 3; ++i) { 83560b57cec5SDimitry Andric int Idx = OpIndices[i]; 83570b57cec5SDimitry Andric if (Idx == -1) 83580b57cec5SDimitry Andric break; 83590b57cec5SDimitry Andric 83600b57cec5SDimitry Andric const MachineOperand &MO = MI.getOperand(Idx); 83610b57cec5SDimitry Andric if (!MO.isReg()) 83620b57cec5SDimitry Andric continue; 83630b57cec5SDimitry Andric 83640b57cec5SDimitry Andric // Is this operand statically required to be an SGPR based on the operand 83650b57cec5SDimitry Andric // constraints? 8366bdd1243dSDimitry Andric const TargetRegisterClass *OpRC = 8367bdd1243dSDimitry Andric RI.getRegClass(Desc.operands()[Idx].RegClass); 83680b57cec5SDimitry Andric bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 83690b57cec5SDimitry Andric if (IsRequiredSGPR) 83700b57cec5SDimitry Andric return MO.getReg(); 83710b57cec5SDimitry Andric 83720b57cec5SDimitry Andric // If this could be a VGPR or an SGPR, Check the dynamic register class. 83738bcb0991SDimitry Andric Register Reg = MO.getReg(); 83740b57cec5SDimitry Andric const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 83750b57cec5SDimitry Andric if (RI.isSGPRClass(RegRC)) 83760b57cec5SDimitry Andric UsedSGPRs[i] = Reg; 83770b57cec5SDimitry Andric } 83780b57cec5SDimitry Andric 83790b57cec5SDimitry Andric // We don't have a required SGPR operand, so we have a bit more freedom in 83800b57cec5SDimitry Andric // selecting operands to move. 83810b57cec5SDimitry Andric 83820b57cec5SDimitry Andric // Try to select the most used SGPR. If an SGPR is equal to one of the 83830b57cec5SDimitry Andric // others, we choose that. 83840b57cec5SDimitry Andric // 83850b57cec5SDimitry Andric // e.g. 83860b57cec5SDimitry Andric // V_FMA_F32 v0, s0, s0, s0 -> No moves 83870b57cec5SDimitry Andric // V_FMA_F32 v0, s0, s1, s0 -> Move s1 83880b57cec5SDimitry Andric 83890b57cec5SDimitry Andric // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 83900b57cec5SDimitry Andric // prefer those. 83910b57cec5SDimitry Andric 8392bdd1243dSDimitry Andric if (UsedSGPRs[0]) { 83930b57cec5SDimitry Andric if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 83940b57cec5SDimitry Andric SGPRReg = UsedSGPRs[0]; 83950b57cec5SDimitry Andric } 83960b57cec5SDimitry Andric 8397bdd1243dSDimitry Andric if (!SGPRReg && UsedSGPRs[1]) { 83980b57cec5SDimitry Andric if (UsedSGPRs[1] == UsedSGPRs[2]) 83990b57cec5SDimitry Andric SGPRReg = UsedSGPRs[1]; 84000b57cec5SDimitry Andric } 84010b57cec5SDimitry Andric 84020b57cec5SDimitry Andric return SGPRReg; 84030b57cec5SDimitry Andric } 84040b57cec5SDimitry Andric 84050b57cec5SDimitry Andric MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 84060b57cec5SDimitry Andric unsigned OperandName) const { 84070b57cec5SDimitry Andric int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 84080b57cec5SDimitry Andric if (Idx == -1) 84090b57cec5SDimitry Andric return nullptr; 84100b57cec5SDimitry Andric 84110b57cec5SDimitry Andric return &MI.getOperand(Idx); 84120b57cec5SDimitry Andric } 84130b57cec5SDimitry Andric 84140b57cec5SDimitry Andric uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 84150b57cec5SDimitry Andric if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 8416bdd1243dSDimitry Andric int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11 8417bdd1243dSDimitry Andric ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT 8418bdd1243dSDimitry Andric : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT; 841981ad6265SDimitry Andric return (Format << 44) | 84200b57cec5SDimitry Andric (1ULL << 56) | // RESOURCE_LEVEL = 1 84210b57cec5SDimitry Andric (3ULL << 60); // OOB_SELECT = 3 84220b57cec5SDimitry Andric } 84230b57cec5SDimitry Andric 84240b57cec5SDimitry Andric uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 84250b57cec5SDimitry Andric if (ST.isAmdHsaOS()) { 84260b57cec5SDimitry Andric // Set ATC = 1. GFX9 doesn't have this bit. 84270b57cec5SDimitry Andric if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 84280b57cec5SDimitry Andric RsrcDataFormat |= (1ULL << 56); 84290b57cec5SDimitry Andric 84300b57cec5SDimitry Andric // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 84310b57cec5SDimitry Andric // BTW, it disables TC L2 and therefore decreases performance. 84320b57cec5SDimitry Andric if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 84330b57cec5SDimitry Andric RsrcDataFormat |= (2ULL << 59); 84340b57cec5SDimitry Andric } 84350b57cec5SDimitry Andric 84360b57cec5SDimitry Andric return RsrcDataFormat; 84370b57cec5SDimitry Andric } 84380b57cec5SDimitry Andric 84390b57cec5SDimitry Andric uint64_t SIInstrInfo::getScratchRsrcWords23() const { 84400b57cec5SDimitry Andric uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 84410b57cec5SDimitry Andric AMDGPU::RSRC_TID_ENABLE | 84420b57cec5SDimitry Andric 0xffffffff; // Size; 84430b57cec5SDimitry Andric 84440b57cec5SDimitry Andric // GFX9 doesn't have ELEMENT_SIZE. 84450b57cec5SDimitry Andric if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 8446e8d8bef9SDimitry Andric uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; 84470b57cec5SDimitry Andric Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 84480b57cec5SDimitry Andric } 84490b57cec5SDimitry Andric 84500b57cec5SDimitry Andric // IndexStride = 64 / 32. 84510b57cec5SDimitry Andric uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 84520b57cec5SDimitry Andric Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 84530b57cec5SDimitry Andric 84540b57cec5SDimitry Andric // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 84550b57cec5SDimitry Andric // Clear them unless we want a huge stride. 84560b57cec5SDimitry Andric if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 84570b57cec5SDimitry Andric ST.getGeneration() <= AMDGPUSubtarget::GFX9) 84580b57cec5SDimitry Andric Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 84590b57cec5SDimitry Andric 84600b57cec5SDimitry Andric return Rsrc23; 84610b57cec5SDimitry Andric } 84620b57cec5SDimitry Andric 84630b57cec5SDimitry Andric bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 84640b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 84650b57cec5SDimitry Andric 84660b57cec5SDimitry Andric return isSMRD(Opc); 84670b57cec5SDimitry Andric } 84680b57cec5SDimitry Andric 84695ffd83dbSDimitry Andric bool SIInstrInfo::isHighLatencyDef(int Opc) const { 84705ffd83dbSDimitry Andric return get(Opc).mayLoad() && 84715ffd83dbSDimitry Andric (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 84720b57cec5SDimitry Andric } 84730b57cec5SDimitry Andric 84740b57cec5SDimitry Andric unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 84750b57cec5SDimitry Andric int &FrameIndex) const { 84760b57cec5SDimitry Andric const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 84770b57cec5SDimitry Andric if (!Addr || !Addr->isFI()) 8478bdd1243dSDimitry Andric return Register(); 84790b57cec5SDimitry Andric 84800b57cec5SDimitry Andric assert(!MI.memoperands_empty() && 84810b57cec5SDimitry Andric (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 84820b57cec5SDimitry Andric 84830b57cec5SDimitry Andric FrameIndex = Addr->getIndex(); 84840b57cec5SDimitry Andric return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 84850b57cec5SDimitry Andric } 84860b57cec5SDimitry Andric 84870b57cec5SDimitry Andric unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 84880b57cec5SDimitry Andric int &FrameIndex) const { 84890b57cec5SDimitry Andric const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 84900b57cec5SDimitry Andric assert(Addr && Addr->isFI()); 84910b57cec5SDimitry Andric FrameIndex = Addr->getIndex(); 84920b57cec5SDimitry Andric return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 84930b57cec5SDimitry Andric } 84940b57cec5SDimitry Andric 84950b57cec5SDimitry Andric unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 84960b57cec5SDimitry Andric int &FrameIndex) const { 84970b57cec5SDimitry Andric if (!MI.mayLoad()) 8498bdd1243dSDimitry Andric return Register(); 84990b57cec5SDimitry Andric 85000b57cec5SDimitry Andric if (isMUBUF(MI) || isVGPRSpill(MI)) 85010b57cec5SDimitry Andric return isStackAccess(MI, FrameIndex); 85020b57cec5SDimitry Andric 85030b57cec5SDimitry Andric if (isSGPRSpill(MI)) 85040b57cec5SDimitry Andric return isSGPRStackAccess(MI, FrameIndex); 85050b57cec5SDimitry Andric 8506bdd1243dSDimitry Andric return Register(); 85070b57cec5SDimitry Andric } 85080b57cec5SDimitry Andric 85090b57cec5SDimitry Andric unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 85100b57cec5SDimitry Andric int &FrameIndex) const { 85110b57cec5SDimitry Andric if (!MI.mayStore()) 8512bdd1243dSDimitry Andric return Register(); 85130b57cec5SDimitry Andric 85140b57cec5SDimitry Andric if (isMUBUF(MI) || isVGPRSpill(MI)) 85150b57cec5SDimitry Andric return isStackAccess(MI, FrameIndex); 85160b57cec5SDimitry Andric 85170b57cec5SDimitry Andric if (isSGPRSpill(MI)) 85180b57cec5SDimitry Andric return isSGPRStackAccess(MI, FrameIndex); 85190b57cec5SDimitry Andric 8520bdd1243dSDimitry Andric return Register(); 85210b57cec5SDimitry Andric } 85220b57cec5SDimitry Andric 85230b57cec5SDimitry Andric unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 85240b57cec5SDimitry Andric unsigned Size = 0; 85250b57cec5SDimitry Andric MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 85260b57cec5SDimitry Andric MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 85270b57cec5SDimitry Andric while (++I != E && I->isInsideBundle()) { 85280b57cec5SDimitry Andric assert(!I->isBundle() && "No nested bundle!"); 85290b57cec5SDimitry Andric Size += getInstSizeInBytes(*I); 85300b57cec5SDimitry Andric } 85310b57cec5SDimitry Andric 85320b57cec5SDimitry Andric return Size; 85330b57cec5SDimitry Andric } 85340b57cec5SDimitry Andric 85350b57cec5SDimitry Andric unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 85360b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 85370b57cec5SDimitry Andric const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 85380b57cec5SDimitry Andric unsigned DescSize = Desc.getSize(); 85390b57cec5SDimitry Andric 85400b57cec5SDimitry Andric // If we have a definitive size, we can use it. Otherwise we need to inspect 85410b57cec5SDimitry Andric // the operands to know the size. 8542e8d8bef9SDimitry Andric if (isFixedSize(MI)) { 8543e8d8bef9SDimitry Andric unsigned Size = DescSize; 8544e8d8bef9SDimitry Andric 8545e8d8bef9SDimitry Andric // If we hit the buggy offset, an extra nop will be inserted in MC so 8546e8d8bef9SDimitry Andric // estimate the worst case. 8547e8d8bef9SDimitry Andric if (MI.isBranch() && ST.hasOffset3fBug()) 8548e8d8bef9SDimitry Andric Size += 4; 8549e8d8bef9SDimitry Andric 8550e8d8bef9SDimitry Andric return Size; 8551e8d8bef9SDimitry Andric } 85520b57cec5SDimitry Andric 8553349cc55cSDimitry Andric // Instructions may have a 32-bit literal encoded after them. Check 8554349cc55cSDimitry Andric // operands that could ever be literals. 85550b57cec5SDimitry Andric if (isVALU(MI) || isSALU(MI)) { 8556349cc55cSDimitry Andric if (isDPP(MI)) 85570b57cec5SDimitry Andric return DescSize; 8558349cc55cSDimitry Andric bool HasLiteral = false; 8559349cc55cSDimitry Andric for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 856081ad6265SDimitry Andric const MachineOperand &Op = MI.getOperand(I); 8561bdd1243dSDimitry Andric const MCOperandInfo &OpInfo = Desc.operands()[I]; 8562bdd1243dSDimitry Andric if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) { 8563349cc55cSDimitry Andric HasLiteral = true; 8564349cc55cSDimitry Andric break; 8565349cc55cSDimitry Andric } 8566349cc55cSDimitry Andric } 8567349cc55cSDimitry Andric return HasLiteral ? DescSize + 4 : DescSize; 85680b57cec5SDimitry Andric } 85690b57cec5SDimitry Andric 85700b57cec5SDimitry Andric // Check whether we have extra NSA words. 85710b57cec5SDimitry Andric if (isMIMG(MI)) { 85720b57cec5SDimitry Andric int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 85730b57cec5SDimitry Andric if (VAddr0Idx < 0) 85740b57cec5SDimitry Andric return 8; 85750b57cec5SDimitry Andric 85760b57cec5SDimitry Andric int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 85770b57cec5SDimitry Andric return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 85780b57cec5SDimitry Andric } 85790b57cec5SDimitry Andric 85800b57cec5SDimitry Andric switch (Opc) { 85810b57cec5SDimitry Andric case TargetOpcode::BUNDLE: 85820b57cec5SDimitry Andric return getInstBundleSize(MI); 85830b57cec5SDimitry Andric case TargetOpcode::INLINEASM: 85840b57cec5SDimitry Andric case TargetOpcode::INLINEASM_BR: { 85850b57cec5SDimitry Andric const MachineFunction *MF = MI.getParent()->getParent(); 85860b57cec5SDimitry Andric const char *AsmStr = MI.getOperand(0).getSymbolName(); 8587e8d8bef9SDimitry Andric return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); 85880b57cec5SDimitry Andric } 85890b57cec5SDimitry Andric default: 8590fe6060f1SDimitry Andric if (MI.isMetaInstruction()) 8591fe6060f1SDimitry Andric return 0; 85920b57cec5SDimitry Andric return DescSize; 85930b57cec5SDimitry Andric } 85940b57cec5SDimitry Andric } 85950b57cec5SDimitry Andric 85960b57cec5SDimitry Andric bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 85970b57cec5SDimitry Andric if (!isFLAT(MI)) 85980b57cec5SDimitry Andric return false; 85990b57cec5SDimitry Andric 86000b57cec5SDimitry Andric if (MI.memoperands_empty()) 86010b57cec5SDimitry Andric return true; 86020b57cec5SDimitry Andric 86030b57cec5SDimitry Andric for (const MachineMemOperand *MMO : MI.memoperands()) { 86040b57cec5SDimitry Andric if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 86050b57cec5SDimitry Andric return true; 86060b57cec5SDimitry Andric } 86070b57cec5SDimitry Andric return false; 86080b57cec5SDimitry Andric } 86090b57cec5SDimitry Andric 86100b57cec5SDimitry Andric bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 86110b57cec5SDimitry Andric return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 86120b57cec5SDimitry Andric } 86130b57cec5SDimitry Andric 86140b57cec5SDimitry Andric void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 86150b57cec5SDimitry Andric MachineBasicBlock *IfEnd) const { 86160b57cec5SDimitry Andric MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 86170b57cec5SDimitry Andric assert(TI != IfEntry->end()); 86180b57cec5SDimitry Andric 86190b57cec5SDimitry Andric MachineInstr *Branch = &(*TI); 86200b57cec5SDimitry Andric MachineFunction *MF = IfEntry->getParent(); 86210b57cec5SDimitry Andric MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 86220b57cec5SDimitry Andric 86230b57cec5SDimitry Andric if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 86248bcb0991SDimitry Andric Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 86250b57cec5SDimitry Andric MachineInstr *SIIF = 86260b57cec5SDimitry Andric BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 86270b57cec5SDimitry Andric .add(Branch->getOperand(0)) 86280b57cec5SDimitry Andric .add(Branch->getOperand(1)); 86290b57cec5SDimitry Andric MachineInstr *SIEND = 86300b57cec5SDimitry Andric BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 86310b57cec5SDimitry Andric .addReg(DstReg); 86320b57cec5SDimitry Andric 86330b57cec5SDimitry Andric IfEntry->erase(TI); 86340b57cec5SDimitry Andric IfEntry->insert(IfEntry->end(), SIIF); 86350b57cec5SDimitry Andric IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 86360b57cec5SDimitry Andric } 86370b57cec5SDimitry Andric } 86380b57cec5SDimitry Andric 86390b57cec5SDimitry Andric void SIInstrInfo::convertNonUniformLoopRegion( 86400b57cec5SDimitry Andric MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 86410b57cec5SDimitry Andric MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 86420b57cec5SDimitry Andric // We expect 2 terminators, one conditional and one unconditional. 86430b57cec5SDimitry Andric assert(TI != LoopEnd->end()); 86440b57cec5SDimitry Andric 86450b57cec5SDimitry Andric MachineInstr *Branch = &(*TI); 86460b57cec5SDimitry Andric MachineFunction *MF = LoopEnd->getParent(); 86470b57cec5SDimitry Andric MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 86480b57cec5SDimitry Andric 86490b57cec5SDimitry Andric if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 86500b57cec5SDimitry Andric 86518bcb0991SDimitry Andric Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 86528bcb0991SDimitry Andric Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 86530b57cec5SDimitry Andric MachineInstrBuilder HeaderPHIBuilder = 86540b57cec5SDimitry Andric BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 8655349cc55cSDimitry Andric for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) { 8656349cc55cSDimitry Andric if (PMBB == LoopEnd) { 86570b57cec5SDimitry Andric HeaderPHIBuilder.addReg(BackEdgeReg); 86580b57cec5SDimitry Andric } else { 86598bcb0991SDimitry Andric Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 86600b57cec5SDimitry Andric materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 86610b57cec5SDimitry Andric ZeroReg, 0); 86620b57cec5SDimitry Andric HeaderPHIBuilder.addReg(ZeroReg); 86630b57cec5SDimitry Andric } 8664349cc55cSDimitry Andric HeaderPHIBuilder.addMBB(PMBB); 86650b57cec5SDimitry Andric } 86660b57cec5SDimitry Andric MachineInstr *HeaderPhi = HeaderPHIBuilder; 86670b57cec5SDimitry Andric MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 86680b57cec5SDimitry Andric get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 86690b57cec5SDimitry Andric .addReg(DstReg) 86700b57cec5SDimitry Andric .add(Branch->getOperand(0)); 86710b57cec5SDimitry Andric MachineInstr *SILOOP = 86720b57cec5SDimitry Andric BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 86730b57cec5SDimitry Andric .addReg(BackEdgeReg) 86740b57cec5SDimitry Andric .addMBB(LoopEntry); 86750b57cec5SDimitry Andric 86760b57cec5SDimitry Andric LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 86770b57cec5SDimitry Andric LoopEnd->erase(TI); 86780b57cec5SDimitry Andric LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 86790b57cec5SDimitry Andric LoopEnd->insert(LoopEnd->end(), SILOOP); 86800b57cec5SDimitry Andric } 86810b57cec5SDimitry Andric } 86820b57cec5SDimitry Andric 86830b57cec5SDimitry Andric ArrayRef<std::pair<int, const char *>> 86840b57cec5SDimitry Andric SIInstrInfo::getSerializableTargetIndices() const { 86850b57cec5SDimitry Andric static const std::pair<int, const char *> TargetIndices[] = { 86860b57cec5SDimitry Andric {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 86870b57cec5SDimitry Andric {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 86880b57cec5SDimitry Andric {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 86890b57cec5SDimitry Andric {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 86900b57cec5SDimitry Andric {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 8691bdd1243dSDimitry Andric return ArrayRef(TargetIndices); 86920b57cec5SDimitry Andric } 86930b57cec5SDimitry Andric 86940b57cec5SDimitry Andric /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 86950b57cec5SDimitry Andric /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 86960b57cec5SDimitry Andric ScheduleHazardRecognizer * 86970b57cec5SDimitry Andric SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 86980b57cec5SDimitry Andric const ScheduleDAG *DAG) const { 86990b57cec5SDimitry Andric return new GCNHazardRecognizer(DAG->MF); 87000b57cec5SDimitry Andric } 87010b57cec5SDimitry Andric 87020b57cec5SDimitry Andric /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 87030b57cec5SDimitry Andric /// pass. 87040b57cec5SDimitry Andric ScheduleHazardRecognizer * 87050b57cec5SDimitry Andric SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 87060b57cec5SDimitry Andric return new GCNHazardRecognizer(MF); 87070b57cec5SDimitry Andric } 87080b57cec5SDimitry Andric 8709349cc55cSDimitry Andric // Called during: 8710349cc55cSDimitry Andric // - pre-RA scheduling and post-RA scheduling 8711349cc55cSDimitry Andric ScheduleHazardRecognizer * 8712349cc55cSDimitry Andric SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II, 8713349cc55cSDimitry Andric const ScheduleDAGMI *DAG) const { 8714349cc55cSDimitry Andric // Borrowed from Arm Target 8715349cc55cSDimitry Andric // We would like to restrict this hazard recognizer to only 8716349cc55cSDimitry Andric // post-RA scheduling; we can tell that we're post-RA because we don't 8717349cc55cSDimitry Andric // track VRegLiveness. 8718349cc55cSDimitry Andric if (!DAG->hasVRegLiveness()) 8719349cc55cSDimitry Andric return new GCNHazardRecognizer(DAG->MF); 8720349cc55cSDimitry Andric return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); 8721349cc55cSDimitry Andric } 8722349cc55cSDimitry Andric 87230b57cec5SDimitry Andric std::pair<unsigned, unsigned> 87240b57cec5SDimitry Andric SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 8725bdd1243dSDimitry Andric return std::pair(TF & MO_MASK, TF & ~MO_MASK); 87260b57cec5SDimitry Andric } 87270b57cec5SDimitry Andric 87280b57cec5SDimitry Andric ArrayRef<std::pair<unsigned, const char *>> 87290b57cec5SDimitry Andric SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 87300b57cec5SDimitry Andric static const std::pair<unsigned, const char *> TargetFlags[] = { 87310b57cec5SDimitry Andric { MO_GOTPCREL, "amdgpu-gotprel" }, 87320b57cec5SDimitry Andric { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 87330b57cec5SDimitry Andric { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 87340b57cec5SDimitry Andric { MO_REL32_LO, "amdgpu-rel32-lo" }, 87350b57cec5SDimitry Andric { MO_REL32_HI, "amdgpu-rel32-hi" }, 87360b57cec5SDimitry Andric { MO_ABS32_LO, "amdgpu-abs32-lo" }, 87370b57cec5SDimitry Andric { MO_ABS32_HI, "amdgpu-abs32-hi" }, 87380b57cec5SDimitry Andric }; 87390b57cec5SDimitry Andric 8740bdd1243dSDimitry Andric return ArrayRef(TargetFlags); 87410b57cec5SDimitry Andric } 87420b57cec5SDimitry Andric 874381ad6265SDimitry Andric ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 874481ad6265SDimitry Andric SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { 874581ad6265SDimitry Andric static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 874681ad6265SDimitry Andric { 874781ad6265SDimitry Andric {MONoClobber, "amdgpu-noclobber"}, 874881ad6265SDimitry Andric }; 874981ad6265SDimitry Andric 8750bdd1243dSDimitry Andric return ArrayRef(TargetFlags); 875181ad6265SDimitry Andric } 875281ad6265SDimitry Andric 87535f757f3fSDimitry Andric unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, 87545f757f3fSDimitry Andric const MachineFunction &MF) const { 87555f757f3fSDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 87565f757f3fSDimitry Andric assert(SrcReg.isVirtual()); 87575f757f3fSDimitry Andric if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) 87585f757f3fSDimitry Andric return AMDGPU::WWM_COPY; 87595f757f3fSDimitry Andric 87605f757f3fSDimitry Andric return AMDGPU::COPY; 87615f757f3fSDimitry Andric } 87625f757f3fSDimitry Andric 87635f757f3fSDimitry Andric bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, 87645f757f3fSDimitry Andric Register Reg) const { 87655f757f3fSDimitry Andric // We need to handle instructions which may be inserted during register 87665f757f3fSDimitry Andric // allocation to handle the prolog. The initial prolog instruction may have 87675f757f3fSDimitry Andric // been separated from the start of the block by spills and copies inserted 87685f757f3fSDimitry Andric // needed by the prolog. However, the insertions for scalar registers can 87695f757f3fSDimitry Andric // always be placed at the BB top as they are independent of the exec mask 87705f757f3fSDimitry Andric // value. 87715f757f3fSDimitry Andric bool IsNullOrVectorRegister = true; 87725f757f3fSDimitry Andric if (Reg) { 87735f757f3fSDimitry Andric const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 87745f757f3fSDimitry Andric IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)); 87755f757f3fSDimitry Andric } 87765f757f3fSDimitry Andric 87775f757f3fSDimitry Andric uint16_t Opc = MI.getOpcode(); 87785f757f3fSDimitry Andric // FIXME: Copies inserted in the block prolog for live-range split should also 87795f757f3fSDimitry Andric // be included. 87805f757f3fSDimitry Andric return IsNullOrVectorRegister && 87815f757f3fSDimitry Andric (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY && 87825f757f3fSDimitry Andric MI.modifiesRegister(AMDGPU::EXEC, &RI))); 87830b57cec5SDimitry Andric } 87840b57cec5SDimitry Andric 87850b57cec5SDimitry Andric MachineInstrBuilder 87860b57cec5SDimitry Andric SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 87870b57cec5SDimitry Andric MachineBasicBlock::iterator I, 87880b57cec5SDimitry Andric const DebugLoc &DL, 87895ffd83dbSDimitry Andric Register DestReg) const { 87900b57cec5SDimitry Andric if (ST.hasAddNoCarry()) 87910b57cec5SDimitry Andric return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 87920b57cec5SDimitry Andric 87930b57cec5SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 87948bcb0991SDimitry Andric Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 87950b57cec5SDimitry Andric MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 87960b57cec5SDimitry Andric 8797e8d8bef9SDimitry Andric return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 87980b57cec5SDimitry Andric .addReg(UnusedCarry, RegState::Define | RegState::Dead); 87990b57cec5SDimitry Andric } 88000b57cec5SDimitry Andric 88018bcb0991SDimitry Andric MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 88028bcb0991SDimitry Andric MachineBasicBlock::iterator I, 88038bcb0991SDimitry Andric const DebugLoc &DL, 88048bcb0991SDimitry Andric Register DestReg, 88058bcb0991SDimitry Andric RegScavenger &RS) const { 88068bcb0991SDimitry Andric if (ST.hasAddNoCarry()) 88078bcb0991SDimitry Andric return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 88088bcb0991SDimitry Andric 8809480093f4SDimitry Andric // If available, prefer to use vcc. 8810480093f4SDimitry Andric Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 8811480093f4SDimitry Andric ? Register(RI.getVCC()) 881206c3fb27SDimitry Andric : RS.scavengeRegisterBackwards( 881306c3fb27SDimitry Andric *RI.getBoolRC(), I, /* RestoreAfter */ false, 881406c3fb27SDimitry Andric 0, /* AllowSpill */ false); 8815480093f4SDimitry Andric 88168bcb0991SDimitry Andric // TODO: Users need to deal with this. 88178bcb0991SDimitry Andric if (!UnusedCarry.isValid()) 88188bcb0991SDimitry Andric return MachineInstrBuilder(); 88198bcb0991SDimitry Andric 8820e8d8bef9SDimitry Andric return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 88218bcb0991SDimitry Andric .addReg(UnusedCarry, RegState::Define | RegState::Dead); 88228bcb0991SDimitry Andric } 88238bcb0991SDimitry Andric 88240b57cec5SDimitry Andric bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 88250b57cec5SDimitry Andric switch (Opcode) { 88260b57cec5SDimitry Andric case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 88270b57cec5SDimitry Andric case AMDGPU::SI_KILL_I1_TERMINATOR: 88280b57cec5SDimitry Andric return true; 88290b57cec5SDimitry Andric default: 88300b57cec5SDimitry Andric return false; 88310b57cec5SDimitry Andric } 88320b57cec5SDimitry Andric } 88330b57cec5SDimitry Andric 88340b57cec5SDimitry Andric const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 88350b57cec5SDimitry Andric switch (Opcode) { 88360b57cec5SDimitry Andric case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 88370b57cec5SDimitry Andric return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 88380b57cec5SDimitry Andric case AMDGPU::SI_KILL_I1_PSEUDO: 88390b57cec5SDimitry Andric return get(AMDGPU::SI_KILL_I1_TERMINATOR); 88400b57cec5SDimitry Andric default: 88410b57cec5SDimitry Andric llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 88420b57cec5SDimitry Andric } 88430b57cec5SDimitry Andric } 88440b57cec5SDimitry Andric 88455f757f3fSDimitry Andric bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const { 88465f757f3fSDimitry Andric return Imm <= getMaxMUBUFImmOffset(ST); 88475f757f3fSDimitry Andric } 88485f757f3fSDimitry Andric 88495f757f3fSDimitry Andric unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) { 88505f757f3fSDimitry Andric // GFX12 field is non-negative 24-bit signed byte offset. 88515f757f3fSDimitry Andric const unsigned OffsetBits = 88525f757f3fSDimitry Andric ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12; 88535f757f3fSDimitry Andric return (1 << OffsetBits) - 1; 88545f757f3fSDimitry Andric } 885506c3fb27SDimitry Andric 88560b57cec5SDimitry Andric void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 88570b57cec5SDimitry Andric if (!ST.isWave32()) 88580b57cec5SDimitry Andric return; 88590b57cec5SDimitry Andric 886006c3fb27SDimitry Andric if (MI.isInlineAsm()) 886106c3fb27SDimitry Andric return; 886206c3fb27SDimitry Andric 88630b57cec5SDimitry Andric for (auto &Op : MI.implicit_operands()) { 88640b57cec5SDimitry Andric if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 88650b57cec5SDimitry Andric Op.setReg(AMDGPU::VCC_LO); 88660b57cec5SDimitry Andric } 88670b57cec5SDimitry Andric } 88680b57cec5SDimitry Andric 88690b57cec5SDimitry Andric bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 88700b57cec5SDimitry Andric if (!isSMRD(MI)) 88710b57cec5SDimitry Andric return false; 88720b57cec5SDimitry Andric 88730b57cec5SDimitry Andric // Check that it is using a buffer resource. 88740b57cec5SDimitry Andric int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 88750b57cec5SDimitry Andric if (Idx == -1) // e.g. s_memtime 88760b57cec5SDimitry Andric return false; 88770b57cec5SDimitry Andric 8878bdd1243dSDimitry Andric const auto RCID = MI.getDesc().operands()[Idx].RegClass; 88798bcb0991SDimitry Andric return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 88808bcb0991SDimitry Andric } 88818bcb0991SDimitry Andric 888206c3fb27SDimitry Andric // Given Imm, split it into the values to put into the SOffset and ImmOffset 888306c3fb27SDimitry Andric // fields in an MUBUF instruction. Return false if it is not possible (due to a 888406c3fb27SDimitry Andric // hardware bug needing a workaround). 888506c3fb27SDimitry Andric // 888606c3fb27SDimitry Andric // The required alignment ensures that individual address components remain 888706c3fb27SDimitry Andric // aligned if they are aligned to begin with. It also ensures that additional 888806c3fb27SDimitry Andric // offsets within the given alignment can be added to the resulting ImmOffset. 888906c3fb27SDimitry Andric bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, 889006c3fb27SDimitry Andric uint32_t &ImmOffset, Align Alignment) const { 88915f757f3fSDimitry Andric const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST); 889206c3fb27SDimitry Andric const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value()); 889306c3fb27SDimitry Andric uint32_t Overflow = 0; 889406c3fb27SDimitry Andric 889506c3fb27SDimitry Andric if (Imm > MaxImm) { 889606c3fb27SDimitry Andric if (Imm <= MaxImm + 64) { 889706c3fb27SDimitry Andric // Use an SOffset inline constant for 4..64 889806c3fb27SDimitry Andric Overflow = Imm - MaxImm; 889906c3fb27SDimitry Andric Imm = MaxImm; 890006c3fb27SDimitry Andric } else { 890106c3fb27SDimitry Andric // Try to keep the same value in SOffset for adjacent loads, so that 890206c3fb27SDimitry Andric // the corresponding register contents can be re-used. 890306c3fb27SDimitry Andric // 890406c3fb27SDimitry Andric // Load values with all low-bits (except for alignment bits) set into 890506c3fb27SDimitry Andric // SOffset, so that a larger range of values can be covered using 890606c3fb27SDimitry Andric // s_movk_i32. 890706c3fb27SDimitry Andric // 890806c3fb27SDimitry Andric // Atomic operations fail to work correctly when individual address 890906c3fb27SDimitry Andric // components are unaligned, even if their sum is aligned. 891006c3fb27SDimitry Andric uint32_t High = (Imm + Alignment.value()) & ~MaxOffset; 891106c3fb27SDimitry Andric uint32_t Low = (Imm + Alignment.value()) & MaxOffset; 891206c3fb27SDimitry Andric Imm = Low; 891306c3fb27SDimitry Andric Overflow = High - Alignment.value(); 891406c3fb27SDimitry Andric } 891506c3fb27SDimitry Andric } 891606c3fb27SDimitry Andric 89175f757f3fSDimitry Andric if (Overflow > 0) { 891806c3fb27SDimitry Andric // There is a hardware bug in SI and CI which prevents address clamping in 891906c3fb27SDimitry Andric // MUBUF instructions from working correctly with SOffsets. The immediate 892006c3fb27SDimitry Andric // offset is unaffected. 89215f757f3fSDimitry Andric if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) 892206c3fb27SDimitry Andric return false; 892306c3fb27SDimitry Andric 89245f757f3fSDimitry Andric // It is not possible to set immediate in SOffset field on some targets. 89255f757f3fSDimitry Andric if (ST.hasRestrictedSOffset()) 89265f757f3fSDimitry Andric return false; 89275f757f3fSDimitry Andric } 89285f757f3fSDimitry Andric 892906c3fb27SDimitry Andric ImmOffset = Imm; 893006c3fb27SDimitry Andric SOffset = Overflow; 893106c3fb27SDimitry Andric return true; 893206c3fb27SDimitry Andric } 893306c3fb27SDimitry Andric 8934fe6060f1SDimitry Andric // Depending on the used address space and instructions, some immediate offsets 8935fe6060f1SDimitry Andric // are allowed and some are not. 8936fe6060f1SDimitry Andric // In general, flat instruction offsets can only be non-negative, global and 8937fe6060f1SDimitry Andric // scratch instruction offsets can also be negative. 8938fe6060f1SDimitry Andric // 8939fe6060f1SDimitry Andric // There are several bugs related to these offsets: 8940fe6060f1SDimitry Andric // On gfx10.1, flat instructions that go into the global address space cannot 8941fe6060f1SDimitry Andric // use an offset. 8942fe6060f1SDimitry Andric // 8943fe6060f1SDimitry Andric // For scratch instructions, the address can be either an SGPR or a VGPR. 8944fe6060f1SDimitry Andric // The following offsets can be used, depending on the architecture (x means 8945fe6060f1SDimitry Andric // cannot be used): 8946fe6060f1SDimitry Andric // +----------------------------+------+------+ 8947fe6060f1SDimitry Andric // | Address-Mode | SGPR | VGPR | 8948fe6060f1SDimitry Andric // +----------------------------+------+------+ 8949fe6060f1SDimitry Andric // | gfx9 | | | 8950fe6060f1SDimitry Andric // | negative, 4-aligned offset | x | ok | 8951fe6060f1SDimitry Andric // | negative, unaligned offset | x | ok | 8952fe6060f1SDimitry Andric // +----------------------------+------+------+ 8953fe6060f1SDimitry Andric // | gfx10 | | | 8954fe6060f1SDimitry Andric // | negative, 4-aligned offset | ok | ok | 8955fe6060f1SDimitry Andric // | negative, unaligned offset | ok | x | 8956fe6060f1SDimitry Andric // +----------------------------+------+------+ 8957fe6060f1SDimitry Andric // | gfx10.3 | | | 8958fe6060f1SDimitry Andric // | negative, 4-aligned offset | ok | ok | 8959fe6060f1SDimitry Andric // | negative, unaligned offset | ok | ok | 8960fe6060f1SDimitry Andric // +----------------------------+------+------+ 8961fe6060f1SDimitry Andric // 8962fe6060f1SDimitry Andric // This function ignores the addressing mode, so if an offset cannot be used in 8963fe6060f1SDimitry Andric // one addressing mode, it is considered illegal. 89640b57cec5SDimitry Andric bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 8965fe6060f1SDimitry Andric uint64_t FlatVariant) const { 89660b57cec5SDimitry Andric // TODO: Should 0 be special cased? 89670b57cec5SDimitry Andric if (!ST.hasFlatInstOffsets()) 89680b57cec5SDimitry Andric return false; 89690b57cec5SDimitry Andric 8970fe6060f1SDimitry Andric if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && 8971fe6060f1SDimitry Andric (AddrSpace == AMDGPUAS::FLAT_ADDRESS || 8972fe6060f1SDimitry Andric AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) 89730b57cec5SDimitry Andric return false; 89740b57cec5SDimitry Andric 8975fe6060f1SDimitry Andric if (ST.hasNegativeUnalignedScratchOffsetBug() && 8976fe6060f1SDimitry Andric FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && 8977fe6060f1SDimitry Andric (Offset % 4) != 0) { 8978fe6060f1SDimitry Andric return false; 8979fe6060f1SDimitry Andric } 8980fe6060f1SDimitry Andric 89815f757f3fSDimitry Andric bool AllowNegative = allowNegativeFlatOffset(FlatVariant); 8982bdd1243dSDimitry Andric unsigned N = AMDGPU::getNumFlatOffsetBits(ST); 8983bdd1243dSDimitry Andric return isIntN(N, Offset) && (AllowNegative || Offset >= 0); 89840b57cec5SDimitry Andric } 89850b57cec5SDimitry Andric 8986fe6060f1SDimitry Andric // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. 8987fe6060f1SDimitry Andric std::pair<int64_t, int64_t> 8988fe6060f1SDimitry Andric SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, 8989fe6060f1SDimitry Andric uint64_t FlatVariant) const { 8990e8d8bef9SDimitry Andric int64_t RemainderOffset = COffsetVal; 8991e8d8bef9SDimitry Andric int64_t ImmField = 0; 8992fe6060f1SDimitry Andric 89935f757f3fSDimitry Andric bool AllowNegative = allowNegativeFlatOffset(FlatVariant); 8994bdd1243dSDimitry Andric const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1; 89955f757f3fSDimitry Andric 8996bdd1243dSDimitry Andric if (AllowNegative) { 8997e8d8bef9SDimitry Andric // Use signed division by a power of two to truncate towards 0. 8998bdd1243dSDimitry Andric int64_t D = 1LL << NumBits; 8999e8d8bef9SDimitry Andric RemainderOffset = (COffsetVal / D) * D; 9000e8d8bef9SDimitry Andric ImmField = COffsetVal - RemainderOffset; 9001fe6060f1SDimitry Andric 9002fe6060f1SDimitry Andric if (ST.hasNegativeUnalignedScratchOffsetBug() && 9003fe6060f1SDimitry Andric FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && 9004fe6060f1SDimitry Andric (ImmField % 4) != 0) { 9005fe6060f1SDimitry Andric // Make ImmField a multiple of 4 9006fe6060f1SDimitry Andric RemainderOffset += ImmField % 4; 9007fe6060f1SDimitry Andric ImmField -= ImmField % 4; 9008fe6060f1SDimitry Andric } 9009e8d8bef9SDimitry Andric } else if (COffsetVal >= 0) { 9010e8d8bef9SDimitry Andric ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); 9011e8d8bef9SDimitry Andric RemainderOffset = COffsetVal - ImmField; 90120b57cec5SDimitry Andric } 90130b57cec5SDimitry Andric 9014fe6060f1SDimitry Andric assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)); 9015e8d8bef9SDimitry Andric assert(RemainderOffset + ImmField == COffsetVal); 9016e8d8bef9SDimitry Andric return {ImmField, RemainderOffset}; 9017e8d8bef9SDimitry Andric } 90180b57cec5SDimitry Andric 90195f757f3fSDimitry Andric bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const { 90205f757f3fSDimitry Andric if (ST.hasNegativeScratchOffsetBug() && 90215f757f3fSDimitry Andric FlatVariant == SIInstrFlags::FlatScratch) 90225f757f3fSDimitry Andric return false; 90235f757f3fSDimitry Andric 90245f757f3fSDimitry Andric return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST); 90255f757f3fSDimitry Andric } 90265f757f3fSDimitry Andric 902706c3fb27SDimitry Andric static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { 90280b57cec5SDimitry Andric switch (ST.getGeneration()) { 90290b57cec5SDimitry Andric default: 90300b57cec5SDimitry Andric break; 90310b57cec5SDimitry Andric case AMDGPUSubtarget::SOUTHERN_ISLANDS: 90320b57cec5SDimitry Andric case AMDGPUSubtarget::SEA_ISLANDS: 90330b57cec5SDimitry Andric return SIEncodingFamily::SI; 90340b57cec5SDimitry Andric case AMDGPUSubtarget::VOLCANIC_ISLANDS: 90350b57cec5SDimitry Andric case AMDGPUSubtarget::GFX9: 90360b57cec5SDimitry Andric return SIEncodingFamily::VI; 90370b57cec5SDimitry Andric case AMDGPUSubtarget::GFX10: 90380b57cec5SDimitry Andric return SIEncodingFamily::GFX10; 903981ad6265SDimitry Andric case AMDGPUSubtarget::GFX11: 904081ad6265SDimitry Andric return SIEncodingFamily::GFX11; 90415f757f3fSDimitry Andric case AMDGPUSubtarget::GFX12: 90425f757f3fSDimitry Andric return SIEncodingFamily::GFX12; 90430b57cec5SDimitry Andric } 90440b57cec5SDimitry Andric llvm_unreachable("Unknown subtarget generation!"); 90450b57cec5SDimitry Andric } 90460b57cec5SDimitry Andric 9047480093f4SDimitry Andric bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 9048480093f4SDimitry Andric switch(MCOp) { 9049480093f4SDimitry Andric // These opcodes use indirect register addressing so 9050480093f4SDimitry Andric // they need special handling by codegen (currently missing). 9051480093f4SDimitry Andric // Therefore it is too risky to allow these opcodes 9052480093f4SDimitry Andric // to be selected by dpp combiner or sdwa peepholer. 9053480093f4SDimitry Andric case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 9054480093f4SDimitry Andric case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 9055480093f4SDimitry Andric case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 9056480093f4SDimitry Andric case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 9057480093f4SDimitry Andric case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 9058480093f4SDimitry Andric case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 9059480093f4SDimitry Andric case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 9060480093f4SDimitry Andric case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 9061480093f4SDimitry Andric return true; 9062480093f4SDimitry Andric default: 9063480093f4SDimitry Andric return false; 9064480093f4SDimitry Andric } 9065480093f4SDimitry Andric } 9066480093f4SDimitry Andric 90670b57cec5SDimitry Andric int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 90685f757f3fSDimitry Andric if (SIInstrInfo::isSoftWaitcnt(Opcode)) 90695f757f3fSDimitry Andric Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); 90705f757f3fSDimitry Andric 907106c3fb27SDimitry Andric unsigned Gen = subtargetEncodingFamily(ST); 90720b57cec5SDimitry Andric 90730b57cec5SDimitry Andric if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 90740b57cec5SDimitry Andric ST.getGeneration() == AMDGPUSubtarget::GFX9) 90750b57cec5SDimitry Andric Gen = SIEncodingFamily::GFX9; 90760b57cec5SDimitry Andric 90770b57cec5SDimitry Andric // Adjust the encoding family to GFX80 for D16 buffer instructions when the 90780b57cec5SDimitry Andric // subtarget has UnpackedD16VMem feature. 90790b57cec5SDimitry Andric // TODO: remove this when we discard GFX80 encoding. 90800b57cec5SDimitry Andric if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 90810b57cec5SDimitry Andric Gen = SIEncodingFamily::GFX80; 90820b57cec5SDimitry Andric 90830b57cec5SDimitry Andric if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 90840b57cec5SDimitry Andric switch (ST.getGeneration()) { 90850b57cec5SDimitry Andric default: 90860b57cec5SDimitry Andric Gen = SIEncodingFamily::SDWA; 90870b57cec5SDimitry Andric break; 90880b57cec5SDimitry Andric case AMDGPUSubtarget::GFX9: 90890b57cec5SDimitry Andric Gen = SIEncodingFamily::SDWA9; 90900b57cec5SDimitry Andric break; 90910b57cec5SDimitry Andric case AMDGPUSubtarget::GFX10: 90920b57cec5SDimitry Andric Gen = SIEncodingFamily::SDWA10; 90930b57cec5SDimitry Andric break; 90940b57cec5SDimitry Andric } 90950b57cec5SDimitry Andric } 90960b57cec5SDimitry Andric 909704eeddc0SDimitry Andric if (isMAI(Opcode)) { 909804eeddc0SDimitry Andric int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); 909904eeddc0SDimitry Andric if (MFMAOp != -1) 910004eeddc0SDimitry Andric Opcode = MFMAOp; 910104eeddc0SDimitry Andric } 910204eeddc0SDimitry Andric 91030b57cec5SDimitry Andric int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 91040b57cec5SDimitry Andric 91055f757f3fSDimitry Andric // TODO-GFX12: Remove this. 91065f757f3fSDimitry Andric // Hack to allow some GFX12 codegen tests to run before all the encodings are 91075f757f3fSDimitry Andric // implemented. 91085f757f3fSDimitry Andric if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12) 91095f757f3fSDimitry Andric MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11); 91105f757f3fSDimitry Andric 91110b57cec5SDimitry Andric // -1 means that Opcode is already a native instruction. 91120b57cec5SDimitry Andric if (MCOp == -1) 91130b57cec5SDimitry Andric return Opcode; 91140b57cec5SDimitry Andric 9115fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) { 9116fe6060f1SDimitry Andric uint16_t NMCOp = (uint16_t)-1; 911781ad6265SDimitry Andric if (ST.hasGFX940Insts()) 911881ad6265SDimitry Andric NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); 911981ad6265SDimitry Andric if (NMCOp == (uint16_t)-1) 9120fe6060f1SDimitry Andric NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); 9121fe6060f1SDimitry Andric if (NMCOp == (uint16_t)-1) 9122fe6060f1SDimitry Andric NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); 9123fe6060f1SDimitry Andric if (NMCOp != (uint16_t)-1) 9124fe6060f1SDimitry Andric MCOp = NMCOp; 9125fe6060f1SDimitry Andric } 9126fe6060f1SDimitry Andric 91270b57cec5SDimitry Andric // (uint16_t)-1 means that Opcode is a pseudo instruction that has 91280b57cec5SDimitry Andric // no encoding in the given subtarget generation. 91290b57cec5SDimitry Andric if (MCOp == (uint16_t)-1) 91300b57cec5SDimitry Andric return -1; 91310b57cec5SDimitry Andric 9132480093f4SDimitry Andric if (isAsmOnlyOpcode(MCOp)) 9133480093f4SDimitry Andric return -1; 9134480093f4SDimitry Andric 91350b57cec5SDimitry Andric return MCOp; 91360b57cec5SDimitry Andric } 91370b57cec5SDimitry Andric 91380b57cec5SDimitry Andric static 91390b57cec5SDimitry Andric TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 91400b57cec5SDimitry Andric assert(RegOpnd.isReg()); 91410b57cec5SDimitry Andric return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 91420b57cec5SDimitry Andric getRegSubRegPair(RegOpnd); 91430b57cec5SDimitry Andric } 91440b57cec5SDimitry Andric 91450b57cec5SDimitry Andric TargetInstrInfo::RegSubRegPair 91460b57cec5SDimitry Andric llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 91470b57cec5SDimitry Andric assert(MI.isRegSequence()); 91480b57cec5SDimitry Andric for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 91490b57cec5SDimitry Andric if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 91500b57cec5SDimitry Andric auto &RegOp = MI.getOperand(1 + 2 * I); 91510b57cec5SDimitry Andric return getRegOrUndef(RegOp); 91520b57cec5SDimitry Andric } 91530b57cec5SDimitry Andric return TargetInstrInfo::RegSubRegPair(); 91540b57cec5SDimitry Andric } 91550b57cec5SDimitry Andric 91560b57cec5SDimitry Andric // Try to find the definition of reg:subreg in subreg-manipulation pseudos 91570b57cec5SDimitry Andric // Following a subreg of reg:subreg isn't supported 91580b57cec5SDimitry Andric static bool followSubRegDef(MachineInstr &MI, 91590b57cec5SDimitry Andric TargetInstrInfo::RegSubRegPair &RSR) { 91600b57cec5SDimitry Andric if (!RSR.SubReg) 91610b57cec5SDimitry Andric return false; 91620b57cec5SDimitry Andric switch (MI.getOpcode()) { 91630b57cec5SDimitry Andric default: break; 91640b57cec5SDimitry Andric case AMDGPU::REG_SEQUENCE: 91650b57cec5SDimitry Andric RSR = getRegSequenceSubReg(MI, RSR.SubReg); 91660b57cec5SDimitry Andric return true; 91670b57cec5SDimitry Andric // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 91680b57cec5SDimitry Andric case AMDGPU::INSERT_SUBREG: 91690b57cec5SDimitry Andric if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 91700b57cec5SDimitry Andric // inserted the subreg we're looking for 91710b57cec5SDimitry Andric RSR = getRegOrUndef(MI.getOperand(2)); 91720b57cec5SDimitry Andric else { // the subreg in the rest of the reg 91730b57cec5SDimitry Andric auto R1 = getRegOrUndef(MI.getOperand(1)); 91740b57cec5SDimitry Andric if (R1.SubReg) // subreg of subreg isn't supported 91750b57cec5SDimitry Andric return false; 91760b57cec5SDimitry Andric RSR.Reg = R1.Reg; 91770b57cec5SDimitry Andric } 91780b57cec5SDimitry Andric return true; 91790b57cec5SDimitry Andric } 91800b57cec5SDimitry Andric return false; 91810b57cec5SDimitry Andric } 91820b57cec5SDimitry Andric 91830b57cec5SDimitry Andric MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 91840b57cec5SDimitry Andric MachineRegisterInfo &MRI) { 91850b57cec5SDimitry Andric assert(MRI.isSSA()); 9186e8d8bef9SDimitry Andric if (!P.Reg.isVirtual()) 91870b57cec5SDimitry Andric return nullptr; 91880b57cec5SDimitry Andric 91890b57cec5SDimitry Andric auto RSR = P; 91900b57cec5SDimitry Andric auto *DefInst = MRI.getVRegDef(RSR.Reg); 91910b57cec5SDimitry Andric while (auto *MI = DefInst) { 91920b57cec5SDimitry Andric DefInst = nullptr; 91930b57cec5SDimitry Andric switch (MI->getOpcode()) { 91940b57cec5SDimitry Andric case AMDGPU::COPY: 91950b57cec5SDimitry Andric case AMDGPU::V_MOV_B32_e32: { 91960b57cec5SDimitry Andric auto &Op1 = MI->getOperand(1); 9197e8d8bef9SDimitry Andric if (Op1.isReg() && Op1.getReg().isVirtual()) { 91980b57cec5SDimitry Andric if (Op1.isUndef()) 91990b57cec5SDimitry Andric return nullptr; 92000b57cec5SDimitry Andric RSR = getRegSubRegPair(Op1); 92010b57cec5SDimitry Andric DefInst = MRI.getVRegDef(RSR.Reg); 92020b57cec5SDimitry Andric } 92030b57cec5SDimitry Andric break; 92040b57cec5SDimitry Andric } 92050b57cec5SDimitry Andric default: 92060b57cec5SDimitry Andric if (followSubRegDef(*MI, RSR)) { 92070b57cec5SDimitry Andric if (!RSR.Reg) 92080b57cec5SDimitry Andric return nullptr; 92090b57cec5SDimitry Andric DefInst = MRI.getVRegDef(RSR.Reg); 92100b57cec5SDimitry Andric } 92110b57cec5SDimitry Andric } 92120b57cec5SDimitry Andric if (!DefInst) 92130b57cec5SDimitry Andric return MI; 92140b57cec5SDimitry Andric } 92150b57cec5SDimitry Andric return nullptr; 92160b57cec5SDimitry Andric } 92170b57cec5SDimitry Andric 92180b57cec5SDimitry Andric bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 92190b57cec5SDimitry Andric Register VReg, 92200b57cec5SDimitry Andric const MachineInstr &DefMI, 92210b57cec5SDimitry Andric const MachineInstr &UseMI) { 92220b57cec5SDimitry Andric assert(MRI.isSSA() && "Must be run on SSA"); 92230b57cec5SDimitry Andric 92240b57cec5SDimitry Andric auto *TRI = MRI.getTargetRegisterInfo(); 92250b57cec5SDimitry Andric auto *DefBB = DefMI.getParent(); 92260b57cec5SDimitry Andric 92270b57cec5SDimitry Andric // Don't bother searching between blocks, although it is possible this block 92280b57cec5SDimitry Andric // doesn't modify exec. 92290b57cec5SDimitry Andric if (UseMI.getParent() != DefBB) 92300b57cec5SDimitry Andric return true; 92310b57cec5SDimitry Andric 92320b57cec5SDimitry Andric const int MaxInstScan = 20; 92330b57cec5SDimitry Andric int NumInst = 0; 92340b57cec5SDimitry Andric 92350b57cec5SDimitry Andric // Stop scan at the use. 92360b57cec5SDimitry Andric auto E = UseMI.getIterator(); 92370b57cec5SDimitry Andric for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 92380b57cec5SDimitry Andric if (I->isDebugInstr()) 92390b57cec5SDimitry Andric continue; 92400b57cec5SDimitry Andric 92410b57cec5SDimitry Andric if (++NumInst > MaxInstScan) 92420b57cec5SDimitry Andric return true; 92430b57cec5SDimitry Andric 92440b57cec5SDimitry Andric if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 92450b57cec5SDimitry Andric return true; 92460b57cec5SDimitry Andric } 92470b57cec5SDimitry Andric 92480b57cec5SDimitry Andric return false; 92490b57cec5SDimitry Andric } 92500b57cec5SDimitry Andric 92510b57cec5SDimitry Andric bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 92520b57cec5SDimitry Andric Register VReg, 92530b57cec5SDimitry Andric const MachineInstr &DefMI) { 92540b57cec5SDimitry Andric assert(MRI.isSSA() && "Must be run on SSA"); 92550b57cec5SDimitry Andric 92560b57cec5SDimitry Andric auto *TRI = MRI.getTargetRegisterInfo(); 92570b57cec5SDimitry Andric auto *DefBB = DefMI.getParent(); 92580b57cec5SDimitry Andric 9259e8d8bef9SDimitry Andric const int MaxUseScan = 10; 9260e8d8bef9SDimitry Andric int NumUse = 0; 92610b57cec5SDimitry Andric 9262e8d8bef9SDimitry Andric for (auto &Use : MRI.use_nodbg_operands(VReg)) { 9263e8d8bef9SDimitry Andric auto &UseInst = *Use.getParent(); 92640b57cec5SDimitry Andric // Don't bother searching between blocks, although it is possible this block 92650b57cec5SDimitry Andric // doesn't modify exec. 926681ad6265SDimitry Andric if (UseInst.getParent() != DefBB || UseInst.isPHI()) 92670b57cec5SDimitry Andric return true; 92680b57cec5SDimitry Andric 9269e8d8bef9SDimitry Andric if (++NumUse > MaxUseScan) 92700b57cec5SDimitry Andric return true; 92710b57cec5SDimitry Andric } 92720b57cec5SDimitry Andric 9273e8d8bef9SDimitry Andric if (NumUse == 0) 9274e8d8bef9SDimitry Andric return false; 9275e8d8bef9SDimitry Andric 92760b57cec5SDimitry Andric const int MaxInstScan = 20; 92770b57cec5SDimitry Andric int NumInst = 0; 92780b57cec5SDimitry Andric 92790b57cec5SDimitry Andric // Stop scan when we have seen all the uses. 92800b57cec5SDimitry Andric for (auto I = std::next(DefMI.getIterator()); ; ++I) { 9281e8d8bef9SDimitry Andric assert(I != DefBB->end()); 9282e8d8bef9SDimitry Andric 92830b57cec5SDimitry Andric if (I->isDebugInstr()) 92840b57cec5SDimitry Andric continue; 92850b57cec5SDimitry Andric 92860b57cec5SDimitry Andric if (++NumInst > MaxInstScan) 92870b57cec5SDimitry Andric return true; 92880b57cec5SDimitry Andric 9289e8d8bef9SDimitry Andric for (const MachineOperand &Op : I->operands()) { 9290e8d8bef9SDimitry Andric // We don't check reg masks here as they're used only on calls: 9291e8d8bef9SDimitry Andric // 1. EXEC is only considered const within one BB 9292e8d8bef9SDimitry Andric // 2. Call should be a terminator instruction if present in a BB 92930b57cec5SDimitry Andric 9294e8d8bef9SDimitry Andric if (!Op.isReg()) 9295e8d8bef9SDimitry Andric continue; 9296e8d8bef9SDimitry Andric 9297e8d8bef9SDimitry Andric Register Reg = Op.getReg(); 9298e8d8bef9SDimitry Andric if (Op.isUse()) { 9299e8d8bef9SDimitry Andric if (Reg == VReg && --NumUse == 0) 9300e8d8bef9SDimitry Andric return false; 9301e8d8bef9SDimitry Andric } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) 93020b57cec5SDimitry Andric return true; 93030b57cec5SDimitry Andric } 93040b57cec5SDimitry Andric } 9305e8d8bef9SDimitry Andric } 93068bcb0991SDimitry Andric 93078bcb0991SDimitry Andric MachineInstr *SIInstrInfo::createPHIDestinationCopy( 93088bcb0991SDimitry Andric MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 93098bcb0991SDimitry Andric const DebugLoc &DL, Register Src, Register Dst) const { 93108bcb0991SDimitry Andric auto Cur = MBB.begin(); 93118bcb0991SDimitry Andric if (Cur != MBB.end()) 93128bcb0991SDimitry Andric do { 93138bcb0991SDimitry Andric if (!Cur->isPHI() && Cur->readsRegister(Dst)) 93148bcb0991SDimitry Andric return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 93158bcb0991SDimitry Andric ++Cur; 93168bcb0991SDimitry Andric } while (Cur != MBB.end() && Cur != LastPHIIt); 93178bcb0991SDimitry Andric 93188bcb0991SDimitry Andric return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 93198bcb0991SDimitry Andric Dst); 93208bcb0991SDimitry Andric } 93218bcb0991SDimitry Andric 93228bcb0991SDimitry Andric MachineInstr *SIInstrInfo::createPHISourceCopy( 93238bcb0991SDimitry Andric MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 9324480093f4SDimitry Andric const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 93258bcb0991SDimitry Andric if (InsPt != MBB.end() && 93268bcb0991SDimitry Andric (InsPt->getOpcode() == AMDGPU::SI_IF || 93278bcb0991SDimitry Andric InsPt->getOpcode() == AMDGPU::SI_ELSE || 93288bcb0991SDimitry Andric InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 93298bcb0991SDimitry Andric InsPt->definesRegister(Src)) { 93308bcb0991SDimitry Andric InsPt++; 9331480093f4SDimitry Andric return BuildMI(MBB, InsPt, DL, 93328bcb0991SDimitry Andric get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 93338bcb0991SDimitry Andric : AMDGPU::S_MOV_B64_term), 93348bcb0991SDimitry Andric Dst) 93358bcb0991SDimitry Andric .addReg(Src, 0, SrcSubReg) 93368bcb0991SDimitry Andric .addReg(AMDGPU::EXEC, RegState::Implicit); 93378bcb0991SDimitry Andric } 93388bcb0991SDimitry Andric return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 93398bcb0991SDimitry Andric Dst); 93408bcb0991SDimitry Andric } 93418bcb0991SDimitry Andric 93428bcb0991SDimitry Andric bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 9343480093f4SDimitry Andric 9344480093f4SDimitry Andric MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 9345480093f4SDimitry Andric MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 9346480093f4SDimitry Andric MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 9347480093f4SDimitry Andric VirtRegMap *VRM) const { 9348480093f4SDimitry Andric // This is a bit of a hack (copied from AArch64). Consider this instruction: 9349480093f4SDimitry Andric // 9350480093f4SDimitry Andric // %0:sreg_32 = COPY $m0 9351480093f4SDimitry Andric // 9352480093f4SDimitry Andric // We explicitly chose SReg_32 for the virtual register so such a copy might 9353480093f4SDimitry Andric // be eliminated by RegisterCoalescer. However, that may not be possible, and 9354480093f4SDimitry Andric // %0 may even spill. We can't spill $m0 normally (it would require copying to 9355480093f4SDimitry Andric // a numbered SGPR anyway), and since it is in the SReg_32 register class, 9356480093f4SDimitry Andric // TargetInstrInfo::foldMemoryOperand() is going to try. 93575ffd83dbSDimitry Andric // A similar issue also exists with spilling and reloading $exec registers. 9358480093f4SDimitry Andric // 9359480093f4SDimitry Andric // To prevent that, constrain the %0 register class here. 93605f757f3fSDimitry Andric if (isFullCopyInstr(MI)) { 9361480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 9362480093f4SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 93635ffd83dbSDimitry Andric if ((DstReg.isVirtual() || SrcReg.isVirtual()) && 93645ffd83dbSDimitry Andric (DstReg.isVirtual() != SrcReg.isVirtual())) { 93655ffd83dbSDimitry Andric MachineRegisterInfo &MRI = MF.getRegInfo(); 93665ffd83dbSDimitry Andric Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; 93675ffd83dbSDimitry Andric const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); 93685ffd83dbSDimitry Andric if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { 93695ffd83dbSDimitry Andric MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 93705ffd83dbSDimitry Andric return nullptr; 93715ffd83dbSDimitry Andric } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { 93725ffd83dbSDimitry Andric MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); 9373480093f4SDimitry Andric return nullptr; 9374480093f4SDimitry Andric } 9375480093f4SDimitry Andric } 9376480093f4SDimitry Andric } 9377480093f4SDimitry Andric 9378480093f4SDimitry Andric return nullptr; 9379480093f4SDimitry Andric } 9380480093f4SDimitry Andric 9381480093f4SDimitry Andric unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 9382480093f4SDimitry Andric const MachineInstr &MI, 9383480093f4SDimitry Andric unsigned *PredCost) const { 9384480093f4SDimitry Andric if (MI.isBundle()) { 9385480093f4SDimitry Andric MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 9386480093f4SDimitry Andric MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 9387480093f4SDimitry Andric unsigned Lat = 0, Count = 0; 9388480093f4SDimitry Andric for (++I; I != E && I->isBundledWithPred(); ++I) { 9389480093f4SDimitry Andric ++Count; 9390480093f4SDimitry Andric Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 9391480093f4SDimitry Andric } 9392480093f4SDimitry Andric return Lat + Count - 1; 9393480093f4SDimitry Andric } 9394480093f4SDimitry Andric 9395480093f4SDimitry Andric return SchedModel.computeInstrLatency(&MI); 9396480093f4SDimitry Andric } 9397e8d8bef9SDimitry Andric 9398bdd1243dSDimitry Andric InstructionUniformity 9399bdd1243dSDimitry Andric SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { 9400bdd1243dSDimitry Andric unsigned opcode = MI.getOpcode(); 94015f757f3fSDimitry Andric if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { 94025f757f3fSDimitry Andric auto IID = GI->getIntrinsicID(); 940306c3fb27SDimitry Andric if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) 940406c3fb27SDimitry Andric return InstructionUniformity::NeverUniform; 940506c3fb27SDimitry Andric if (AMDGPU::isIntrinsicAlwaysUniform(IID)) 940606c3fb27SDimitry Andric return InstructionUniformity::AlwaysUniform; 940706c3fb27SDimitry Andric 940806c3fb27SDimitry Andric switch (IID) { 940906c3fb27SDimitry Andric case Intrinsic::amdgcn_if: 941006c3fb27SDimitry Andric case Intrinsic::amdgcn_else: 941106c3fb27SDimitry Andric // FIXME: Uniform if second result 941206c3fb27SDimitry Andric break; 941306c3fb27SDimitry Andric } 941406c3fb27SDimitry Andric 941506c3fb27SDimitry Andric return InstructionUniformity::Default; 9416bdd1243dSDimitry Andric } 9417bdd1243dSDimitry Andric 9418bdd1243dSDimitry Andric // Loads from the private and flat address spaces are divergent, because 9419bdd1243dSDimitry Andric // threads can execute the load instruction with the same inputs and get 9420bdd1243dSDimitry Andric // different results. 9421bdd1243dSDimitry Andric // 9422bdd1243dSDimitry Andric // All other loads are not divergent, because if threads issue loads with the 9423bdd1243dSDimitry Andric // same arguments, they will always get the same result. 9424bdd1243dSDimitry Andric if (opcode == AMDGPU::G_LOAD) { 9425bdd1243dSDimitry Andric if (MI.memoperands_empty()) 9426bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; // conservative assumption 9427bdd1243dSDimitry Andric 9428bdd1243dSDimitry Andric if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) { 9429bdd1243dSDimitry Andric return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS || 9430bdd1243dSDimitry Andric mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; 9431bdd1243dSDimitry Andric })) { 9432bdd1243dSDimitry Andric // At least one MMO in a non-global address space. 9433bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; 9434bdd1243dSDimitry Andric } 9435bdd1243dSDimitry Andric return InstructionUniformity::Default; 9436bdd1243dSDimitry Andric } 9437bdd1243dSDimitry Andric 9438bdd1243dSDimitry Andric if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) || 9439bdd1243dSDimitry Andric opcode == AMDGPU::G_ATOMIC_CMPXCHG || 94405f757f3fSDimitry Andric opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS || 94415f757f3fSDimitry Andric AMDGPU::isGenericAtomic(opcode)) { 9442bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; 9443bdd1243dSDimitry Andric } 9444bdd1243dSDimitry Andric return InstructionUniformity::Default; 9445bdd1243dSDimitry Andric } 9446bdd1243dSDimitry Andric 9447bdd1243dSDimitry Andric InstructionUniformity 9448bdd1243dSDimitry Andric SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { 944906c3fb27SDimitry Andric 945006c3fb27SDimitry Andric if (isNeverUniform(MI)) 945106c3fb27SDimitry Andric return InstructionUniformity::NeverUniform; 945206c3fb27SDimitry Andric 945306c3fb27SDimitry Andric unsigned opcode = MI.getOpcode(); 94545f757f3fSDimitry Andric if (opcode == AMDGPU::V_READLANE_B32 || 94555f757f3fSDimitry Andric opcode == AMDGPU::V_READFIRSTLANE_B32 || 94565f757f3fSDimitry Andric opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR) 945706c3fb27SDimitry Andric return InstructionUniformity::AlwaysUniform; 945806c3fb27SDimitry Andric 94595f757f3fSDimitry Andric if (isCopyInstr(MI)) { 946006c3fb27SDimitry Andric const MachineOperand &srcOp = MI.getOperand(1); 946106c3fb27SDimitry Andric if (srcOp.isReg() && srcOp.getReg().isPhysical()) { 946206c3fb27SDimitry Andric const TargetRegisterClass *regClass = 946306c3fb27SDimitry Andric RI.getPhysRegBaseClass(srcOp.getReg()); 946406c3fb27SDimitry Andric return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform 946506c3fb27SDimitry Andric : InstructionUniformity::NeverUniform; 946606c3fb27SDimitry Andric } 946706c3fb27SDimitry Andric return InstructionUniformity::Default; 946806c3fb27SDimitry Andric } 946906c3fb27SDimitry Andric 947006c3fb27SDimitry Andric // GMIR handling 947106c3fb27SDimitry Andric if (MI.isPreISelOpcode()) 947206c3fb27SDimitry Andric return SIInstrInfo::getGenericInstructionUniformity(MI); 947306c3fb27SDimitry Andric 9474bdd1243dSDimitry Andric // Atomics are divergent because they are executed sequentially: when an 9475bdd1243dSDimitry Andric // atomic operation refers to the same address in each thread, then each 9476bdd1243dSDimitry Andric // thread after the first sees the value written by the previous thread as 9477bdd1243dSDimitry Andric // original value. 9478bdd1243dSDimitry Andric 9479bdd1243dSDimitry Andric if (isAtomic(MI)) 9480bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; 9481bdd1243dSDimitry Andric 9482bdd1243dSDimitry Andric // Loads from the private and flat address spaces are divergent, because 9483bdd1243dSDimitry Andric // threads can execute the load instruction with the same inputs and get 9484bdd1243dSDimitry Andric // different results. 9485bdd1243dSDimitry Andric if (isFLAT(MI) && MI.mayLoad()) { 9486bdd1243dSDimitry Andric if (MI.memoperands_empty()) 9487bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; // conservative assumption 9488bdd1243dSDimitry Andric 9489bdd1243dSDimitry Andric if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) { 9490bdd1243dSDimitry Andric return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS || 9491bdd1243dSDimitry Andric mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; 9492bdd1243dSDimitry Andric })) { 9493bdd1243dSDimitry Andric // At least one MMO in a non-global address space. 9494bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; 9495bdd1243dSDimitry Andric } 9496bdd1243dSDimitry Andric 9497bdd1243dSDimitry Andric return InstructionUniformity::Default; 9498bdd1243dSDimitry Andric } 9499bdd1243dSDimitry Andric 9500bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 950106c3fb27SDimitry Andric const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo(); 950206c3fb27SDimitry Andric 950306c3fb27SDimitry Andric // FIXME: It's conceptually broken to report this for an instruction, and not 950406c3fb27SDimitry Andric // a specific def operand. For inline asm in particular, there could be mixed 950506c3fb27SDimitry Andric // uniform and divergent results. 950606c3fb27SDimitry Andric for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 950706c3fb27SDimitry Andric const MachineOperand &SrcOp = MI.getOperand(I); 950806c3fb27SDimitry Andric if (!SrcOp.isReg()) 9509bdd1243dSDimitry Andric continue; 9510bdd1243dSDimitry Andric 951106c3fb27SDimitry Andric Register Reg = SrcOp.getReg(); 951206c3fb27SDimitry Andric if (!Reg || !SrcOp.readsReg()) 951306c3fb27SDimitry Andric continue; 9514bdd1243dSDimitry Andric 951506c3fb27SDimitry Andric // If RegBank is null, this is unassigned or an unallocatable special 951606c3fb27SDimitry Andric // register, which are all scalars. 951706c3fb27SDimitry Andric const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI); 951806c3fb27SDimitry Andric if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID) 9519bdd1243dSDimitry Andric return InstructionUniformity::NeverUniform; 9520bdd1243dSDimitry Andric } 9521bdd1243dSDimitry Andric 9522bdd1243dSDimitry Andric // TODO: Uniformity check condtions above can be rearranged for more 9523bdd1243dSDimitry Andric // redability 9524bdd1243dSDimitry Andric 9525bdd1243dSDimitry Andric // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are 9526bdd1243dSDimitry Andric // currently turned into no-op COPYs by SelectionDAG ISel and are 9527bdd1243dSDimitry Andric // therefore no longer recognizable. 9528bdd1243dSDimitry Andric 9529bdd1243dSDimitry Andric return InstructionUniformity::Default; 9530bdd1243dSDimitry Andric } 9531bdd1243dSDimitry Andric 9532e8d8bef9SDimitry Andric unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { 9533e8d8bef9SDimitry Andric switch (MF.getFunction().getCallingConv()) { 9534e8d8bef9SDimitry Andric case CallingConv::AMDGPU_PS: 9535e8d8bef9SDimitry Andric return 1; 9536e8d8bef9SDimitry Andric case CallingConv::AMDGPU_VS: 9537e8d8bef9SDimitry Andric return 2; 9538e8d8bef9SDimitry Andric case CallingConv::AMDGPU_GS: 9539e8d8bef9SDimitry Andric return 3; 9540e8d8bef9SDimitry Andric case CallingConv::AMDGPU_HS: 9541e8d8bef9SDimitry Andric case CallingConv::AMDGPU_LS: 9542e8d8bef9SDimitry Andric case CallingConv::AMDGPU_ES: 9543e8d8bef9SDimitry Andric report_fatal_error("ds_ordered_count unsupported for this calling conv"); 9544e8d8bef9SDimitry Andric case CallingConv::AMDGPU_CS: 9545e8d8bef9SDimitry Andric case CallingConv::AMDGPU_KERNEL: 9546e8d8bef9SDimitry Andric case CallingConv::C: 9547e8d8bef9SDimitry Andric case CallingConv::Fast: 9548e8d8bef9SDimitry Andric default: 9549e8d8bef9SDimitry Andric // Assume other calling conventions are various compute callable functions 9550e8d8bef9SDimitry Andric return 0; 9551e8d8bef9SDimitry Andric } 9552e8d8bef9SDimitry Andric } 9553349cc55cSDimitry Andric 9554349cc55cSDimitry Andric bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 9555349cc55cSDimitry Andric Register &SrcReg2, int64_t &CmpMask, 9556349cc55cSDimitry Andric int64_t &CmpValue) const { 9557349cc55cSDimitry Andric if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg()) 9558349cc55cSDimitry Andric return false; 9559349cc55cSDimitry Andric 9560349cc55cSDimitry Andric switch (MI.getOpcode()) { 9561349cc55cSDimitry Andric default: 9562349cc55cSDimitry Andric break; 9563349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U32: 9564349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_I32: 9565349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U32: 9566349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_I32: 9567349cc55cSDimitry Andric case AMDGPU::S_CMP_LT_U32: 9568349cc55cSDimitry Andric case AMDGPU::S_CMP_LT_I32: 9569349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_U32: 9570349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_I32: 9571349cc55cSDimitry Andric case AMDGPU::S_CMP_LE_U32: 9572349cc55cSDimitry Andric case AMDGPU::S_CMP_LE_I32: 9573349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_U32: 9574349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_I32: 9575349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U64: 9576349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U64: 9577349cc55cSDimitry Andric SrcReg = MI.getOperand(0).getReg(); 9578349cc55cSDimitry Andric if (MI.getOperand(1).isReg()) { 9579349cc55cSDimitry Andric if (MI.getOperand(1).getSubReg()) 9580349cc55cSDimitry Andric return false; 9581349cc55cSDimitry Andric SrcReg2 = MI.getOperand(1).getReg(); 9582349cc55cSDimitry Andric CmpValue = 0; 9583349cc55cSDimitry Andric } else if (MI.getOperand(1).isImm()) { 9584349cc55cSDimitry Andric SrcReg2 = Register(); 9585349cc55cSDimitry Andric CmpValue = MI.getOperand(1).getImm(); 9586349cc55cSDimitry Andric } else { 9587349cc55cSDimitry Andric return false; 9588349cc55cSDimitry Andric } 9589349cc55cSDimitry Andric CmpMask = ~0; 9590349cc55cSDimitry Andric return true; 9591349cc55cSDimitry Andric case AMDGPU::S_CMPK_EQ_U32: 9592349cc55cSDimitry Andric case AMDGPU::S_CMPK_EQ_I32: 9593349cc55cSDimitry Andric case AMDGPU::S_CMPK_LG_U32: 9594349cc55cSDimitry Andric case AMDGPU::S_CMPK_LG_I32: 9595349cc55cSDimitry Andric case AMDGPU::S_CMPK_LT_U32: 9596349cc55cSDimitry Andric case AMDGPU::S_CMPK_LT_I32: 9597349cc55cSDimitry Andric case AMDGPU::S_CMPK_GT_U32: 9598349cc55cSDimitry Andric case AMDGPU::S_CMPK_GT_I32: 9599349cc55cSDimitry Andric case AMDGPU::S_CMPK_LE_U32: 9600349cc55cSDimitry Andric case AMDGPU::S_CMPK_LE_I32: 9601349cc55cSDimitry Andric case AMDGPU::S_CMPK_GE_U32: 9602349cc55cSDimitry Andric case AMDGPU::S_CMPK_GE_I32: 9603349cc55cSDimitry Andric SrcReg = MI.getOperand(0).getReg(); 9604349cc55cSDimitry Andric SrcReg2 = Register(); 9605349cc55cSDimitry Andric CmpValue = MI.getOperand(1).getImm(); 9606349cc55cSDimitry Andric CmpMask = ~0; 9607349cc55cSDimitry Andric return true; 9608349cc55cSDimitry Andric } 9609349cc55cSDimitry Andric 9610349cc55cSDimitry Andric return false; 9611349cc55cSDimitry Andric } 9612349cc55cSDimitry Andric 9613349cc55cSDimitry Andric bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, 9614349cc55cSDimitry Andric Register SrcReg2, int64_t CmpMask, 9615349cc55cSDimitry Andric int64_t CmpValue, 9616349cc55cSDimitry Andric const MachineRegisterInfo *MRI) const { 9617349cc55cSDimitry Andric if (!SrcReg || SrcReg.isPhysical()) 9618349cc55cSDimitry Andric return false; 9619349cc55cSDimitry Andric 9620349cc55cSDimitry Andric if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) 9621349cc55cSDimitry Andric return false; 9622349cc55cSDimitry Andric 9623349cc55cSDimitry Andric const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, 9624349cc55cSDimitry Andric this](int64_t ExpectedValue, unsigned SrcSize, 962581ad6265SDimitry Andric bool IsReversible, bool IsSigned) -> bool { 9626349cc55cSDimitry Andric // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 9627349cc55cSDimitry Andric // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 9628349cc55cSDimitry Andric // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 9629349cc55cSDimitry Andric // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 9630349cc55cSDimitry Andric // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n 9631349cc55cSDimitry Andric // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 9632349cc55cSDimitry Andric // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 9633349cc55cSDimitry Andric // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 9634349cc55cSDimitry Andric // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 9635349cc55cSDimitry Andric // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n 9636349cc55cSDimitry Andric // 9637349cc55cSDimitry Andric // Signed ge/gt are not used for the sign bit. 9638349cc55cSDimitry Andric // 9639349cc55cSDimitry Andric // If result of the AND is unused except in the compare: 9640349cc55cSDimitry Andric // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n 9641349cc55cSDimitry Andric // 9642349cc55cSDimitry Andric // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n 9643349cc55cSDimitry Andric // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n 9644349cc55cSDimitry Andric // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n 9645349cc55cSDimitry Andric // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n 9646349cc55cSDimitry Andric // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n 9647349cc55cSDimitry Andric // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n 9648349cc55cSDimitry Andric 9649349cc55cSDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); 9650349cc55cSDimitry Andric if (!Def || Def->getParent() != CmpInstr.getParent()) 9651349cc55cSDimitry Andric return false; 9652349cc55cSDimitry Andric 9653349cc55cSDimitry Andric if (Def->getOpcode() != AMDGPU::S_AND_B32 && 9654349cc55cSDimitry Andric Def->getOpcode() != AMDGPU::S_AND_B64) 9655349cc55cSDimitry Andric return false; 9656349cc55cSDimitry Andric 9657349cc55cSDimitry Andric int64_t Mask; 9658349cc55cSDimitry Andric const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool { 9659349cc55cSDimitry Andric if (MO->isImm()) 9660349cc55cSDimitry Andric Mask = MO->getImm(); 9661349cc55cSDimitry Andric else if (!getFoldableImm(MO, Mask)) 9662349cc55cSDimitry Andric return false; 9663349cc55cSDimitry Andric Mask &= maxUIntN(SrcSize); 9664349cc55cSDimitry Andric return isPowerOf2_64(Mask); 9665349cc55cSDimitry Andric }; 9666349cc55cSDimitry Andric 9667349cc55cSDimitry Andric MachineOperand *SrcOp = &Def->getOperand(1); 9668349cc55cSDimitry Andric if (isMask(SrcOp)) 9669349cc55cSDimitry Andric SrcOp = &Def->getOperand(2); 9670349cc55cSDimitry Andric else if (isMask(&Def->getOperand(2))) 9671349cc55cSDimitry Andric SrcOp = &Def->getOperand(1); 9672349cc55cSDimitry Andric else 9673349cc55cSDimitry Andric return false; 9674349cc55cSDimitry Andric 967506c3fb27SDimitry Andric unsigned BitNo = llvm::countr_zero((uint64_t)Mask); 9676349cc55cSDimitry Andric if (IsSigned && BitNo == SrcSize - 1) 9677349cc55cSDimitry Andric return false; 9678349cc55cSDimitry Andric 9679349cc55cSDimitry Andric ExpectedValue <<= BitNo; 9680349cc55cSDimitry Andric 9681349cc55cSDimitry Andric bool IsReversedCC = false; 9682349cc55cSDimitry Andric if (CmpValue != ExpectedValue) { 968381ad6265SDimitry Andric if (!IsReversible) 9684349cc55cSDimitry Andric return false; 9685349cc55cSDimitry Andric IsReversedCC = CmpValue == (ExpectedValue ^ Mask); 9686349cc55cSDimitry Andric if (!IsReversedCC) 9687349cc55cSDimitry Andric return false; 9688349cc55cSDimitry Andric } 9689349cc55cSDimitry Andric 9690349cc55cSDimitry Andric Register DefReg = Def->getOperand(0).getReg(); 9691349cc55cSDimitry Andric if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) 9692349cc55cSDimitry Andric return false; 9693349cc55cSDimitry Andric 9694349cc55cSDimitry Andric for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); 9695349cc55cSDimitry Andric I != E; ++I) { 9696349cc55cSDimitry Andric if (I->modifiesRegister(AMDGPU::SCC, &RI) || 9697349cc55cSDimitry Andric I->killsRegister(AMDGPU::SCC, &RI)) 9698349cc55cSDimitry Andric return false; 9699349cc55cSDimitry Andric } 9700349cc55cSDimitry Andric 9701349cc55cSDimitry Andric MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC); 9702349cc55cSDimitry Andric SccDef->setIsDead(false); 9703349cc55cSDimitry Andric CmpInstr.eraseFromParent(); 9704349cc55cSDimitry Andric 9705349cc55cSDimitry Andric if (!MRI->use_nodbg_empty(DefReg)) { 9706349cc55cSDimitry Andric assert(!IsReversedCC); 9707349cc55cSDimitry Andric return true; 9708349cc55cSDimitry Andric } 9709349cc55cSDimitry Andric 9710349cc55cSDimitry Andric // Replace AND with unused result with a S_BITCMP. 9711349cc55cSDimitry Andric MachineBasicBlock *MBB = Def->getParent(); 9712349cc55cSDimitry Andric 9713349cc55cSDimitry Andric unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32 9714349cc55cSDimitry Andric : AMDGPU::S_BITCMP1_B32 9715349cc55cSDimitry Andric : IsReversedCC ? AMDGPU::S_BITCMP0_B64 9716349cc55cSDimitry Andric : AMDGPU::S_BITCMP1_B64; 9717349cc55cSDimitry Andric 9718349cc55cSDimitry Andric BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc)) 9719349cc55cSDimitry Andric .add(*SrcOp) 9720349cc55cSDimitry Andric .addImm(BitNo); 9721349cc55cSDimitry Andric Def->eraseFromParent(); 9722349cc55cSDimitry Andric 9723349cc55cSDimitry Andric return true; 9724349cc55cSDimitry Andric }; 9725349cc55cSDimitry Andric 9726349cc55cSDimitry Andric switch (CmpInstr.getOpcode()) { 9727349cc55cSDimitry Andric default: 9728349cc55cSDimitry Andric break; 9729349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U32: 9730349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_I32: 9731349cc55cSDimitry Andric case AMDGPU::S_CMPK_EQ_U32: 9732349cc55cSDimitry Andric case AMDGPU::S_CMPK_EQ_I32: 9733349cc55cSDimitry Andric return optimizeCmpAnd(1, 32, true, false); 9734349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_U32: 9735349cc55cSDimitry Andric case AMDGPU::S_CMPK_GE_U32: 9736349cc55cSDimitry Andric return optimizeCmpAnd(1, 32, false, false); 9737349cc55cSDimitry Andric case AMDGPU::S_CMP_GE_I32: 9738349cc55cSDimitry Andric case AMDGPU::S_CMPK_GE_I32: 9739349cc55cSDimitry Andric return optimizeCmpAnd(1, 32, false, true); 9740349cc55cSDimitry Andric case AMDGPU::S_CMP_EQ_U64: 9741349cc55cSDimitry Andric return optimizeCmpAnd(1, 64, true, false); 9742349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U32: 9743349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_I32: 9744349cc55cSDimitry Andric case AMDGPU::S_CMPK_LG_U32: 9745349cc55cSDimitry Andric case AMDGPU::S_CMPK_LG_I32: 9746349cc55cSDimitry Andric return optimizeCmpAnd(0, 32, true, false); 9747349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_U32: 9748349cc55cSDimitry Andric case AMDGPU::S_CMPK_GT_U32: 9749349cc55cSDimitry Andric return optimizeCmpAnd(0, 32, false, false); 9750349cc55cSDimitry Andric case AMDGPU::S_CMP_GT_I32: 9751349cc55cSDimitry Andric case AMDGPU::S_CMPK_GT_I32: 9752349cc55cSDimitry Andric return optimizeCmpAnd(0, 32, false, true); 9753349cc55cSDimitry Andric case AMDGPU::S_CMP_LG_U64: 9754349cc55cSDimitry Andric return optimizeCmpAnd(0, 64, true, false); 9755349cc55cSDimitry Andric } 9756349cc55cSDimitry Andric 9757349cc55cSDimitry Andric return false; 9758349cc55cSDimitry Andric } 975981ad6265SDimitry Andric 976081ad6265SDimitry Andric void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI, 976181ad6265SDimitry Andric unsigned OpName) const { 976281ad6265SDimitry Andric if (!ST.needsAlignedVGPRs()) 976381ad6265SDimitry Andric return; 976481ad6265SDimitry Andric 976581ad6265SDimitry Andric int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 976681ad6265SDimitry Andric if (OpNo < 0) 976781ad6265SDimitry Andric return; 976881ad6265SDimitry Andric MachineOperand &Op = MI.getOperand(OpNo); 976981ad6265SDimitry Andric if (getOpSize(MI, OpNo) > 4) 977081ad6265SDimitry Andric return; 977181ad6265SDimitry Andric 977281ad6265SDimitry Andric // Add implicit aligned super-reg to force alignment on the data operand. 977381ad6265SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 977481ad6265SDimitry Andric MachineBasicBlock *BB = MI.getParent(); 977581ad6265SDimitry Andric MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 977681ad6265SDimitry Andric Register DataReg = Op.getReg(); 977781ad6265SDimitry Andric bool IsAGPR = RI.isAGPR(MRI, DataReg); 977881ad6265SDimitry Andric Register Undef = MRI.createVirtualRegister( 977981ad6265SDimitry Andric IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); 978081ad6265SDimitry Andric BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef); 978181ad6265SDimitry Andric Register NewVR = 978281ad6265SDimitry Andric MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass 978381ad6265SDimitry Andric : &AMDGPU::VReg_64_Align2RegClass); 978481ad6265SDimitry Andric BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR) 978581ad6265SDimitry Andric .addReg(DataReg, 0, Op.getSubReg()) 978681ad6265SDimitry Andric .addImm(AMDGPU::sub0) 978781ad6265SDimitry Andric .addReg(Undef) 978881ad6265SDimitry Andric .addImm(AMDGPU::sub1); 978981ad6265SDimitry Andric Op.setReg(NewVR); 979081ad6265SDimitry Andric Op.setSubReg(AMDGPU::sub0); 979181ad6265SDimitry Andric MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); 979281ad6265SDimitry Andric } 9793