10b57cec5SDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
100b57cec5SDimitry Andric // This will fuse operations such as
110b57cec5SDimitry Andric // ds_read_b32 v0, v2 offset:16
120b57cec5SDimitry Andric // ds_read_b32 v1, v2 offset:32
130b57cec5SDimitry Andric // ==>
140b57cec5SDimitry Andric // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
150b57cec5SDimitry Andric //
160b57cec5SDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
170b57cec5SDimitry Andric // s_buffer_load_dword s4, s[0:3], 4
180b57cec5SDimitry Andric // s_buffer_load_dword s5, s[0:3], 8
190b57cec5SDimitry Andric // ==>
200b57cec5SDimitry Andric // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
210b57cec5SDimitry Andric //
220b57cec5SDimitry Andric // This pass also tries to promote constant offset to the immediate by
230b57cec5SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
240b57cec5SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
250b57cec5SDimitry Andric // to the immediate.
260b57cec5SDimitry Andric // E.g.
270b57cec5SDimitry Andric // s_movk_i32 s0, 0x1800
280b57cec5SDimitry Andric // v_add_co_u32_e32 v0, vcc, s0, v2
290b57cec5SDimitry Andric // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
300b57cec5SDimitry Andric //
310b57cec5SDimitry Andric // s_movk_i32 s0, 0x1000
320b57cec5SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2
330b57cec5SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
340b57cec5SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off
350b57cec5SDimitry Andric // global_load_dwordx2 v[0:1], v[0:1], off
360b57cec5SDimitry Andric // =>
370b57cec5SDimitry Andric // s_movk_i32 s0, 0x1000
380b57cec5SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2
390b57cec5SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
400b57cec5SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off
410b57cec5SDimitry Andric // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
420b57cec5SDimitry Andric //
430b57cec5SDimitry Andric // Future improvements:
440b57cec5SDimitry Andric //
458bcb0991SDimitry Andric // - This is currently missing stores of constants because loading
460b57cec5SDimitry Andric // the constant into the data register is placed between the stores, although
470b57cec5SDimitry Andric // this is arguably a scheduling problem.
480b57cec5SDimitry Andric //
490b57cec5SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
500b57cec5SDimitry Andric // one pair, and recomputes live intervals and moves on to the next pair. It
510b57cec5SDimitry Andric // would be better to compute a list of all merges that need to occur.
520b57cec5SDimitry Andric //
530b57cec5SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
540b57cec5SDimitry Andric // cluster of loads have offsets that are too large to fit in the 8-bit
550b57cec5SDimitry Andric // offsets, but are close enough to fit in the 8 bits, we can add to the base
560b57cec5SDimitry Andric // pointer and use the new reduced offsets.
570b57cec5SDimitry Andric //
580b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
590b57cec5SDimitry Andric
600b57cec5SDimitry Andric #include "AMDGPU.h"
61e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
620b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
630b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
640b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
65480093f4SDimitry Andric #include "llvm/InitializePasses.h"
660b57cec5SDimitry Andric
670b57cec5SDimitry Andric using namespace llvm;
680b57cec5SDimitry Andric
690b57cec5SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
700b57cec5SDimitry Andric
710b57cec5SDimitry Andric namespace {
720b57cec5SDimitry Andric enum InstClassEnum {
730b57cec5SDimitry Andric UNKNOWN,
740b57cec5SDimitry Andric DS_READ,
750b57cec5SDimitry Andric DS_WRITE,
760b57cec5SDimitry Andric S_BUFFER_LOAD_IMM,
77bdd1243dSDimitry Andric S_BUFFER_LOAD_SGPR_IMM,
78bdd1243dSDimitry Andric S_LOAD_IMM,
798bcb0991SDimitry Andric BUFFER_LOAD,
808bcb0991SDimitry Andric BUFFER_STORE,
818bcb0991SDimitry Andric MIMG,
82480093f4SDimitry Andric TBUFFER_LOAD,
83480093f4SDimitry Andric TBUFFER_STORE,
8481ad6265SDimitry Andric GLOBAL_LOAD_SADDR,
8581ad6265SDimitry Andric GLOBAL_STORE_SADDR,
8681ad6265SDimitry Andric FLAT_LOAD,
8781ad6265SDimitry Andric FLAT_STORE,
8881ad6265SDimitry Andric GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
8981ad6265SDimitry Andric GLOBAL_STORE // any CombineInfo, they are only ever returned by
9081ad6265SDimitry Andric // getCommonInstClass.
910b57cec5SDimitry Andric };
920b57cec5SDimitry Andric
935ffd83dbSDimitry Andric struct AddressRegs {
945ffd83dbSDimitry Andric unsigned char NumVAddrs = 0;
955ffd83dbSDimitry Andric bool SBase = false;
965ffd83dbSDimitry Andric bool SRsrc = false;
975ffd83dbSDimitry Andric bool SOffset = false;
9881ad6265SDimitry Andric bool SAddr = false;
995ffd83dbSDimitry Andric bool VAddr = false;
1005ffd83dbSDimitry Andric bool Addr = false;
1015ffd83dbSDimitry Andric bool SSamp = false;
1020b57cec5SDimitry Andric };
1030b57cec5SDimitry Andric
1045ffd83dbSDimitry Andric // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
1055ffd83dbSDimitry Andric const unsigned MaxAddressRegs = 12 + 1 + 1;
1065ffd83dbSDimitry Andric
1070b57cec5SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
1080b57cec5SDimitry Andric struct CombineInfo {
1090b57cec5SDimitry Andric MachineBasicBlock::iterator I;
1100b57cec5SDimitry Andric unsigned EltSize;
111480093f4SDimitry Andric unsigned Offset;
112480093f4SDimitry Andric unsigned Width;
113480093f4SDimitry Andric unsigned Format;
1140b57cec5SDimitry Andric unsigned BaseOff;
115480093f4SDimitry Andric unsigned DMask;
1160b57cec5SDimitry Andric InstClassEnum InstClass;
117fe6060f1SDimitry Andric unsigned CPol = 0;
11804eeddc0SDimitry Andric bool IsAGPR;
1190b57cec5SDimitry Andric bool UseST64;
1205ffd83dbSDimitry Andric int AddrIdx[MaxAddressRegs];
1215ffd83dbSDimitry Andric const MachineOperand *AddrReg[MaxAddressRegs];
1228bcb0991SDimitry Andric unsigned NumAddresses;
1235ffd83dbSDimitry Andric unsigned Order;
1248bcb0991SDimitry Andric
hasSameBaseAddress__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo125bdd1243dSDimitry Andric bool hasSameBaseAddress(const CombineInfo &CI) {
126bdd1243dSDimitry Andric if (NumAddresses != CI.NumAddresses)
127bdd1243dSDimitry Andric return false;
128bdd1243dSDimitry Andric
129bdd1243dSDimitry Andric const MachineInstr &MI = *CI.I;
1308bcb0991SDimitry Andric for (unsigned i = 0; i < NumAddresses; i++) {
1318bcb0991SDimitry Andric const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
1328bcb0991SDimitry Andric
1338bcb0991SDimitry Andric if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
1348bcb0991SDimitry Andric if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
1358bcb0991SDimitry Andric AddrReg[i]->getImm() != AddrRegNext.getImm()) {
1368bcb0991SDimitry Andric return false;
1378bcb0991SDimitry Andric }
1388bcb0991SDimitry Andric continue;
1398bcb0991SDimitry Andric }
1408bcb0991SDimitry Andric
1418bcb0991SDimitry Andric // Check same base pointer. Be careful of subregisters, which can occur
1428bcb0991SDimitry Andric // with vectors of pointers.
1438bcb0991SDimitry Andric if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
1448bcb0991SDimitry Andric AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
1458bcb0991SDimitry Andric return false;
1468bcb0991SDimitry Andric }
1478bcb0991SDimitry Andric }
1488bcb0991SDimitry Andric return true;
1498bcb0991SDimitry Andric }
1508bcb0991SDimitry Andric
hasMergeableAddress__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo1518bcb0991SDimitry Andric bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
1528bcb0991SDimitry Andric for (unsigned i = 0; i < NumAddresses; ++i) {
1538bcb0991SDimitry Andric const MachineOperand *AddrOp = AddrReg[i];
1548bcb0991SDimitry Andric // Immediates are always OK.
1558bcb0991SDimitry Andric if (AddrOp->isImm())
1568bcb0991SDimitry Andric continue;
1578bcb0991SDimitry Andric
1588bcb0991SDimitry Andric // Don't try to merge addresses that aren't either immediates or registers.
1598bcb0991SDimitry Andric // TODO: Should be possible to merge FrameIndexes and maybe some other
1608bcb0991SDimitry Andric // non-register
1618bcb0991SDimitry Andric if (!AddrOp->isReg())
1628bcb0991SDimitry Andric return false;
1638bcb0991SDimitry Andric
1645f757f3fSDimitry Andric // TODO: We should be able to merge instructions with other physical reg
1655f757f3fSDimitry Andric // addresses too.
1665f757f3fSDimitry Andric if (AddrOp->getReg().isPhysical() &&
1675f757f3fSDimitry Andric AddrOp->getReg() != AMDGPU::SGPR_NULL)
1688bcb0991SDimitry Andric return false;
1698bcb0991SDimitry Andric
170bdd1243dSDimitry Andric // If an address has only one use then there will be no other
1718bcb0991SDimitry Andric // instructions with the same address, so we can't merge this one.
1728bcb0991SDimitry Andric if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
1738bcb0991SDimitry Andric return false;
1748bcb0991SDimitry Andric }
1758bcb0991SDimitry Andric return true;
1768bcb0991SDimitry Andric }
1778bcb0991SDimitry Andric
17804eeddc0SDimitry Andric void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
17981ad6265SDimitry Andric
18081ad6265SDimitry Andric // Compare by pointer order.
operator <__anon1ff0d37f0111::SILoadStoreOptimizer::CombineInfo18181ad6265SDimitry Andric bool operator<(const CombineInfo& Other) const {
18281ad6265SDimitry Andric return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
18381ad6265SDimitry Andric }
1840b57cec5SDimitry Andric };
1850b57cec5SDimitry Andric
1860b57cec5SDimitry Andric struct BaseRegisters {
1875ffd83dbSDimitry Andric Register LoReg;
1885ffd83dbSDimitry Andric Register HiReg;
1890b57cec5SDimitry Andric
1900b57cec5SDimitry Andric unsigned LoSubReg = 0;
1910b57cec5SDimitry Andric unsigned HiSubReg = 0;
1920b57cec5SDimitry Andric };
1930b57cec5SDimitry Andric
1940b57cec5SDimitry Andric struct MemAddress {
1950b57cec5SDimitry Andric BaseRegisters Base;
1960b57cec5SDimitry Andric int64_t Offset = 0;
1970b57cec5SDimitry Andric };
1980b57cec5SDimitry Andric
1990b57cec5SDimitry Andric using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
2000b57cec5SDimitry Andric
2010b57cec5SDimitry Andric private:
2020b57cec5SDimitry Andric const GCNSubtarget *STM = nullptr;
2030b57cec5SDimitry Andric const SIInstrInfo *TII = nullptr;
2040b57cec5SDimitry Andric const SIRegisterInfo *TRI = nullptr;
2050b57cec5SDimitry Andric MachineRegisterInfo *MRI = nullptr;
2060b57cec5SDimitry Andric AliasAnalysis *AA = nullptr;
2070b57cec5SDimitry Andric bool OptimizeAgain;
2080b57cec5SDimitry Andric
20981ad6265SDimitry Andric bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
21081ad6265SDimitry Andric const DenseSet<Register> &ARegUses,
21181ad6265SDimitry Andric const MachineInstr &A, const MachineInstr &B) const;
212480093f4SDimitry Andric static bool dmasksCanBeCombined(const CombineInfo &CI,
213480093f4SDimitry Andric const SIInstrInfo &TII,
214480093f4SDimitry Andric const CombineInfo &Paired);
2155ffd83dbSDimitry Andric static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
2165ffd83dbSDimitry Andric CombineInfo &Paired, bool Modify = false);
2175ffd83dbSDimitry Andric static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218480093f4SDimitry Andric const CombineInfo &Paired);
219*0fca6ea1SDimitry Andric unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220480093f4SDimitry Andric static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221480093f4SDimitry Andric const CombineInfo &Paired);
222*0fca6ea1SDimitry Andric const TargetRegisterClass *
223*0fca6ea1SDimitry Andric getTargetRegisterClass(const CombineInfo &CI,
224*0fca6ea1SDimitry Andric const CombineInfo &Paired) const;
225fe6060f1SDimitry Andric const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
2260b57cec5SDimitry Andric
22781ad6265SDimitry Andric CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
2280b57cec5SDimitry Andric
229*0fca6ea1SDimitry Andric void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore, int OpName,
231*0fca6ea1SDimitry Andric Register DestReg) const;
232*0fca6ea1SDimitry Andric Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore,
234*0fca6ea1SDimitry Andric int OpName) const;
235*0fca6ea1SDimitry Andric
2360b57cec5SDimitry Andric unsigned read2Opcode(unsigned EltSize) const;
2370b57cec5SDimitry Andric unsigned read2ST64Opcode(unsigned EltSize) const;
23881ad6265SDimitry Andric MachineBasicBlock::iterator
23981ad6265SDimitry Andric mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
24081ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2410b57cec5SDimitry Andric
2420b57cec5SDimitry Andric unsigned write2Opcode(unsigned EltSize) const;
2430b57cec5SDimitry Andric unsigned write2ST64Opcode(unsigned EltSize) const;
2445ffd83dbSDimitry Andric MachineBasicBlock::iterator
2455ffd83dbSDimitry Andric mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
24681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2475ffd83dbSDimitry Andric MachineBasicBlock::iterator
2485ffd83dbSDimitry Andric mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
24981ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2505ffd83dbSDimitry Andric MachineBasicBlock::iterator
251bdd1243dSDimitry Andric mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
25281ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2535ffd83dbSDimitry Andric MachineBasicBlock::iterator
2545ffd83dbSDimitry Andric mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
25581ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2565ffd83dbSDimitry Andric MachineBasicBlock::iterator
2575ffd83dbSDimitry Andric mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
25881ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2595ffd83dbSDimitry Andric MachineBasicBlock::iterator
2605ffd83dbSDimitry Andric mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
26181ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2625ffd83dbSDimitry Andric MachineBasicBlock::iterator
2635ffd83dbSDimitry Andric mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
26481ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
26581ad6265SDimitry Andric MachineBasicBlock::iterator
26681ad6265SDimitry Andric mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
26781ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
26881ad6265SDimitry Andric MachineBasicBlock::iterator
26981ad6265SDimitry Andric mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
27081ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore);
2710b57cec5SDimitry Andric
2725ffd83dbSDimitry Andric void updateBaseAndOffset(MachineInstr &I, Register NewBase,
2738bcb0991SDimitry Andric int32_t NewOffset) const;
2745ffd83dbSDimitry Andric Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
2758bcb0991SDimitry Andric MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276bdd1243dSDimitry Andric std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
2778bcb0991SDimitry Andric void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
2780b57cec5SDimitry Andric /// Promotes constant offset to the immediate by adjusting the base. It
2790b57cec5SDimitry Andric /// tries to use a base from the nearby instructions that allows it to have
2800b57cec5SDimitry Andric /// a 13bit constant offset which gets promoted to the immediate.
2810b57cec5SDimitry Andric bool promoteConstantOffsetToImm(MachineInstr &CI,
2820b57cec5SDimitry Andric MemInfoMap &Visited,
2838bcb0991SDimitry Andric SmallPtrSet<MachineInstr *, 4> &Promoted) const;
2848bcb0991SDimitry Andric void addInstToMergeableList(const CombineInfo &CI,
2858bcb0991SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) const;
2865ffd83dbSDimitry Andric
2875ffd83dbSDimitry Andric std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
2885ffd83dbSDimitry Andric MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2895ffd83dbSDimitry Andric MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2908bcb0991SDimitry Andric std::list<std::list<CombineInfo>> &MergeableInsts) const;
2910b57cec5SDimitry Andric
29281ad6265SDimitry Andric static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
29381ad6265SDimitry Andric const CombineInfo &Paired);
29481ad6265SDimitry Andric
29581ad6265SDimitry Andric static InstClassEnum getCommonInstClass(const CombineInfo &CI,
29681ad6265SDimitry Andric const CombineInfo &Paired);
29781ad6265SDimitry Andric
2980b57cec5SDimitry Andric public:
2990b57cec5SDimitry Andric static char ID;
3000b57cec5SDimitry Andric
SILoadStoreOptimizer()3010b57cec5SDimitry Andric SILoadStoreOptimizer() : MachineFunctionPass(ID) {
3020b57cec5SDimitry Andric initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
3030b57cec5SDimitry Andric }
3040b57cec5SDimitry Andric
3058bcb0991SDimitry Andric bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
3068bcb0991SDimitry Andric bool &OptimizeListAgain);
3078bcb0991SDimitry Andric bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
3080b57cec5SDimitry Andric
3090b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
3100b57cec5SDimitry Andric
getPassName() const3110b57cec5SDimitry Andric StringRef getPassName() const override { return "SI Load Store Optimizer"; }
3120b57cec5SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const3130b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
3140b57cec5SDimitry Andric AU.setPreservesCFG();
3150b57cec5SDimitry Andric AU.addRequired<AAResultsWrapperPass>();
3160b57cec5SDimitry Andric
3170b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
3180b57cec5SDimitry Andric }
3195ffd83dbSDimitry Andric
getRequiredProperties() const3205ffd83dbSDimitry Andric MachineFunctionProperties getRequiredProperties() const override {
3215ffd83dbSDimitry Andric return MachineFunctionProperties()
3225ffd83dbSDimitry Andric .set(MachineFunctionProperties::Property::IsSSA);
3235ffd83dbSDimitry Andric }
3240b57cec5SDimitry Andric };
3250b57cec5SDimitry Andric
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)3268bcb0991SDimitry Andric static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
3278bcb0991SDimitry Andric const unsigned Opc = MI.getOpcode();
3288bcb0991SDimitry Andric
3298bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) {
3308bcb0991SDimitry Andric // FIXME: Handle d16 correctly
3318bcb0991SDimitry Andric return AMDGPU::getMUBUFElements(Opc);
3328bcb0991SDimitry Andric }
3335f757f3fSDimitry Andric if (TII.isImage(MI)) {
3348bcb0991SDimitry Andric uint64_t DMaskImm =
3358bcb0991SDimitry Andric TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336bdd1243dSDimitry Andric return llvm::popcount(DMaskImm);
3378bcb0991SDimitry Andric }
338480093f4SDimitry Andric if (TII.isMTBUF(Opc)) {
339480093f4SDimitry Andric return AMDGPU::getMTBUFElements(Opc);
340480093f4SDimitry Andric }
3418bcb0991SDimitry Andric
3428bcb0991SDimitry Andric switch (Opc) {
3438bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
34681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
34781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
34881ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
34981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
35081ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
35181ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
3528bcb0991SDimitry Andric return 1;
3538bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
356*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
35781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
35881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
35981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
36081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
36181ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
36281ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
3638bcb0991SDimitry Andric return 2;
3645f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
3655f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
3665f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
367*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
36881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
36981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
37081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
37181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
37281ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
37381ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
37481ad6265SDimitry Andric return 3;
3758bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
378*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
37981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
38081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
38181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
38281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
38381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
38481ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
3858bcb0991SDimitry Andric return 4;
386349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
389*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390349cc55cSDimitry Andric return 8;
391*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B32:
392*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
393*0fca6ea1SDimitry Andric case AMDGPU::DS_WRITE_B32:
394fe6060f1SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
395fe6060f1SDimitry Andric return 1;
396*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B64:
397*0fca6ea1SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
398*0fca6ea1SDimitry Andric case AMDGPU::DS_WRITE_B64:
399fe6060f1SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
400fe6060f1SDimitry Andric return 2;
4018bcb0991SDimitry Andric default:
4028bcb0991SDimitry Andric return 0;
4038bcb0991SDimitry Andric }
4048bcb0991SDimitry Andric }
4058bcb0991SDimitry Andric
4068bcb0991SDimitry Andric /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)4078bcb0991SDimitry Andric static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
4088bcb0991SDimitry Andric switch (Opc) {
4098bcb0991SDimitry Andric default:
4108bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) {
4118bcb0991SDimitry Andric switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
4128bcb0991SDimitry Andric default:
4138bcb0991SDimitry Andric return UNKNOWN;
414*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
4188bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
4198bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
4208bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
4218bcb0991SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
4265f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
4275f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
4285f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
4295f757f3fSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
4308bcb0991SDimitry Andric return BUFFER_LOAD;
431*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
4358bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
4368bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
4378bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
4388bcb0991SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442*0fca6ea1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
4435f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
4445f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
4455f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
4465f757f3fSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
4478bcb0991SDimitry Andric return BUFFER_STORE;
4488bcb0991SDimitry Andric }
4498bcb0991SDimitry Andric }
4505f757f3fSDimitry Andric if (TII.isImage(Opc)) {
4518bcb0991SDimitry Andric // Ignore instructions encoded without vaddr.
452bdd1243dSDimitry Andric if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
453bdd1243dSDimitry Andric !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
4548bcb0991SDimitry Andric return UNKNOWN;
455349cc55cSDimitry Andric // Ignore BVH instructions
456349cc55cSDimitry Andric if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
457349cc55cSDimitry Andric return UNKNOWN;
4588bcb0991SDimitry Andric // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
459480093f4SDimitry Andric if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
460480093f4SDimitry Andric TII.isGather4(Opc))
4618bcb0991SDimitry Andric return UNKNOWN;
4628bcb0991SDimitry Andric return MIMG;
4638bcb0991SDimitry Andric }
464480093f4SDimitry Andric if (TII.isMTBUF(Opc)) {
465480093f4SDimitry Andric switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
466480093f4SDimitry Andric default:
467480093f4SDimitry Andric return UNKNOWN;
4685f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
4695f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
4705f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
4715f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475480093f4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
4765f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
4775f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
4785f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
4795f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
4805f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
4815f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
4825f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
4835f757f3fSDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
484480093f4SDimitry Andric return TBUFFER_LOAD;
485480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488480093f4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
4895f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
4905f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
4915f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
4925f757f3fSDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493480093f4SDimitry Andric return TBUFFER_STORE;
494480093f4SDimitry Andric }
495480093f4SDimitry Andric }
4968bcb0991SDimitry Andric return UNKNOWN;
4978bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
4988bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
4995f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5008bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5028bcb0991SDimitry Andric return S_BUFFER_LOAD_IMM;
503bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
5055f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508bdd1243dSDimitry Andric return S_BUFFER_LOAD_SGPR_IMM;
509bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
510bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
5115f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
512bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
513bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
514*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
518bdd1243dSDimitry Andric return S_LOAD_IMM;
5198bcb0991SDimitry Andric case AMDGPU::DS_READ_B32:
5208bcb0991SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
5218bcb0991SDimitry Andric case AMDGPU::DS_READ_B64:
5228bcb0991SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
5238bcb0991SDimitry Andric return DS_READ;
5248bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32:
5258bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
5268bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64:
5278bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
5288bcb0991SDimitry Andric return DS_WRITE;
52981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
53081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
53181ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
53281ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
53381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
53481ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
53581ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
53681ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
53781ad6265SDimitry Andric return FLAT_LOAD;
53881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
53981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
54081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
54181ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
54281ad6265SDimitry Andric return GLOBAL_LOAD_SADDR;
54381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
54481ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
54581ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
54681ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
54781ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
54881ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
54981ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
55081ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
55181ad6265SDimitry Andric return FLAT_STORE;
55281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
55381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
55481ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
55581ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
55681ad6265SDimitry Andric return GLOBAL_STORE_SADDR;
5578bcb0991SDimitry Andric }
5588bcb0991SDimitry Andric }
5598bcb0991SDimitry Andric
5608bcb0991SDimitry Andric /// Determines instruction subclass from opcode. Only instructions
56181ad6265SDimitry Andric /// of the same subclass can be merged together. The merged instruction may have
56281ad6265SDimitry Andric /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)5638bcb0991SDimitry Andric static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
5648bcb0991SDimitry Andric switch (Opc) {
5658bcb0991SDimitry Andric default:
5668bcb0991SDimitry Andric if (TII.isMUBUF(Opc))
5678bcb0991SDimitry Andric return AMDGPU::getMUBUFBaseOpcode(Opc);
5685f757f3fSDimitry Andric if (TII.isImage(Opc)) {
5698bcb0991SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
5708bcb0991SDimitry Andric assert(Info);
5718bcb0991SDimitry Andric return Info->BaseOpcode;
5728bcb0991SDimitry Andric }
573480093f4SDimitry Andric if (TII.isMTBUF(Opc))
574480093f4SDimitry Andric return AMDGPU::getMTBUFBaseOpcode(Opc);
5758bcb0991SDimitry Andric return -1;
5768bcb0991SDimitry Andric case AMDGPU::DS_READ_B32:
5778bcb0991SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
5788bcb0991SDimitry Andric case AMDGPU::DS_READ_B64:
5798bcb0991SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
5808bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32:
5818bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
5828bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64:
5838bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
5848bcb0991SDimitry Andric return Opc;
5858bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
5868bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
5875f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5888bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5908bcb0991SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
5935f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596bdd1243dSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
598bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
5995f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
600bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
601bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
602*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606bdd1243dSDimitry Andric return AMDGPU::S_LOAD_DWORD_IMM;
60781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
60881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
60981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
61081ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
61181ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
61281ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
61381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
61481ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
61581ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORD;
61681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
61781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
61881ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
61981ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
62081ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
62181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
62281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
62381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
62481ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
62581ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
62681ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
62781ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
62881ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
62981ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORD;
63081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
63181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
63281ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
63381ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
63481ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
6358bcb0991SDimitry Andric }
6368bcb0991SDimitry Andric }
6378bcb0991SDimitry Andric
63881ad6265SDimitry Andric // GLOBAL loads and stores are classified as FLAT initially. If both combined
63981ad6265SDimitry Andric // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
64081ad6265SDimitry Andric // If either or both instructions are non segment specific FLAT the resulting
64181ad6265SDimitry Andric // combined operation will be FLAT, potentially promoting one of the GLOBAL
64281ad6265SDimitry Andric // operations to FLAT.
64381ad6265SDimitry Andric // For other instructions return the original unmodified class.
64481ad6265SDimitry Andric InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)64581ad6265SDimitry Andric SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
64681ad6265SDimitry Andric const CombineInfo &Paired) {
64781ad6265SDimitry Andric assert(CI.InstClass == Paired.InstClass);
64881ad6265SDimitry Andric
64981ad6265SDimitry Andric if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
65081ad6265SDimitry Andric SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
65181ad6265SDimitry Andric return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
65281ad6265SDimitry Andric
65381ad6265SDimitry Andric return CI.InstClass;
65481ad6265SDimitry Andric }
65581ad6265SDimitry Andric
getRegs(unsigned Opc,const SIInstrInfo & TII)6565ffd83dbSDimitry Andric static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
6575ffd83dbSDimitry Andric AddressRegs Result;
6585ffd83dbSDimitry Andric
6598bcb0991SDimitry Andric if (TII.isMUBUF(Opc)) {
6605ffd83dbSDimitry Andric if (AMDGPU::getMUBUFHasVAddr(Opc))
6615ffd83dbSDimitry Andric Result.VAddr = true;
6625ffd83dbSDimitry Andric if (AMDGPU::getMUBUFHasSrsrc(Opc))
6635ffd83dbSDimitry Andric Result.SRsrc = true;
6645ffd83dbSDimitry Andric if (AMDGPU::getMUBUFHasSoffset(Opc))
6655ffd83dbSDimitry Andric Result.SOffset = true;
6668bcb0991SDimitry Andric
6675ffd83dbSDimitry Andric return Result;
6688bcb0991SDimitry Andric }
6698bcb0991SDimitry Andric
6705f757f3fSDimitry Andric if (TII.isImage(Opc)) {
6715ffd83dbSDimitry Andric int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
6725ffd83dbSDimitry Andric if (VAddr0Idx >= 0) {
6735f757f3fSDimitry Andric int RsrcName =
6745f757f3fSDimitry Andric TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
6755f757f3fSDimitry Andric int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
6765f757f3fSDimitry Andric Result.NumVAddrs = RsrcIdx - VAddr0Idx;
6775ffd83dbSDimitry Andric } else {
6785ffd83dbSDimitry Andric Result.VAddr = true;
6795ffd83dbSDimitry Andric }
6805ffd83dbSDimitry Andric Result.SRsrc = true;
6818bcb0991SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
6828bcb0991SDimitry Andric if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
6835ffd83dbSDimitry Andric Result.SSamp = true;
684480093f4SDimitry Andric
6855ffd83dbSDimitry Andric return Result;
686480093f4SDimitry Andric }
687480093f4SDimitry Andric if (TII.isMTBUF(Opc)) {
6885ffd83dbSDimitry Andric if (AMDGPU::getMTBUFHasVAddr(Opc))
6895ffd83dbSDimitry Andric Result.VAddr = true;
6905ffd83dbSDimitry Andric if (AMDGPU::getMTBUFHasSrsrc(Opc))
6915ffd83dbSDimitry Andric Result.SRsrc = true;
6925ffd83dbSDimitry Andric if (AMDGPU::getMTBUFHasSoffset(Opc))
6935ffd83dbSDimitry Andric Result.SOffset = true;
694480093f4SDimitry Andric
6955ffd83dbSDimitry Andric return Result;
6968bcb0991SDimitry Andric }
6978bcb0991SDimitry Andric
6988bcb0991SDimitry Andric switch (Opc) {
6998bcb0991SDimitry Andric default:
7005ffd83dbSDimitry Andric return Result;
701bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
7035f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705bdd1243dSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706bdd1243dSDimitry Andric Result.SOffset = true;
707bdd1243dSDimitry Andric [[fallthrough]];
7088bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
7098bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
7105f757f3fSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
7118bcb0991SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712349cc55cSDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
714bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
7155f757f3fSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
716bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
717bdd1243dSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
718*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721*0fca6ea1SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
7225ffd83dbSDimitry Andric Result.SBase = true;
7235ffd83dbSDimitry Andric return Result;
7248bcb0991SDimitry Andric case AMDGPU::DS_READ_B32:
7258bcb0991SDimitry Andric case AMDGPU::DS_READ_B64:
7268bcb0991SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
7278bcb0991SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
7288bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32:
7298bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64:
7308bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
7318bcb0991SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
7325ffd83dbSDimitry Andric Result.Addr = true;
7335ffd83dbSDimitry Andric return Result;
73481ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
73581ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
73681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
73781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
73881ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
73981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
74081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
74181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
74281ad6265SDimitry Andric Result.SAddr = true;
743bdd1243dSDimitry Andric [[fallthrough]];
74481ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
74581ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
74681ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
74781ad6265SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
74881ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
74981ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
75081ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
75181ad6265SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
75281ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
75381ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
75481ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
75581ad6265SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
75681ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
75781ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
75881ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
75981ad6265SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
76081ad6265SDimitry Andric Result.VAddr = true;
76181ad6265SDimitry Andric return Result;
7628bcb0991SDimitry Andric }
7638bcb0991SDimitry Andric }
7648bcb0991SDimitry Andric
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)7658bcb0991SDimitry Andric void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
76604eeddc0SDimitry Andric const SILoadStoreOptimizer &LSO) {
7678bcb0991SDimitry Andric I = MI;
7688bcb0991SDimitry Andric unsigned Opc = MI->getOpcode();
76904eeddc0SDimitry Andric InstClass = getInstClass(Opc, *LSO.TII);
7708bcb0991SDimitry Andric
7718bcb0991SDimitry Andric if (InstClass == UNKNOWN)
7728bcb0991SDimitry Andric return;
7738bcb0991SDimitry Andric
77404eeddc0SDimitry Andric IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
77504eeddc0SDimitry Andric
7768bcb0991SDimitry Andric switch (InstClass) {
7778bcb0991SDimitry Andric case DS_READ:
7788bcb0991SDimitry Andric EltSize =
7798bcb0991SDimitry Andric (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
7808bcb0991SDimitry Andric : 4;
7818bcb0991SDimitry Andric break;
7828bcb0991SDimitry Andric case DS_WRITE:
7838bcb0991SDimitry Andric EltSize =
7848bcb0991SDimitry Andric (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
7858bcb0991SDimitry Andric : 4;
7868bcb0991SDimitry Andric break;
7878bcb0991SDimitry Andric case S_BUFFER_LOAD_IMM:
788bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
789bdd1243dSDimitry Andric case S_LOAD_IMM:
79004eeddc0SDimitry Andric EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
7918bcb0991SDimitry Andric break;
7928bcb0991SDimitry Andric default:
7938bcb0991SDimitry Andric EltSize = 4;
7948bcb0991SDimitry Andric break;
7958bcb0991SDimitry Andric }
7968bcb0991SDimitry Andric
7978bcb0991SDimitry Andric if (InstClass == MIMG) {
79804eeddc0SDimitry Andric DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
7995ffd83dbSDimitry Andric // Offset is not considered for MIMG instructions.
8005ffd83dbSDimitry Andric Offset = 0;
8018bcb0991SDimitry Andric } else {
8028bcb0991SDimitry Andric int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
80306c3fb27SDimitry Andric Offset = I->getOperand(OffsetIdx).getImm();
8048bcb0991SDimitry Andric }
8058bcb0991SDimitry Andric
806480093f4SDimitry Andric if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
80704eeddc0SDimitry Andric Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
808480093f4SDimitry Andric
80904eeddc0SDimitry Andric Width = getOpcodeWidth(*I, *LSO.TII);
8108bcb0991SDimitry Andric
8118bcb0991SDimitry Andric if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
812480093f4SDimitry Andric Offset &= 0xffff;
8138bcb0991SDimitry Andric } else if (InstClass != MIMG) {
81404eeddc0SDimitry Andric CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
8158bcb0991SDimitry Andric }
8168bcb0991SDimitry Andric
81704eeddc0SDimitry Andric AddressRegs Regs = getRegs(Opc, *LSO.TII);
8185f757f3fSDimitry Andric bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
8195ffd83dbSDimitry Andric
8208bcb0991SDimitry Andric NumAddresses = 0;
8215ffd83dbSDimitry Andric for (unsigned J = 0; J < Regs.NumVAddrs; J++)
8225ffd83dbSDimitry Andric AddrIdx[NumAddresses++] =
8235ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
8245ffd83dbSDimitry Andric if (Regs.Addr)
8255ffd83dbSDimitry Andric AddrIdx[NumAddresses++] =
8265ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
8275ffd83dbSDimitry Andric if (Regs.SBase)
8285ffd83dbSDimitry Andric AddrIdx[NumAddresses++] =
8295ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
8305ffd83dbSDimitry Andric if (Regs.SRsrc)
8315f757f3fSDimitry Andric AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
8325f757f3fSDimitry Andric Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
8335ffd83dbSDimitry Andric if (Regs.SOffset)
8345ffd83dbSDimitry Andric AddrIdx[NumAddresses++] =
8355ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
83681ad6265SDimitry Andric if (Regs.SAddr)
83781ad6265SDimitry Andric AddrIdx[NumAddresses++] =
83881ad6265SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
8395ffd83dbSDimitry Andric if (Regs.VAddr)
8405ffd83dbSDimitry Andric AddrIdx[NumAddresses++] =
8415ffd83dbSDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
8425ffd83dbSDimitry Andric if (Regs.SSamp)
8435f757f3fSDimitry Andric AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
8445f757f3fSDimitry Andric Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
8455ffd83dbSDimitry Andric assert(NumAddresses <= MaxAddressRegs);
8468bcb0991SDimitry Andric
8475ffd83dbSDimitry Andric for (unsigned J = 0; J < NumAddresses; J++)
8485ffd83dbSDimitry Andric AddrReg[J] = &I->getOperand(AddrIdx[J]);
8498bcb0991SDimitry Andric }
8508bcb0991SDimitry Andric
8510b57cec5SDimitry Andric } // end anonymous namespace.
8520b57cec5SDimitry Andric
8530b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
8540b57cec5SDimitry Andric "SI Load Store Optimizer", false, false)
8550b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
8560b57cec5SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
8570b57cec5SDimitry Andric false, false)
8580b57cec5SDimitry Andric
8590b57cec5SDimitry Andric char SILoadStoreOptimizer::ID = 0;
8600b57cec5SDimitry Andric
8610b57cec5SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
8620b57cec5SDimitry Andric
createSILoadStoreOptimizerPass()8630b57cec5SDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
8640b57cec5SDimitry Andric return new SILoadStoreOptimizer();
8650b57cec5SDimitry Andric }
8660b57cec5SDimitry Andric
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)8670b57cec5SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
8685ffd83dbSDimitry Andric DenseSet<Register> &RegDefs,
86981ad6265SDimitry Andric DenseSet<Register> &RegUses) {
87081ad6265SDimitry Andric for (const auto &Op : MI.operands()) {
87181ad6265SDimitry Andric if (!Op.isReg())
87281ad6265SDimitry Andric continue;
8730b57cec5SDimitry Andric if (Op.isDef())
8740b57cec5SDimitry Andric RegDefs.insert(Op.getReg());
87581ad6265SDimitry Andric if (Op.readsReg())
87681ad6265SDimitry Andric RegUses.insert(Op.getReg());
8770b57cec5SDimitry Andric }
8780b57cec5SDimitry Andric }
8790b57cec5SDimitry Andric
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const88081ad6265SDimitry Andric bool SILoadStoreOptimizer::canSwapInstructions(
88181ad6265SDimitry Andric const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
88281ad6265SDimitry Andric const MachineInstr &A, const MachineInstr &B) const {
88381ad6265SDimitry Andric if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
88481ad6265SDimitry Andric (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
8850b57cec5SDimitry Andric return false;
88681ad6265SDimitry Andric for (const auto &BOp : B.operands()) {
88781ad6265SDimitry Andric if (!BOp.isReg())
8880b57cec5SDimitry Andric continue;
88981ad6265SDimitry Andric if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
89081ad6265SDimitry Andric return false;
89181ad6265SDimitry Andric if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
8920b57cec5SDimitry Andric return false;
8930b57cec5SDimitry Andric }
8940b57cec5SDimitry Andric return true;
8950b57cec5SDimitry Andric }
8960b57cec5SDimitry Andric
89781ad6265SDimitry Andric // Given that \p CI and \p Paired are adjacent memory operations produce a new
89881ad6265SDimitry Andric // MMO for the combined operation with a new access size.
89981ad6265SDimitry Andric MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)90081ad6265SDimitry Andric SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
90181ad6265SDimitry Andric const CombineInfo &Paired) {
90281ad6265SDimitry Andric const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
90381ad6265SDimitry Andric const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
90481ad6265SDimitry Andric
905*0fca6ea1SDimitry Andric unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
90681ad6265SDimitry Andric
90781ad6265SDimitry Andric // A base pointer for the combined operation is the same as the leading
90881ad6265SDimitry Andric // operation's pointer.
90981ad6265SDimitry Andric if (Paired < CI)
91081ad6265SDimitry Andric std::swap(MMOa, MMOb);
91181ad6265SDimitry Andric
91281ad6265SDimitry Andric MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
91381ad6265SDimitry Andric // If merging FLAT and GLOBAL set address space to FLAT.
91481ad6265SDimitry Andric if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
91581ad6265SDimitry Andric PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
91681ad6265SDimitry Andric
91781ad6265SDimitry Andric MachineFunction *MF = CI.I->getMF();
91881ad6265SDimitry Andric return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
9198bcb0991SDimitry Andric }
9208bcb0991SDimitry Andric
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)921480093f4SDimitry Andric bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
922480093f4SDimitry Andric const SIInstrInfo &TII,
923480093f4SDimitry Andric const CombineInfo &Paired) {
9248bcb0991SDimitry Andric assert(CI.InstClass == MIMG);
9258bcb0991SDimitry Andric
9268bcb0991SDimitry Andric // Ignore instructions with tfe/lwe set.
9278bcb0991SDimitry Andric const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
9288bcb0991SDimitry Andric const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
9298bcb0991SDimitry Andric
9308bcb0991SDimitry Andric if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
9318bcb0991SDimitry Andric return false;
9328bcb0991SDimitry Andric
9338bcb0991SDimitry Andric // Check other optional immediate operands for equality.
934fe6060f1SDimitry Andric unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935fe6060f1SDimitry Andric AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936fe6060f1SDimitry Andric AMDGPU::OpName::r128, AMDGPU::OpName::a16};
9378bcb0991SDimitry Andric
9388bcb0991SDimitry Andric for (auto op : OperandsToMatch) {
9398bcb0991SDimitry Andric int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940480093f4SDimitry Andric if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
9418bcb0991SDimitry Andric return false;
9428bcb0991SDimitry Andric if (Idx != -1 &&
943480093f4SDimitry Andric CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
9448bcb0991SDimitry Andric return false;
9458bcb0991SDimitry Andric }
9468bcb0991SDimitry Andric
9478bcb0991SDimitry Andric // Check DMask for overlaps.
948480093f4SDimitry Andric unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949480093f4SDimitry Andric unsigned MinMask = std::min(CI.DMask, Paired.DMask);
9508bcb0991SDimitry Andric
9515f757f3fSDimitry Andric if (!MaxMask)
9525f757f3fSDimitry Andric return false;
9535f757f3fSDimitry Andric
95406c3fb27SDimitry Andric unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
9558bcb0991SDimitry Andric if ((1u << AllowedBitsForMin) <= MinMask)
9568bcb0991SDimitry Andric return false;
9578bcb0991SDimitry Andric
9588bcb0991SDimitry Andric return true;
9598bcb0991SDimitry Andric }
9608bcb0991SDimitry Andric
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)961480093f4SDimitry Andric static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
962480093f4SDimitry Andric unsigned ComponentCount,
9635ffd83dbSDimitry Andric const GCNSubtarget &STI) {
964480093f4SDimitry Andric if (ComponentCount > 4)
965480093f4SDimitry Andric return 0;
966480093f4SDimitry Andric
967480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
968480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
969480093f4SDimitry Andric if (!OldFormatInfo)
970480093f4SDimitry Andric return 0;
971480093f4SDimitry Andric
972480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
973480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
974480093f4SDimitry Andric ComponentCount,
975480093f4SDimitry Andric OldFormatInfo->NumFormat, STI);
976480093f4SDimitry Andric
977480093f4SDimitry Andric if (!NewFormatInfo)
978480093f4SDimitry Andric return 0;
979480093f4SDimitry Andric
980480093f4SDimitry Andric assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981480093f4SDimitry Andric NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
982480093f4SDimitry Andric
983480093f4SDimitry Andric return NewFormatInfo->Format;
984480093f4SDimitry Andric }
985480093f4SDimitry Andric
986fe6060f1SDimitry Andric // Return the value in the inclusive range [Lo,Hi] that is aligned to the
987fe6060f1SDimitry Andric // highest power of two. Note that the result is well defined for all inputs
988fe6060f1SDimitry Andric // including corner cases like:
989fe6060f1SDimitry Andric // - if Lo == Hi, return that value
990fe6060f1SDimitry Andric // - if Lo == 0, return 0 (even though the "- 1" below underflows
991fe6060f1SDimitry Andric // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)992fe6060f1SDimitry Andric static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
99306c3fb27SDimitry Andric return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
994fe6060f1SDimitry Andric }
995fe6060f1SDimitry Andric
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)996480093f4SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
9975ffd83dbSDimitry Andric const GCNSubtarget &STI,
9985ffd83dbSDimitry Andric CombineInfo &Paired,
9995ffd83dbSDimitry Andric bool Modify) {
10008bcb0991SDimitry Andric assert(CI.InstClass != MIMG);
10018bcb0991SDimitry Andric
10020b57cec5SDimitry Andric // XXX - Would the same offset be OK? Is there any reason this would happen or
10030b57cec5SDimitry Andric // be useful?
1004480093f4SDimitry Andric if (CI.Offset == Paired.Offset)
10050b57cec5SDimitry Andric return false;
10060b57cec5SDimitry Andric
10070b57cec5SDimitry Andric // This won't be valid if the offset isn't aligned.
1008480093f4SDimitry Andric if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
10090b57cec5SDimitry Andric return false;
10100b57cec5SDimitry Andric
1011480093f4SDimitry Andric if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1012480093f4SDimitry Andric
1013480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1014480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1015480093f4SDimitry Andric if (!Info0)
1016480093f4SDimitry Andric return false;
1017480093f4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1018480093f4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1019480093f4SDimitry Andric if (!Info1)
1020480093f4SDimitry Andric return false;
1021480093f4SDimitry Andric
1022480093f4SDimitry Andric if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023480093f4SDimitry Andric Info0->NumFormat != Info1->NumFormat)
1024480093f4SDimitry Andric return false;
1025480093f4SDimitry Andric
1026480093f4SDimitry Andric // TODO: Should be possible to support more formats, but if format loads
1027480093f4SDimitry Andric // are not dword-aligned, the merged load might not be valid.
1028480093f4SDimitry Andric if (Info0->BitsPerComp != 32)
1029480093f4SDimitry Andric return false;
1030480093f4SDimitry Andric
1031480093f4SDimitry Andric if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1032480093f4SDimitry Andric return false;
1033480093f4SDimitry Andric }
1034480093f4SDimitry Andric
1035fe6060f1SDimitry Andric uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036fe6060f1SDimitry Andric uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
10370b57cec5SDimitry Andric CI.UseST64 = false;
10380b57cec5SDimitry Andric CI.BaseOff = 0;
10390b57cec5SDimitry Andric
1040fe6060f1SDimitry Andric // Handle all non-DS instructions.
10410b57cec5SDimitry Andric if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
104206c3fb27SDimitry Andric if (EltOffset0 + CI.Width != EltOffset1 &&
104306c3fb27SDimitry Andric EltOffset1 + Paired.Width != EltOffset0)
104406c3fb27SDimitry Andric return false;
104506c3fb27SDimitry Andric if (CI.CPol != Paired.CPol)
104606c3fb27SDimitry Andric return false;
10475f757f3fSDimitry Andric if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
10485f757f3fSDimitry Andric CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
10495f757f3fSDimitry Andric // Reject cases like:
10505f757f3fSDimitry Andric // dword + dwordx2 -> dwordx3
10515f757f3fSDimitry Andric // dword + dwordx3 -> dwordx4
10525f757f3fSDimitry Andric // If we tried to combine these cases, we would fail to extract a subreg
10535f757f3fSDimitry Andric // for the result of the second load due to SGPR alignment requirements.
10545f757f3fSDimitry Andric if (CI.Width != Paired.Width &&
10555f757f3fSDimitry Andric (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
10565f757f3fSDimitry Andric return false;
10575f757f3fSDimitry Andric }
105806c3fb27SDimitry Andric return true;
10590b57cec5SDimitry Andric }
10600b57cec5SDimitry Andric
10610b57cec5SDimitry Andric // If the offset in elements doesn't fit in 8-bits, we might be able to use
10620b57cec5SDimitry Andric // the stride 64 versions.
10630b57cec5SDimitry Andric if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
10640b57cec5SDimitry Andric isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
10655ffd83dbSDimitry Andric if (Modify) {
1066480093f4SDimitry Andric CI.Offset = EltOffset0 / 64;
1067480093f4SDimitry Andric Paired.Offset = EltOffset1 / 64;
10680b57cec5SDimitry Andric CI.UseST64 = true;
10695ffd83dbSDimitry Andric }
10700b57cec5SDimitry Andric return true;
10710b57cec5SDimitry Andric }
10720b57cec5SDimitry Andric
10730b57cec5SDimitry Andric // Check if the new offsets fit in the reduced 8-bit range.
10740b57cec5SDimitry Andric if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
10755ffd83dbSDimitry Andric if (Modify) {
1076480093f4SDimitry Andric CI.Offset = EltOffset0;
1077480093f4SDimitry Andric Paired.Offset = EltOffset1;
10785ffd83dbSDimitry Andric }
10790b57cec5SDimitry Andric return true;
10800b57cec5SDimitry Andric }
10810b57cec5SDimitry Andric
10820b57cec5SDimitry Andric // Try to shift base address to decrease offsets.
1083fe6060f1SDimitry Andric uint32_t Min = std::min(EltOffset0, EltOffset1);
1084fe6060f1SDimitry Andric uint32_t Max = std::max(EltOffset0, EltOffset1);
10850b57cec5SDimitry Andric
1086fe6060f1SDimitry Andric const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087fe6060f1SDimitry Andric if (((Max - Min) & ~Mask) == 0) {
10885ffd83dbSDimitry Andric if (Modify) {
1089fe6060f1SDimitry Andric // From the range of values we could use for BaseOff, choose the one that
1090fe6060f1SDimitry Andric // is aligned to the highest power of two, to maximise the chance that
1091fe6060f1SDimitry Andric // the same offset can be reused for other load/store pairs.
1092fe6060f1SDimitry Andric uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1093fe6060f1SDimitry Andric // Copy the low bits of the offsets, so that when we adjust them by
1094fe6060f1SDimitry Andric // subtracting BaseOff they will be multiples of 64.
1095fe6060f1SDimitry Andric BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096fe6060f1SDimitry Andric CI.BaseOff = BaseOff * CI.EltSize;
1097fe6060f1SDimitry Andric CI.Offset = (EltOffset0 - BaseOff) / 64;
1098fe6060f1SDimitry Andric Paired.Offset = (EltOffset1 - BaseOff) / 64;
10990b57cec5SDimitry Andric CI.UseST64 = true;
11005ffd83dbSDimitry Andric }
11010b57cec5SDimitry Andric return true;
11020b57cec5SDimitry Andric }
11030b57cec5SDimitry Andric
1104fe6060f1SDimitry Andric if (isUInt<8>(Max - Min)) {
11055ffd83dbSDimitry Andric if (Modify) {
1106fe6060f1SDimitry Andric // From the range of values we could use for BaseOff, choose the one that
1107fe6060f1SDimitry Andric // is aligned to the highest power of two, to maximise the chance that
1108fe6060f1SDimitry Andric // the same offset can be reused for other load/store pairs.
1109fe6060f1SDimitry Andric uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1110fe6060f1SDimitry Andric CI.BaseOff = BaseOff * CI.EltSize;
1111fe6060f1SDimitry Andric CI.Offset = EltOffset0 - BaseOff;
1112fe6060f1SDimitry Andric Paired.Offset = EltOffset1 - BaseOff;
11135ffd83dbSDimitry Andric }
11140b57cec5SDimitry Andric return true;
11150b57cec5SDimitry Andric }
11160b57cec5SDimitry Andric
11170b57cec5SDimitry Andric return false;
11180b57cec5SDimitry Andric }
11190b57cec5SDimitry Andric
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)11200b57cec5SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1121480093f4SDimitry Andric const CombineInfo &CI,
1122480093f4SDimitry Andric const CombineInfo &Paired) {
1123480093f4SDimitry Andric const unsigned Width = (CI.Width + Paired.Width);
11240b57cec5SDimitry Andric switch (CI.InstClass) {
11250b57cec5SDimitry Andric default:
11260b57cec5SDimitry Andric return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
11270b57cec5SDimitry Andric case S_BUFFER_LOAD_IMM:
1128bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
1129bdd1243dSDimitry Andric case S_LOAD_IMM:
11300b57cec5SDimitry Andric switch (Width) {
11310b57cec5SDimitry Andric default:
11320b57cec5SDimitry Andric return false;
11330b57cec5SDimitry Andric case 2:
11340b57cec5SDimitry Andric case 4:
1135349cc55cSDimitry Andric case 8:
11360b57cec5SDimitry Andric return true;
11375f757f3fSDimitry Andric case 3:
11385f757f3fSDimitry Andric return STM.hasScalarDwordx3Loads();
11390b57cec5SDimitry Andric }
11400b57cec5SDimitry Andric }
11410b57cec5SDimitry Andric }
11420b57cec5SDimitry Andric
1143fe6060f1SDimitry Andric const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1144fe6060f1SDimitry Andric SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1145fe6060f1SDimitry Andric if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Dst->getReg());
1147fe6060f1SDimitry Andric }
1148fe6060f1SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg());
1150fe6060f1SDimitry Andric }
1151fe6060f1SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg());
1153fe6060f1SDimitry Andric }
1154fe6060f1SDimitry Andric if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Dst->getReg());
1156fe6060f1SDimitry Andric }
1157fe6060f1SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158fe6060f1SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg());
1159fe6060f1SDimitry Andric }
1160fe6060f1SDimitry Andric return nullptr;
1161fe6060f1SDimitry Andric }
1162fe6060f1SDimitry Andric
116381ad6265SDimitry Andric /// This function assumes that CI comes before Paired in a basic block. Return
116481ad6265SDimitry Andric /// an insertion point for the merged instruction or nullptr on failure.
116581ad6265SDimitry Andric SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)116681ad6265SDimitry Andric SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
116781ad6265SDimitry Andric CombineInfo &Paired) {
116881ad6265SDimitry Andric // If another instruction has already been merged into CI, it may now be a
116981ad6265SDimitry Andric // type that we can't do any further merging into.
117081ad6265SDimitry Andric if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
117181ad6265SDimitry Andric return nullptr;
117281ad6265SDimitry Andric assert(CI.InstClass == Paired.InstClass);
117381ad6265SDimitry Andric
117481ad6265SDimitry Andric if (getInstSubclass(CI.I->getOpcode(), *TII) !=
117581ad6265SDimitry Andric getInstSubclass(Paired.I->getOpcode(), *TII))
117681ad6265SDimitry Andric return nullptr;
11775ffd83dbSDimitry Andric
11785ffd83dbSDimitry Andric // Check both offsets (or masks for MIMG) can be combined and fit in the
11795ffd83dbSDimitry Andric // reduced range.
118081ad6265SDimitry Andric if (CI.InstClass == MIMG) {
118181ad6265SDimitry Andric if (!dmasksCanBeCombined(CI, *TII, Paired))
118281ad6265SDimitry Andric return nullptr;
118381ad6265SDimitry Andric } else {
118481ad6265SDimitry Andric if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
118581ad6265SDimitry Andric return nullptr;
11865ffd83dbSDimitry Andric }
11875ffd83dbSDimitry Andric
118881ad6265SDimitry Andric DenseSet<Register> RegDefs;
118981ad6265SDimitry Andric DenseSet<Register> RegUses;
119081ad6265SDimitry Andric CombineInfo *Where;
119181ad6265SDimitry Andric if (CI.I->mayLoad()) {
119281ad6265SDimitry Andric // Try to hoist Paired up to CI.
119381ad6265SDimitry Andric addDefsUsesToList(*Paired.I, RegDefs, RegUses);
119481ad6265SDimitry Andric for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
119581ad6265SDimitry Andric if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
119681ad6265SDimitry Andric return nullptr;
11970b57cec5SDimitry Andric }
119881ad6265SDimitry Andric Where = &CI;
119981ad6265SDimitry Andric } else {
120081ad6265SDimitry Andric // Try to sink CI down to Paired.
120181ad6265SDimitry Andric addDefsUsesToList(*CI.I, RegDefs, RegUses);
120281ad6265SDimitry Andric for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
120381ad6265SDimitry Andric if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
120481ad6265SDimitry Andric return nullptr;
12050b57cec5SDimitry Andric }
120681ad6265SDimitry Andric Where = &Paired;
120781ad6265SDimitry Andric }
12085ffd83dbSDimitry Andric
12095ffd83dbSDimitry Andric // Call offsetsCanBeCombined with modify = true so that the offsets are
12105ffd83dbSDimitry Andric // correct for the new instruction. This should return true, because
12115ffd83dbSDimitry Andric // this function should only be called on CombineInfo objects that
12125ffd83dbSDimitry Andric // have already been confirmed to be mergeable.
121381ad6265SDimitry Andric if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
12145ffd83dbSDimitry Andric offsetsCanBeCombined(CI, *STM, Paired, true);
121581ad6265SDimitry Andric return Where;
12160b57cec5SDimitry Andric }
12170b57cec5SDimitry Andric
1218*0fca6ea1SDimitry Andric // Copy the merged load result from DestReg to the original dest regs of CI and
1219*0fca6ea1SDimitry Andric // Paired.
copyToDestRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName,Register DestReg) const1220*0fca6ea1SDimitry Andric void SILoadStoreOptimizer::copyToDestRegs(
1221*0fca6ea1SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1222*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore, int OpName,
1223*0fca6ea1SDimitry Andric Register DestReg) const {
1224*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1225*0fca6ea1SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1226*0fca6ea1SDimitry Andric
1227*0fca6ea1SDimitry Andric auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1228*0fca6ea1SDimitry Andric
1229*0fca6ea1SDimitry Andric // Copy to the old destination registers.
1230*0fca6ea1SDimitry Andric const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231*0fca6ea1SDimitry Andric auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232*0fca6ea1SDimitry Andric auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233*0fca6ea1SDimitry Andric
1234*0fca6ea1SDimitry Andric // The constrained sload instructions in S_LOAD_IMM class will have
1235*0fca6ea1SDimitry Andric // `early-clobber` flag in the dst operand. Remove the flag before using the
1236*0fca6ea1SDimitry Andric // MOs in copies.
1237*0fca6ea1SDimitry Andric Dest0->setIsEarlyClobber(false);
1238*0fca6ea1SDimitry Andric Dest1->setIsEarlyClobber(false);
1239*0fca6ea1SDimitry Andric
1240*0fca6ea1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1241*0fca6ea1SDimitry Andric .add(*Dest0) // Copy to same destination including flags and sub reg.
1242*0fca6ea1SDimitry Andric .addReg(DestReg, 0, SubRegIdx0);
1243*0fca6ea1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1244*0fca6ea1SDimitry Andric .add(*Dest1)
1245*0fca6ea1SDimitry Andric .addReg(DestReg, RegState::Kill, SubRegIdx1);
1246*0fca6ea1SDimitry Andric }
1247*0fca6ea1SDimitry Andric
1248*0fca6ea1SDimitry Andric // Return a register for the source of the merged store after copying the
1249*0fca6ea1SDimitry Andric // original source regs of CI and Paired into it.
1250*0fca6ea1SDimitry Andric Register
copyFromSrcRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName) const1251*0fca6ea1SDimitry Andric SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1252*0fca6ea1SDimitry Andric MachineBasicBlock::iterator InsertBefore,
1253*0fca6ea1SDimitry Andric int OpName) const {
1254*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1255*0fca6ea1SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1256*0fca6ea1SDimitry Andric
1257*0fca6ea1SDimitry Andric auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1258*0fca6ea1SDimitry Andric
1259*0fca6ea1SDimitry Andric // Copy to the new source register.
1260*0fca6ea1SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1261*0fca6ea1SDimitry Andric Register SrcReg = MRI->createVirtualRegister(SuperRC);
1262*0fca6ea1SDimitry Andric
1263*0fca6ea1SDimitry Andric const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264*0fca6ea1SDimitry Andric const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1265*0fca6ea1SDimitry Andric
1266*0fca6ea1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1267*0fca6ea1SDimitry Andric .add(*Src0)
1268*0fca6ea1SDimitry Andric .addImm(SubRegIdx0)
1269*0fca6ea1SDimitry Andric .add(*Src1)
1270*0fca6ea1SDimitry Andric .addImm(SubRegIdx1);
1271*0fca6ea1SDimitry Andric
1272*0fca6ea1SDimitry Andric return SrcReg;
1273*0fca6ea1SDimitry Andric }
1274*0fca6ea1SDimitry Andric
read2Opcode(unsigned EltSize) const12750b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
12760b57cec5SDimitry Andric if (STM->ldsRequiresM0Init())
12770b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
12780b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
12790b57cec5SDimitry Andric }
12800b57cec5SDimitry Andric
read2ST64Opcode(unsigned EltSize) const12810b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
12820b57cec5SDimitry Andric if (STM->ldsRequiresM0Init())
12830b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
12840b57cec5SDimitry Andric
12850b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
12860b57cec5SDimitry Andric : AMDGPU::DS_READ2ST64_B64_gfx9;
12870b57cec5SDimitry Andric }
12880b57cec5SDimitry Andric
12890b57cec5SDimitry Andric MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)12905ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
129181ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
12920b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
12930b57cec5SDimitry Andric
12940b57cec5SDimitry Andric // Be careful, since the addresses could be subregisters themselves in weird
12950b57cec5SDimitry Andric // cases, like vectors of pointers.
12960b57cec5SDimitry Andric const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
12970b57cec5SDimitry Andric
1298*0fca6ea1SDimitry Andric unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299*0fca6ea1SDimitry Andric unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
13000b57cec5SDimitry Andric unsigned Opc =
13010b57cec5SDimitry Andric CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
13020b57cec5SDimitry Andric
13030b57cec5SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
13040b57cec5SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
13050b57cec5SDimitry Andric
13060b57cec5SDimitry Andric const MCInstrDesc &Read2Desc = TII->get(Opc);
13070b57cec5SDimitry Andric
1308fe6060f1SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
13098bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
13100b57cec5SDimitry Andric
13110b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
13120b57cec5SDimitry Andric
13138bcb0991SDimitry Andric Register BaseReg = AddrReg->getReg();
13140b57cec5SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg();
13150b57cec5SDimitry Andric unsigned BaseRegFlags = 0;
13160b57cec5SDimitry Andric if (CI.BaseOff) {
13178bcb0991SDimitry Andric Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
131881ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
13190b57cec5SDimitry Andric .addImm(CI.BaseOff);
13200b57cec5SDimitry Andric
13210b57cec5SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
13220b57cec5SDimitry Andric BaseRegFlags = RegState::Kill;
13230b57cec5SDimitry Andric
132481ad6265SDimitry Andric TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
13250b57cec5SDimitry Andric .addReg(ImmReg)
13260b57cec5SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg)
13270b57cec5SDimitry Andric .addImm(0); // clamp bit
13280b57cec5SDimitry Andric BaseSubReg = 0;
13290b57cec5SDimitry Andric }
13300b57cec5SDimitry Andric
13310b57cec5SDimitry Andric MachineInstrBuilder Read2 =
133281ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
13330b57cec5SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
13340b57cec5SDimitry Andric .addImm(NewOffset0) // offset0
13350b57cec5SDimitry Andric .addImm(NewOffset1) // offset1
13360b57cec5SDimitry Andric .addImm(0) // gds
1337480093f4SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*Paired.I});
13380b57cec5SDimitry Andric
1339*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
13400b57cec5SDimitry Andric
13410b57cec5SDimitry Andric CI.I->eraseFromParent();
1342480093f4SDimitry Andric Paired.I->eraseFromParent();
13430b57cec5SDimitry Andric
13440b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
13458bcb0991SDimitry Andric return Read2;
13460b57cec5SDimitry Andric }
13470b57cec5SDimitry Andric
write2Opcode(unsigned EltSize) const13480b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
13490b57cec5SDimitry Andric if (STM->ldsRequiresM0Init())
13500b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
13510b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
13520b57cec5SDimitry Andric : AMDGPU::DS_WRITE2_B64_gfx9;
13530b57cec5SDimitry Andric }
13540b57cec5SDimitry Andric
write2ST64Opcode(unsigned EltSize) const13550b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
13560b57cec5SDimitry Andric if (STM->ldsRequiresM0Init())
13570b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
13580b57cec5SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64;
13590b57cec5SDimitry Andric
13600b57cec5SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
13610b57cec5SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64_gfx9;
13620b57cec5SDimitry Andric }
13630b57cec5SDimitry Andric
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)136481ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
136581ad6265SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
136681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
13670b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
13680b57cec5SDimitry Andric
13690b57cec5SDimitry Andric // Be sure to use .addOperand(), and not .addReg() with these. We want to be
13700b57cec5SDimitry Andric // sure we preserve the subregister index and any register flags set on them.
13710b57cec5SDimitry Andric const MachineOperand *AddrReg =
13720b57cec5SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
13730b57cec5SDimitry Andric const MachineOperand *Data0 =
13740b57cec5SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
13750b57cec5SDimitry Andric const MachineOperand *Data1 =
1376480093f4SDimitry Andric TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
13770b57cec5SDimitry Andric
1378480093f4SDimitry Andric unsigned NewOffset0 = CI.Offset;
1379480093f4SDimitry Andric unsigned NewOffset1 = Paired.Offset;
13800b57cec5SDimitry Andric unsigned Opc =
13810b57cec5SDimitry Andric CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
13820b57cec5SDimitry Andric
13830b57cec5SDimitry Andric if (NewOffset0 > NewOffset1) {
13840b57cec5SDimitry Andric // Canonicalize the merged instruction so the smaller offset comes first.
13850b57cec5SDimitry Andric std::swap(NewOffset0, NewOffset1);
13860b57cec5SDimitry Andric std::swap(Data0, Data1);
13870b57cec5SDimitry Andric }
13880b57cec5SDimitry Andric
13890b57cec5SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
13900b57cec5SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
13910b57cec5SDimitry Andric
13920b57cec5SDimitry Andric const MCInstrDesc &Write2Desc = TII->get(Opc);
13930b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
13940b57cec5SDimitry Andric
13958bcb0991SDimitry Andric Register BaseReg = AddrReg->getReg();
13960b57cec5SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg();
13970b57cec5SDimitry Andric unsigned BaseRegFlags = 0;
13980b57cec5SDimitry Andric if (CI.BaseOff) {
13998bcb0991SDimitry Andric Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
140081ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
14010b57cec5SDimitry Andric .addImm(CI.BaseOff);
14020b57cec5SDimitry Andric
14030b57cec5SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
14040b57cec5SDimitry Andric BaseRegFlags = RegState::Kill;
14050b57cec5SDimitry Andric
140681ad6265SDimitry Andric TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
14070b57cec5SDimitry Andric .addReg(ImmReg)
14080b57cec5SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg)
14090b57cec5SDimitry Andric .addImm(0); // clamp bit
14100b57cec5SDimitry Andric BaseSubReg = 0;
14110b57cec5SDimitry Andric }
14120b57cec5SDimitry Andric
14130b57cec5SDimitry Andric MachineInstrBuilder Write2 =
141481ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, Write2Desc)
14150b57cec5SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
14160b57cec5SDimitry Andric .add(*Data0) // data0
14170b57cec5SDimitry Andric .add(*Data1) // data1
14180b57cec5SDimitry Andric .addImm(NewOffset0) // offset0
14190b57cec5SDimitry Andric .addImm(NewOffset1) // offset1
14200b57cec5SDimitry Andric .addImm(0) // gds
1421480093f4SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*Paired.I});
14220b57cec5SDimitry Andric
14230b57cec5SDimitry Andric CI.I->eraseFromParent();
1424480093f4SDimitry Andric Paired.I->eraseFromParent();
14250b57cec5SDimitry Andric
14260b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
14278bcb0991SDimitry Andric return Write2;
14280b57cec5SDimitry Andric }
14290b57cec5SDimitry Andric
14300b57cec5SDimitry Andric MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)14315ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
143281ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
14330b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
14340b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1435480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
14360b57cec5SDimitry Andric
1437480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14380b57cec5SDimitry Andric
14398bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1440480093f4SDimitry Andric unsigned MergedDMask = CI.DMask | Paired.DMask;
14418bcb0991SDimitry Andric unsigned DMaskIdx =
14428bcb0991SDimitry Andric AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
14430b57cec5SDimitry Andric
144481ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
14458bcb0991SDimitry Andric for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
14468bcb0991SDimitry Andric if (I == DMaskIdx)
14478bcb0991SDimitry Andric MIB.addImm(MergedDMask);
14488bcb0991SDimitry Andric else
14498bcb0991SDimitry Andric MIB.add((*CI.I).getOperand(I));
14508bcb0991SDimitry Andric }
14510b57cec5SDimitry Andric
14528bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions
14538bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
14548bcb0991SDimitry Andric // will return true if this is the case.
1455480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14560b57cec5SDimitry Andric
145781ad6265SDimitry Andric MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14580b57cec5SDimitry Andric
1459*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
14600b57cec5SDimitry Andric
14610b57cec5SDimitry Andric CI.I->eraseFromParent();
1462480093f4SDimitry Andric Paired.I->eraseFromParent();
14638bcb0991SDimitry Andric return New;
14648bcb0991SDimitry Andric }
14658bcb0991SDimitry Andric
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1466bdd1243dSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
14675ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired,
146881ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
14698bcb0991SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
14708bcb0991SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1471480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
14728bcb0991SDimitry Andric
1473480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14748bcb0991SDimitry Andric
14758bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1476480093f4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
14778bcb0991SDimitry Andric
14788bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions
14798bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
14808bcb0991SDimitry Andric // will return true if this is the case.
1481480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14828bcb0991SDimitry Andric
1483bdd1243dSDimitry Andric MachineInstrBuilder New =
148481ad6265SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485bdd1243dSDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486bdd1243dSDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487bdd1243dSDimitry Andric New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488bdd1243dSDimitry Andric New.addImm(MergedOffset);
1489bdd1243dSDimitry Andric New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14908bcb0991SDimitry Andric
1491*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
14928bcb0991SDimitry Andric
14938bcb0991SDimitry Andric CI.I->eraseFromParent();
1494480093f4SDimitry Andric Paired.I->eraseFromParent();
14958bcb0991SDimitry Andric return New;
14968bcb0991SDimitry Andric }
14978bcb0991SDimitry Andric
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)14985ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
14995ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired,
150081ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
15018bcb0991SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
15028bcb0991SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
15038bcb0991SDimitry Andric
1504480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
15058bcb0991SDimitry Andric
1506480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
15078bcb0991SDimitry Andric
15088bcb0991SDimitry Andric // Copy to the new source register.
15098bcb0991SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1510480093f4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
15118bcb0991SDimitry Andric
151281ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
15138bcb0991SDimitry Andric
15145ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
15158bcb0991SDimitry Andric
15165ffd83dbSDimitry Andric if (Regs.VAddr)
15178bcb0991SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
15188bcb0991SDimitry Andric
15198bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions
15208bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
15218bcb0991SDimitry Andric // will return true if this is the case.
1522480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
15238bcb0991SDimitry Andric
15248bcb0991SDimitry Andric MachineInstr *New =
15258bcb0991SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
15268bcb0991SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
15278bcb0991SDimitry Andric .addImm(MergedOffset) // offset
1528fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol
15298bcb0991SDimitry Andric .addImm(0) // swz
153081ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
15318bcb0991SDimitry Andric
1532*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
15338bcb0991SDimitry Andric
15348bcb0991SDimitry Andric CI.I->eraseFromParent();
1535480093f4SDimitry Andric Paired.I->eraseFromParent();
15368bcb0991SDimitry Andric return New;
15370b57cec5SDimitry Andric }
15380b57cec5SDimitry Andric
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)15395ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
15405ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired,
154181ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1542480093f4SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1543480093f4SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1544480093f4SDimitry Andric
1545480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1546480093f4SDimitry Andric
1547480093f4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1548480093f4SDimitry Andric
1549480093f4SDimitry Andric // Copy to the new source register.
1550480093f4SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1551480093f4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1552480093f4SDimitry Andric
155381ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1554480093f4SDimitry Andric
15555ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
1556480093f4SDimitry Andric
15575ffd83dbSDimitry Andric if (Regs.VAddr)
1558480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559480093f4SDimitry Andric
1560480093f4SDimitry Andric unsigned JoinedFormat =
15615ffd83dbSDimitry Andric getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562480093f4SDimitry Andric
1563480093f4SDimitry Andric // It shouldn't be possible to get this far if the two instructions
1564480093f4SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
1565480093f4SDimitry Andric // will return true if this is the case.
1566480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567480093f4SDimitry Andric
1568480093f4SDimitry Andric MachineInstr *New =
1569480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570480093f4SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571480093f4SDimitry Andric .addImm(MergedOffset) // offset
1572480093f4SDimitry Andric .addImm(JoinedFormat) // format
1573fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol
1574480093f4SDimitry Andric .addImm(0) // swz
157581ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576480093f4SDimitry Andric
1577*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1578480093f4SDimitry Andric
1579480093f4SDimitry Andric CI.I->eraseFromParent();
1580480093f4SDimitry Andric Paired.I->eraseFromParent();
1581480093f4SDimitry Andric return New;
1582480093f4SDimitry Andric }
1583480093f4SDimitry Andric
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)15845ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
15855ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired,
158681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1587480093f4SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1588480093f4SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1589480093f4SDimitry Andric
1590480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1591480093f4SDimitry Andric
1592*0fca6ea1SDimitry Andric Register SrcReg =
1593*0fca6ea1SDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1594480093f4SDimitry Andric
159581ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1596480093f4SDimitry Andric .addReg(SrcReg, RegState::Kill);
1597480093f4SDimitry Andric
15985ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
1599480093f4SDimitry Andric
16005ffd83dbSDimitry Andric if (Regs.VAddr)
1601480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1602480093f4SDimitry Andric
1603480093f4SDimitry Andric unsigned JoinedFormat =
16045ffd83dbSDimitry Andric getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1605480093f4SDimitry Andric
1606480093f4SDimitry Andric // It shouldn't be possible to get this far if the two instructions
1607480093f4SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
1608480093f4SDimitry Andric // will return true if this is the case.
1609480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610480093f4SDimitry Andric
1611480093f4SDimitry Andric MachineInstr *New =
1612480093f4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613480093f4SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614480093f4SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1615480093f4SDimitry Andric .addImm(JoinedFormat) // format
1616fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol
1617480093f4SDimitry Andric .addImm(0) // swz
161881ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1619480093f4SDimitry Andric
162081ad6265SDimitry Andric CI.I->eraseFromParent();
162181ad6265SDimitry Andric Paired.I->eraseFromParent();
162281ad6265SDimitry Andric return New;
162381ad6265SDimitry Andric }
162481ad6265SDimitry Andric
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)162581ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
162681ad6265SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
162781ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
162881ad6265SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
162981ad6265SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
163081ad6265SDimitry Andric
163181ad6265SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
163281ad6265SDimitry Andric
163381ad6265SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
163481ad6265SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
163581ad6265SDimitry Andric
163681ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
163781ad6265SDimitry Andric
163881ad6265SDimitry Andric if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
163981ad6265SDimitry Andric MIB.add(*SAddr);
164081ad6265SDimitry Andric
164181ad6265SDimitry Andric MachineInstr *New =
164281ad6265SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
164381ad6265SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset))
164481ad6265SDimitry Andric .addImm(CI.CPol)
164581ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
164681ad6265SDimitry Andric
1647*0fca6ea1SDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
164881ad6265SDimitry Andric
164981ad6265SDimitry Andric CI.I->eraseFromParent();
165081ad6265SDimitry Andric Paired.I->eraseFromParent();
165181ad6265SDimitry Andric return New;
165281ad6265SDimitry Andric }
165381ad6265SDimitry Andric
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)165481ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
165581ad6265SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
165681ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
165781ad6265SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
165881ad6265SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
165981ad6265SDimitry Andric
166081ad6265SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
166181ad6265SDimitry Andric
1662*0fca6ea1SDimitry Andric Register SrcReg =
1663*0fca6ea1SDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
166481ad6265SDimitry Andric
166581ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
166681ad6265SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
166781ad6265SDimitry Andric .addReg(SrcReg, RegState::Kill);
166881ad6265SDimitry Andric
166981ad6265SDimitry Andric if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
167081ad6265SDimitry Andric MIB.add(*SAddr);
167181ad6265SDimitry Andric
167281ad6265SDimitry Andric MachineInstr *New =
167381ad6265SDimitry Andric MIB.addImm(std::min(CI.Offset, Paired.Offset))
167481ad6265SDimitry Andric .addImm(CI.CPol)
167581ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1676480093f4SDimitry Andric
1677480093f4SDimitry Andric CI.I->eraseFromParent();
1678480093f4SDimitry Andric Paired.I->eraseFromParent();
1679480093f4SDimitry Andric return New;
1680480093f4SDimitry Andric }
1681480093f4SDimitry Andric
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1682480093f4SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1683480093f4SDimitry Andric const CombineInfo &Paired) {
1684480093f4SDimitry Andric const unsigned Width = CI.Width + Paired.Width;
16850b57cec5SDimitry Andric
168681ad6265SDimitry Andric switch (getCommonInstClass(CI, Paired)) {
16870b57cec5SDimitry Andric default:
16888bcb0991SDimitry Andric assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
16898bcb0991SDimitry Andric // FIXME: Handle d16 correctly
16908bcb0991SDimitry Andric return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
16918bcb0991SDimitry Andric Width);
1692480093f4SDimitry Andric case TBUFFER_LOAD:
1693480093f4SDimitry Andric case TBUFFER_STORE:
1694480093f4SDimitry Andric return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1695480093f4SDimitry Andric Width);
1696480093f4SDimitry Andric
16970b57cec5SDimitry Andric case UNKNOWN:
16980b57cec5SDimitry Andric llvm_unreachable("Unknown instruction class");
16990b57cec5SDimitry Andric case S_BUFFER_LOAD_IMM:
17000b57cec5SDimitry Andric switch (Width) {
17010b57cec5SDimitry Andric default:
17020b57cec5SDimitry Andric return 0;
17030b57cec5SDimitry Andric case 2:
17040b57cec5SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
17055f757f3fSDimitry Andric case 3:
17065f757f3fSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
17070b57cec5SDimitry Andric case 4:
17080b57cec5SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709349cc55cSDimitry Andric case 8:
1710349cc55cSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
17110b57cec5SDimitry Andric }
1712bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
1713bdd1243dSDimitry Andric switch (Width) {
1714bdd1243dSDimitry Andric default:
1715bdd1243dSDimitry Andric return 0;
1716bdd1243dSDimitry Andric case 2:
171706c3fb27SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
17185f757f3fSDimitry Andric case 3:
17195f757f3fSDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720bdd1243dSDimitry Andric case 4:
172106c3fb27SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722bdd1243dSDimitry Andric case 8:
172306c3fb27SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724bdd1243dSDimitry Andric }
1725*0fca6ea1SDimitry Andric case S_LOAD_IMM: {
1726*0fca6ea1SDimitry Andric // If XNACK is enabled, use the constrained opcodes when the first load is
1727*0fca6ea1SDimitry Andric // under-aligned.
1728*0fca6ea1SDimitry Andric const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729*0fca6ea1SDimitry Andric bool NeedsConstrainedOpc =
1730*0fca6ea1SDimitry Andric STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1731bdd1243dSDimitry Andric switch (Width) {
1732bdd1243dSDimitry Andric default:
1733bdd1243dSDimitry Andric return 0;
1734bdd1243dSDimitry Andric case 2:
1735*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX2_IMM;
17375f757f3fSDimitry Andric case 3:
1738*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX3_IMM;
1740bdd1243dSDimitry Andric case 4:
1741*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX4_IMM;
1743bdd1243dSDimitry Andric case 8:
1744*0fca6ea1SDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745*0fca6ea1SDimitry Andric : AMDGPU::S_LOAD_DWORDX8_IMM;
1746*0fca6ea1SDimitry Andric }
1747bdd1243dSDimitry Andric }
174881ad6265SDimitry Andric case GLOBAL_LOAD:
174981ad6265SDimitry Andric switch (Width) {
175081ad6265SDimitry Andric default:
175181ad6265SDimitry Andric return 0;
175281ad6265SDimitry Andric case 2:
175381ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX2;
175481ad6265SDimitry Andric case 3:
175581ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX3;
175681ad6265SDimitry Andric case 4:
175781ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX4;
175881ad6265SDimitry Andric }
175981ad6265SDimitry Andric case GLOBAL_LOAD_SADDR:
176081ad6265SDimitry Andric switch (Width) {
176181ad6265SDimitry Andric default:
176281ad6265SDimitry Andric return 0;
176381ad6265SDimitry Andric case 2:
176481ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
176581ad6265SDimitry Andric case 3:
176681ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
176781ad6265SDimitry Andric case 4:
176881ad6265SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
176981ad6265SDimitry Andric }
177081ad6265SDimitry Andric case GLOBAL_STORE:
177181ad6265SDimitry Andric switch (Width) {
177281ad6265SDimitry Andric default:
177381ad6265SDimitry Andric return 0;
177481ad6265SDimitry Andric case 2:
177581ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX2;
177681ad6265SDimitry Andric case 3:
177781ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX3;
177881ad6265SDimitry Andric case 4:
177981ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX4;
178081ad6265SDimitry Andric }
178181ad6265SDimitry Andric case GLOBAL_STORE_SADDR:
178281ad6265SDimitry Andric switch (Width) {
178381ad6265SDimitry Andric default:
178481ad6265SDimitry Andric return 0;
178581ad6265SDimitry Andric case 2:
178681ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
178781ad6265SDimitry Andric case 3:
178881ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
178981ad6265SDimitry Andric case 4:
179081ad6265SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
179181ad6265SDimitry Andric }
179281ad6265SDimitry Andric case FLAT_LOAD:
179381ad6265SDimitry Andric switch (Width) {
179481ad6265SDimitry Andric default:
179581ad6265SDimitry Andric return 0;
179681ad6265SDimitry Andric case 2:
179781ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX2;
179881ad6265SDimitry Andric case 3:
179981ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX3;
180081ad6265SDimitry Andric case 4:
180181ad6265SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX4;
180281ad6265SDimitry Andric }
180381ad6265SDimitry Andric case FLAT_STORE:
180481ad6265SDimitry Andric switch (Width) {
180581ad6265SDimitry Andric default:
180681ad6265SDimitry Andric return 0;
180781ad6265SDimitry Andric case 2:
180881ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX2;
180981ad6265SDimitry Andric case 3:
181081ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX3;
181181ad6265SDimitry Andric case 4:
181281ad6265SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX4;
181381ad6265SDimitry Andric }
18148bcb0991SDimitry Andric case MIMG:
1815bdd1243dSDimitry Andric assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1816349cc55cSDimitry Andric "No overlaps");
18178bcb0991SDimitry Andric return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
18180b57cec5SDimitry Andric }
18190b57cec5SDimitry Andric }
18200b57cec5SDimitry Andric
18210b57cec5SDimitry Andric std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1822349cc55cSDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1823349cc55cSDimitry Andric const CombineInfo &Paired) {
1824bdd1243dSDimitry Andric assert((CI.InstClass != MIMG ||
1825bdd1243dSDimitry Andric ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
182681ad6265SDimitry Andric CI.Width + Paired.Width)) &&
18278bcb0991SDimitry Andric "No overlaps");
18288bcb0991SDimitry Andric
1829349cc55cSDimitry Andric unsigned Idx0;
1830349cc55cSDimitry Andric unsigned Idx1;
1831349cc55cSDimitry Andric
183204eeddc0SDimitry Andric static const unsigned Idxs[5][4] = {
18338bcb0991SDimitry Andric {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
183404eeddc0SDimitry Andric {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
183504eeddc0SDimitry Andric {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
183604eeddc0SDimitry Andric {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
183704eeddc0SDimitry Andric {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
18388bcb0991SDimitry Andric };
18398bcb0991SDimitry Andric
184004eeddc0SDimitry Andric assert(CI.Width >= 1 && CI.Width <= 4);
184104eeddc0SDimitry Andric assert(Paired.Width >= 1 && Paired.Width <= 4);
18428bcb0991SDimitry Andric
184381ad6265SDimitry Andric if (Paired < CI) {
1844480093f4SDimitry Andric Idx1 = Idxs[0][Paired.Width - 1];
1845480093f4SDimitry Andric Idx0 = Idxs[Paired.Width][CI.Width - 1];
18460b57cec5SDimitry Andric } else {
1847480093f4SDimitry Andric Idx0 = Idxs[0][CI.Width - 1];
1848480093f4SDimitry Andric Idx1 = Idxs[CI.Width][Paired.Width - 1];
18490b57cec5SDimitry Andric }
18508bcb0991SDimitry Andric
1851*0fca6ea1SDimitry Andric return {Idx0, Idx1};
18520b57cec5SDimitry Andric }
18530b57cec5SDimitry Andric
18540b57cec5SDimitry Andric const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired) const1855480093f4SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1856*0fca6ea1SDimitry Andric const CombineInfo &Paired) const {
1857bdd1243dSDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858bdd1243dSDimitry Andric CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859480093f4SDimitry Andric switch (CI.Width + Paired.Width) {
18600b57cec5SDimitry Andric default:
18610b57cec5SDimitry Andric return nullptr;
18620b57cec5SDimitry Andric case 2:
18630b57cec5SDimitry Andric return &AMDGPU::SReg_64_XEXECRegClass;
18645f757f3fSDimitry Andric case 3:
18655f757f3fSDimitry Andric return &AMDGPU::SGPR_96RegClass;
18660b57cec5SDimitry Andric case 4:
18678bcb0991SDimitry Andric return &AMDGPU::SGPR_128RegClass;
18680b57cec5SDimitry Andric case 8:
18695ffd83dbSDimitry Andric return &AMDGPU::SGPR_256RegClass;
18700b57cec5SDimitry Andric case 16:
18715ffd83dbSDimitry Andric return &AMDGPU::SGPR_512RegClass;
18720b57cec5SDimitry Andric }
18730b57cec5SDimitry Andric }
1874fe6060f1SDimitry Andric
1875fe6060f1SDimitry Andric unsigned BitWidth = 32 * (CI.Width + Paired.Width);
18764824e7fdSDimitry Andric return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877fe6060f1SDimitry Andric ? TRI->getAGPRClassForBitWidth(BitWidth)
1878fe6060f1SDimitry Andric : TRI->getVGPRClassForBitWidth(BitWidth);
18790b57cec5SDimitry Andric }
18800b57cec5SDimitry Andric
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)18815ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
18825ffd83dbSDimitry Andric CombineInfo &CI, CombineInfo &Paired,
188381ad6265SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
18840b57cec5SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
18850b57cec5SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
18860b57cec5SDimitry Andric
1887480093f4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
18880b57cec5SDimitry Andric
1889*0fca6ea1SDimitry Andric Register SrcReg =
1890*0fca6ea1SDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
18910b57cec5SDimitry Andric
189281ad6265SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
18930b57cec5SDimitry Andric .addReg(SrcReg, RegState::Kill);
18940b57cec5SDimitry Andric
18955ffd83dbSDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
18960b57cec5SDimitry Andric
18975ffd83dbSDimitry Andric if (Regs.VAddr)
18980b57cec5SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
18990b57cec5SDimitry Andric
19008bcb0991SDimitry Andric
19018bcb0991SDimitry Andric // It shouldn't be possible to get this far if the two instructions
19028bcb0991SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
19038bcb0991SDimitry Andric // will return true if this is the case.
1904480093f4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
19058bcb0991SDimitry Andric
19068bcb0991SDimitry Andric MachineInstr *New =
19070b57cec5SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
19080b57cec5SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909480093f4SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1910fe6060f1SDimitry Andric .addImm(CI.CPol) // cpol
19118bcb0991SDimitry Andric .addImm(0) // swz
191281ad6265SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
19130b57cec5SDimitry Andric
19140b57cec5SDimitry Andric CI.I->eraseFromParent();
1915480093f4SDimitry Andric Paired.I->eraseFromParent();
19168bcb0991SDimitry Andric return New;
19170b57cec5SDimitry Andric }
19180b57cec5SDimitry Andric
19190b57cec5SDimitry Andric MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const19208bcb0991SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
19210b57cec5SDimitry Andric APInt V(32, Val, true);
19220b57cec5SDimitry Andric if (TII->isInlineConstant(V))
19230b57cec5SDimitry Andric return MachineOperand::CreateImm(Val);
19240b57cec5SDimitry Andric
19258bcb0991SDimitry Andric Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
19260b57cec5SDimitry Andric MachineInstr *Mov =
19270b57cec5SDimitry Andric BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
19280b57cec5SDimitry Andric TII->get(AMDGPU::S_MOV_B32), Reg)
19290b57cec5SDimitry Andric .addImm(Val);
19300b57cec5SDimitry Andric (void)Mov;
19310b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; Mov->dump());
19320b57cec5SDimitry Andric return MachineOperand::CreateReg(Reg, false);
19330b57cec5SDimitry Andric }
19340b57cec5SDimitry Andric
19350b57cec5SDimitry Andric // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const19365ffd83dbSDimitry Andric Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
19378bcb0991SDimitry Andric const MemAddress &Addr) const {
19380b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
19390b57cec5SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator();
19400b57cec5SDimitry Andric DebugLoc DL = MI.getDebugLoc();
19410b57cec5SDimitry Andric
19420b57cec5SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
19430b57cec5SDimitry Andric Addr.Base.LoSubReg) &&
19440b57cec5SDimitry Andric "Expected 32-bit Base-Register-Low!!");
19450b57cec5SDimitry Andric
19460b57cec5SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
19470b57cec5SDimitry Andric Addr.Base.HiSubReg) &&
19480b57cec5SDimitry Andric "Expected 32-bit Base-Register-Hi!!");
19490b57cec5SDimitry Andric
19500b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
19510b57cec5SDimitry Andric MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
19520b57cec5SDimitry Andric MachineOperand OffsetHi =
19530b57cec5SDimitry Andric createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
19540b57cec5SDimitry Andric
19550b57cec5SDimitry Andric const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
19568bcb0991SDimitry Andric Register CarryReg = MRI->createVirtualRegister(CarryRC);
19578bcb0991SDimitry Andric Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
19580b57cec5SDimitry Andric
19598bcb0991SDimitry Andric Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19608bcb0991SDimitry Andric Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19610b57cec5SDimitry Andric MachineInstr *LoHalf =
1962e8d8bef9SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
19630b57cec5SDimitry Andric .addReg(CarryReg, RegState::Define)
19640b57cec5SDimitry Andric .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
19650b57cec5SDimitry Andric .add(OffsetLo)
19660b57cec5SDimitry Andric .addImm(0); // clamp bit
19670b57cec5SDimitry Andric (void)LoHalf;
19680b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
19690b57cec5SDimitry Andric
19700b57cec5SDimitry Andric MachineInstr *HiHalf =
19710b57cec5SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
19720b57cec5SDimitry Andric .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
19730b57cec5SDimitry Andric .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
19740b57cec5SDimitry Andric .add(OffsetHi)
19750b57cec5SDimitry Andric .addReg(CarryReg, RegState::Kill)
19760b57cec5SDimitry Andric .addImm(0); // clamp bit
19770b57cec5SDimitry Andric (void)HiHalf;
19780b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
19790b57cec5SDimitry Andric
1980fe6060f1SDimitry Andric Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
19810b57cec5SDimitry Andric MachineInstr *FullBase =
19820b57cec5SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
19830b57cec5SDimitry Andric .addReg(DestSub0)
19840b57cec5SDimitry Andric .addImm(AMDGPU::sub0)
19850b57cec5SDimitry Andric .addReg(DestSub1)
19860b57cec5SDimitry Andric .addImm(AMDGPU::sub1);
19870b57cec5SDimitry Andric (void)FullBase;
19880b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
19890b57cec5SDimitry Andric
19900b57cec5SDimitry Andric return FullDestReg;
19910b57cec5SDimitry Andric }
19920b57cec5SDimitry Andric
19930b57cec5SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const19940b57cec5SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
19955ffd83dbSDimitry Andric Register NewBase,
19968bcb0991SDimitry Andric int32_t NewOffset) const {
1997480093f4SDimitry Andric auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998480093f4SDimitry Andric Base->setReg(NewBase);
1999480093f4SDimitry Andric Base->setIsKill(false);
20000b57cec5SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
20010b57cec5SDimitry Andric }
20020b57cec5SDimitry Andric
2003bdd1243dSDimitry Andric std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const20048bcb0991SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
20050b57cec5SDimitry Andric if (Op.isImm())
20060b57cec5SDimitry Andric return Op.getImm();
20070b57cec5SDimitry Andric
20080b57cec5SDimitry Andric if (!Op.isReg())
2009bdd1243dSDimitry Andric return std::nullopt;
20100b57cec5SDimitry Andric
20110b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
20120b57cec5SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
20130b57cec5SDimitry Andric !Def->getOperand(1).isImm())
2014bdd1243dSDimitry Andric return std::nullopt;
20150b57cec5SDimitry Andric
20160b57cec5SDimitry Andric return Def->getOperand(1).getImm();
20170b57cec5SDimitry Andric }
20180b57cec5SDimitry Andric
20190b57cec5SDimitry Andric // Analyze Base and extracts:
20200b57cec5SDimitry Andric // - 32bit base registers, subregisters
20210b57cec5SDimitry Andric // - 64bit constant offset
20220b57cec5SDimitry Andric // Expecting base computation as:
20230b57cec5SDimitry Andric // %OFFSET0:sgpr_32 = S_MOV_B32 8000
20240b57cec5SDimitry Andric // %LO:vgpr_32, %c:sreg_64_xexec =
2025e8d8bef9SDimitry Andric // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
20260b57cec5SDimitry Andric // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
20270b57cec5SDimitry Andric // %Base:vreg_64 =
20280b57cec5SDimitry Andric // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const20290b57cec5SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
20308bcb0991SDimitry Andric MemAddress &Addr) const {
20310b57cec5SDimitry Andric if (!Base.isReg())
20320b57cec5SDimitry Andric return;
20330b57cec5SDimitry Andric
20340b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
20350b57cec5SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
20360b57cec5SDimitry Andric || Def->getNumOperands() != 5)
20370b57cec5SDimitry Andric return;
20380b57cec5SDimitry Andric
20390b57cec5SDimitry Andric MachineOperand BaseLo = Def->getOperand(1);
20400b57cec5SDimitry Andric MachineOperand BaseHi = Def->getOperand(3);
20410b57cec5SDimitry Andric if (!BaseLo.isReg() || !BaseHi.isReg())
20420b57cec5SDimitry Andric return;
20430b57cec5SDimitry Andric
20440b57cec5SDimitry Andric MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
20450b57cec5SDimitry Andric MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
20460b57cec5SDimitry Andric
2047e8d8bef9SDimitry Andric if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
20480b57cec5SDimitry Andric !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
20490b57cec5SDimitry Andric return;
20500b57cec5SDimitry Andric
20510b57cec5SDimitry Andric const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
20520b57cec5SDimitry Andric const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
20530b57cec5SDimitry Andric
20540b57cec5SDimitry Andric auto Offset0P = extractConstOffset(*Src0);
20550b57cec5SDimitry Andric if (Offset0P)
20560b57cec5SDimitry Andric BaseLo = *Src1;
20570b57cec5SDimitry Andric else {
20580b57cec5SDimitry Andric if (!(Offset0P = extractConstOffset(*Src1)))
20590b57cec5SDimitry Andric return;
20600b57cec5SDimitry Andric BaseLo = *Src0;
20610b57cec5SDimitry Andric }
20620b57cec5SDimitry Andric
20630b57cec5SDimitry Andric Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
20640b57cec5SDimitry Andric Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
20650b57cec5SDimitry Andric
20660b57cec5SDimitry Andric if (Src0->isImm())
20670b57cec5SDimitry Andric std::swap(Src0, Src1);
20680b57cec5SDimitry Andric
2069*0fca6ea1SDimitry Andric if (!Src1->isImm() || Src0->isImm())
20700b57cec5SDimitry Andric return;
20710b57cec5SDimitry Andric
20720b57cec5SDimitry Andric uint64_t Offset1 = Src1->getImm();
20730b57cec5SDimitry Andric BaseHi = *Src0;
20740b57cec5SDimitry Andric
20750b57cec5SDimitry Andric Addr.Base.LoReg = BaseLo.getReg();
20760b57cec5SDimitry Andric Addr.Base.HiReg = BaseHi.getReg();
20770b57cec5SDimitry Andric Addr.Base.LoSubReg = BaseLo.getSubReg();
20780b57cec5SDimitry Andric Addr.Base.HiSubReg = BaseHi.getSubReg();
20790b57cec5SDimitry Andric Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
20800b57cec5SDimitry Andric }
20810b57cec5SDimitry Andric
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const20820b57cec5SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
20830b57cec5SDimitry Andric MachineInstr &MI,
20840b57cec5SDimitry Andric MemInfoMap &Visited,
20858bcb0991SDimitry Andric SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
20860b57cec5SDimitry Andric
2087*0fca6ea1SDimitry Andric if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
20880b57cec5SDimitry Andric return false;
20890b57cec5SDimitry Andric
2090*0fca6ea1SDimitry Andric // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2091*0fca6ea1SDimitry Andric if (SIInstrInfo::isFLATScratch(MI))
20928bcb0991SDimitry Andric return false;
20938bcb0991SDimitry Andric
2094*0fca6ea1SDimitry Andric unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2095*0fca6ea1SDimitry Andric : AMDGPUAS::FLAT_ADDRESS;
20960b57cec5SDimitry Andric
20970b57cec5SDimitry Andric if (AnchorList.count(&MI))
20980b57cec5SDimitry Andric return false;
20990b57cec5SDimitry Andric
21000b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
21010b57cec5SDimitry Andric
21020b57cec5SDimitry Andric if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
21030b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
21040b57cec5SDimitry Andric return false;
21050b57cec5SDimitry Andric }
21060b57cec5SDimitry Andric
21070b57cec5SDimitry Andric // Step1: Find the base-registers and a 64bit constant offset.
21080b57cec5SDimitry Andric MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
21090b57cec5SDimitry Andric MemAddress MAddr;
211006c3fb27SDimitry Andric if (!Visited.contains(&MI)) {
21110b57cec5SDimitry Andric processBaseWithConstOffset(Base, MAddr);
21120b57cec5SDimitry Andric Visited[&MI] = MAddr;
21130b57cec5SDimitry Andric } else
21140b57cec5SDimitry Andric MAddr = Visited[&MI];
21150b57cec5SDimitry Andric
21160b57cec5SDimitry Andric if (MAddr.Offset == 0) {
21170b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
21180b57cec5SDimitry Andric " constant offsets that can be promoted.\n";);
21190b57cec5SDimitry Andric return false;
21200b57cec5SDimitry Andric }
21210b57cec5SDimitry Andric
21220b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
21230b57cec5SDimitry Andric << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
21240b57cec5SDimitry Andric
21250b57cec5SDimitry Andric // Step2: Traverse through MI's basic block and find an anchor(that has the
21260b57cec5SDimitry Andric // same base-registers) with the highest 13bit distance from MI's offset.
21270b57cec5SDimitry Andric // E.g. (64bit loads)
21280b57cec5SDimitry Andric // bb:
21290b57cec5SDimitry Andric // addr1 = &a + 4096; load1 = load(addr1, 0)
21300b57cec5SDimitry Andric // addr2 = &a + 6144; load2 = load(addr2, 0)
21310b57cec5SDimitry Andric // addr3 = &a + 8192; load3 = load(addr3, 0)
21320b57cec5SDimitry Andric // addr4 = &a + 10240; load4 = load(addr4, 0)
21330b57cec5SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0)
21340b57cec5SDimitry Andric //
21350b57cec5SDimitry Andric // Starting from the first load, the optimization will try to find a new base
21360b57cec5SDimitry Andric // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
21370b57cec5SDimitry Andric // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
21380b57cec5SDimitry Andric // as the new-base(anchor) because of the maximum distance which can
213981ad6265SDimitry Andric // accommodate more intermediate bases presumably.
21400b57cec5SDimitry Andric //
21410b57cec5SDimitry Andric // Step3: move (&a + 8192) above load1. Compute and promote offsets from
21420b57cec5SDimitry Andric // (&a + 8192) for load1, load2, load4.
21430b57cec5SDimitry Andric // addr = &a + 8192
21440b57cec5SDimitry Andric // load1 = load(addr, -4096)
21450b57cec5SDimitry Andric // load2 = load(addr, -2048)
21460b57cec5SDimitry Andric // load3 = load(addr, 0)
21470b57cec5SDimitry Andric // load4 = load(addr, 2048)
21480b57cec5SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0)
21490b57cec5SDimitry Andric //
21500b57cec5SDimitry Andric MachineInstr *AnchorInst = nullptr;
21510b57cec5SDimitry Andric MemAddress AnchorAddr;
21520b57cec5SDimitry Andric uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
21530b57cec5SDimitry Andric SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
21540b57cec5SDimitry Andric
21550b57cec5SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
21560b57cec5SDimitry Andric MachineBasicBlock::iterator E = MBB->end();
21570b57cec5SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator();
21580b57cec5SDimitry Andric ++MBBI;
21590b57cec5SDimitry Andric const SITargetLowering *TLI =
21600b57cec5SDimitry Andric static_cast<const SITargetLowering *>(STM->getTargetLowering());
21610b57cec5SDimitry Andric
21620b57cec5SDimitry Andric for ( ; MBBI != E; ++MBBI) {
21630b57cec5SDimitry Andric MachineInstr &MINext = *MBBI;
21640b57cec5SDimitry Andric // TODO: Support finding an anchor(with same base) from store addresses or
21650b57cec5SDimitry Andric // any other load addresses where the opcodes are different.
21660b57cec5SDimitry Andric if (MINext.getOpcode() != MI.getOpcode() ||
21670b57cec5SDimitry Andric TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
21680b57cec5SDimitry Andric continue;
21690b57cec5SDimitry Andric
21700b57cec5SDimitry Andric const MachineOperand &BaseNext =
21710b57cec5SDimitry Andric *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
21720b57cec5SDimitry Andric MemAddress MAddrNext;
217306c3fb27SDimitry Andric if (!Visited.contains(&MINext)) {
21740b57cec5SDimitry Andric processBaseWithConstOffset(BaseNext, MAddrNext);
21750b57cec5SDimitry Andric Visited[&MINext] = MAddrNext;
21760b57cec5SDimitry Andric } else
21770b57cec5SDimitry Andric MAddrNext = Visited[&MINext];
21780b57cec5SDimitry Andric
21790b57cec5SDimitry Andric if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
21800b57cec5SDimitry Andric MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
21810b57cec5SDimitry Andric MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
21820b57cec5SDimitry Andric MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
21830b57cec5SDimitry Andric continue;
21840b57cec5SDimitry Andric
2185*0fca6ea1SDimitry Andric InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
21860b57cec5SDimitry Andric
21870b57cec5SDimitry Andric int64_t Dist = MAddr.Offset - MAddrNext.Offset;
21880b57cec5SDimitry Andric TargetLoweringBase::AddrMode AM;
21890b57cec5SDimitry Andric AM.HasBaseReg = true;
21900b57cec5SDimitry Andric AM.BaseOffs = Dist;
2191*0fca6ea1SDimitry Andric if (TLI->isLegalFlatAddressingMode(AM, AS) &&
21920b57cec5SDimitry Andric (uint32_t)std::abs(Dist) > MaxDist) {
21930b57cec5SDimitry Andric MaxDist = std::abs(Dist);
21940b57cec5SDimitry Andric
21950b57cec5SDimitry Andric AnchorAddr = MAddrNext;
21960b57cec5SDimitry Andric AnchorInst = &MINext;
21970b57cec5SDimitry Andric }
21980b57cec5SDimitry Andric }
21990b57cec5SDimitry Andric
22000b57cec5SDimitry Andric if (AnchorInst) {
22010b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
22020b57cec5SDimitry Andric AnchorInst->dump());
22030b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
22040b57cec5SDimitry Andric << AnchorAddr.Offset << "\n\n");
22050b57cec5SDimitry Andric
22060b57cec5SDimitry Andric // Instead of moving up, just re-compute anchor-instruction's base address.
22075ffd83dbSDimitry Andric Register Base = computeBase(MI, AnchorAddr);
22080b57cec5SDimitry Andric
22090b57cec5SDimitry Andric updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
22100b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
22110b57cec5SDimitry Andric
2212*0fca6ea1SDimitry Andric for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
22130b57cec5SDimitry Andric TargetLoweringBase::AddrMode AM;
22140b57cec5SDimitry Andric AM.HasBaseReg = true;
2215*0fca6ea1SDimitry Andric AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
22160b57cec5SDimitry Andric
2217*0fca6ea1SDimitry Andric if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2218*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2219*0fca6ea1SDimitry Andric OtherMI->dump());
2220*0fca6ea1SDimitry Andric updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
22220b57cec5SDimitry Andric }
22230b57cec5SDimitry Andric }
22240b57cec5SDimitry Andric AnchorList.insert(AnchorInst);
22250b57cec5SDimitry Andric return true;
22260b57cec5SDimitry Andric }
22270b57cec5SDimitry Andric
22280b57cec5SDimitry Andric return false;
22290b57cec5SDimitry Andric }
22300b57cec5SDimitry Andric
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const22318bcb0991SDimitry Andric void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
22328bcb0991SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) const {
22338bcb0991SDimitry Andric for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234480093f4SDimitry Andric if (AddrList.front().InstClass == CI.InstClass &&
223504eeddc0SDimitry Andric AddrList.front().IsAGPR == CI.IsAGPR &&
2236bdd1243dSDimitry Andric AddrList.front().hasSameBaseAddress(CI)) {
22378bcb0991SDimitry Andric AddrList.emplace_back(CI);
22388bcb0991SDimitry Andric return;
22398bcb0991SDimitry Andric }
22408bcb0991SDimitry Andric }
22410b57cec5SDimitry Andric
22428bcb0991SDimitry Andric // Base address not found, so add a new list.
22438bcb0991SDimitry Andric MergeableInsts.emplace_back(1, CI);
22448bcb0991SDimitry Andric }
22458bcb0991SDimitry Andric
22465ffd83dbSDimitry Andric std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const22475ffd83dbSDimitry Andric SILoadStoreOptimizer::collectMergeableInsts(
22485ffd83dbSDimitry Andric MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
22495ffd83dbSDimitry Andric MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
22508bcb0991SDimitry Andric std::list<std::list<CombineInfo>> &MergeableInsts) const {
22518bcb0991SDimitry Andric bool Modified = false;
22520b57cec5SDimitry Andric
22538bcb0991SDimitry Andric // Sort potential mergeable instructions into lists. One list per base address.
22545ffd83dbSDimitry Andric unsigned Order = 0;
22555ffd83dbSDimitry Andric MachineBasicBlock::iterator BlockI = Begin;
22565ffd83dbSDimitry Andric for (; BlockI != End; ++BlockI) {
22575ffd83dbSDimitry Andric MachineInstr &MI = *BlockI;
22585ffd83dbSDimitry Andric
22598bcb0991SDimitry Andric // We run this before checking if an address is mergeable, because it can produce
22608bcb0991SDimitry Andric // better code even if the instructions aren't mergeable.
22610b57cec5SDimitry Andric if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
22620b57cec5SDimitry Andric Modified = true;
22630b57cec5SDimitry Andric
22641fd87a68SDimitry Andric // Treat volatile accesses, ordered accesses and unmodeled side effects as
22651fd87a68SDimitry Andric // barriers. We can look after this barrier for separate merges.
22661fd87a68SDimitry Andric if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
22671fd87a68SDimitry Andric LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
22685ffd83dbSDimitry Andric
22695ffd83dbSDimitry Andric // Search will resume after this instruction in a separate merge list.
22705ffd83dbSDimitry Andric ++BlockI;
22715ffd83dbSDimitry Andric break;
22725ffd83dbSDimitry Andric }
22735ffd83dbSDimitry Andric
22748bcb0991SDimitry Andric const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
22758bcb0991SDimitry Andric if (InstClass == UNKNOWN)
22768bcb0991SDimitry Andric continue;
22778bcb0991SDimitry Andric
227804eeddc0SDimitry Andric // Do not merge VMEM buffer instructions with "swizzled" bit set.
227904eeddc0SDimitry Andric int Swizzled =
228004eeddc0SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
228104eeddc0SDimitry Andric if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
228204eeddc0SDimitry Andric continue;
228304eeddc0SDimitry Andric
22848bcb0991SDimitry Andric CombineInfo CI;
228504eeddc0SDimitry Andric CI.setMI(MI, *this);
22865ffd83dbSDimitry Andric CI.Order = Order++;
22878bcb0991SDimitry Andric
22888bcb0991SDimitry Andric if (!CI.hasMergeableAddress(*MRI))
22898bcb0991SDimitry Andric continue;
22908bcb0991SDimitry Andric
229104eeddc0SDimitry Andric if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
229204eeddc0SDimitry Andric // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
229304eeddc0SDimitry Andric // operands. However we are reporting that ds_write2 shall have
229404eeddc0SDimitry Andric // only VGPR data so that machine copy propagation does not
229504eeddc0SDimitry Andric // create an illegal instruction with a VGPR and AGPR sources.
229604eeddc0SDimitry Andric // Consequenctially if we create such instruction the verifier
229704eeddc0SDimitry Andric // will complain.
229804eeddc0SDimitry Andric continue;
229904eeddc0SDimitry Andric }
230004eeddc0SDimitry Andric
23015ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
23025ffd83dbSDimitry Andric
23038bcb0991SDimitry Andric addInstToMergeableList(CI, MergeableInsts);
23048bcb0991SDimitry Andric }
23055ffd83dbSDimitry Andric
23065ffd83dbSDimitry Andric // At this point we have lists of Mergeable instructions.
23075ffd83dbSDimitry Andric //
23085ffd83dbSDimitry Andric // Part 2: Sort lists by offset and then for each CombineInfo object in the
23095ffd83dbSDimitry Andric // list try to find an instruction that can be merged with I. If an instruction
23105ffd83dbSDimitry Andric // is found, it is stored in the Paired field. If no instructions are found, then
23115ffd83dbSDimitry Andric // the CombineInfo object is deleted from the list.
23125ffd83dbSDimitry Andric
23135ffd83dbSDimitry Andric for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
23145ffd83dbSDimitry Andric E = MergeableInsts.end(); I != E;) {
23155ffd83dbSDimitry Andric
23165ffd83dbSDimitry Andric std::list<CombineInfo> &MergeList = *I;
23175ffd83dbSDimitry Andric if (MergeList.size() <= 1) {
23185ffd83dbSDimitry Andric // This means we have found only one instruction with a given address
23195ffd83dbSDimitry Andric // that can be merged, and we need at least 2 instructions to do a merge,
23205ffd83dbSDimitry Andric // so this list can be discarded.
23215ffd83dbSDimitry Andric I = MergeableInsts.erase(I);
23225ffd83dbSDimitry Andric continue;
23235ffd83dbSDimitry Andric }
23245ffd83dbSDimitry Andric
23255ffd83dbSDimitry Andric // Sort the lists by offsets, this way mergeable instructions will be
23265ffd83dbSDimitry Andric // adjacent to each other in the list, which will make it easier to find
23275ffd83dbSDimitry Andric // matches.
23285ffd83dbSDimitry Andric MergeList.sort(
2329349cc55cSDimitry Andric [] (const CombineInfo &A, const CombineInfo &B) {
23305ffd83dbSDimitry Andric return A.Offset < B.Offset;
23315ffd83dbSDimitry Andric });
23325ffd83dbSDimitry Andric ++I;
23335ffd83dbSDimitry Andric }
23345ffd83dbSDimitry Andric
2335*0fca6ea1SDimitry Andric return {BlockI, Modified};
23368bcb0991SDimitry Andric }
23378bcb0991SDimitry Andric
23388bcb0991SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
23398bcb0991SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
23408bcb0991SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)23418bcb0991SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(
23428bcb0991SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) {
23438bcb0991SDimitry Andric bool Modified = false;
23448bcb0991SDimitry Andric
23455ffd83dbSDimitry Andric for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
23465ffd83dbSDimitry Andric E = MergeableInsts.end(); I != E;) {
23475ffd83dbSDimitry Andric std::list<CombineInfo> &MergeList = *I;
23488bcb0991SDimitry Andric
23498bcb0991SDimitry Andric bool OptimizeListAgain = false;
23508bcb0991SDimitry Andric if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
23515ffd83dbSDimitry Andric // We weren't able to make any changes, so delete the list so we don't
23528bcb0991SDimitry Andric // process the same instructions the next time we try to optimize this
23538bcb0991SDimitry Andric // block.
23545ffd83dbSDimitry Andric I = MergeableInsts.erase(I);
23550b57cec5SDimitry Andric continue;
23560b57cec5SDimitry Andric }
23570b57cec5SDimitry Andric
23585ffd83dbSDimitry Andric Modified = true;
23595ffd83dbSDimitry Andric
23608bcb0991SDimitry Andric // We made changes, but also determined that there were no more optimization
23618bcb0991SDimitry Andric // opportunities, so we don't need to reprocess the list
23625ffd83dbSDimitry Andric if (!OptimizeListAgain) {
23635ffd83dbSDimitry Andric I = MergeableInsts.erase(I);
23645ffd83dbSDimitry Andric continue;
23655ffd83dbSDimitry Andric }
23665ffd83dbSDimitry Andric OptimizeAgain = true;
23678bcb0991SDimitry Andric }
23688bcb0991SDimitry Andric return Modified;
23698bcb0991SDimitry Andric }
23708bcb0991SDimitry Andric
23718bcb0991SDimitry Andric bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)23728bcb0991SDimitry Andric SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
23738bcb0991SDimitry Andric std::list<CombineInfo> &MergeList,
23748bcb0991SDimitry Andric bool &OptimizeListAgain) {
23755ffd83dbSDimitry Andric if (MergeList.empty())
23765ffd83dbSDimitry Andric return false;
23775ffd83dbSDimitry Andric
23788bcb0991SDimitry Andric bool Modified = false;
2379480093f4SDimitry Andric
23805ffd83dbSDimitry Andric for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
23815ffd83dbSDimitry Andric Next = std::next(I)) {
23825ffd83dbSDimitry Andric
23835ffd83dbSDimitry Andric auto First = I;
23845ffd83dbSDimitry Andric auto Second = Next;
23855ffd83dbSDimitry Andric
23865ffd83dbSDimitry Andric if ((*First).Order > (*Second).Order)
23875ffd83dbSDimitry Andric std::swap(First, Second);
23885ffd83dbSDimitry Andric CombineInfo &CI = *First;
23895ffd83dbSDimitry Andric CombineInfo &Paired = *Second;
23905ffd83dbSDimitry Andric
239181ad6265SDimitry Andric CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
239281ad6265SDimitry Andric if (!Where) {
23935ffd83dbSDimitry Andric ++I;
2394480093f4SDimitry Andric continue;
23955ffd83dbSDimitry Andric }
2396480093f4SDimitry Andric
2397480093f4SDimitry Andric Modified = true;
23985ffd83dbSDimitry Andric
23995ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
24000b57cec5SDimitry Andric
240181ad6265SDimitry Andric MachineBasicBlock::iterator NewMI;
24020b57cec5SDimitry Andric switch (CI.InstClass) {
24030b57cec5SDimitry Andric default:
2404480093f4SDimitry Andric llvm_unreachable("unknown InstClass");
24050b57cec5SDimitry Andric break;
240681ad6265SDimitry Andric case DS_READ:
240781ad6265SDimitry Andric NewMI = mergeRead2Pair(CI, Paired, Where->I);
240881ad6265SDimitry Andric break;
240981ad6265SDimitry Andric case DS_WRITE:
241081ad6265SDimitry Andric NewMI = mergeWrite2Pair(CI, Paired, Where->I);
241181ad6265SDimitry Andric break;
241281ad6265SDimitry Andric case S_BUFFER_LOAD_IMM:
2413bdd1243dSDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
2414bdd1243dSDimitry Andric case S_LOAD_IMM:
2415bdd1243dSDimitry Andric NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
241681ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 8;
241781ad6265SDimitry Andric break;
241881ad6265SDimitry Andric case BUFFER_LOAD:
241981ad6265SDimitry Andric NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
242081ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
242181ad6265SDimitry Andric break;
242281ad6265SDimitry Andric case BUFFER_STORE:
242381ad6265SDimitry Andric NewMI = mergeBufferStorePair(CI, Paired, Where->I);
242481ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
242581ad6265SDimitry Andric break;
242681ad6265SDimitry Andric case MIMG:
242781ad6265SDimitry Andric NewMI = mergeImagePair(CI, Paired, Where->I);
242881ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
242981ad6265SDimitry Andric break;
243081ad6265SDimitry Andric case TBUFFER_LOAD:
243181ad6265SDimitry Andric NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
243281ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
243381ad6265SDimitry Andric break;
243481ad6265SDimitry Andric case TBUFFER_STORE:
243581ad6265SDimitry Andric NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
243681ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
243781ad6265SDimitry Andric break;
243881ad6265SDimitry Andric case FLAT_LOAD:
243981ad6265SDimitry Andric case GLOBAL_LOAD:
244081ad6265SDimitry Andric case GLOBAL_LOAD_SADDR:
244181ad6265SDimitry Andric NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
244281ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
244381ad6265SDimitry Andric break;
244481ad6265SDimitry Andric case FLAT_STORE:
244581ad6265SDimitry Andric case GLOBAL_STORE:
244681ad6265SDimitry Andric case GLOBAL_STORE_SADDR:
244781ad6265SDimitry Andric NewMI = mergeFlatStorePair(CI, Paired, Where->I);
244881ad6265SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
24498bcb0991SDimitry Andric break;
2450480093f4SDimitry Andric }
245104eeddc0SDimitry Andric CI.setMI(NewMI, *this);
245281ad6265SDimitry Andric CI.Order = Where->Order;
24535ffd83dbSDimitry Andric if (I == Second)
24545ffd83dbSDimitry Andric I = Next;
2455480093f4SDimitry Andric
24565ffd83dbSDimitry Andric MergeList.erase(Second);
24570b57cec5SDimitry Andric }
24580b57cec5SDimitry Andric
24590b57cec5SDimitry Andric return Modified;
24600b57cec5SDimitry Andric }
24610b57cec5SDimitry Andric
runOnMachineFunction(MachineFunction & MF)24620b57cec5SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
24630b57cec5SDimitry Andric if (skipFunction(MF.getFunction()))
24640b57cec5SDimitry Andric return false;
24650b57cec5SDimitry Andric
24660b57cec5SDimitry Andric STM = &MF.getSubtarget<GCNSubtarget>();
24670b57cec5SDimitry Andric if (!STM->loadStoreOptEnabled())
24680b57cec5SDimitry Andric return false;
24690b57cec5SDimitry Andric
24700b57cec5SDimitry Andric TII = STM->getInstrInfo();
24710b57cec5SDimitry Andric TRI = &TII->getRegisterInfo();
24720b57cec5SDimitry Andric
24730b57cec5SDimitry Andric MRI = &MF.getRegInfo();
24740b57cec5SDimitry Andric AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
24750b57cec5SDimitry Andric
24760b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
24770b57cec5SDimitry Andric
24780b57cec5SDimitry Andric bool Modified = false;
24790b57cec5SDimitry Andric
24805ffd83dbSDimitry Andric // Contains the list of instructions for which constant offsets are being
24815ffd83dbSDimitry Andric // promoted to the IMM. This is tracked for an entire block at time.
24825ffd83dbSDimitry Andric SmallPtrSet<MachineInstr *, 4> AnchorList;
24835ffd83dbSDimitry Andric MemInfoMap Visited;
24848bcb0991SDimitry Andric
24850b57cec5SDimitry Andric for (MachineBasicBlock &MBB : MF) {
24865ffd83dbSDimitry Andric MachineBasicBlock::iterator SectionEnd;
24875ffd83dbSDimitry Andric for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
24885ffd83dbSDimitry Andric I = SectionEnd) {
24895ffd83dbSDimitry Andric bool CollectModified;
24908bcb0991SDimitry Andric std::list<std::list<CombineInfo>> MergeableInsts;
24915ffd83dbSDimitry Andric
24925ffd83dbSDimitry Andric // First pass: Collect list of all instructions we know how to merge in a
24935ffd83dbSDimitry Andric // subset of the block.
24945ffd83dbSDimitry Andric std::tie(SectionEnd, CollectModified) =
24955ffd83dbSDimitry Andric collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
24965ffd83dbSDimitry Andric
24975ffd83dbSDimitry Andric Modified |= CollectModified;
24985ffd83dbSDimitry Andric
24990b57cec5SDimitry Andric do {
25000b57cec5SDimitry Andric OptimizeAgain = false;
25018bcb0991SDimitry Andric Modified |= optimizeBlock(MergeableInsts);
25020b57cec5SDimitry Andric } while (OptimizeAgain);
25030b57cec5SDimitry Andric }
25040b57cec5SDimitry Andric
25055ffd83dbSDimitry Andric Visited.clear();
25065ffd83dbSDimitry Andric AnchorList.clear();
25075ffd83dbSDimitry Andric }
25085ffd83dbSDimitry Andric
25090b57cec5SDimitry Andric return Modified;
25100b57cec5SDimitry Andric }
2511