xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass combines split register tuple initialization into a single pseudo:
11 ///
12 ///   undef %0.sub1:sreg_64 = S_MOV_B32 1
13 ///   %0.sub0:sreg_64 = S_MOV_B32 2
14 /// =>
15 ///   %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
16 ///
17 /// This is to allow rematerialization of a value instead of spilling. It is
18 /// supposed to be done after register coalescer to allow it to do its job and
19 /// before actual register allocation to allow rematerialization.
20 ///
21 /// Right now the pass only handles 64 bit SGPRs with immediate initializers,
22 /// although the same shall be possible with other register classes and
23 /// instructions if necessary.
24 ///
25 /// This pass also adds register allocation hints to COPY.
26 /// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27 /// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
29 /// the VGPR_32, the COPY can be completely eliminated.
30 ///
31 //===----------------------------------------------------------------------===//
32 
33 #include "GCNPreRAOptimizations.h"
34 #include "AMDGPU.h"
35 #include "GCNSubtarget.h"
36 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
37 #include "SIRegisterInfo.h"
38 #include "llvm/CodeGen/LiveIntervals.h"
39 #include "llvm/CodeGen/MachineFunctionPass.h"
40 #include "llvm/InitializePasses.h"
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
45 
46 namespace {
47 
48 class GCNPreRAOptimizationsImpl {
49 private:
50   const SIInstrInfo *TII;
51   const SIRegisterInfo *TRI;
52   MachineRegisterInfo *MRI;
53   LiveIntervals *LIS;
54 
55   bool processReg(Register Reg);
56 
57 public:
58   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
59   bool run(MachineFunction &MF);
60 };
61 
62 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
63 public:
64   static char ID;
65 
66   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
67     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
68   }
69 
70   bool runOnMachineFunction(MachineFunction &MF) override;
71 
72   StringRef getPassName() const override {
73     return "AMDGPU Pre-RA optimizations";
74   }
75 
76   void getAnalysisUsage(AnalysisUsage &AU) const override {
77     AU.addRequired<LiveIntervalsWrapperPass>();
78     AU.setPreservesAll();
79     MachineFunctionPass::getAnalysisUsage(AU);
80   }
81 };
82 } // End anonymous namespace.
83 
84 INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
85                       "AMDGPU Pre-RA optimizations", false, false)
86 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
87 INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE,
88                     "Pre-RA optimizations", false, false)
89 
90 char GCNPreRAOptimizationsLegacy::ID = 0;
91 
92 char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizationsLegacy::ID;
93 
94 FunctionPass *llvm::createGCNPreRAOptimizationsLegacyPass() {
95   return new GCNPreRAOptimizationsLegacy();
96 }
97 
98 bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
99   MachineInstr *Def0 = nullptr;
100   MachineInstr *Def1 = nullptr;
101   uint64_t Init = 0;
102   bool Changed = false;
103   SmallSet<Register, 32> ModifiedRegs;
104   bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg));
105 
106   for (MachineInstr &I : MRI->def_instructions(Reg)) {
107     switch (I.getOpcode()) {
108     default:
109       return false;
110     case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
111       break;
112     case AMDGPU::COPY: {
113       // Some subtargets cannot do an AGPR to AGPR copy directly, and need an
114       // intermdiate temporary VGPR register. Try to find the defining
115       // accvgpr_write to avoid temporary registers.
116 
117       if (!IsAGPRDst)
118         return false;
119 
120       Register SrcReg = I.getOperand(1).getReg();
121 
122       if (!SrcReg.isVirtual())
123         break;
124 
125       // Check if source of copy is from another AGPR.
126       bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg));
127       if (!IsAGPRSrc)
128         break;
129 
130       // def_instructions() does not look at subregs so it may give us a
131       // different instruction that defines the same vreg but different subreg
132       // so we have to manually check subreg.
133       Register SrcSubReg = I.getOperand(1).getSubReg();
134       for (auto &Def : MRI->def_instructions(SrcReg)) {
135         if (SrcSubReg != Def.getOperand(0).getSubReg())
136           continue;
137 
138         if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
139           MachineOperand DefSrcMO = Def.getOperand(1);
140 
141           // Immediates are not an issue and can be propagated in
142           // postrapseudos pass. Only handle cases where defining
143           // accvgpr_write source is a vreg.
144           if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
145             // Propagate source reg of accvgpr write to this copy instruction
146             I.getOperand(1).setReg(DefSrcMO.getReg());
147             I.getOperand(1).setSubReg(DefSrcMO.getSubReg());
148 
149             // Reg uses were changed, collect unique set of registers to update
150             // live intervals at the end.
151             ModifiedRegs.insert(DefSrcMO.getReg());
152             ModifiedRegs.insert(SrcReg);
153 
154             Changed = true;
155           }
156 
157           // Found the defining accvgpr_write, stop looking any further.
158           break;
159         }
160       }
161       break;
162     }
163     case AMDGPU::S_MOV_B32:
164       if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() ||
165           I.getNumOperands() != 2)
166         return false;
167 
168       switch (I.getOperand(0).getSubReg()) {
169       default:
170         return false;
171       case AMDGPU::sub0:
172         if (Def0)
173           return false;
174         Def0 = &I;
175         Init |= Lo_32(I.getOperand(1).getImm());
176         break;
177       case AMDGPU::sub1:
178         if (Def1)
179           return false;
180         Def1 = &I;
181         Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
182         break;
183       }
184       break;
185     }
186   }
187 
188   // For AGPR reg, check if live intervals need to be updated.
189   if (IsAGPRDst) {
190     if (Changed) {
191       for (Register RegToUpdate : ModifiedRegs) {
192         LIS->removeInterval(RegToUpdate);
193         LIS->createAndComputeVirtRegInterval(RegToUpdate);
194       }
195     }
196 
197     return Changed;
198   }
199 
200   // For SGPR reg, check if we can combine instructions.
201   if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
202     return Changed;
203 
204   LLVM_DEBUG(dbgs() << "Combining:\n  " << *Def0 << "  " << *Def1
205                     << "    =>\n");
206 
207   if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
208                                 LIS->getInstructionIndex(*Def0)))
209     std::swap(Def0, Def1);
210 
211   LIS->RemoveMachineInstrFromMaps(*Def0);
212   LIS->RemoveMachineInstrFromMaps(*Def1);
213   auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
214                       TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
215                   .addImm(Init);
216 
217   Def0->eraseFromParent();
218   Def1->eraseFromParent();
219   LIS->InsertMachineInstrInMaps(*NewI);
220   LIS->removeInterval(Reg);
221   LIS->createAndComputeVirtRegInterval(Reg);
222 
223   LLVM_DEBUG(dbgs() << "  " << *NewI);
224 
225   return true;
226 }
227 
228 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
229   if (skipFunction(MF.getFunction()))
230     return false;
231   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
232   return GCNPreRAOptimizationsImpl(LIS).run(MF);
233 }
234 
235 PreservedAnalyses
236 GCNPreRAOptimizationsPass::run(MachineFunction &MF,
237                                MachineFunctionAnalysisManager &MFAM) {
238   LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
239   GCNPreRAOptimizationsImpl(LIS).run(MF);
240   return PreservedAnalyses::all();
241 }
242 
243 bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
244   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
245   TII = ST.getInstrInfo();
246   MRI = &MF.getRegInfo();
247   TRI = ST.getRegisterInfo();
248 
249   bool Changed = false;
250 
251   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
252     Register Reg = Register::index2VirtReg(I);
253     if (!LIS->hasInterval(Reg))
254       continue;
255     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
256     if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
257         (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
258       continue;
259 
260     Changed |= processReg(Reg);
261   }
262 
263   if (!ST.useRealTrue16Insts())
264     return Changed;
265 
266   // Add RA hints to improve True16 COPY elimination.
267   for (const MachineBasicBlock &MBB : MF) {
268     for (const MachineInstr &MI : MBB) {
269       if (MI.getOpcode() != AMDGPU::COPY)
270         continue;
271       Register Dst = MI.getOperand(0).getReg();
272       Register Src = MI.getOperand(1).getReg();
273       if (Dst.isVirtual() &&
274           MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
275           Src.isPhysical() &&
276           TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
277         MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
278       if (Src.isVirtual() &&
279           MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
280           Dst.isPhysical() &&
281           TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
282         MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
283       if (!Dst.isVirtual() || !Src.isVirtual())
284         continue;
285       if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
286           MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
287         MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
288         MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
289       }
290       if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
291           MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
292         MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
293     }
294   }
295 
296   return Changed;
297 }
298