xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AMDGPURegBankLegalize.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// Lower G_ instructions that can't be inst-selected with register bank
10 /// assignment from AMDGPURegBankSelect based on machine uniformity info.
11 /// Given types on all operands, some register bank assignments require lowering
12 /// while others do not.
13 /// Note: cases where all register bank assignments would require lowering are
14 /// lowered in legalizer.
15 /// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not.
16 /// Eliminate sgpr S1 by lowering to sgpr S32.
17 //
18 //===----------------------------------------------------------------------===//
19 
20 #include "AMDGPU.h"
21 #include "AMDGPUGlobalISelUtils.h"
22 #include "AMDGPURegBankLegalizeHelper.h"
23 #include "GCNSubtarget.h"
24 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
25 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
28 #include "llvm/CodeGen/TargetPassConfig.h"
29 #include "llvm/InitializePasses.h"
30 
31 #define DEBUG_TYPE "amdgpu-regbanklegalize"
32 
33 using namespace llvm;
34 using namespace AMDGPU;
35 
36 namespace {
37 
38 class AMDGPURegBankLegalize : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42 public:
AMDGPURegBankLegalize()43   AMDGPURegBankLegalize() : MachineFunctionPass(ID) {}
44 
45   bool runOnMachineFunction(MachineFunction &MF) override;
46 
getPassName() const47   StringRef getPassName() const override {
48     return "AMDGPU Register Bank Legalize";
49   }
50 
getAnalysisUsage(AnalysisUsage & AU) const51   void getAnalysisUsage(AnalysisUsage &AU) const override {
52     AU.addRequired<TargetPassConfig>();
53     AU.addRequired<GISelCSEAnalysisWrapperPass>();
54     AU.addRequired<MachineUniformityAnalysisPass>();
55     MachineFunctionPass::getAnalysisUsage(AU);
56   }
57 
58   // If there were no phis and we do waterfall expansion machine verifier would
59   // fail.
getClearedProperties() const60   MachineFunctionProperties getClearedProperties() const override {
61     return MachineFunctionProperties().setNoPHIs();
62   }
63 };
64 
65 } // End anonymous namespace.
66 
67 INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE,
68                       "AMDGPU Register Bank Legalize", false, false)
69 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
70 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
71 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
72 INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE,
73                     "AMDGPU Register Bank Legalize", false, false)
74 
75 char AMDGPURegBankLegalize::ID = 0;
76 
77 char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID;
78 
createAMDGPURegBankLegalizePass()79 FunctionPass *llvm::createAMDGPURegBankLegalizePass() {
80   return new AMDGPURegBankLegalize();
81 }
82 
getRules(const GCNSubtarget & ST,MachineRegisterInfo & MRI)83 const RegBankLegalizeRules &getRules(const GCNSubtarget &ST,
84                                      MachineRegisterInfo &MRI) {
85   static std::mutex GlobalMutex;
86   static SmallDenseMap<unsigned, std::unique_ptr<RegBankLegalizeRules>>
87       CacheForRuleSet;
88   std::lock_guard<std::mutex> Lock(GlobalMutex);
89   auto [It, Inserted] = CacheForRuleSet.try_emplace(ST.getGeneration());
90   if (Inserted)
91     It->second = std::make_unique<RegBankLegalizeRules>(ST, MRI);
92   else
93     It->second->refreshRefs(ST, MRI);
94   return *It->second;
95 }
96 
97 class AMDGPURegBankLegalizeCombiner {
98   MachineIRBuilder &B;
99   MachineRegisterInfo &MRI;
100   const SIRegisterInfo &TRI;
101   const RegisterBank *SgprRB;
102   const RegisterBank *VgprRB;
103   const RegisterBank *VccRB;
104 
105   static constexpr LLT S1 = LLT::scalar(1);
106   static constexpr LLT S16 = LLT::scalar(16);
107   static constexpr LLT S32 = LLT::scalar(32);
108   static constexpr LLT S64 = LLT::scalar(64);
109 
110 public:
AMDGPURegBankLegalizeCombiner(MachineIRBuilder & B,const SIRegisterInfo & TRI,const RegisterBankInfo & RBI)111   AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI,
112                                 const RegisterBankInfo &RBI)
113       : B(B), MRI(*B.getMRI()), TRI(TRI),
114         SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
115         VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
116         VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
117 
isLaneMask(Register Reg)118   bool isLaneMask(Register Reg) {
119     const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
120     if (RB && RB->getID() == AMDGPU::VCCRegBankID)
121       return true;
122 
123     const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
124     return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
125   }
126 
cleanUpAfterCombine(MachineInstr & MI,MachineInstr * Optional0)127   void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
128     MI.eraseFromParent();
129     if (Optional0 && isTriviallyDead(*Optional0, MRI))
130       Optional0->eraseFromParent();
131   }
132 
tryMatch(Register Src,unsigned Opcode)133   std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
134     MachineInstr *MatchMI = MRI.getVRegDef(Src);
135     if (MatchMI->getOpcode() != Opcode)
136       return {nullptr, Register()};
137     return {MatchMI, MatchMI->getOperand(1).getReg()};
138   }
139 
tryCombineCopy(MachineInstr & MI)140   void tryCombineCopy(MachineInstr &MI) {
141     Register Dst = MI.getOperand(0).getReg();
142     Register Src = MI.getOperand(1).getReg();
143     // Skip copies of physical registers.
144     if (!Dst.isVirtual() || !Src.isVirtual())
145       return;
146 
147     // This is a cross bank copy, sgpr S1 to lane mask.
148     //
149     // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
150     // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
151     // ->
152     // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
153     if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
154       auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
155       assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
156              "sgpr S1 must be result of G_TRUNC of sgpr S32");
157 
158       B.setInstr(MI);
159       // Ensure that truncated bits in BoolSrc are 0.
160       auto One = B.buildConstant({SgprRB, S32}, 1);
161       auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
162       B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
163       cleanUpAfterCombine(MI, Trunc);
164       return;
165     }
166 
167     // Src = G_AMDGPU_READANYLANE RALSrc
168     // Dst = COPY Src
169     // ->
170     // Dst = RALSrc
171     if (MRI.getRegBankOrNull(Dst) == VgprRB &&
172         MRI.getRegBankOrNull(Src) == SgprRB) {
173       auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
174       if (!RAL)
175         return;
176 
177       assert(MRI.getRegBank(RALSrc) == VgprRB);
178       MRI.replaceRegWith(Dst, RALSrc);
179       cleanUpAfterCombine(MI, RAL);
180       return;
181     }
182   }
183 
tryCombineS1AnyExt(MachineInstr & MI)184   void tryCombineS1AnyExt(MachineInstr &MI) {
185     // %Src:sgpr(S1) = G_TRUNC %TruncSrc
186     // %Dst = G_ANYEXT %Src:sgpr(S1)
187     // ->
188     // %Dst = G_... %TruncSrc
189     Register Dst = MI.getOperand(0).getReg();
190     Register Src = MI.getOperand(1).getReg();
191     if (MRI.getType(Src) != S1)
192       return;
193 
194     auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
195     if (!Trunc)
196       return;
197 
198     LLT DstTy = MRI.getType(Dst);
199     LLT TruncSrcTy = MRI.getType(TruncSrc);
200 
201     if (DstTy == TruncSrcTy) {
202       MRI.replaceRegWith(Dst, TruncSrc);
203       cleanUpAfterCombine(MI, Trunc);
204       return;
205     }
206 
207     B.setInstr(MI);
208 
209     if (DstTy == S32 && TruncSrcTy == S64) {
210       auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
211       MRI.replaceRegWith(Dst, Unmerge.getReg(0));
212       cleanUpAfterCombine(MI, Trunc);
213       return;
214     }
215 
216     if (DstTy == S64 && TruncSrcTy == S32) {
217       B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
218                             {TruncSrc, B.buildUndef({SgprRB, S32})});
219       cleanUpAfterCombine(MI, Trunc);
220       return;
221     }
222 
223     if (DstTy == S32 && TruncSrcTy == S16) {
224       B.buildAnyExt(Dst, TruncSrc);
225       cleanUpAfterCombine(MI, Trunc);
226       return;
227     }
228 
229     if (DstTy == S16 && TruncSrcTy == S32) {
230       B.buildTrunc(Dst, TruncSrc);
231       cleanUpAfterCombine(MI, Trunc);
232       return;
233     }
234 
235     llvm_unreachable("missing anyext + trunc combine");
236   }
237 };
238 
239 // Search through MRI for virtual registers with sgpr register bank and S1 LLT.
getAnySgprS1(const MachineRegisterInfo & MRI)240 [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
241   const LLT S1 = LLT::scalar(1);
242   for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
243     Register Reg = Register::index2VirtReg(i);
244     if (MRI.def_empty(Reg) || MRI.getType(Reg) != S1)
245       continue;
246 
247     const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
248     if (RB && RB->getID() == AMDGPU::SGPRRegBankID) {
249       LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: ";
250                  MRI.getVRegDef(Reg)->dump(););
251       return Reg;
252     }
253   }
254 
255   return {};
256 }
257 
runOnMachineFunction(MachineFunction & MF)258 bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
259   if (MF.getProperties().hasFailedISel())
260     return false;
261 
262   // Setup the instruction builder with CSE.
263   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
264   GISelCSEAnalysisWrapper &Wrapper =
265       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
266   GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig());
267   GISelObserverWrapper Observer;
268   Observer.addObserver(&CSEInfo);
269 
270   CSEMIRBuilder B(MF);
271   B.setCSEInfo(&CSEInfo);
272   B.setChangeObserver(Observer);
273 
274   RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
275   RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
276 
277   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
278   MachineRegisterInfo &MRI = MF.getRegInfo();
279   const RegisterBankInfo &RBI = *ST.getRegBankInfo();
280   const MachineUniformityInfo &MUI =
281       getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
282 
283   // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes.
284   const RegBankLegalizeRules &RBLRules = getRules(ST, MRI);
285 
286   // Logic that does legalization based on IDs assigned to Opcode.
287   RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules);
288 
289   SmallVector<MachineInstr *> AllInst;
290 
291   for (MachineBasicBlock &MBB : MF) {
292     for (MachineInstr &MI : MBB) {
293       AllInst.push_back(&MI);
294     }
295   }
296 
297   for (MachineInstr *MI : AllInst) {
298     if (!MI->isPreISelOpcode())
299       continue;
300 
301     unsigned Opc = MI->getOpcode();
302     // Insert point for use operands needs some calculation.
303     if (Opc == AMDGPU::G_PHI) {
304       RBLHelper.applyMappingPHI(*MI);
305       continue;
306     }
307 
308     // Opcodes that support pretty much all combinations of reg banks and LLTs
309     // (except S1). There is no point in writing rules for them.
310     if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
311         Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
312       RBLHelper.applyMappingTrivial(*MI);
313       continue;
314     }
315 
316     // Opcodes that also support S1.
317     if (Opc == G_FREEZE &&
318         MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
319       RBLHelper.applyMappingTrivial(*MI);
320       continue;
321     }
322 
323     if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
324          Opc == AMDGPU::G_IMPLICIT_DEF)) {
325       Register Dst = MI->getOperand(0).getReg();
326       // Non S1 types are trivially accepted.
327       if (MRI.getType(Dst) != LLT::scalar(1)) {
328         assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID);
329         continue;
330       }
331 
332       // S1 rules are in RegBankLegalizeRules.
333     }
334 
335     RBLHelper.findRuleAndApplyMapping(*MI);
336   }
337 
338   // Sgpr S1 clean up combines:
339   // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine.
340   //   In RegBankLegalize 'S1 Dst' are legalized into S32 as
341   //   'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'.
342   //   S1 Truncs and Anyexts that come from legalizer, that can have non-S32
343   //   types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up.
344   // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine.
345   //   Divergent instruction uses sgpr S1 as input that should be lane mask(vcc)
346   //   Legalizing this use creates sgpr S1(S32) to vcc Copy.
347 
348   // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1:
349   // - Vcc to vcc Copy: nothing to do here, just a regular copy.
350   // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*).
351   //   Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used
352   //   instead. When only available instruction creates vcc result, use of
353   //   UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC.
354 
355   // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)':
356   // Copy from divergent to uniform register indicates an error in either:
357   // - Uniformity analysis: Uniform instruction has divergent input. If one of
358   //   the inputs is divergent, instruction should be divergent!
359   // - RegBankLegalizer not executing in waterfall loop (missing implementation)
360 
361   AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI);
362 
363   for (MachineBasicBlock &MBB : MF) {
364     for (MachineInstr &MI : make_early_inc_range(MBB)) {
365       if (MI.getOpcode() == AMDGPU::COPY) {
366         Combiner.tryCombineCopy(MI);
367         continue;
368       }
369       if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
370         Combiner.tryCombineS1AnyExt(MI);
371         continue;
372       }
373     }
374   }
375 
376   assert(!getAnySgprS1(MRI).isValid() &&
377          "Registers with sgpr reg bank and S1 LLT are not legal after "
378          "AMDGPURegBankLegalize. Should lower to sgpr S32");
379 
380   return true;
381 }
382