xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp (revision 2c2ec6bbc9cc7762a250ffe903bda6c2e44d25ff)
1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// RenameIndependentSubregs pass leaves large partially used super registers,
10 /// for example:
11 ///   undef %0.sub4:VReg_1024 = ...
12 ///   %0.sub5:VReg_1024 = ...
13 ///   %0.sub6:VReg_1024 = ...
14 ///   %0.sub7:VReg_1024 = ...
15 ///   use %0.sub4_sub5_sub6_sub7
16 ///   use %0.sub6_sub7
17 ///
18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
19 /// rewrites such partially used super registers with registers of minimal size:
20 ///   undef %0.sub0:VReg_128 = ...
21 ///   %0.sub1:VReg_128 = ...
22 ///   %0.sub2:VReg_128 = ...
23 ///   %0.sub3:VReg_128 = ...
24 ///   use %0.sub0_sub1_sub2_sub3
25 ///   use %0.sub2_sub3
26 ///
27 /// This allows to avoid subreg lanemasks tracking during register pressure
28 /// calculation and creates more possibilities for the code unaware of lanemasks
29 //===----------------------------------------------------------------------===//
30 
31 #include "GCNRewritePartialRegUses.h"
32 #include "AMDGPU.h"
33 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
34 #include "SIRegisterInfo.h"
35 #include "llvm/CodeGen/LiveInterval.h"
36 #include "llvm/CodeGen/LiveIntervals.h"
37 #include "llvm/CodeGen/MachineFunctionPass.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetInstrInfo.h"
40 #include "llvm/Pass.h"
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "rewrite-partial-reg-uses"
45 
46 namespace {
47 
48 class GCNRewritePartialRegUsesImpl {
49   MachineRegisterInfo *MRI;
50   const SIRegisterInfo *TRI;
51   const TargetInstrInfo *TII;
52   LiveIntervals *LIS;
53 
54   /// Rewrite partially used register Reg by shifting all its subregisters to
55   /// the right and replacing the original register with a register of minimal
56   /// size. Return true if the change has been made.
57   bool rewriteReg(Register Reg) const;
58 
59   /// Map OldSubReg -> NewSubReg. Used as in/out container.
60   using SubRegMap = SmallDenseMap<unsigned, unsigned>;
61 
62   /// Given register class RC and the set of used subregs as keys in the SubRegs
63   /// map return new register class and indexes of right-shifted subregs as
64   /// values in SubRegs map such that the resulting regclass would contain
65   /// registers of minimal size.
66   const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
67                                            SubRegMap &SubRegs) const;
68 
69   /// Given regclass RC and pairs of [OldSubReg, NewSubReg] in SubRegs try to
70   /// find new regclass such that:
71   ///   1. It has subregs obtained by shifting each OldSubReg by RShift number
72   ///      of bits to the right. Every "shifted" subreg should have the same
73   ///      SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers"
74   ///      all other subregs in pairs. Basically such subreg becomes a whole
75   ///      register.
76   ///   2. Resulting register class contains registers of minimal size.
77   ///
78   /// SubRegs is map of OldSubReg -> NewSubReg and is used as in/out
79   /// parameter:
80   ///   OldSubReg - input parameter,
81   ///   NewSubReg - output, contains shifted subregs on return.
82   const TargetRegisterClass *
83   getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
84                                 unsigned CoverSubregIdx,
85                                 SubRegMap &SubRegs) const;
86 
87   /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
88   /// describing OldSubReg -> NewSubReg mapping.
89   void updateLiveIntervals(Register OldReg, Register NewReg,
90                            SubRegMap &SubRegs) const;
91 
92   /// Helper methods.
93 
94   /// Find right-shifted by RShift amount version of the SubReg if it exists,
95   /// return 0 otherwise.
96   unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
97 
98   /// Find subreg index with a given Offset and Size, return 0 if there is no
99   /// such subregister index. The result is cached in SubRegs data-member.
100   unsigned getSubReg(unsigned Offset, unsigned Size) const;
101 
102   /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
103   mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
104 
105   /// Return bit mask that contains all register classes that are projected into
106   /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
107   const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
108                                        unsigned SubRegIdx) const;
109 
110   /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
111   mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
112                         const uint32_t *>
113       SuperRegMasks;
114 
115   /// Return bitmask containing all allocatable register classes with registers
116   /// aligned at AlignNumBits. The result is cached in
117   /// AllocatableAndAlignedRegClassMasks data-member.
118   const BitVector &
119   getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
120 
121   /// Cache for getAllocatableAndAlignedRegClassMask method:
122   ///   AlignNumBits -> Class bitmask.
123   mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
124 
125 public:
126   GCNRewritePartialRegUsesImpl(LiveIntervals *LS) : LIS(LS) {}
127   bool run(MachineFunction &MF);
128 };
129 
130 class GCNRewritePartialRegUsesLegacy : public MachineFunctionPass {
131 public:
132   static char ID;
133   GCNRewritePartialRegUsesLegacy() : MachineFunctionPass(ID) {}
134 
135   StringRef getPassName() const override {
136     return "Rewrite Partial Register Uses";
137   }
138 
139   void getAnalysisUsage(AnalysisUsage &AU) const override {
140     AU.setPreservesCFG();
141     AU.addPreserved<LiveIntervalsWrapperPass>();
142     AU.addPreserved<SlotIndexesWrapperPass>();
143     MachineFunctionPass::getAnalysisUsage(AU);
144   }
145 
146   bool runOnMachineFunction(MachineFunction &MF) override;
147 };
148 
149 } // end anonymous namespace
150 
151 // TODO: move this to the tablegen and use binary search by Offset.
152 unsigned GCNRewritePartialRegUsesImpl::getSubReg(unsigned Offset,
153                                                  unsigned Size) const {
154   const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0);
155   if (Inserted) {
156     for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
157       if (TRI->getSubRegIdxOffset(Idx) == Offset &&
158           TRI->getSubRegIdxSize(Idx) == Size) {
159         I->second = Idx;
160         break;
161       }
162     }
163   }
164   return I->second;
165 }
166 
167 unsigned GCNRewritePartialRegUsesImpl::shiftSubReg(unsigned SubReg,
168                                                    unsigned RShift) const {
169   unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift;
170   return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg));
171 }
172 
173 const uint32_t *GCNRewritePartialRegUsesImpl::getSuperRegClassMask(
174     const TargetRegisterClass *RC, unsigned SubRegIdx) const {
175   const auto [I, Inserted] =
176       SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr);
177   if (Inserted) {
178     for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
179       if (RCI.getSubReg() == SubRegIdx) {
180         I->second = RCI.getMask();
181         break;
182       }
183     }
184   }
185   return I->second;
186 }
187 
188 const BitVector &
189 GCNRewritePartialRegUsesImpl::getAllocatableAndAlignedRegClassMask(
190     unsigned AlignNumBits) const {
191   const auto [I, Inserted] =
192       AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits);
193   if (Inserted) {
194     BitVector &BV = I->second;
195     BV.resize(TRI->getNumRegClasses());
196     for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
197       auto *RC = TRI->getRegClass(ClassID);
198       if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
199         BV.set(ClassID);
200     }
201   }
202   return I->second;
203 }
204 
205 const TargetRegisterClass *
206 GCNRewritePartialRegUsesImpl::getRegClassWithShiftedSubregs(
207     const TargetRegisterClass *RC, unsigned RShift, unsigned CoverSubregIdx,
208     SubRegMap &SubRegs) const {
209 
210   unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
211   LLVM_DEBUG(dbgs() << "  Shift " << RShift << ", reg align " << RCAlign
212                     << '\n');
213 
214   BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign));
215   for (auto &[OldSubReg, NewSubReg] : SubRegs) {
216     LLVM_DEBUG(dbgs() << "  " << TRI->getSubRegIndexName(OldSubReg) << ':');
217 
218     auto *SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg);
219     if (!SubRegRC) {
220       LLVM_DEBUG(dbgs() << "couldn't find target regclass\n");
221       return nullptr;
222     }
223     LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC)
224                       << (SubRegRC->isAllocatable() ? "" : " not alloc")
225                       << " -> ");
226 
227     if (OldSubReg == CoverSubregIdx) {
228       // Covering subreg will become a full register, RC should be allocatable.
229       assert(SubRegRC->isAllocatable());
230       NewSubReg = AMDGPU::NoSubRegister;
231       LLVM_DEBUG(dbgs() << "whole reg");
232     } else {
233       NewSubReg = shiftSubReg(OldSubReg, RShift);
234       if (!NewSubReg) {
235         LLVM_DEBUG(dbgs() << "none\n");
236         return nullptr;
237       }
238       LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
239     }
240 
241     const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg)
242                                      : SubRegRC->getSubClassMask();
243     if (!Mask)
244       llvm_unreachable("no register class mask?");
245 
246     ClassMask.clearBitsNotInMask(Mask);
247     // Don't try to early exit because checking if ClassMask has set bits isn't
248     // that cheap and we expect it to pass in most cases.
249     LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
250   }
251 
252   // ClassMask is the set of all register classes such that each class is
253   // allocatable, aligned, has all shifted subregs and each subreg has required
254   // register class (see SubRegRC above). Now select first (that is largest)
255   // register class with registers of minimal size.
256   const TargetRegisterClass *MinRC = nullptr;
257   unsigned MinNumBits = std::numeric_limits<unsigned>::max();
258   for (unsigned ClassID : ClassMask.set_bits()) {
259     auto *RC = TRI->getRegClass(ClassID);
260     unsigned NumBits = TRI->getRegSizeInBits(*RC);
261     if (NumBits < MinNumBits) {
262       MinNumBits = NumBits;
263       MinRC = RC;
264     }
265   }
266 #ifndef NDEBUG
267   if (MinRC) {
268     assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
269     for (auto [OldSubReg, NewSubReg] : SubRegs)
270       // Check that all registers in MinRC support NewSubReg subregister.
271       assert(MinRC == TRI->getSubClassWithSubReg(MinRC, NewSubReg));
272   }
273 #endif
274   // There might be zero RShift - in this case we just trying to find smaller
275   // register.
276   return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
277 }
278 
279 const TargetRegisterClass *
280 GCNRewritePartialRegUsesImpl::getMinSizeReg(const TargetRegisterClass *RC,
281                                             SubRegMap &SubRegs) const {
282   unsigned CoverSubreg = AMDGPU::NoSubRegister;
283   unsigned Offset = std::numeric_limits<unsigned>::max();
284   unsigned End = 0;
285   for (auto [SubReg, SRI] : SubRegs) {
286     unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg);
287     unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg);
288     if (SubRegOffset < Offset) {
289       Offset = SubRegOffset;
290       CoverSubreg = AMDGPU::NoSubRegister;
291     }
292     if (SubRegEnd > End) {
293       End = SubRegEnd;
294       CoverSubreg = AMDGPU::NoSubRegister;
295     }
296     if (SubRegOffset == Offset && SubRegEnd == End)
297       CoverSubreg = SubReg;
298   }
299   // If covering subreg is found shift everything so the covering subreg would
300   // be in the rightmost position.
301   if (CoverSubreg != AMDGPU::NoSubRegister)
302     return getRegClassWithShiftedSubregs(RC, Offset, CoverSubreg, SubRegs);
303 
304   // Otherwise find subreg with maximum required alignment and shift it and all
305   // other subregs to the rightmost possible position with respect to the
306   // alignment.
307   unsigned MaxAlign = 0;
308   for (auto [SubReg, SRI] : SubRegs)
309     MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg));
310 
311   unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
312   for (auto [SubReg, SRI] : SubRegs) {
313     if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
314       continue;
315     FirstMaxAlignedSubRegOffset =
316         std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg));
317     if (FirstMaxAlignedSubRegOffset == Offset)
318       break;
319   }
320 
321   unsigned NewOffsetOfMaxAlignedSubReg =
322       alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign);
323 
324   if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
325     llvm_unreachable("misaligned subreg");
326 
327   unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
328   return getRegClassWithShiftedSubregs(RC, RShift, 0, SubRegs);
329 }
330 
331 // Only the subrange's lanemasks of the original interval need to be modified.
332 // Subrange for a covering subreg becomes the main range.
333 void GCNRewritePartialRegUsesImpl::updateLiveIntervals(
334     Register OldReg, Register NewReg, SubRegMap &SubRegs) const {
335   if (!LIS->hasInterval(OldReg))
336     return;
337 
338   auto &OldLI = LIS->getInterval(OldReg);
339   auto &NewLI = LIS->createEmptyInterval(NewReg);
340 
341   auto &Allocator = LIS->getVNInfoAllocator();
342   NewLI.setWeight(OldLI.weight());
343 
344   for (auto &SR : OldLI.subranges()) {
345     auto I = find_if(SubRegs, [&](auto &P) {
346       return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first);
347     });
348 
349     if (I == SubRegs.end()) {
350       // There might be a situation when subranges don't exactly match used
351       // subregs, for example:
352       // %120 [160r,1392r:0) 0@160r
353       //    L000000000000C000 [160r,1392r:0) 0@160r
354       //    L0000000000003000 [160r,1392r:0) 0@160r
355       //    L0000000000000C00 [160r,1392r:0) 0@160r
356       //    L0000000000000300 [160r,1392r:0) 0@160r
357       //    L0000000000000003 [160r,1104r:0) 0@160r
358       //    L000000000000000C [160r,1104r:0) 0@160r
359       //    L0000000000000030 [160r,1104r:0) 0@160r
360       //    L00000000000000C0 [160r,1104r:0) 0@160r
361       // but used subregs are:
362       //    sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
363       //    sub0_sub1_sub2_sub3, L00000000000000FF
364       //    sub4_sub5_sub6_sub7, L000000000000FF00
365       // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
366       // have several subranges with the same lifetime. For such cases just
367       // recreate the interval.
368       LIS->removeInterval(OldReg);
369       LIS->removeInterval(NewReg);
370       LIS->createAndComputeVirtRegInterval(NewReg);
371       return;
372     }
373 
374     if (unsigned NewSubReg = I->second)
375       NewLI.createSubRangeFrom(Allocator,
376                                TRI->getSubRegIndexLaneMask(NewSubReg), SR);
377     else // This is the covering subreg (0 index) - set it as main range.
378       NewLI.assign(SR, Allocator);
379 
380     SubRegs.erase(I);
381   }
382   if (NewLI.empty())
383     NewLI.assign(OldLI, Allocator);
384   assert(NewLI.verify(MRI));
385   LIS->removeInterval(OldReg);
386 }
387 
388 bool GCNRewritePartialRegUsesImpl::rewriteReg(Register Reg) const {
389 
390   // Collect used subregs.
391   SubRegMap SubRegs;
392   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
393     if (MO.getSubReg() == AMDGPU::NoSubRegister)
394       return false; // Whole reg used.
395     SubRegs.try_emplace(MO.getSubReg());
396   }
397 
398   if (SubRegs.empty())
399     return false;
400 
401   auto *RC = MRI->getRegClass(Reg);
402   LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
403                     << ':' << TRI->getRegClassName(RC) << '\n');
404 
405   auto *NewRC = getMinSizeReg(RC, SubRegs);
406   if (!NewRC) {
407     LLVM_DEBUG(dbgs() << "  No improvement achieved\n");
408     return false;
409   }
410 
411   Register NewReg = MRI->createVirtualRegister(NewRC);
412   LLVM_DEBUG(dbgs() << "  Success " << printReg(Reg, TRI) << ':'
413                     << TRI->getRegClassName(RC) << " -> "
414                     << printReg(NewReg, TRI) << ':'
415                     << TRI->getRegClassName(NewRC) << '\n');
416 
417   for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) {
418     MO.setReg(NewReg);
419     // Debug info can refer to the whole reg, just leave it as it is for now.
420     // TODO: create some DI shift expression?
421     if (MO.isDebug() && MO.getSubReg() == 0)
422       continue;
423     unsigned NewSubReg = SubRegs[MO.getSubReg()];
424     MO.setSubReg(NewSubReg);
425     if (NewSubReg == AMDGPU::NoSubRegister && MO.isDef())
426       MO.setIsUndef(false);
427   }
428 
429   if (LIS)
430     updateLiveIntervals(Reg, NewReg, SubRegs);
431 
432   return true;
433 }
434 
435 bool GCNRewritePartialRegUsesImpl::run(MachineFunction &MF) {
436   MRI = &MF.getRegInfo();
437   TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
438   TII = MF.getSubtarget().getInstrInfo();
439   bool Changed = false;
440   for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
441     Changed |= rewriteReg(Register::index2VirtReg(I));
442   }
443   return Changed;
444 }
445 
446 bool GCNRewritePartialRegUsesLegacy::runOnMachineFunction(MachineFunction &MF) {
447   LiveIntervalsWrapperPass *LISWrapper =
448       getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
449   LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
450   GCNRewritePartialRegUsesImpl Impl(LIS);
451   return Impl.run(MF);
452 }
453 
454 PreservedAnalyses
455 GCNRewritePartialRegUsesPass::run(MachineFunction &MF,
456                                   MachineFunctionAnalysisManager &MFAM) {
457   auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
458   if (!GCNRewritePartialRegUsesImpl(LIS).run(MF))
459     return PreservedAnalyses::all();
460 
461   auto PA = getMachineFunctionPassPreservedAnalyses();
462   PA.preserveSet<CFGAnalyses>();
463   PA.preserve<LiveIntervalsAnalysis>();
464   PA.preserve<SlotIndexesAnalysis>();
465   return PA;
466 }
467 
468 char GCNRewritePartialRegUsesLegacy::ID;
469 
470 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUsesLegacy::ID;
471 
472 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE,
473                       "Rewrite Partial Register Uses", false, false)
474 INITIALIZE_PASS_END(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE,
475                     "Rewrite Partial Register Uses", false, false)
476