xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp (revision f126890ac5386406dadf7c4cfa9566cbb56537c5)
1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// RenameIndependentSubregs pass leaves large partially used super registers,
10 /// for example:
11 ///   undef %0.sub4:VReg_1024 = ...
12 ///   %0.sub5:VReg_1024 = ...
13 ///   %0.sub6:VReg_1024 = ...
14 ///   %0.sub7:VReg_1024 = ...
15 ///   use %0.sub4_sub5_sub6_sub7
16 ///   use %0.sub6_sub7
17 ///
18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
19 /// rewrites such partially used super registers with registers of minimal size:
20 ///   undef %0.sub0:VReg_128 = ...
21 ///   %0.sub1:VReg_128 = ...
22 ///   %0.sub2:VReg_128 = ...
23 ///   %0.sub3:VReg_128 = ...
24 ///   use %0.sub0_sub1_sub2_sub3
25 ///   use %0.sub2_sub3
26 ///
27 /// This allows to avoid subreg lanemasks tracking during register pressure
28 /// calculation and creates more possibilities for the code unaware of lanemasks
29 //===----------------------------------------------------------------------===//
30 
31 #include "AMDGPU.h"
32 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
33 #include "SIRegisterInfo.h"
34 #include "llvm/CodeGen/LiveInterval.h"
35 #include "llvm/CodeGen/LiveIntervals.h"
36 #include "llvm/CodeGen/MachineFunctionPass.h"
37 #include "llvm/CodeGen/MachineInstrBuilder.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetInstrInfo.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "rewrite-partial-reg-uses"
46 
47 namespace {
48 
49 class GCNRewritePartialRegUses : public MachineFunctionPass {
50 public:
51   static char ID;
52   GCNRewritePartialRegUses() : MachineFunctionPass(ID) {}
53 
54   StringRef getPassName() const override {
55     return "Rewrite Partial Register Uses";
56   }
57 
58   void getAnalysisUsage(AnalysisUsage &AU) const override {
59     AU.setPreservesCFG();
60     AU.addPreserved<LiveIntervals>();
61     AU.addPreserved<SlotIndexes>();
62     MachineFunctionPass::getAnalysisUsage(AU);
63   }
64 
65   bool runOnMachineFunction(MachineFunction &MF) override;
66 
67 private:
68   MachineRegisterInfo *MRI;
69   const SIRegisterInfo *TRI;
70   const TargetInstrInfo *TII;
71   LiveIntervals *LIS;
72 
73   /// Rewrite partially used register Reg by shifting all its subregisters to
74   /// the right and replacing the original register with a register of minimal
75   /// size. Return true if the change has been made.
76   bool rewriteReg(Register Reg) const;
77 
78   /// Value type for SubRegMap below.
79   struct SubRegInfo {
80     /// Register class required to hold the value stored in the SubReg.
81     const TargetRegisterClass *RC;
82 
83     /// Index for the right-shifted subregister. If 0 this is the "covering"
84     /// subreg i.e. subreg that covers all others. Covering subreg becomes the
85     /// whole register after the replacement.
86     unsigned SubReg = AMDGPU::NoSubRegister;
87     SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {}
88   };
89 
90   /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container.
91   typedef SmallDenseMap<unsigned, SubRegInfo> SubRegMap;
92 
93   /// Given register class RC and the set of used subregs as keys in the SubRegs
94   /// map return new register class and indexes of right-shifted subregs as
95   /// values in SubRegs map such that the resulting regclass would contain
96   /// registers of minimal size.
97   const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
98                                            SubRegMap &SubRegs) const;
99 
100   /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to
101   /// find new regclass such that:
102   ///   1. It has subregs obtained by shifting each OldSubReg by RShift number
103   ///      of bits to the right. Every "shifted" subreg should have the same
104   ///      SubRegRC. SubRegRC can be null, in this case it initialized using
105   ///      getSubRegisterClass. If CoverSubregIdx is not zero it's a subreg that
106   ///      "covers" all other subregs in pairs. Basically such subreg becomes a
107   ///      whole register.
108   ///   2. Resulting register class contains registers of minimal size but not
109   ///      less than RegNumBits.
110   ///
111   /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out
112   /// parameter:
113   ///   OldSubReg - input parameter,
114   ///   SubRegRC  - in/out, should be changed for unknown regclass,
115   ///   NewSubReg - output, contains shifted subregs on return.
116   const TargetRegisterClass *
117   getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
118                                 unsigned RegNumBits, unsigned CoverSubregIdx,
119                                 SubRegMap &SubRegs) const;
120 
121   /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
122   /// describing OldSubReg -> NewSubReg mapping.
123   void updateLiveIntervals(Register OldReg, Register NewReg,
124                            SubRegMap &SubRegs) const;
125 
126   /// Helper methods.
127 
128   /// Return reg class expected by a MO's parent instruction for a given MO.
129   const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const;
130 
131   /// Find right-shifted by RShift amount version of the SubReg if it exists,
132   /// return 0 otherwise.
133   unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
134 
135   /// Find subreg index with a given Offset and Size, return 0 if there is no
136   /// such subregister index. The result is cached in SubRegs data-member.
137   unsigned getSubReg(unsigned Offset, unsigned Size) const;
138 
139   /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
140   mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
141 
142   /// Return bit mask that contains all register classes that are projected into
143   /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
144   const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
145                                        unsigned SubRegIdx) const;
146 
147   /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
148   mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
149                         const uint32_t *>
150       SuperRegMasks;
151 
152   /// Return bitmask containing all allocatable register classes with registers
153   /// aligned at AlignNumBits. The result is cached in
154   /// AllocatableAndAlignedRegClassMasks data-member.
155   const BitVector &
156   getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
157 
158   /// Cache for getAllocatableAndAlignedRegClassMask method:
159   ///   AlignNumBits -> Class bitmask.
160   mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
161 };
162 
163 } // end anonymous namespace
164 
165 // TODO: move this to the tablegen and use binary search by Offset.
166 unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset,
167                                              unsigned Size) const {
168   const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0);
169   if (Inserted) {
170     for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
171       if (TRI->getSubRegIdxOffset(Idx) == Offset &&
172           TRI->getSubRegIdxSize(Idx) == Size) {
173         I->second = Idx;
174         break;
175       }
176     }
177   }
178   return I->second;
179 }
180 
181 unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg,
182                                                unsigned RShift) const {
183   unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift;
184   return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg));
185 }
186 
187 const uint32_t *
188 GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC,
189                                                unsigned SubRegIdx) const {
190   const auto [I, Inserted] =
191       SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr);
192   if (Inserted) {
193     for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
194       if (RCI.getSubReg() == SubRegIdx) {
195         I->second = RCI.getMask();
196         break;
197       }
198     }
199   }
200   return I->second;
201 }
202 
203 const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask(
204     unsigned AlignNumBits) const {
205   const auto [I, Inserted] =
206       AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits);
207   if (Inserted) {
208     BitVector &BV = I->second;
209     BV.resize(TRI->getNumRegClasses());
210     for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
211       auto *RC = TRI->getRegClass(ClassID);
212       if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
213         BV.set(ClassID);
214     }
215   }
216   return I->second;
217 }
218 
219 const TargetRegisterClass *
220 GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
221     const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits,
222     unsigned CoverSubregIdx, SubRegMap &SubRegs) const {
223 
224   unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
225   LLVM_DEBUG(dbgs() << "  Shift " << RShift << ", reg align " << RCAlign
226                     << '\n');
227 
228   BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign));
229   for (auto &[OldSubReg, SRI] : SubRegs) {
230     auto &[SubRegRC, NewSubReg] = SRI;
231 
232     // Register class may be unknown, for example:
233     //   undef %0.sub4:sgpr_1024 = S_MOV_B32 01
234     //   %0.sub5:sgpr_1024 = S_MOV_B32 02
235     //   %1:vreg_64 = COPY %0.sub4_sub5
236     // Register classes for subregs 'sub4' and 'sub5' are known from the
237     // description of destination operand of S_MOV_B32 instruction but the
238     // class for the subreg 'sub4_sub5' isn't specified by the COPY instruction.
239     if (!SubRegRC)
240       SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg);
241 
242     if (!SubRegRC)
243       return nullptr;
244 
245     LLVM_DEBUG(dbgs() << "  " << TRI->getSubRegIndexName(OldSubReg) << ':'
246                       << TRI->getRegClassName(SubRegRC)
247                       << (SubRegRC->isAllocatable() ? "" : " not alloc")
248                       << " -> ");
249 
250     if (OldSubReg == CoverSubregIdx) {
251       NewSubReg = AMDGPU::NoSubRegister;
252       LLVM_DEBUG(dbgs() << "whole reg");
253     } else {
254       NewSubReg = shiftSubReg(OldSubReg, RShift);
255       if (!NewSubReg) {
256         LLVM_DEBUG(dbgs() << "none\n");
257         return nullptr;
258       }
259       LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
260     }
261 
262     const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg)
263                                      : SubRegRC->getSubClassMask();
264     if (!Mask)
265       llvm_unreachable("no register class mask?");
266 
267     ClassMask.clearBitsNotInMask(Mask);
268     // Don't try to early exit because checking if ClassMask has set bits isn't
269     // that cheap and we expect it to pass in most cases.
270     LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
271   }
272 
273   // ClassMask is the set of all register classes such that each class is
274   // allocatable, aligned, has all shifted subregs and each subreg has required
275   // register class (see SubRegRC above). Now select first (that is largest)
276   // register class with registers of minimal but not less than RegNumBits size.
277   // We have to check register size because we may encounter classes of smaller
278   // registers like VReg_1 in some situations.
279   const TargetRegisterClass *MinRC = nullptr;
280   unsigned MinNumBits = std::numeric_limits<unsigned>::max();
281   for (unsigned ClassID : ClassMask.set_bits()) {
282     auto *RC = TRI->getRegClass(ClassID);
283     unsigned NumBits = TRI->getRegSizeInBits(*RC);
284     if (NumBits < MinNumBits && NumBits >= RegNumBits) {
285       MinNumBits = NumBits;
286       MinRC = RC;
287     }
288     if (MinNumBits == RegNumBits)
289       break;
290   }
291 #ifndef NDEBUG
292   if (MinRC) {
293     assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
294     for (auto [SubReg, SRI] : SubRegs)
295       // Check that all registers in MinRC support SRI.SubReg subregister.
296       assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg));
297   }
298 #endif
299   // There might be zero RShift - in this case we just trying to find smaller
300   // register.
301   return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
302 }
303 
304 const TargetRegisterClass *
305 GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC,
306                                         SubRegMap &SubRegs) const {
307   unsigned CoverSubreg = AMDGPU::NoSubRegister;
308   unsigned Offset = std::numeric_limits<unsigned>::max();
309   unsigned End = 0;
310   for (auto [SubReg, SRI] : SubRegs) {
311     unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg);
312     unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg);
313     if (SubRegOffset < Offset) {
314       Offset = SubRegOffset;
315       CoverSubreg = AMDGPU::NoSubRegister;
316     }
317     if (SubRegEnd > End) {
318       End = SubRegEnd;
319       CoverSubreg = AMDGPU::NoSubRegister;
320     }
321     if (SubRegOffset == Offset && SubRegEnd == End)
322       CoverSubreg = SubReg;
323   }
324   // If covering subreg is found shift everything so the covering subreg would
325   // be in the rightmost position.
326   if (CoverSubreg != AMDGPU::NoSubRegister)
327     return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg,
328                                          SubRegs);
329 
330   // Otherwise find subreg with maximum required alignment and shift it and all
331   // other subregs to the rightmost possible position with respect to the
332   // alignment.
333   unsigned MaxAlign = 0;
334   for (auto [SubReg, SRI] : SubRegs)
335     MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg));
336 
337   unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
338   for (auto [SubReg, SRI] : SubRegs) {
339     if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
340       continue;
341     FirstMaxAlignedSubRegOffset =
342         std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg));
343     if (FirstMaxAlignedSubRegOffset == Offset)
344       break;
345   }
346 
347   unsigned NewOffsetOfMaxAlignedSubReg =
348       alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign);
349 
350   if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
351     llvm_unreachable("misaligned subreg");
352 
353   unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
354   return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs);
355 }
356 
357 // Only the subrange's lanemasks of the original interval need to be modified.
358 // Subrange for a covering subreg becomes the main range.
359 void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg,
360                                                    Register NewReg,
361                                                    SubRegMap &SubRegs) const {
362   if (!LIS->hasInterval(OldReg))
363     return;
364 
365   auto &OldLI = LIS->getInterval(OldReg);
366   auto &NewLI = LIS->createEmptyInterval(NewReg);
367 
368   auto &Allocator = LIS->getVNInfoAllocator();
369   NewLI.setWeight(OldLI.weight());
370 
371   for (auto &SR : OldLI.subranges()) {
372     auto I = find_if(SubRegs, [&](auto &P) {
373       return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first);
374     });
375 
376     if (I == SubRegs.end()) {
377       // There might be a situation when subranges don't exactly match used
378       // subregs, for example:
379       // %120 [160r,1392r:0) 0@160r
380       //    L000000000000C000 [160r,1392r:0) 0@160r
381       //    L0000000000003000 [160r,1392r:0) 0@160r
382       //    L0000000000000C00 [160r,1392r:0) 0@160r
383       //    L0000000000000300 [160r,1392r:0) 0@160r
384       //    L0000000000000003 [160r,1104r:0) 0@160r
385       //    L000000000000000C [160r,1104r:0) 0@160r
386       //    L0000000000000030 [160r,1104r:0) 0@160r
387       //    L00000000000000C0 [160r,1104r:0) 0@160r
388       // but used subregs are:
389       //    sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
390       //    sub0_sub1_sub2_sub3, L00000000000000FF
391       //    sub4_sub5_sub6_sub7, L000000000000FF00
392       // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
393       // have several subranges with the same lifetime. For such cases just
394       // recreate the interval.
395       LIS->removeInterval(OldReg);
396       LIS->removeInterval(NewReg);
397       LIS->createAndComputeVirtRegInterval(NewReg);
398       return;
399     }
400 
401     if (unsigned NewSubReg = I->second.SubReg)
402       NewLI.createSubRangeFrom(Allocator,
403                                TRI->getSubRegIndexLaneMask(NewSubReg), SR);
404     else // This is the covering subreg (0 index) - set it as main range.
405       NewLI.assign(SR, Allocator);
406 
407     SubRegs.erase(I);
408   }
409   if (NewLI.empty())
410     NewLI.assign(OldLI, Allocator);
411   NewLI.verify(MRI);
412   LIS->removeInterval(OldReg);
413 }
414 
415 const TargetRegisterClass *
416 GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const {
417   MachineInstr *MI = MO.getParent();
418   return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI,
419                           *MI->getParent()->getParent());
420 }
421 
422 bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const {
423   auto Range = MRI->reg_nodbg_operands(Reg);
424   if (Range.begin() == Range.end())
425     return false;
426 
427   for (MachineOperand &MO : Range) {
428     if (MO.getSubReg() == AMDGPU::NoSubRegister) // Whole reg used, quit.
429       return false;
430   }
431 
432   auto *RC = MRI->getRegClass(Reg);
433   LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
434                     << ':' << TRI->getRegClassName(RC) << '\n');
435 
436   // Collect used subregs and constrained reg classes infered from instruction
437   // operands.
438   SubRegMap SubRegs;
439   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
440     assert(MO.getSubReg() != AMDGPU::NoSubRegister);
441     auto *OpDescRC = getOperandRegClass(MO);
442     const auto [I, Inserted] = SubRegs.try_emplace(MO.getSubReg(), OpDescRC);
443     if (!Inserted && OpDescRC) {
444       SubRegInfo &SRI = I->second;
445       SRI.RC = SRI.RC ? TRI->getCommonSubClass(SRI.RC, OpDescRC) : OpDescRC;
446       if (!SRI.RC) {
447         LLVM_DEBUG(dbgs() << "  Couldn't find common target regclass\n");
448         return false;
449       }
450     }
451   }
452 
453   auto *NewRC = getMinSizeReg(RC, SubRegs);
454   if (!NewRC) {
455     LLVM_DEBUG(dbgs() << "  No improvement achieved\n");
456     return false;
457   }
458 
459   Register NewReg = MRI->createVirtualRegister(NewRC);
460   LLVM_DEBUG(dbgs() << "  Success " << printReg(Reg, TRI) << ':'
461                     << TRI->getRegClassName(RC) << " -> "
462                     << printReg(NewReg, TRI) << ':'
463                     << TRI->getRegClassName(NewRC) << '\n');
464 
465   for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) {
466     MO.setReg(NewReg);
467     // Debug info can refer to the whole reg, just leave it as it is for now.
468     // TODO: create some DI shift expression?
469     if (MO.isDebug() && MO.getSubReg() == 0)
470       continue;
471     unsigned SubReg = SubRegs[MO.getSubReg()].SubReg;
472     MO.setSubReg(SubReg);
473     if (SubReg == AMDGPU::NoSubRegister && MO.isDef())
474       MO.setIsUndef(false);
475   }
476 
477   if (LIS)
478     updateLiveIntervals(Reg, NewReg, SubRegs);
479 
480   return true;
481 }
482 
483 bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) {
484   MRI = &MF.getRegInfo();
485   TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
486   TII = MF.getSubtarget().getInstrInfo();
487   LIS = getAnalysisIfAvailable<LiveIntervals>();
488   bool Changed = false;
489   for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
490     Changed |= rewriteReg(Register::index2VirtReg(I));
491   }
492   return Changed;
493 }
494 
495 char GCNRewritePartialRegUses::ID;
496 
497 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID;
498 
499 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE,
500                       "Rewrite Partial Register Uses", false, false)
501 INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE,
502                     "Rewrite Partial Register Uses", false, false)
503