xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// RenameIndependentSubregs pass leaves large partially used super registers,
10 /// for example:
11 ///   undef %0.sub4:VReg_1024 = ...
12 ///   %0.sub5:VReg_1024 = ...
13 ///   %0.sub6:VReg_1024 = ...
14 ///   %0.sub7:VReg_1024 = ...
15 ///   use %0.sub4_sub5_sub6_sub7
16 ///   use %0.sub6_sub7
17 ///
18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
19 /// rewrites such partially used super registers with registers of minimal size:
20 ///   undef %0.sub0:VReg_128 = ...
21 ///   %0.sub1:VReg_128 = ...
22 ///   %0.sub2:VReg_128 = ...
23 ///   %0.sub3:VReg_128 = ...
24 ///   use %0.sub0_sub1_sub2_sub3
25 ///   use %0.sub2_sub3
26 ///
27 /// This allows to avoid subreg lanemasks tracking during register pressure
28 /// calculation and creates more possibilities for the code unaware of lanemasks
29 //===----------------------------------------------------------------------===//
30 
31 #include "AMDGPU.h"
32 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
33 #include "SIRegisterInfo.h"
34 #include "llvm/CodeGen/LiveInterval.h"
35 #include "llvm/CodeGen/LiveIntervals.h"
36 #include "llvm/CodeGen/MachineFunctionPass.h"
37 #include "llvm/CodeGen/MachineInstrBuilder.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetInstrInfo.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "rewrite-partial-reg-uses"
46 
47 namespace {
48 
49 class GCNRewritePartialRegUses : public MachineFunctionPass {
50 public:
51   static char ID;
52   GCNRewritePartialRegUses() : MachineFunctionPass(ID) {}
53 
54   StringRef getPassName() const override {
55     return "Rewrite Partial Register Uses";
56   }
57 
58   void getAnalysisUsage(AnalysisUsage &AU) const override {
59     AU.setPreservesCFG();
60     AU.addPreserved<LiveIntervals>();
61     AU.addPreserved<SlotIndexes>();
62     MachineFunctionPass::getAnalysisUsage(AU);
63   }
64 
65   bool runOnMachineFunction(MachineFunction &MF) override;
66 
67 private:
68   MachineRegisterInfo *MRI;
69   const SIRegisterInfo *TRI;
70   const TargetInstrInfo *TII;
71   LiveIntervals *LIS;
72 
73   /// Rewrite partially used register Reg by shifting all its subregisters to
74   /// the right and replacing the original register with a register of minimal
75   /// size. Return true if the change has been made.
76   bool rewriteReg(Register Reg) const;
77 
78   /// Value type for SubRegMap below.
79   struct SubRegInfo {
80     /// Register class required to hold the value stored in the SubReg.
81     const TargetRegisterClass *RC;
82 
83     /// Index for the right-shifted subregister. If 0 this is the "covering"
84     /// subreg i.e. subreg that covers all others. Covering subreg becomes the
85     /// whole register after the replacement.
86     unsigned SubReg = AMDGPU::NoSubRegister;
87     SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {}
88   };
89 
90   /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container.
91   typedef SmallDenseMap<unsigned, SubRegInfo> SubRegMap;
92 
93   /// Given register class RC and the set of used subregs as keys in the SubRegs
94   /// map return new register class and indexes of right-shifted subregs as
95   /// values in SubRegs map such that the resulting regclass would contain
96   /// registers of minimal size.
97   const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
98                                            SubRegMap &SubRegs) const;
99 
100   /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to
101   /// find new regclass such that:
102   ///   1. It has subregs obtained by shifting each OldSubReg by RShift number
103   ///      of bits to the right. Every "shifted" subreg should have the same
104   ///      SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers"
105   ///      all other subregs in pairs. Basically such subreg becomes a whole
106   ///      register.
107   ///   2. Resulting register class contains registers of minimal size but not
108   ///      less than RegNumBits.
109   ///
110   /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out
111   /// parameter:
112   ///   OldSubReg - input parameter,
113   ///   SubRegRC  - input parameter (cannot be null),
114   ///   NewSubReg - output, contains shifted subregs on return.
115   const TargetRegisterClass *
116   getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
117                                 unsigned RegNumBits, unsigned CoverSubregIdx,
118                                 SubRegMap &SubRegs) const;
119 
120   /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
121   /// describing OldSubReg -> NewSubReg mapping.
122   void updateLiveIntervals(Register OldReg, Register NewReg,
123                            SubRegMap &SubRegs) const;
124 
125   /// Helper methods.
126 
127   /// Return reg class expected by a MO's parent instruction for a given MO.
128   const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const;
129 
130   /// Find right-shifted by RShift amount version of the SubReg if it exists,
131   /// return 0 otherwise.
132   unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
133 
134   /// Find subreg index with a given Offset and Size, return 0 if there is no
135   /// such subregister index. The result is cached in SubRegs data-member.
136   unsigned getSubReg(unsigned Offset, unsigned Size) const;
137 
138   /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
139   mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
140 
141   /// Return bit mask that contains all register classes that are projected into
142   /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
143   const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
144                                        unsigned SubRegIdx) const;
145 
146   /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
147   mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
148                         const uint32_t *>
149       SuperRegMasks;
150 
151   /// Return bitmask containing all allocatable register classes with registers
152   /// aligned at AlignNumBits. The result is cached in
153   /// AllocatableAndAlignedRegClassMasks data-member.
154   const BitVector &
155   getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
156 
157   /// Cache for getAllocatableAndAlignedRegClassMask method:
158   ///   AlignNumBits -> Class bitmask.
159   mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
160 };
161 
162 } // end anonymous namespace
163 
164 // TODO: move this to the tablegen and use binary search by Offset.
165 unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset,
166                                              unsigned Size) const {
167   const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0);
168   if (Inserted) {
169     for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
170       if (TRI->getSubRegIdxOffset(Idx) == Offset &&
171           TRI->getSubRegIdxSize(Idx) == Size) {
172         I->second = Idx;
173         break;
174       }
175     }
176   }
177   return I->second;
178 }
179 
180 unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg,
181                                                unsigned RShift) const {
182   unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift;
183   return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg));
184 }
185 
186 const uint32_t *
187 GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC,
188                                                unsigned SubRegIdx) const {
189   const auto [I, Inserted] =
190       SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr);
191   if (Inserted) {
192     for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
193       if (RCI.getSubReg() == SubRegIdx) {
194         I->second = RCI.getMask();
195         break;
196       }
197     }
198   }
199   return I->second;
200 }
201 
202 const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask(
203     unsigned AlignNumBits) const {
204   const auto [I, Inserted] =
205       AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits);
206   if (Inserted) {
207     BitVector &BV = I->second;
208     BV.resize(TRI->getNumRegClasses());
209     for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
210       auto *RC = TRI->getRegClass(ClassID);
211       if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
212         BV.set(ClassID);
213     }
214   }
215   return I->second;
216 }
217 
218 const TargetRegisterClass *
219 GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
220     const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits,
221     unsigned CoverSubregIdx, SubRegMap &SubRegs) const {
222 
223   unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
224   LLVM_DEBUG(dbgs() << "  Shift " << RShift << ", reg align " << RCAlign
225                     << '\n');
226 
227   BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign));
228   for (auto &[OldSubReg, SRI] : SubRegs) {
229     auto &[SubRegRC, NewSubReg] = SRI;
230     assert(SubRegRC);
231 
232     LLVM_DEBUG(dbgs() << "  " << TRI->getSubRegIndexName(OldSubReg) << ':'
233                       << TRI->getRegClassName(SubRegRC)
234                       << (SubRegRC->isAllocatable() ? "" : " not alloc")
235                       << " -> ");
236 
237     if (OldSubReg == CoverSubregIdx) {
238       // Covering subreg will become a full register, RC should be allocatable.
239       assert(SubRegRC->isAllocatable());
240       NewSubReg = AMDGPU::NoSubRegister;
241       LLVM_DEBUG(dbgs() << "whole reg");
242     } else {
243       NewSubReg = shiftSubReg(OldSubReg, RShift);
244       if (!NewSubReg) {
245         LLVM_DEBUG(dbgs() << "none\n");
246         return nullptr;
247       }
248       LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
249     }
250 
251     const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg)
252                                      : SubRegRC->getSubClassMask();
253     if (!Mask)
254       llvm_unreachable("no register class mask?");
255 
256     ClassMask.clearBitsNotInMask(Mask);
257     // Don't try to early exit because checking if ClassMask has set bits isn't
258     // that cheap and we expect it to pass in most cases.
259     LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
260   }
261 
262   // ClassMask is the set of all register classes such that each class is
263   // allocatable, aligned, has all shifted subregs and each subreg has required
264   // register class (see SubRegRC above). Now select first (that is largest)
265   // register class with registers of minimal but not less than RegNumBits size.
266   // We have to check register size because we may encounter classes of smaller
267   // registers like VReg_1 in some situations.
268   const TargetRegisterClass *MinRC = nullptr;
269   unsigned MinNumBits = std::numeric_limits<unsigned>::max();
270   for (unsigned ClassID : ClassMask.set_bits()) {
271     auto *RC = TRI->getRegClass(ClassID);
272     unsigned NumBits = TRI->getRegSizeInBits(*RC);
273     if (NumBits < MinNumBits && NumBits >= RegNumBits) {
274       MinNumBits = NumBits;
275       MinRC = RC;
276     }
277     if (MinNumBits == RegNumBits)
278       break;
279   }
280 #ifndef NDEBUG
281   if (MinRC) {
282     assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
283     for (auto [SubReg, SRI] : SubRegs)
284       // Check that all registers in MinRC support SRI.SubReg subregister.
285       assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg));
286   }
287 #endif
288   // There might be zero RShift - in this case we just trying to find smaller
289   // register.
290   return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
291 }
292 
293 const TargetRegisterClass *
294 GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC,
295                                         SubRegMap &SubRegs) const {
296   unsigned CoverSubreg = AMDGPU::NoSubRegister;
297   unsigned Offset = std::numeric_limits<unsigned>::max();
298   unsigned End = 0;
299   for (auto [SubReg, SRI] : SubRegs) {
300     unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg);
301     unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg);
302     if (SubRegOffset < Offset) {
303       Offset = SubRegOffset;
304       CoverSubreg = AMDGPU::NoSubRegister;
305     }
306     if (SubRegEnd > End) {
307       End = SubRegEnd;
308       CoverSubreg = AMDGPU::NoSubRegister;
309     }
310     if (SubRegOffset == Offset && SubRegEnd == End)
311       CoverSubreg = SubReg;
312   }
313   // If covering subreg is found shift everything so the covering subreg would
314   // be in the rightmost position.
315   if (CoverSubreg != AMDGPU::NoSubRegister)
316     return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg,
317                                          SubRegs);
318 
319   // Otherwise find subreg with maximum required alignment and shift it and all
320   // other subregs to the rightmost possible position with respect to the
321   // alignment.
322   unsigned MaxAlign = 0;
323   for (auto [SubReg, SRI] : SubRegs)
324     MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg));
325 
326   unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
327   for (auto [SubReg, SRI] : SubRegs) {
328     if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
329       continue;
330     FirstMaxAlignedSubRegOffset =
331         std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg));
332     if (FirstMaxAlignedSubRegOffset == Offset)
333       break;
334   }
335 
336   unsigned NewOffsetOfMaxAlignedSubReg =
337       alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign);
338 
339   if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
340     llvm_unreachable("misaligned subreg");
341 
342   unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
343   return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs);
344 }
345 
346 // Only the subrange's lanemasks of the original interval need to be modified.
347 // Subrange for a covering subreg becomes the main range.
348 void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg,
349                                                    Register NewReg,
350                                                    SubRegMap &SubRegs) const {
351   if (!LIS->hasInterval(OldReg))
352     return;
353 
354   auto &OldLI = LIS->getInterval(OldReg);
355   auto &NewLI = LIS->createEmptyInterval(NewReg);
356 
357   auto &Allocator = LIS->getVNInfoAllocator();
358   NewLI.setWeight(OldLI.weight());
359 
360   for (auto &SR : OldLI.subranges()) {
361     auto I = find_if(SubRegs, [&](auto &P) {
362       return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first);
363     });
364 
365     if (I == SubRegs.end()) {
366       // There might be a situation when subranges don't exactly match used
367       // subregs, for example:
368       // %120 [160r,1392r:0) 0@160r
369       //    L000000000000C000 [160r,1392r:0) 0@160r
370       //    L0000000000003000 [160r,1392r:0) 0@160r
371       //    L0000000000000C00 [160r,1392r:0) 0@160r
372       //    L0000000000000300 [160r,1392r:0) 0@160r
373       //    L0000000000000003 [160r,1104r:0) 0@160r
374       //    L000000000000000C [160r,1104r:0) 0@160r
375       //    L0000000000000030 [160r,1104r:0) 0@160r
376       //    L00000000000000C0 [160r,1104r:0) 0@160r
377       // but used subregs are:
378       //    sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
379       //    sub0_sub1_sub2_sub3, L00000000000000FF
380       //    sub4_sub5_sub6_sub7, L000000000000FF00
381       // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
382       // have several subranges with the same lifetime. For such cases just
383       // recreate the interval.
384       LIS->removeInterval(OldReg);
385       LIS->removeInterval(NewReg);
386       LIS->createAndComputeVirtRegInterval(NewReg);
387       return;
388     }
389 
390     if (unsigned NewSubReg = I->second.SubReg)
391       NewLI.createSubRangeFrom(Allocator,
392                                TRI->getSubRegIndexLaneMask(NewSubReg), SR);
393     else // This is the covering subreg (0 index) - set it as main range.
394       NewLI.assign(SR, Allocator);
395 
396     SubRegs.erase(I);
397   }
398   if (NewLI.empty())
399     NewLI.assign(OldLI, Allocator);
400   NewLI.verify(MRI);
401   LIS->removeInterval(OldReg);
402 }
403 
404 const TargetRegisterClass *
405 GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const {
406   MachineInstr *MI = MO.getParent();
407   return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI,
408                           *MI->getParent()->getParent());
409 }
410 
411 bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const {
412   auto Range = MRI->reg_nodbg_operands(Reg);
413   if (Range.empty() || any_of(Range, [](MachineOperand &MO) {
414         return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1]
415       }))
416     return false;
417 
418   auto *RC = MRI->getRegClass(Reg);
419   LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
420                     << ':' << TRI->getRegClassName(RC) << '\n');
421 
422   // Collect used subregs and their reg classes infered from instruction
423   // operands.
424   SubRegMap SubRegs;
425   for (MachineOperand &MO : Range) {
426     const unsigned SubReg = MO.getSubReg();
427     assert(SubReg != AMDGPU::NoSubRegister); // Due to [1].
428     LLVM_DEBUG(dbgs() << "  " << TRI->getSubRegIndexName(SubReg) << ':');
429 
430     const auto [I, Inserted] = SubRegs.try_emplace(SubReg);
431     const TargetRegisterClass *&SubRegRC = I->second.RC;
432 
433     if (Inserted)
434       SubRegRC = TRI->getSubRegisterClass(RC, SubReg);
435 
436     if (SubRegRC) {
437       if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) {
438         LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & "
439                           << TRI->getRegClassName(OpDescRC) << " = ");
440         SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC);
441       }
442     }
443 
444     if (!SubRegRC) {
445       LLVM_DEBUG(dbgs() << "couldn't find target regclass\n");
446       return false;
447     }
448     LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n');
449   }
450 
451   auto *NewRC = getMinSizeReg(RC, SubRegs);
452   if (!NewRC) {
453     LLVM_DEBUG(dbgs() << "  No improvement achieved\n");
454     return false;
455   }
456 
457   Register NewReg = MRI->createVirtualRegister(NewRC);
458   LLVM_DEBUG(dbgs() << "  Success " << printReg(Reg, TRI) << ':'
459                     << TRI->getRegClassName(RC) << " -> "
460                     << printReg(NewReg, TRI) << ':'
461                     << TRI->getRegClassName(NewRC) << '\n');
462 
463   for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) {
464     MO.setReg(NewReg);
465     // Debug info can refer to the whole reg, just leave it as it is for now.
466     // TODO: create some DI shift expression?
467     if (MO.isDebug() && MO.getSubReg() == 0)
468       continue;
469     unsigned SubReg = SubRegs[MO.getSubReg()].SubReg;
470     MO.setSubReg(SubReg);
471     if (SubReg == AMDGPU::NoSubRegister && MO.isDef())
472       MO.setIsUndef(false);
473   }
474 
475   if (LIS)
476     updateLiveIntervals(Reg, NewReg, SubRegs);
477 
478   return true;
479 }
480 
481 bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) {
482   MRI = &MF.getRegInfo();
483   TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
484   TII = MF.getSubtarget().getInstrInfo();
485   LIS = getAnalysisIfAvailable<LiveIntervals>();
486   bool Changed = false;
487   for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
488     Changed |= rewriteReg(Register::index2VirtReg(I));
489   }
490   return Changed;
491 }
492 
493 char GCNRewritePartialRegUses::ID;
494 
495 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID;
496 
497 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE,
498                       "Rewrite Partial Register Uses", false, false)
499 INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE,
500                     "Rewrite Partial Register Uses", false, false)
501