1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// RenameIndependentSubregs pass leaves large partially used super registers, 10 /// for example: 11 /// undef %0.sub4:VReg_1024 = ... 12 /// %0.sub5:VReg_1024 = ... 13 /// %0.sub6:VReg_1024 = ... 14 /// %0.sub7:VReg_1024 = ... 15 /// use %0.sub4_sub5_sub6_sub7 16 /// use %0.sub6_sub7 17 /// 18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and 19 /// rewrites such partially used super registers with registers of minimal size: 20 /// undef %0.sub0:VReg_128 = ... 21 /// %0.sub1:VReg_128 = ... 22 /// %0.sub2:VReg_128 = ... 23 /// %0.sub3:VReg_128 = ... 24 /// use %0.sub0_sub1_sub2_sub3 25 /// use %0.sub2_sub3 26 /// 27 /// This allows to avoid subreg lanemasks tracking during register pressure 28 /// calculation and creates more possibilities for the code unaware of lanemasks 29 //===----------------------------------------------------------------------===// 30 31 #include "GCNRewritePartialRegUses.h" 32 #include "AMDGPU.h" 33 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 34 #include "SIRegisterInfo.h" 35 #include "llvm/CodeGen/LiveInterval.h" 36 #include "llvm/CodeGen/LiveIntervals.h" 37 #include "llvm/CodeGen/MachineFunctionPass.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetInstrInfo.h" 40 #include "llvm/Pass.h" 41 42 using namespace llvm; 43 44 #define DEBUG_TYPE "rewrite-partial-reg-uses" 45 46 namespace { 47 48 class GCNRewritePartialRegUsesImpl { 49 MachineRegisterInfo *MRI; 50 const SIRegisterInfo *TRI; 51 const TargetInstrInfo *TII; 52 LiveIntervals *LIS; 53 54 /// Rewrite partially used register Reg by shifting all its subregisters to 55 /// the right and replacing the original register with a register of minimal 56 /// size. Return true if the change has been made. 57 bool rewriteReg(Register Reg) const; 58 59 /// Map OldSubReg -> NewSubReg. Used as in/out container. 60 using SubRegMap = SmallDenseMap<unsigned, unsigned>; 61 62 /// Given register class RC and the set of used subregs as keys in the SubRegs 63 /// map return new register class and indexes of right-shifted subregs as 64 /// values in SubRegs map such that the resulting regclass would contain 65 /// registers of minimal size. 66 const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC, 67 SubRegMap &SubRegs) const; 68 69 /// Given regclass RC and pairs of [OldSubReg, NewSubReg] in SubRegs try to 70 /// find new regclass such that: 71 /// 1. It has subregs obtained by shifting each OldSubReg by RShift number 72 /// of bits to the right. Every "shifted" subreg should have the same 73 /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers" 74 /// all other subregs in pairs. Basically such subreg becomes a whole 75 /// register. 76 /// 2. Resulting register class contains registers of minimal size. 77 /// 78 /// SubRegs is map of OldSubReg -> NewSubReg and is used as in/out 79 /// parameter: 80 /// OldSubReg - input parameter, 81 /// NewSubReg - output, contains shifted subregs on return. 82 const TargetRegisterClass * 83 getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift, 84 unsigned CoverSubregIdx, 85 SubRegMap &SubRegs) const; 86 87 /// Update live intervals after rewriting OldReg to NewReg with SubRegs map 88 /// describing OldSubReg -> NewSubReg mapping. 89 void updateLiveIntervals(Register OldReg, Register NewReg, 90 SubRegMap &SubRegs) const; 91 92 /// Helper methods. 93 94 /// Find right-shifted by RShift amount version of the SubReg if it exists, 95 /// return 0 otherwise. 96 unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const; 97 98 /// Find subreg index with a given Offset and Size, return 0 if there is no 99 /// such subregister index. The result is cached in SubRegs data-member. 100 unsigned getSubReg(unsigned Offset, unsigned Size) const; 101 102 /// Cache for getSubReg method: {Offset, Size} -> SubReg index. 103 mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs; 104 105 /// Return bit mask that contains all register classes that are projected into 106 /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member. 107 const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC, 108 unsigned SubRegIdx) const; 109 110 /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask. 111 mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>, 112 const uint32_t *> 113 SuperRegMasks; 114 115 /// Return bitmask containing all allocatable register classes with registers 116 /// aligned at AlignNumBits. The result is cached in 117 /// AllocatableAndAlignedRegClassMasks data-member. 118 const BitVector & 119 getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const; 120 121 /// Cache for getAllocatableAndAlignedRegClassMask method: 122 /// AlignNumBits -> Class bitmask. 123 mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks; 124 125 public: 126 GCNRewritePartialRegUsesImpl(LiveIntervals *LS) : LIS(LS) {} 127 bool run(MachineFunction &MF); 128 }; 129 130 class GCNRewritePartialRegUsesLegacy : public MachineFunctionPass { 131 public: 132 static char ID; 133 GCNRewritePartialRegUsesLegacy() : MachineFunctionPass(ID) {} 134 135 StringRef getPassName() const override { 136 return "Rewrite Partial Register Uses"; 137 } 138 139 void getAnalysisUsage(AnalysisUsage &AU) const override { 140 AU.setPreservesCFG(); 141 AU.addPreserved<LiveIntervalsWrapperPass>(); 142 AU.addPreserved<SlotIndexesWrapperPass>(); 143 MachineFunctionPass::getAnalysisUsage(AU); 144 } 145 146 bool runOnMachineFunction(MachineFunction &MF) override; 147 }; 148 149 } // end anonymous namespace 150 151 // TODO: move this to the tablegen and use binary search by Offset. 152 unsigned GCNRewritePartialRegUsesImpl::getSubReg(unsigned Offset, 153 unsigned Size) const { 154 const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0); 155 if (Inserted) { 156 for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { 157 if (TRI->getSubRegIdxOffset(Idx) == Offset && 158 TRI->getSubRegIdxSize(Idx) == Size) { 159 I->second = Idx; 160 break; 161 } 162 } 163 } 164 return I->second; 165 } 166 167 unsigned GCNRewritePartialRegUsesImpl::shiftSubReg(unsigned SubReg, 168 unsigned RShift) const { 169 unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift; 170 return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg)); 171 } 172 173 const uint32_t *GCNRewritePartialRegUsesImpl::getSuperRegClassMask( 174 const TargetRegisterClass *RC, unsigned SubRegIdx) const { 175 const auto [I, Inserted] = 176 SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr); 177 if (Inserted) { 178 for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) { 179 if (RCI.getSubReg() == SubRegIdx) { 180 I->second = RCI.getMask(); 181 break; 182 } 183 } 184 } 185 return I->second; 186 } 187 188 const BitVector & 189 GCNRewritePartialRegUsesImpl::getAllocatableAndAlignedRegClassMask( 190 unsigned AlignNumBits) const { 191 const auto [I, Inserted] = 192 AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits); 193 if (Inserted) { 194 BitVector &BV = I->second; 195 BV.resize(TRI->getNumRegClasses()); 196 for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) { 197 auto *RC = TRI->getRegClass(ClassID); 198 if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits)) 199 BV.set(ClassID); 200 } 201 } 202 return I->second; 203 } 204 205 const TargetRegisterClass * 206 GCNRewritePartialRegUsesImpl::getRegClassWithShiftedSubregs( 207 const TargetRegisterClass *RC, unsigned RShift, unsigned CoverSubregIdx, 208 SubRegMap &SubRegs) const { 209 210 unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC); 211 LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign 212 << '\n'); 213 214 BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign)); 215 for (auto &[OldSubReg, NewSubReg] : SubRegs) { 216 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':'); 217 218 auto *SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg); 219 if (!SubRegRC) { 220 LLVM_DEBUG(dbgs() << "couldn't find target regclass\n"); 221 return nullptr; 222 } 223 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) 224 << (SubRegRC->isAllocatable() ? "" : " not alloc") 225 << " -> "); 226 227 if (OldSubReg == CoverSubregIdx) { 228 // Covering subreg will become a full register, RC should be allocatable. 229 assert(SubRegRC->isAllocatable()); 230 NewSubReg = AMDGPU::NoSubRegister; 231 LLVM_DEBUG(dbgs() << "whole reg"); 232 } else { 233 NewSubReg = shiftSubReg(OldSubReg, RShift); 234 if (!NewSubReg) { 235 LLVM_DEBUG(dbgs() << "none\n"); 236 return nullptr; 237 } 238 LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg)); 239 } 240 241 const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg) 242 : SubRegRC->getSubClassMask(); 243 if (!Mask) 244 llvm_unreachable("no register class mask?"); 245 246 ClassMask.clearBitsNotInMask(Mask); 247 // Don't try to early exit because checking if ClassMask has set bits isn't 248 // that cheap and we expect it to pass in most cases. 249 LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n'); 250 } 251 252 // ClassMask is the set of all register classes such that each class is 253 // allocatable, aligned, has all shifted subregs and each subreg has required 254 // register class (see SubRegRC above). Now select first (that is largest) 255 // register class with registers of minimal size. 256 const TargetRegisterClass *MinRC = nullptr; 257 unsigned MinNumBits = std::numeric_limits<unsigned>::max(); 258 for (unsigned ClassID : ClassMask.set_bits()) { 259 auto *RC = TRI->getRegClass(ClassID); 260 unsigned NumBits = TRI->getRegSizeInBits(*RC); 261 if (NumBits < MinNumBits) { 262 MinNumBits = NumBits; 263 MinRC = RC; 264 } 265 } 266 #ifndef NDEBUG 267 if (MinRC) { 268 assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign)); 269 for (auto [OldSubReg, NewSubReg] : SubRegs) 270 // Check that all registers in MinRC support NewSubReg subregister. 271 assert(MinRC == TRI->getSubClassWithSubReg(MinRC, NewSubReg)); 272 } 273 #endif 274 // There might be zero RShift - in this case we just trying to find smaller 275 // register. 276 return (MinRC != RC || RShift != 0) ? MinRC : nullptr; 277 } 278 279 const TargetRegisterClass * 280 GCNRewritePartialRegUsesImpl::getMinSizeReg(const TargetRegisterClass *RC, 281 SubRegMap &SubRegs) const { 282 unsigned CoverSubreg = AMDGPU::NoSubRegister; 283 unsigned Offset = std::numeric_limits<unsigned>::max(); 284 unsigned End = 0; 285 for (auto [SubReg, SRI] : SubRegs) { 286 unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg); 287 unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg); 288 if (SubRegOffset < Offset) { 289 Offset = SubRegOffset; 290 CoverSubreg = AMDGPU::NoSubRegister; 291 } 292 if (SubRegEnd > End) { 293 End = SubRegEnd; 294 CoverSubreg = AMDGPU::NoSubRegister; 295 } 296 if (SubRegOffset == Offset && SubRegEnd == End) 297 CoverSubreg = SubReg; 298 } 299 // If covering subreg is found shift everything so the covering subreg would 300 // be in the rightmost position. 301 if (CoverSubreg != AMDGPU::NoSubRegister) 302 return getRegClassWithShiftedSubregs(RC, Offset, CoverSubreg, SubRegs); 303 304 // Otherwise find subreg with maximum required alignment and shift it and all 305 // other subregs to the rightmost possible position with respect to the 306 // alignment. 307 unsigned MaxAlign = 0; 308 for (auto [SubReg, SRI] : SubRegs) 309 MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg)); 310 311 unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max(); 312 for (auto [SubReg, SRI] : SubRegs) { 313 if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign) 314 continue; 315 FirstMaxAlignedSubRegOffset = 316 std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg)); 317 if (FirstMaxAlignedSubRegOffset == Offset) 318 break; 319 } 320 321 unsigned NewOffsetOfMaxAlignedSubReg = 322 alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign); 323 324 if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset) 325 llvm_unreachable("misaligned subreg"); 326 327 unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg; 328 return getRegClassWithShiftedSubregs(RC, RShift, 0, SubRegs); 329 } 330 331 // Only the subrange's lanemasks of the original interval need to be modified. 332 // Subrange for a covering subreg becomes the main range. 333 void GCNRewritePartialRegUsesImpl::updateLiveIntervals( 334 Register OldReg, Register NewReg, SubRegMap &SubRegs) const { 335 if (!LIS->hasInterval(OldReg)) 336 return; 337 338 auto &OldLI = LIS->getInterval(OldReg); 339 auto &NewLI = LIS->createEmptyInterval(NewReg); 340 341 auto &Allocator = LIS->getVNInfoAllocator(); 342 NewLI.setWeight(OldLI.weight()); 343 344 for (auto &SR : OldLI.subranges()) { 345 auto I = find_if(SubRegs, [&](auto &P) { 346 return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first); 347 }); 348 349 if (I == SubRegs.end()) { 350 // There might be a situation when subranges don't exactly match used 351 // subregs, for example: 352 // %120 [160r,1392r:0) 0@160r 353 // L000000000000C000 [160r,1392r:0) 0@160r 354 // L0000000000003000 [160r,1392r:0) 0@160r 355 // L0000000000000C00 [160r,1392r:0) 0@160r 356 // L0000000000000300 [160r,1392r:0) 0@160r 357 // L0000000000000003 [160r,1104r:0) 0@160r 358 // L000000000000000C [160r,1104r:0) 0@160r 359 // L0000000000000030 [160r,1104r:0) 0@160r 360 // L00000000000000C0 [160r,1104r:0) 0@160r 361 // but used subregs are: 362 // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF 363 // sub0_sub1_sub2_sub3, L00000000000000FF 364 // sub4_sub5_sub6_sub7, L000000000000FF00 365 // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7 366 // have several subranges with the same lifetime. For such cases just 367 // recreate the interval. 368 LIS->removeInterval(OldReg); 369 LIS->removeInterval(NewReg); 370 LIS->createAndComputeVirtRegInterval(NewReg); 371 return; 372 } 373 374 if (unsigned NewSubReg = I->second) 375 NewLI.createSubRangeFrom(Allocator, 376 TRI->getSubRegIndexLaneMask(NewSubReg), SR); 377 else // This is the covering subreg (0 index) - set it as main range. 378 NewLI.assign(SR, Allocator); 379 380 SubRegs.erase(I); 381 } 382 if (NewLI.empty()) 383 NewLI.assign(OldLI, Allocator); 384 assert(NewLI.verify(MRI)); 385 LIS->removeInterval(OldReg); 386 } 387 388 bool GCNRewritePartialRegUsesImpl::rewriteReg(Register Reg) const { 389 390 // Collect used subregs. 391 SubRegMap SubRegs; 392 for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { 393 if (MO.getSubReg() == AMDGPU::NoSubRegister) 394 return false; // Whole reg used. 395 SubRegs.try_emplace(MO.getSubReg()); 396 } 397 398 if (SubRegs.empty()) 399 return false; 400 401 auto *RC = MRI->getRegClass(Reg); 402 LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI) 403 << ':' << TRI->getRegClassName(RC) << '\n'); 404 405 auto *NewRC = getMinSizeReg(RC, SubRegs); 406 if (!NewRC) { 407 LLVM_DEBUG(dbgs() << " No improvement achieved\n"); 408 return false; 409 } 410 411 Register NewReg = MRI->createVirtualRegister(NewRC); 412 LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':' 413 << TRI->getRegClassName(RC) << " -> " 414 << printReg(NewReg, TRI) << ':' 415 << TRI->getRegClassName(NewRC) << '\n'); 416 417 for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) { 418 MO.setReg(NewReg); 419 // Debug info can refer to the whole reg, just leave it as it is for now. 420 // TODO: create some DI shift expression? 421 if (MO.isDebug() && MO.getSubReg() == 0) 422 continue; 423 unsigned NewSubReg = SubRegs[MO.getSubReg()]; 424 MO.setSubReg(NewSubReg); 425 if (NewSubReg == AMDGPU::NoSubRegister && MO.isDef()) 426 MO.setIsUndef(false); 427 } 428 429 if (LIS) 430 updateLiveIntervals(Reg, NewReg, SubRegs); 431 432 return true; 433 } 434 435 bool GCNRewritePartialRegUsesImpl::run(MachineFunction &MF) { 436 MRI = &MF.getRegInfo(); 437 TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo()); 438 TII = MF.getSubtarget().getInstrInfo(); 439 bool Changed = false; 440 for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) { 441 Changed |= rewriteReg(Register::index2VirtReg(I)); 442 } 443 return Changed; 444 } 445 446 bool GCNRewritePartialRegUsesLegacy::runOnMachineFunction(MachineFunction &MF) { 447 LiveIntervalsWrapperPass *LISWrapper = 448 getAnalysisIfAvailable<LiveIntervalsWrapperPass>(); 449 LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; 450 GCNRewritePartialRegUsesImpl Impl(LIS); 451 return Impl.run(MF); 452 } 453 454 PreservedAnalyses 455 GCNRewritePartialRegUsesPass::run(MachineFunction &MF, 456 MachineFunctionAnalysisManager &MFAM) { 457 auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF); 458 if (!GCNRewritePartialRegUsesImpl(LIS).run(MF)) 459 return PreservedAnalyses::all(); 460 461 auto PA = getMachineFunctionPassPreservedAnalyses(); 462 PA.preserveSet<CFGAnalyses>(); 463 PA.preserve<LiveIntervalsAnalysis>(); 464 PA.preserve<SlotIndexesAnalysis>(); 465 return PA; 466 } 467 468 char GCNRewritePartialRegUsesLegacy::ID; 469 470 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUsesLegacy::ID; 471 472 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE, 473 "Rewrite Partial Register Uses", false, false) 474 INITIALIZE_PASS_END(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE, 475 "Rewrite Partial Register Uses", false, false) 476