1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// RenameIndependentSubregs pass leaves large partially used super registers, 10 /// for example: 11 /// undef %0.sub4:VReg_1024 = ... 12 /// %0.sub5:VReg_1024 = ... 13 /// %0.sub6:VReg_1024 = ... 14 /// %0.sub7:VReg_1024 = ... 15 /// use %0.sub4_sub5_sub6_sub7 16 /// use %0.sub6_sub7 17 /// 18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and 19 /// rewrites such partially used super registers with registers of minimal size: 20 /// undef %0.sub0:VReg_128 = ... 21 /// %0.sub1:VReg_128 = ... 22 /// %0.sub2:VReg_128 = ... 23 /// %0.sub3:VReg_128 = ... 24 /// use %0.sub0_sub1_sub2_sub3 25 /// use %0.sub2_sub3 26 /// 27 /// This allows to avoid subreg lanemasks tracking during register pressure 28 /// calculation and creates more possibilities for the code unaware of lanemasks 29 //===----------------------------------------------------------------------===// 30 31 #include "AMDGPU.h" 32 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 33 #include "SIRegisterInfo.h" 34 #include "llvm/CodeGen/LiveInterval.h" 35 #include "llvm/CodeGen/LiveIntervals.h" 36 #include "llvm/CodeGen/MachineFunctionPass.h" 37 #include "llvm/CodeGen/MachineInstrBuilder.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetInstrInfo.h" 40 #include "llvm/InitializePasses.h" 41 #include "llvm/Pass.h" 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "rewrite-partial-reg-uses" 46 47 namespace { 48 49 class GCNRewritePartialRegUses : public MachineFunctionPass { 50 public: 51 static char ID; 52 GCNRewritePartialRegUses() : MachineFunctionPass(ID) {} 53 54 StringRef getPassName() const override { 55 return "Rewrite Partial Register Uses"; 56 } 57 58 void getAnalysisUsage(AnalysisUsage &AU) const override { 59 AU.setPreservesCFG(); 60 AU.addPreserved<LiveIntervals>(); 61 AU.addPreserved<SlotIndexes>(); 62 MachineFunctionPass::getAnalysisUsage(AU); 63 } 64 65 bool runOnMachineFunction(MachineFunction &MF) override; 66 67 private: 68 MachineRegisterInfo *MRI; 69 const SIRegisterInfo *TRI; 70 const TargetInstrInfo *TII; 71 LiveIntervals *LIS; 72 73 /// Rewrite partially used register Reg by shifting all its subregisters to 74 /// the right and replacing the original register with a register of minimal 75 /// size. Return true if the change has been made. 76 bool rewriteReg(Register Reg) const; 77 78 /// Value type for SubRegMap below. 79 struct SubRegInfo { 80 /// Register class required to hold the value stored in the SubReg. 81 const TargetRegisterClass *RC; 82 83 /// Index for the right-shifted subregister. If 0 this is the "covering" 84 /// subreg i.e. subreg that covers all others. Covering subreg becomes the 85 /// whole register after the replacement. 86 unsigned SubReg = AMDGPU::NoSubRegister; 87 SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {} 88 }; 89 90 /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container. 91 typedef SmallDenseMap<unsigned, SubRegInfo> SubRegMap; 92 93 /// Given register class RC and the set of used subregs as keys in the SubRegs 94 /// map return new register class and indexes of right-shifted subregs as 95 /// values in SubRegs map such that the resulting regclass would contain 96 /// registers of minimal size. 97 const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC, 98 SubRegMap &SubRegs) const; 99 100 /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to 101 /// find new regclass such that: 102 /// 1. It has subregs obtained by shifting each OldSubReg by RShift number 103 /// of bits to the right. Every "shifted" subreg should have the same 104 /// SubRegRC. SubRegRC can be null, in this case it initialized using 105 /// getSubRegisterClass. If CoverSubregIdx is not zero it's a subreg that 106 /// "covers" all other subregs in pairs. Basically such subreg becomes a 107 /// whole register. 108 /// 2. Resulting register class contains registers of minimal size but not 109 /// less than RegNumBits. 110 /// 111 /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out 112 /// parameter: 113 /// OldSubReg - input parameter, 114 /// SubRegRC - in/out, should be changed for unknown regclass, 115 /// NewSubReg - output, contains shifted subregs on return. 116 const TargetRegisterClass * 117 getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift, 118 unsigned RegNumBits, unsigned CoverSubregIdx, 119 SubRegMap &SubRegs) const; 120 121 /// Update live intervals after rewriting OldReg to NewReg with SubRegs map 122 /// describing OldSubReg -> NewSubReg mapping. 123 void updateLiveIntervals(Register OldReg, Register NewReg, 124 SubRegMap &SubRegs) const; 125 126 /// Helper methods. 127 128 /// Return reg class expected by a MO's parent instruction for a given MO. 129 const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const; 130 131 /// Find right-shifted by RShift amount version of the SubReg if it exists, 132 /// return 0 otherwise. 133 unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const; 134 135 /// Find subreg index with a given Offset and Size, return 0 if there is no 136 /// such subregister index. The result is cached in SubRegs data-member. 137 unsigned getSubReg(unsigned Offset, unsigned Size) const; 138 139 /// Cache for getSubReg method: {Offset, Size} -> SubReg index. 140 mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs; 141 142 /// Return bit mask that contains all register classes that are projected into 143 /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member. 144 const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC, 145 unsigned SubRegIdx) const; 146 147 /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask. 148 mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>, 149 const uint32_t *> 150 SuperRegMasks; 151 152 /// Return bitmask containing all allocatable register classes with registers 153 /// aligned at AlignNumBits. The result is cached in 154 /// AllocatableAndAlignedRegClassMasks data-member. 155 const BitVector & 156 getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const; 157 158 /// Cache for getAllocatableAndAlignedRegClassMask method: 159 /// AlignNumBits -> Class bitmask. 160 mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks; 161 }; 162 163 } // end anonymous namespace 164 165 // TODO: move this to the tablegen and use binary search by Offset. 166 unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset, 167 unsigned Size) const { 168 const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0); 169 if (Inserted) { 170 for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { 171 if (TRI->getSubRegIdxOffset(Idx) == Offset && 172 TRI->getSubRegIdxSize(Idx) == Size) { 173 I->second = Idx; 174 break; 175 } 176 } 177 } 178 return I->second; 179 } 180 181 unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg, 182 unsigned RShift) const { 183 unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift; 184 return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg)); 185 } 186 187 const uint32_t * 188 GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC, 189 unsigned SubRegIdx) const { 190 const auto [I, Inserted] = 191 SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr); 192 if (Inserted) { 193 for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) { 194 if (RCI.getSubReg() == SubRegIdx) { 195 I->second = RCI.getMask(); 196 break; 197 } 198 } 199 } 200 return I->second; 201 } 202 203 const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( 204 unsigned AlignNumBits) const { 205 const auto [I, Inserted] = 206 AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits); 207 if (Inserted) { 208 BitVector &BV = I->second; 209 BV.resize(TRI->getNumRegClasses()); 210 for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) { 211 auto *RC = TRI->getRegClass(ClassID); 212 if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits)) 213 BV.set(ClassID); 214 } 215 } 216 return I->second; 217 } 218 219 const TargetRegisterClass * 220 GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( 221 const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits, 222 unsigned CoverSubregIdx, SubRegMap &SubRegs) const { 223 224 unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC); 225 LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign 226 << '\n'); 227 228 BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign)); 229 for (auto &[OldSubReg, SRI] : SubRegs) { 230 auto &[SubRegRC, NewSubReg] = SRI; 231 232 // Register class may be unknown, for example: 233 // undef %0.sub4:sgpr_1024 = S_MOV_B32 01 234 // %0.sub5:sgpr_1024 = S_MOV_B32 02 235 // %1:vreg_64 = COPY %0.sub4_sub5 236 // Register classes for subregs 'sub4' and 'sub5' are known from the 237 // description of destination operand of S_MOV_B32 instruction but the 238 // class for the subreg 'sub4_sub5' isn't specified by the COPY instruction. 239 if (!SubRegRC) 240 SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg); 241 242 if (!SubRegRC) 243 return nullptr; 244 245 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':' 246 << TRI->getRegClassName(SubRegRC) 247 << (SubRegRC->isAllocatable() ? "" : " not alloc") 248 << " -> "); 249 250 if (OldSubReg == CoverSubregIdx) { 251 NewSubReg = AMDGPU::NoSubRegister; 252 LLVM_DEBUG(dbgs() << "whole reg"); 253 } else { 254 NewSubReg = shiftSubReg(OldSubReg, RShift); 255 if (!NewSubReg) { 256 LLVM_DEBUG(dbgs() << "none\n"); 257 return nullptr; 258 } 259 LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg)); 260 } 261 262 const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg) 263 : SubRegRC->getSubClassMask(); 264 if (!Mask) 265 llvm_unreachable("no register class mask?"); 266 267 ClassMask.clearBitsNotInMask(Mask); 268 // Don't try to early exit because checking if ClassMask has set bits isn't 269 // that cheap and we expect it to pass in most cases. 270 LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n'); 271 } 272 273 // ClassMask is the set of all register classes such that each class is 274 // allocatable, aligned, has all shifted subregs and each subreg has required 275 // register class (see SubRegRC above). Now select first (that is largest) 276 // register class with registers of minimal but not less than RegNumBits size. 277 // We have to check register size because we may encounter classes of smaller 278 // registers like VReg_1 in some situations. 279 const TargetRegisterClass *MinRC = nullptr; 280 unsigned MinNumBits = std::numeric_limits<unsigned>::max(); 281 for (unsigned ClassID : ClassMask.set_bits()) { 282 auto *RC = TRI->getRegClass(ClassID); 283 unsigned NumBits = TRI->getRegSizeInBits(*RC); 284 if (NumBits < MinNumBits && NumBits >= RegNumBits) { 285 MinNumBits = NumBits; 286 MinRC = RC; 287 } 288 if (MinNumBits == RegNumBits) 289 break; 290 } 291 #ifndef NDEBUG 292 if (MinRC) { 293 assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign)); 294 for (auto [SubReg, SRI] : SubRegs) 295 // Check that all registers in MinRC support SRI.SubReg subregister. 296 assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg)); 297 } 298 #endif 299 // There might be zero RShift - in this case we just trying to find smaller 300 // register. 301 return (MinRC != RC || RShift != 0) ? MinRC : nullptr; 302 } 303 304 const TargetRegisterClass * 305 GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC, 306 SubRegMap &SubRegs) const { 307 unsigned CoverSubreg = AMDGPU::NoSubRegister; 308 unsigned Offset = std::numeric_limits<unsigned>::max(); 309 unsigned End = 0; 310 for (auto [SubReg, SRI] : SubRegs) { 311 unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg); 312 unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg); 313 if (SubRegOffset < Offset) { 314 Offset = SubRegOffset; 315 CoverSubreg = AMDGPU::NoSubRegister; 316 } 317 if (SubRegEnd > End) { 318 End = SubRegEnd; 319 CoverSubreg = AMDGPU::NoSubRegister; 320 } 321 if (SubRegOffset == Offset && SubRegEnd == End) 322 CoverSubreg = SubReg; 323 } 324 // If covering subreg is found shift everything so the covering subreg would 325 // be in the rightmost position. 326 if (CoverSubreg != AMDGPU::NoSubRegister) 327 return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg, 328 SubRegs); 329 330 // Otherwise find subreg with maximum required alignment and shift it and all 331 // other subregs to the rightmost possible position with respect to the 332 // alignment. 333 unsigned MaxAlign = 0; 334 for (auto [SubReg, SRI] : SubRegs) 335 MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg)); 336 337 unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max(); 338 for (auto [SubReg, SRI] : SubRegs) { 339 if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign) 340 continue; 341 FirstMaxAlignedSubRegOffset = 342 std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg)); 343 if (FirstMaxAlignedSubRegOffset == Offset) 344 break; 345 } 346 347 unsigned NewOffsetOfMaxAlignedSubReg = 348 alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign); 349 350 if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset) 351 llvm_unreachable("misaligned subreg"); 352 353 unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg; 354 return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs); 355 } 356 357 // Only the subrange's lanemasks of the original interval need to be modified. 358 // Subrange for a covering subreg becomes the main range. 359 void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg, 360 Register NewReg, 361 SubRegMap &SubRegs) const { 362 if (!LIS->hasInterval(OldReg)) 363 return; 364 365 auto &OldLI = LIS->getInterval(OldReg); 366 auto &NewLI = LIS->createEmptyInterval(NewReg); 367 368 auto &Allocator = LIS->getVNInfoAllocator(); 369 NewLI.setWeight(OldLI.weight()); 370 371 for (auto &SR : OldLI.subranges()) { 372 auto I = find_if(SubRegs, [&](auto &P) { 373 return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first); 374 }); 375 376 if (I == SubRegs.end()) { 377 // There might be a situation when subranges don't exactly match used 378 // subregs, for example: 379 // %120 [160r,1392r:0) 0@160r 380 // L000000000000C000 [160r,1392r:0) 0@160r 381 // L0000000000003000 [160r,1392r:0) 0@160r 382 // L0000000000000C00 [160r,1392r:0) 0@160r 383 // L0000000000000300 [160r,1392r:0) 0@160r 384 // L0000000000000003 [160r,1104r:0) 0@160r 385 // L000000000000000C [160r,1104r:0) 0@160r 386 // L0000000000000030 [160r,1104r:0) 0@160r 387 // L00000000000000C0 [160r,1104r:0) 0@160r 388 // but used subregs are: 389 // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF 390 // sub0_sub1_sub2_sub3, L00000000000000FF 391 // sub4_sub5_sub6_sub7, L000000000000FF00 392 // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7 393 // have several subranges with the same lifetime. For such cases just 394 // recreate the interval. 395 LIS->removeInterval(OldReg); 396 LIS->removeInterval(NewReg); 397 LIS->createAndComputeVirtRegInterval(NewReg); 398 return; 399 } 400 401 if (unsigned NewSubReg = I->second.SubReg) 402 NewLI.createSubRangeFrom(Allocator, 403 TRI->getSubRegIndexLaneMask(NewSubReg), SR); 404 else // This is the covering subreg (0 index) - set it as main range. 405 NewLI.assign(SR, Allocator); 406 407 SubRegs.erase(I); 408 } 409 if (NewLI.empty()) 410 NewLI.assign(OldLI, Allocator); 411 NewLI.verify(MRI); 412 LIS->removeInterval(OldReg); 413 } 414 415 const TargetRegisterClass * 416 GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const { 417 MachineInstr *MI = MO.getParent(); 418 return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI, 419 *MI->getParent()->getParent()); 420 } 421 422 bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { 423 auto Range = MRI->reg_nodbg_operands(Reg); 424 if (Range.begin() == Range.end()) 425 return false; 426 427 for (MachineOperand &MO : Range) { 428 if (MO.getSubReg() == AMDGPU::NoSubRegister) // Whole reg used, quit. 429 return false; 430 } 431 432 auto *RC = MRI->getRegClass(Reg); 433 LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI) 434 << ':' << TRI->getRegClassName(RC) << '\n'); 435 436 // Collect used subregs and constrained reg classes infered from instruction 437 // operands. 438 SubRegMap SubRegs; 439 for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { 440 assert(MO.getSubReg() != AMDGPU::NoSubRegister); 441 auto *OpDescRC = getOperandRegClass(MO); 442 const auto [I, Inserted] = SubRegs.try_emplace(MO.getSubReg(), OpDescRC); 443 if (!Inserted && OpDescRC) { 444 SubRegInfo &SRI = I->second; 445 SRI.RC = SRI.RC ? TRI->getCommonSubClass(SRI.RC, OpDescRC) : OpDescRC; 446 if (!SRI.RC) { 447 LLVM_DEBUG(dbgs() << " Couldn't find common target regclass\n"); 448 return false; 449 } 450 } 451 } 452 453 auto *NewRC = getMinSizeReg(RC, SubRegs); 454 if (!NewRC) { 455 LLVM_DEBUG(dbgs() << " No improvement achieved\n"); 456 return false; 457 } 458 459 Register NewReg = MRI->createVirtualRegister(NewRC); 460 LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':' 461 << TRI->getRegClassName(RC) << " -> " 462 << printReg(NewReg, TRI) << ':' 463 << TRI->getRegClassName(NewRC) << '\n'); 464 465 for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) { 466 MO.setReg(NewReg); 467 // Debug info can refer to the whole reg, just leave it as it is for now. 468 // TODO: create some DI shift expression? 469 if (MO.isDebug() && MO.getSubReg() == 0) 470 continue; 471 unsigned SubReg = SubRegs[MO.getSubReg()].SubReg; 472 MO.setSubReg(SubReg); 473 if (SubReg == AMDGPU::NoSubRegister && MO.isDef()) 474 MO.setIsUndef(false); 475 } 476 477 if (LIS) 478 updateLiveIntervals(Reg, NewReg, SubRegs); 479 480 return true; 481 } 482 483 bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) { 484 MRI = &MF.getRegInfo(); 485 TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo()); 486 TII = MF.getSubtarget().getInstrInfo(); 487 LIS = getAnalysisIfAvailable<LiveIntervals>(); 488 bool Changed = false; 489 for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) { 490 Changed |= rewriteReg(Register::index2VirtReg(I)); 491 } 492 return Changed; 493 } 494 495 char GCNRewritePartialRegUses::ID; 496 497 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID; 498 499 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE, 500 "Rewrite Partial Register Uses", false, false) 501 INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE, 502 "Rewrite Partial Register Uses", false, false) 503