1 //===-------------- GCNRewritePartialRegUses.cpp --------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// RenameIndependentSubregs pass leaves large partially used super registers, 10 /// for example: 11 /// undef %0.sub4:VReg_1024 = ... 12 /// %0.sub5:VReg_1024 = ... 13 /// %0.sub6:VReg_1024 = ... 14 /// %0.sub7:VReg_1024 = ... 15 /// use %0.sub4_sub5_sub6_sub7 16 /// use %0.sub6_sub7 17 /// 18 /// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and 19 /// rewrites such partially used super registers with registers of minimal size: 20 /// undef %0.sub0:VReg_128 = ... 21 /// %0.sub1:VReg_128 = ... 22 /// %0.sub2:VReg_128 = ... 23 /// %0.sub3:VReg_128 = ... 24 /// use %0.sub0_sub1_sub2_sub3 25 /// use %0.sub2_sub3 26 /// 27 /// This allows to avoid subreg lanemasks tracking during register pressure 28 /// calculation and creates more possibilities for the code unaware of lanemasks 29 //===----------------------------------------------------------------------===// 30 31 #include "AMDGPU.h" 32 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 33 #include "SIRegisterInfo.h" 34 #include "llvm/CodeGen/LiveInterval.h" 35 #include "llvm/CodeGen/LiveIntervals.h" 36 #include "llvm/CodeGen/MachineFunctionPass.h" 37 #include "llvm/CodeGen/MachineInstrBuilder.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetInstrInfo.h" 40 #include "llvm/InitializePasses.h" 41 #include "llvm/Pass.h" 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "rewrite-partial-reg-uses" 46 47 namespace { 48 49 class GCNRewritePartialRegUses : public MachineFunctionPass { 50 public: 51 static char ID; 52 GCNRewritePartialRegUses() : MachineFunctionPass(ID) {} 53 54 StringRef getPassName() const override { 55 return "Rewrite Partial Register Uses"; 56 } 57 58 void getAnalysisUsage(AnalysisUsage &AU) const override { 59 AU.setPreservesCFG(); 60 AU.addPreserved<LiveIntervals>(); 61 AU.addPreserved<SlotIndexes>(); 62 MachineFunctionPass::getAnalysisUsage(AU); 63 } 64 65 bool runOnMachineFunction(MachineFunction &MF) override; 66 67 private: 68 MachineRegisterInfo *MRI; 69 const SIRegisterInfo *TRI; 70 const TargetInstrInfo *TII; 71 LiveIntervals *LIS; 72 73 /// Rewrite partially used register Reg by shifting all its subregisters to 74 /// the right and replacing the original register with a register of minimal 75 /// size. Return true if the change has been made. 76 bool rewriteReg(Register Reg) const; 77 78 /// Value type for SubRegMap below. 79 struct SubRegInfo { 80 /// Register class required to hold the value stored in the SubReg. 81 const TargetRegisterClass *RC; 82 83 /// Index for the right-shifted subregister. If 0 this is the "covering" 84 /// subreg i.e. subreg that covers all others. Covering subreg becomes the 85 /// whole register after the replacement. 86 unsigned SubReg = AMDGPU::NoSubRegister; 87 SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {} 88 }; 89 90 /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container. 91 typedef SmallDenseMap<unsigned, SubRegInfo> SubRegMap; 92 93 /// Given register class RC and the set of used subregs as keys in the SubRegs 94 /// map return new register class and indexes of right-shifted subregs as 95 /// values in SubRegs map such that the resulting regclass would contain 96 /// registers of minimal size. 97 const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC, 98 SubRegMap &SubRegs) const; 99 100 /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to 101 /// find new regclass such that: 102 /// 1. It has subregs obtained by shifting each OldSubReg by RShift number 103 /// of bits to the right. Every "shifted" subreg should have the same 104 /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers" 105 /// all other subregs in pairs. Basically such subreg becomes a whole 106 /// register. 107 /// 2. Resulting register class contains registers of minimal size but not 108 /// less than RegNumBits. 109 /// 110 /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out 111 /// parameter: 112 /// OldSubReg - input parameter, 113 /// SubRegRC - input parameter (cannot be null), 114 /// NewSubReg - output, contains shifted subregs on return. 115 const TargetRegisterClass * 116 getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift, 117 unsigned RegNumBits, unsigned CoverSubregIdx, 118 SubRegMap &SubRegs) const; 119 120 /// Update live intervals after rewriting OldReg to NewReg with SubRegs map 121 /// describing OldSubReg -> NewSubReg mapping. 122 void updateLiveIntervals(Register OldReg, Register NewReg, 123 SubRegMap &SubRegs) const; 124 125 /// Helper methods. 126 127 /// Return reg class expected by a MO's parent instruction for a given MO. 128 const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const; 129 130 /// Find right-shifted by RShift amount version of the SubReg if it exists, 131 /// return 0 otherwise. 132 unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const; 133 134 /// Find subreg index with a given Offset and Size, return 0 if there is no 135 /// such subregister index. The result is cached in SubRegs data-member. 136 unsigned getSubReg(unsigned Offset, unsigned Size) const; 137 138 /// Cache for getSubReg method: {Offset, Size} -> SubReg index. 139 mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs; 140 141 /// Return bit mask that contains all register classes that are projected into 142 /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member. 143 const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC, 144 unsigned SubRegIdx) const; 145 146 /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask. 147 mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>, 148 const uint32_t *> 149 SuperRegMasks; 150 151 /// Return bitmask containing all allocatable register classes with registers 152 /// aligned at AlignNumBits. The result is cached in 153 /// AllocatableAndAlignedRegClassMasks data-member. 154 const BitVector & 155 getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const; 156 157 /// Cache for getAllocatableAndAlignedRegClassMask method: 158 /// AlignNumBits -> Class bitmask. 159 mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks; 160 }; 161 162 } // end anonymous namespace 163 164 // TODO: move this to the tablegen and use binary search by Offset. 165 unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset, 166 unsigned Size) const { 167 const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0); 168 if (Inserted) { 169 for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { 170 if (TRI->getSubRegIdxOffset(Idx) == Offset && 171 TRI->getSubRegIdxSize(Idx) == Size) { 172 I->second = Idx; 173 break; 174 } 175 } 176 } 177 return I->second; 178 } 179 180 unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg, 181 unsigned RShift) const { 182 unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift; 183 return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg)); 184 } 185 186 const uint32_t * 187 GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC, 188 unsigned SubRegIdx) const { 189 const auto [I, Inserted] = 190 SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr); 191 if (Inserted) { 192 for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) { 193 if (RCI.getSubReg() == SubRegIdx) { 194 I->second = RCI.getMask(); 195 break; 196 } 197 } 198 } 199 return I->second; 200 } 201 202 const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( 203 unsigned AlignNumBits) const { 204 const auto [I, Inserted] = 205 AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits); 206 if (Inserted) { 207 BitVector &BV = I->second; 208 BV.resize(TRI->getNumRegClasses()); 209 for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) { 210 auto *RC = TRI->getRegClass(ClassID); 211 if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits)) 212 BV.set(ClassID); 213 } 214 } 215 return I->second; 216 } 217 218 const TargetRegisterClass * 219 GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( 220 const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits, 221 unsigned CoverSubregIdx, SubRegMap &SubRegs) const { 222 223 unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC); 224 LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign 225 << '\n'); 226 227 BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign)); 228 for (auto &[OldSubReg, SRI] : SubRegs) { 229 auto &[SubRegRC, NewSubReg] = SRI; 230 assert(SubRegRC); 231 232 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':' 233 << TRI->getRegClassName(SubRegRC) 234 << (SubRegRC->isAllocatable() ? "" : " not alloc") 235 << " -> "); 236 237 if (OldSubReg == CoverSubregIdx) { 238 // Covering subreg will become a full register, RC should be allocatable. 239 assert(SubRegRC->isAllocatable()); 240 NewSubReg = AMDGPU::NoSubRegister; 241 LLVM_DEBUG(dbgs() << "whole reg"); 242 } else { 243 NewSubReg = shiftSubReg(OldSubReg, RShift); 244 if (!NewSubReg) { 245 LLVM_DEBUG(dbgs() << "none\n"); 246 return nullptr; 247 } 248 LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg)); 249 } 250 251 const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg) 252 : SubRegRC->getSubClassMask(); 253 if (!Mask) 254 llvm_unreachable("no register class mask?"); 255 256 ClassMask.clearBitsNotInMask(Mask); 257 // Don't try to early exit because checking if ClassMask has set bits isn't 258 // that cheap and we expect it to pass in most cases. 259 LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n'); 260 } 261 262 // ClassMask is the set of all register classes such that each class is 263 // allocatable, aligned, has all shifted subregs and each subreg has required 264 // register class (see SubRegRC above). Now select first (that is largest) 265 // register class with registers of minimal but not less than RegNumBits size. 266 // We have to check register size because we may encounter classes of smaller 267 // registers like VReg_1 in some situations. 268 const TargetRegisterClass *MinRC = nullptr; 269 unsigned MinNumBits = std::numeric_limits<unsigned>::max(); 270 for (unsigned ClassID : ClassMask.set_bits()) { 271 auto *RC = TRI->getRegClass(ClassID); 272 unsigned NumBits = TRI->getRegSizeInBits(*RC); 273 if (NumBits < MinNumBits && NumBits >= RegNumBits) { 274 MinNumBits = NumBits; 275 MinRC = RC; 276 } 277 if (MinNumBits == RegNumBits) 278 break; 279 } 280 #ifndef NDEBUG 281 if (MinRC) { 282 assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign)); 283 for (auto [SubReg, SRI] : SubRegs) 284 // Check that all registers in MinRC support SRI.SubReg subregister. 285 assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg)); 286 } 287 #endif 288 // There might be zero RShift - in this case we just trying to find smaller 289 // register. 290 return (MinRC != RC || RShift != 0) ? MinRC : nullptr; 291 } 292 293 const TargetRegisterClass * 294 GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC, 295 SubRegMap &SubRegs) const { 296 unsigned CoverSubreg = AMDGPU::NoSubRegister; 297 unsigned Offset = std::numeric_limits<unsigned>::max(); 298 unsigned End = 0; 299 for (auto [SubReg, SRI] : SubRegs) { 300 unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg); 301 unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg); 302 if (SubRegOffset < Offset) { 303 Offset = SubRegOffset; 304 CoverSubreg = AMDGPU::NoSubRegister; 305 } 306 if (SubRegEnd > End) { 307 End = SubRegEnd; 308 CoverSubreg = AMDGPU::NoSubRegister; 309 } 310 if (SubRegOffset == Offset && SubRegEnd == End) 311 CoverSubreg = SubReg; 312 } 313 // If covering subreg is found shift everything so the covering subreg would 314 // be in the rightmost position. 315 if (CoverSubreg != AMDGPU::NoSubRegister) 316 return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg, 317 SubRegs); 318 319 // Otherwise find subreg with maximum required alignment and shift it and all 320 // other subregs to the rightmost possible position with respect to the 321 // alignment. 322 unsigned MaxAlign = 0; 323 for (auto [SubReg, SRI] : SubRegs) 324 MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg)); 325 326 unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max(); 327 for (auto [SubReg, SRI] : SubRegs) { 328 if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign) 329 continue; 330 FirstMaxAlignedSubRegOffset = 331 std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg)); 332 if (FirstMaxAlignedSubRegOffset == Offset) 333 break; 334 } 335 336 unsigned NewOffsetOfMaxAlignedSubReg = 337 alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign); 338 339 if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset) 340 llvm_unreachable("misaligned subreg"); 341 342 unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg; 343 return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs); 344 } 345 346 // Only the subrange's lanemasks of the original interval need to be modified. 347 // Subrange for a covering subreg becomes the main range. 348 void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg, 349 Register NewReg, 350 SubRegMap &SubRegs) const { 351 if (!LIS->hasInterval(OldReg)) 352 return; 353 354 auto &OldLI = LIS->getInterval(OldReg); 355 auto &NewLI = LIS->createEmptyInterval(NewReg); 356 357 auto &Allocator = LIS->getVNInfoAllocator(); 358 NewLI.setWeight(OldLI.weight()); 359 360 for (auto &SR : OldLI.subranges()) { 361 auto I = find_if(SubRegs, [&](auto &P) { 362 return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first); 363 }); 364 365 if (I == SubRegs.end()) { 366 // There might be a situation when subranges don't exactly match used 367 // subregs, for example: 368 // %120 [160r,1392r:0) 0@160r 369 // L000000000000C000 [160r,1392r:0) 0@160r 370 // L0000000000003000 [160r,1392r:0) 0@160r 371 // L0000000000000C00 [160r,1392r:0) 0@160r 372 // L0000000000000300 [160r,1392r:0) 0@160r 373 // L0000000000000003 [160r,1104r:0) 0@160r 374 // L000000000000000C [160r,1104r:0) 0@160r 375 // L0000000000000030 [160r,1104r:0) 0@160r 376 // L00000000000000C0 [160r,1104r:0) 0@160r 377 // but used subregs are: 378 // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF 379 // sub0_sub1_sub2_sub3, L00000000000000FF 380 // sub4_sub5_sub6_sub7, L000000000000FF00 381 // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7 382 // have several subranges with the same lifetime. For such cases just 383 // recreate the interval. 384 LIS->removeInterval(OldReg); 385 LIS->removeInterval(NewReg); 386 LIS->createAndComputeVirtRegInterval(NewReg); 387 return; 388 } 389 390 if (unsigned NewSubReg = I->second.SubReg) 391 NewLI.createSubRangeFrom(Allocator, 392 TRI->getSubRegIndexLaneMask(NewSubReg), SR); 393 else // This is the covering subreg (0 index) - set it as main range. 394 NewLI.assign(SR, Allocator); 395 396 SubRegs.erase(I); 397 } 398 if (NewLI.empty()) 399 NewLI.assign(OldLI, Allocator); 400 NewLI.verify(MRI); 401 LIS->removeInterval(OldReg); 402 } 403 404 const TargetRegisterClass * 405 GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const { 406 MachineInstr *MI = MO.getParent(); 407 return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI, 408 *MI->getParent()->getParent()); 409 } 410 411 bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { 412 auto Range = MRI->reg_nodbg_operands(Reg); 413 if (Range.empty() || any_of(Range, [](MachineOperand &MO) { 414 return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1] 415 })) 416 return false; 417 418 auto *RC = MRI->getRegClass(Reg); 419 LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI) 420 << ':' << TRI->getRegClassName(RC) << '\n'); 421 422 // Collect used subregs and their reg classes infered from instruction 423 // operands. 424 SubRegMap SubRegs; 425 for (MachineOperand &MO : Range) { 426 const unsigned SubReg = MO.getSubReg(); 427 assert(SubReg != AMDGPU::NoSubRegister); // Due to [1]. 428 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(SubReg) << ':'); 429 430 const auto [I, Inserted] = SubRegs.try_emplace(SubReg); 431 const TargetRegisterClass *&SubRegRC = I->second.RC; 432 433 if (Inserted) 434 SubRegRC = TRI->getSubRegisterClass(RC, SubReg); 435 436 if (SubRegRC) { 437 if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) { 438 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & " 439 << TRI->getRegClassName(OpDescRC) << " = "); 440 SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC); 441 } 442 } 443 444 if (!SubRegRC) { 445 LLVM_DEBUG(dbgs() << "couldn't find target regclass\n"); 446 return false; 447 } 448 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n'); 449 } 450 451 auto *NewRC = getMinSizeReg(RC, SubRegs); 452 if (!NewRC) { 453 LLVM_DEBUG(dbgs() << " No improvement achieved\n"); 454 return false; 455 } 456 457 Register NewReg = MRI->createVirtualRegister(NewRC); 458 LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':' 459 << TRI->getRegClassName(RC) << " -> " 460 << printReg(NewReg, TRI) << ':' 461 << TRI->getRegClassName(NewRC) << '\n'); 462 463 for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) { 464 MO.setReg(NewReg); 465 // Debug info can refer to the whole reg, just leave it as it is for now. 466 // TODO: create some DI shift expression? 467 if (MO.isDebug() && MO.getSubReg() == 0) 468 continue; 469 unsigned SubReg = SubRegs[MO.getSubReg()].SubReg; 470 MO.setSubReg(SubReg); 471 if (SubReg == AMDGPU::NoSubRegister && MO.isDef()) 472 MO.setIsUndef(false); 473 } 474 475 if (LIS) 476 updateLiveIntervals(Reg, NewReg, SubRegs); 477 478 return true; 479 } 480 481 bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) { 482 MRI = &MF.getRegInfo(); 483 TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo()); 484 TII = MF.getSubtarget().getInstrInfo(); 485 LIS = getAnalysisIfAvailable<LiveIntervals>(); 486 bool Changed = false; 487 for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) { 488 Changed |= rewriteReg(Register::index2VirtReg(I)); 489 } 490 return Changed; 491 } 492 493 char GCNRewritePartialRegUses::ID; 494 495 char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID; 496 497 INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE, 498 "Rewrite Partial Register Uses", false, false) 499 INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE, 500 "Rewrite Partial Register Uses", false, false) 501