1 //===-- AMDGPURegBankLegalize.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// Lower G_ instructions that can't be inst-selected with register bank 10 /// assignment from AMDGPURegBankSelect based on machine uniformity info. 11 /// Given types on all operands, some register bank assignments require lowering 12 /// while others do not. 13 /// Note: cases where all register bank assignments would require lowering are 14 /// lowered in legalizer. 15 /// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not. 16 /// Eliminate sgpr S1 by lowering to sgpr S32. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include "AMDGPU.h" 21 #include "AMDGPUGlobalISelUtils.h" 22 #include "AMDGPURegBankLegalizeHelper.h" 23 #include "GCNSubtarget.h" 24 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 25 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineUniformityAnalysis.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/InitializePasses.h" 30 31 #define DEBUG_TYPE "amdgpu-regbanklegalize" 32 33 using namespace llvm; 34 using namespace AMDGPU; 35 36 namespace { 37 38 class AMDGPURegBankLegalize : public MachineFunctionPass { 39 public: 40 static char ID; 41 42 public: 43 AMDGPURegBankLegalize() : MachineFunctionPass(ID) {} 44 45 bool runOnMachineFunction(MachineFunction &MF) override; 46 47 StringRef getPassName() const override { 48 return "AMDGPU Register Bank Legalize"; 49 } 50 51 void getAnalysisUsage(AnalysisUsage &AU) const override { 52 AU.addRequired<TargetPassConfig>(); 53 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 54 AU.addRequired<MachineUniformityAnalysisPass>(); 55 MachineFunctionPass::getAnalysisUsage(AU); 56 } 57 58 // If there were no phis and we do waterfall expansion machine verifier would 59 // fail. 60 MachineFunctionProperties getClearedProperties() const override { 61 return MachineFunctionProperties().setNoPHIs(); 62 } 63 }; 64 65 } // End anonymous namespace. 66 67 INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE, 68 "AMDGPU Register Bank Legalize", false, false) 69 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 70 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) 71 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) 72 INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE, 73 "AMDGPU Register Bank Legalize", false, false) 74 75 char AMDGPURegBankLegalize::ID = 0; 76 77 char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID; 78 79 FunctionPass *llvm::createAMDGPURegBankLegalizePass() { 80 return new AMDGPURegBankLegalize(); 81 } 82 83 const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, 84 MachineRegisterInfo &MRI) { 85 static std::mutex GlobalMutex; 86 static SmallDenseMap<unsigned, std::unique_ptr<RegBankLegalizeRules>> 87 CacheForRuleSet; 88 std::lock_guard<std::mutex> Lock(GlobalMutex); 89 auto [It, Inserted] = CacheForRuleSet.try_emplace(ST.getGeneration()); 90 if (Inserted) 91 It->second = std::make_unique<RegBankLegalizeRules>(ST, MRI); 92 else 93 It->second->refreshRefs(ST, MRI); 94 return *It->second; 95 } 96 97 class AMDGPURegBankLegalizeCombiner { 98 MachineIRBuilder &B; 99 MachineRegisterInfo &MRI; 100 const SIRegisterInfo &TRI; 101 const RegisterBank *SgprRB; 102 const RegisterBank *VgprRB; 103 const RegisterBank *VccRB; 104 105 static constexpr LLT S1 = LLT::scalar(1); 106 static constexpr LLT S16 = LLT::scalar(16); 107 static constexpr LLT S32 = LLT::scalar(32); 108 static constexpr LLT S64 = LLT::scalar(64); 109 110 public: 111 AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, 112 const RegisterBankInfo &RBI) 113 : B(B), MRI(*B.getMRI()), TRI(TRI), 114 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), 115 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), 116 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; 117 118 bool isLaneMask(Register Reg) { 119 const RegisterBank *RB = MRI.getRegBankOrNull(Reg); 120 if (RB && RB->getID() == AMDGPU::VCCRegBankID) 121 return true; 122 123 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); 124 return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); 125 } 126 127 void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { 128 MI.eraseFromParent(); 129 if (Optional0 && isTriviallyDead(*Optional0, MRI)) 130 Optional0->eraseFromParent(); 131 } 132 133 std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { 134 MachineInstr *MatchMI = MRI.getVRegDef(Src); 135 if (MatchMI->getOpcode() != Opcode) 136 return {nullptr, Register()}; 137 return {MatchMI, MatchMI->getOperand(1).getReg()}; 138 } 139 140 void tryCombineCopy(MachineInstr &MI) { 141 Register Dst = MI.getOperand(0).getReg(); 142 Register Src = MI.getOperand(1).getReg(); 143 // Skip copies of physical registers. 144 if (!Dst.isVirtual() || !Src.isVirtual()) 145 return; 146 147 // This is a cross bank copy, sgpr S1 to lane mask. 148 // 149 // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) 150 // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) 151 // -> 152 // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) 153 if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { 154 auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); 155 assert(Trunc && MRI.getType(TruncS32Src) == S32 && 156 "sgpr S1 must be result of G_TRUNC of sgpr S32"); 157 158 B.setInstr(MI); 159 // Ensure that truncated bits in BoolSrc are 0. 160 auto One = B.buildConstant({SgprRB, S32}, 1); 161 auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); 162 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); 163 cleanUpAfterCombine(MI, Trunc); 164 return; 165 } 166 167 // Src = G_AMDGPU_READANYLANE RALSrc 168 // Dst = COPY Src 169 // -> 170 // Dst = RALSrc 171 if (MRI.getRegBankOrNull(Dst) == VgprRB && 172 MRI.getRegBankOrNull(Src) == SgprRB) { 173 auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); 174 if (!RAL) 175 return; 176 177 assert(MRI.getRegBank(RALSrc) == VgprRB); 178 MRI.replaceRegWith(Dst, RALSrc); 179 cleanUpAfterCombine(MI, RAL); 180 return; 181 } 182 } 183 184 void tryCombineS1AnyExt(MachineInstr &MI) { 185 // %Src:sgpr(S1) = G_TRUNC %TruncSrc 186 // %Dst = G_ANYEXT %Src:sgpr(S1) 187 // -> 188 // %Dst = G_... %TruncSrc 189 Register Dst = MI.getOperand(0).getReg(); 190 Register Src = MI.getOperand(1).getReg(); 191 if (MRI.getType(Src) != S1) 192 return; 193 194 auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); 195 if (!Trunc) 196 return; 197 198 LLT DstTy = MRI.getType(Dst); 199 LLT TruncSrcTy = MRI.getType(TruncSrc); 200 201 if (DstTy == TruncSrcTy) { 202 MRI.replaceRegWith(Dst, TruncSrc); 203 cleanUpAfterCombine(MI, Trunc); 204 return; 205 } 206 207 B.setInstr(MI); 208 209 if (DstTy == S32 && TruncSrcTy == S64) { 210 auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); 211 MRI.replaceRegWith(Dst, Unmerge.getReg(0)); 212 cleanUpAfterCombine(MI, Trunc); 213 return; 214 } 215 216 if (DstTy == S64 && TruncSrcTy == S32) { 217 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), 218 {TruncSrc, B.buildUndef({SgprRB, S32})}); 219 cleanUpAfterCombine(MI, Trunc); 220 return; 221 } 222 223 if (DstTy == S32 && TruncSrcTy == S16) { 224 B.buildAnyExt(Dst, TruncSrc); 225 cleanUpAfterCombine(MI, Trunc); 226 return; 227 } 228 229 if (DstTy == S16 && TruncSrcTy == S32) { 230 B.buildTrunc(Dst, TruncSrc); 231 cleanUpAfterCombine(MI, Trunc); 232 return; 233 } 234 235 llvm_unreachable("missing anyext + trunc combine"); 236 } 237 }; 238 239 // Search through MRI for virtual registers with sgpr register bank and S1 LLT. 240 [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { 241 const LLT S1 = LLT::scalar(1); 242 for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) { 243 Register Reg = Register::index2VirtReg(i); 244 if (MRI.def_empty(Reg) || MRI.getType(Reg) != S1) 245 continue; 246 247 const RegisterBank *RB = MRI.getRegBankOrNull(Reg); 248 if (RB && RB->getID() == AMDGPU::SGPRRegBankID) { 249 LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: "; 250 MRI.getVRegDef(Reg)->dump();); 251 return Reg; 252 } 253 } 254 255 return {}; 256 } 257 258 bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { 259 if (MF.getProperties().hasFailedISel()) 260 return false; 261 262 // Setup the instruction builder with CSE. 263 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 264 GISelCSEAnalysisWrapper &Wrapper = 265 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 266 GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig()); 267 GISelObserverWrapper Observer; 268 Observer.addObserver(&CSEInfo); 269 270 CSEMIRBuilder B(MF); 271 B.setCSEInfo(&CSEInfo); 272 B.setChangeObserver(Observer); 273 274 RAIIDelegateInstaller DelegateInstaller(MF, &Observer); 275 RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); 276 277 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 278 MachineRegisterInfo &MRI = MF.getRegInfo(); 279 const RegisterBankInfo &RBI = *ST.getRegBankInfo(); 280 const MachineUniformityInfo &MUI = 281 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); 282 283 // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. 284 const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); 285 286 // Logic that does legalization based on IDs assigned to Opcode. 287 RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules); 288 289 SmallVector<MachineInstr *> AllInst; 290 291 for (MachineBasicBlock &MBB : MF) { 292 for (MachineInstr &MI : MBB) { 293 AllInst.push_back(&MI); 294 } 295 } 296 297 for (MachineInstr *MI : AllInst) { 298 if (!MI->isPreISelOpcode()) 299 continue; 300 301 unsigned Opc = MI->getOpcode(); 302 // Insert point for use operands needs some calculation. 303 if (Opc == AMDGPU::G_PHI) { 304 RBLHelper.applyMappingPHI(*MI); 305 continue; 306 } 307 308 // Opcodes that support pretty much all combinations of reg banks and LLTs 309 // (except S1). There is no point in writing rules for them. 310 if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || 311 Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { 312 RBLHelper.applyMappingTrivial(*MI); 313 continue; 314 } 315 316 // Opcodes that also support S1. 317 if (Opc == G_FREEZE && 318 MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { 319 RBLHelper.applyMappingTrivial(*MI); 320 continue; 321 } 322 323 if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || 324 Opc == AMDGPU::G_IMPLICIT_DEF)) { 325 Register Dst = MI->getOperand(0).getReg(); 326 // Non S1 types are trivially accepted. 327 if (MRI.getType(Dst) != LLT::scalar(1)) { 328 assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID); 329 continue; 330 } 331 332 // S1 rules are in RegBankLegalizeRules. 333 } 334 335 RBLHelper.findRuleAndApplyMapping(*MI); 336 } 337 338 // Sgpr S1 clean up combines: 339 // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine. 340 // In RegBankLegalize 'S1 Dst' are legalized into S32 as 341 // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'. 342 // S1 Truncs and Anyexts that come from legalizer, that can have non-S32 343 // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up. 344 // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine. 345 // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc) 346 // Legalizing this use creates sgpr S1(S32) to vcc Copy. 347 348 // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1: 349 // - Vcc to vcc Copy: nothing to do here, just a regular copy. 350 // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*). 351 // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used 352 // instead. When only available instruction creates vcc result, use of 353 // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC. 354 355 // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)': 356 // Copy from divergent to uniform register indicates an error in either: 357 // - Uniformity analysis: Uniform instruction has divergent input. If one of 358 // the inputs is divergent, instruction should be divergent! 359 // - RegBankLegalizer not executing in waterfall loop (missing implementation) 360 361 AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI); 362 363 for (MachineBasicBlock &MBB : MF) { 364 for (MachineInstr &MI : make_early_inc_range(MBB)) { 365 if (MI.getOpcode() == AMDGPU::COPY) { 366 Combiner.tryCombineCopy(MI); 367 continue; 368 } 369 if (MI.getOpcode() == AMDGPU::G_ANYEXT) { 370 Combiner.tryCombineS1AnyExt(MI); 371 continue; 372 } 373 } 374 } 375 376 assert(!getAnySgprS1(MRI).isValid() && 377 "Registers with sgpr reg bank and S1 LLT are not legal after " 378 "AMDGPURegBankLegalize. Should lower to sgpr S32"); 379 380 return true; 381 } 382