1 //===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// Implements actual lowering algorithms for each ID that can be used in 10 /// Rule.OperandMapping. Similar to legalizer helper but with register banks. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPURegBankLegalizeHelper.h" 15 #include "AMDGPUGlobalISelUtils.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "AMDGPURegBankLegalizeRules.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "GCNSubtarget.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineUniformityAnalysis.h" 25 #include "llvm/IR/IntrinsicsAMDGPU.h" 26 27 #define DEBUG_TYPE "amdgpu-regbanklegalize" 28 29 using namespace llvm; 30 using namespace AMDGPU; 31 32 RegBankLegalizeHelper::RegBankLegalizeHelper( 33 MachineIRBuilder &B, const MachineUniformityInfo &MUI, 34 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) 35 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()), 36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), 37 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), 38 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), 39 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} 40 41 void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { 42 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI); 43 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI); 44 45 SmallSet<Register, 4> WaterfallSgprs; 46 unsigned OpIdx = 0; 47 if (Mapping.DstOpMapping.size() > 0) { 48 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator())); 49 applyMappingDst(MI, OpIdx, Mapping.DstOpMapping); 50 } 51 if (Mapping.SrcOpMapping.size() > 0) { 52 B.setInstr(MI); 53 applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs); 54 } 55 56 lower(MI, Mapping, WaterfallSgprs); 57 } 58 59 void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, 60 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) { 61 MachineFunction &MF = B.getMF(); 62 assert(MI.getNumMemOperands() == 1); 63 MachineMemOperand &BaseMMO = **MI.memoperands_begin(); 64 Register Dst = MI.getOperand(0).getReg(); 65 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); 66 Register Base = MI.getOperand(1).getReg(); 67 LLT PtrTy = MRI.getType(Base); 68 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); 69 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); 70 SmallVector<Register, 4> LoadPartRegs; 71 72 unsigned ByteOffset = 0; 73 for (LLT PartTy : LLTBreakdown) { 74 Register BasePlusOffset; 75 if (ByteOffset == 0) { 76 BasePlusOffset = Base; 77 } else { 78 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); 79 BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); 80 } 81 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); 82 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); 83 LoadPartRegs.push_back(LoadPart.getReg(0)); 84 ByteOffset += PartTy.getSizeInBytes(); 85 } 86 87 if (!MergeTy.isValid()) { 88 // Loads are of same size, concat or merge them together. 89 B.buildMergeLikeInstr(Dst, LoadPartRegs); 90 } else { 91 // Loads are not all of same size, need to unmerge them to smaller pieces 92 // of MergeTy type, then merge pieces to Dst. 93 SmallVector<Register, 4> MergeTyParts; 94 for (Register Reg : LoadPartRegs) { 95 if (MRI.getType(Reg) == MergeTy) { 96 MergeTyParts.push_back(Reg); 97 } else { 98 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); 99 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) 100 MergeTyParts.push_back(Unmerge.getReg(i)); 101 } 102 } 103 B.buildMergeLikeInstr(Dst, MergeTyParts); 104 } 105 MI.eraseFromParent(); 106 } 107 108 void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, 109 LLT MergeTy) { 110 MachineFunction &MF = B.getMF(); 111 assert(MI.getNumMemOperands() == 1); 112 MachineMemOperand &BaseMMO = **MI.memoperands_begin(); 113 Register Dst = MI.getOperand(0).getReg(); 114 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); 115 Register Base = MI.getOperand(1).getReg(); 116 117 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); 118 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); 119 120 if (WideTy.isScalar()) { 121 B.buildTrunc(Dst, WideLoad); 122 } else { 123 SmallVector<Register, 4> MergeTyParts; 124 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); 125 126 LLT DstTy = MRI.getType(Dst); 127 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); 128 for (unsigned i = 0; i < NumElts; ++i) { 129 MergeTyParts.push_back(Unmerge.getReg(i)); 130 } 131 B.buildMergeLikeInstr(Dst, MergeTyParts); 132 } 133 MI.eraseFromParent(); 134 } 135 136 void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { 137 Register Dst = MI.getOperand(0).getReg(); 138 LLT Ty = MRI.getType(Dst); 139 Register Src = MI.getOperand(1).getReg(); 140 unsigned Opc = MI.getOpcode(); 141 int TrueExtCst = Opc == G_SEXT ? -1 : 1; 142 if (Ty == S32 || Ty == S16) { 143 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst); 144 auto False = B.buildConstant({VgprRB, Ty}, 0); 145 B.buildSelect(Dst, Src, True, False); 146 } else if (Ty == S64) { 147 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst); 148 auto False = B.buildConstant({VgprRB_S32}, 0); 149 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False); 150 MachineInstrBuilder Hi; 151 switch (Opc) { 152 case G_SEXT: 153 Hi = Lo; 154 break; 155 case G_ZEXT: 156 Hi = False; 157 break; 158 case G_ANYEXT: 159 Hi = B.buildUndef({VgprRB_S32}); 160 break; 161 default: 162 llvm_unreachable("Opcode not supported"); 163 } 164 165 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)}); 166 } else { 167 llvm_unreachable("Type not supported"); 168 } 169 170 MI.eraseFromParent(); 171 } 172 173 std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) { 174 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); 175 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff); 176 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask); 177 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); 178 return {Lo.getReg(0), Hi.getReg(0)}; 179 } 180 181 std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) { 182 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); 183 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16); 184 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); 185 return {Lo.getReg(0), Hi.getReg(0)}; 186 } 187 188 std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) { 189 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); 190 auto Lo = PackedS32; 191 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); 192 return {Lo.getReg(0), Hi.getReg(0)}; 193 } 194 195 void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { 196 Register Lo, Hi; 197 switch (MI.getOpcode()) { 198 case AMDGPU::G_SHL: { 199 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg()); 200 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg()); 201 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); 202 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); 203 break; 204 } 205 case AMDGPU::G_LSHR: { 206 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg()); 207 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg()); 208 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); 209 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); 210 break; 211 } 212 case AMDGPU::G_ASHR: { 213 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg()); 214 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg()); 215 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0); 216 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0); 217 break; 218 } 219 default: 220 llvm_unreachable("Unpack lowering not implemented"); 221 } 222 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); 223 MI.eraseFromParent(); 224 } 225 226 static bool isSignedBFE(MachineInstr &MI) { 227 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) 228 return (GI->is(Intrinsic::amdgcn_sbfe)); 229 230 return MI.getOpcode() == AMDGPU::G_SBFX; 231 } 232 233 void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { 234 Register Dst = MI.getOperand(0).getReg(); 235 assert(MRI.getType(Dst) == LLT::scalar(64)); 236 bool Signed = isSignedBFE(MI); 237 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; 238 // Extract bitfield from Src, LSBit is the least-significant bit for the 239 // extraction (field offset) and Width is size of bitfield. 240 Register Src = MI.getOperand(FirstOpnd).getReg(); 241 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg(); 242 Register Width = MI.getOperand(FirstOpnd + 2).getReg(); 243 // Comments are for signed bitfield extract, similar for unsigned. x is sign 244 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract. 245 246 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl 247 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR; 248 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit}); 249 250 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI); 251 252 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width) 253 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000 254 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl 255 if (!ConstWidth) { 256 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width); 257 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt); 258 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt}); 259 MI.eraseFromParent(); 260 return; 261 } 262 263 uint64_t WidthImm = ConstWidth->Value.getZExtValue(); 264 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc); 265 Register SHRSrcLo = UnmergeSHRSrc.getReg(0); 266 Register SHRSrcHi = UnmergeSHRSrc.getReg(1); 267 auto Zero = B.buildConstant({VgprRB, S32}, 0); 268 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX; 269 270 if (WidthImm <= 32) { 271 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl 272 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width}); 273 MachineInstrBuilder Hi; 274 if (Signed) { 275 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl 276 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31)); 277 } else { 278 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl 279 Hi = Zero; 280 } 281 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 282 } else { 283 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32); 284 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl 285 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt}); 286 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi}); 287 } 288 289 MI.eraseFromParent(); 290 } 291 292 void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { 293 Register DstReg = MI.getOperand(0).getReg(); 294 LLT Ty = MRI.getType(DstReg); 295 bool Signed = isSignedBFE(MI); 296 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; 297 Register Src = MI.getOperand(FirstOpnd).getReg(); 298 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg(); 299 Register Width = MI.getOperand(FirstOpnd + 2).getReg(); 300 // For uniform bit field extract there are 4 available instructions, but 301 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32, 302 // field offset in low and size in high 16 bits. 303 304 // Src1 Hi16|Lo16 = Size|FieldOffset 305 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6)); 306 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask); 307 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16)); 308 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size); 309 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 310 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 311 unsigned Opc = Ty == S32 ? Opc32 : Opc64; 312 313 // Select machine instruction, because of reg class constraining, insert 314 // copies from reg class to reg bank. 315 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}}, 316 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)}); 317 if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(), 318 *ST.getRegisterInfo(), RBI)) 319 llvm_unreachable("failed to constrain BFE"); 320 321 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg()); 322 MI.eraseFromParent(); 323 } 324 325 void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { 326 Register Dst = MI.getOperand(0).getReg(); 327 LLT DstTy = MRI.getType(Dst); 328 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64); 329 LLT Ty = DstTy == V4S16 ? V2S16 : S32; 330 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg()); 331 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg()); 332 unsigned Opc = MI.getOpcode(); 333 auto Flags = MI.getFlags(); 334 auto Lo = 335 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags); 336 auto Hi = 337 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags); 338 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 339 MI.eraseFromParent(); 340 } 341 342 void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { 343 Register Dst = MI.getOperand(0).getReg(); 344 LLT DstTy = MRI.getType(Dst); 345 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 || 346 (DstTy.isPointer() && DstTy.getSizeInBits() == 64)); 347 LLT Ty = DstTy == V4S16 ? V2S16 : S32; 348 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg()); 349 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg()); 350 Register Cond = MI.getOperand(1).getReg(); 351 auto Flags = MI.getFlags(); 352 auto Lo = 353 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags); 354 auto Hi = 355 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags); 356 357 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 358 MI.eraseFromParent(); 359 } 360 361 void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { 362 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); 363 int Amt = MI.getOperand(2).getImm(); 364 Register Lo, Hi; 365 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend 366 if (Amt <= 32) { 367 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0)); 368 if (Amt == 32) { 369 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx 370 Lo = Freeze.getReg(0); 371 } else { 372 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx 373 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0); 374 } 375 376 auto SignExtCst = B.buildConstant(SgprRB_S32, 31); 377 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0); 378 } else { 379 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx 380 Lo = Op1.getReg(0); 381 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0); 382 } 383 384 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); 385 MI.eraseFromParent(); 386 } 387 388 void RegBankLegalizeHelper::lower(MachineInstr &MI, 389 const RegBankLLTMapping &Mapping, 390 SmallSet<Register, 4> &WaterfallSgprs) { 391 392 switch (Mapping.LoweringMethod) { 393 case DoNotLower: 394 return; 395 case VccExtToSel: 396 return lowerVccExtToSel(MI); 397 case UniExtToSel: { 398 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 399 auto True = B.buildConstant({SgprRB, Ty}, 400 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1); 401 auto False = B.buildConstant({SgprRB, Ty}, 0); 402 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare. 403 // We are making select here. S1 cond was already 'any-extended to S32' + 404 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg. 405 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True, 406 False); 407 MI.eraseFromParent(); 408 return; 409 } 410 case UnpackBitShift: 411 return lowerUnpackBitShift(MI); 412 case Ext32To64: { 413 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); 414 MachineInstrBuilder Hi; 415 switch (MI.getOpcode()) { 416 case AMDGPU::G_ZEXT: { 417 Hi = B.buildConstant({RB, S32}, 0); 418 break; 419 } 420 case AMDGPU::G_SEXT: { 421 // Replicate sign bit from 32-bit extended part. 422 auto ShiftAmt = B.buildConstant({RB, S32}, 31); 423 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt); 424 break; 425 } 426 case AMDGPU::G_ANYEXT: { 427 Hi = B.buildUndef({RB, S32}); 428 break; 429 } 430 default: 431 llvm_unreachable("Unsuported Opcode in Ext32To64"); 432 } 433 434 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), 435 {MI.getOperand(1).getReg(), Hi}); 436 MI.eraseFromParent(); 437 return; 438 } 439 case UniCstExt: { 440 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); 441 B.buildConstant(MI.getOperand(0).getReg(), ConstVal); 442 443 MI.eraseFromParent(); 444 return; 445 } 446 case VgprToVccCopy: { 447 Register Src = MI.getOperand(1).getReg(); 448 LLT Ty = MRI.getType(Src); 449 // Take lowest bit from each lane and put it in lane mask. 450 // Lowering via compare, but we need to clean high bits first as compare 451 // compares all bits in register. 452 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty}); 453 if (Ty == S64) { 454 auto Src64 = B.buildUnmerge(VgprRB_S32, Src); 455 auto One = B.buildConstant(VgprRB_S32, 1); 456 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One); 457 auto Zero = B.buildConstant(VgprRB_S32, 0); 458 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero); 459 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi}); 460 } else { 461 assert(Ty == S32 || Ty == S16); 462 auto One = B.buildConstant({VgprRB, Ty}, 1); 463 B.buildAnd(BoolSrc, Src, One); 464 } 465 auto Zero = B.buildConstant({VgprRB, Ty}, 0); 466 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero); 467 MI.eraseFromParent(); 468 return; 469 } 470 case V_BFE: 471 return lowerV_BFE(MI); 472 case S_BFE: 473 return lowerS_BFE(MI); 474 case SplitTo32: 475 return lowerSplitTo32(MI); 476 case SplitTo32Select: 477 return lowerSplitTo32Select(MI); 478 case SplitTo32SExtInReg: 479 return lowerSplitTo32SExtInReg(MI); 480 case SplitLoad: { 481 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 482 unsigned Size = DstTy.getSizeInBits(); 483 // Even split to 128-bit loads 484 if (Size > 128) { 485 LLT B128; 486 if (DstTy.isVector()) { 487 LLT EltTy = DstTy.getElementType(); 488 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 489 } else { 490 B128 = LLT::scalar(128); 491 } 492 if (Size / 128 == 2) 493 splitLoad(MI, {B128, B128}); 494 else if (Size / 128 == 4) 495 splitLoad(MI, {B128, B128, B128, B128}); 496 else { 497 LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); 498 llvm_unreachable("SplitLoad type not supported for MI"); 499 } 500 } 501 // 64 and 32 bit load 502 else if (DstTy == S96) 503 splitLoad(MI, {S64, S32}, S32); 504 else if (DstTy == V3S32) 505 splitLoad(MI, {V2S32, S32}, S32); 506 else if (DstTy == V6S16) 507 splitLoad(MI, {V4S16, V2S16}, V2S16); 508 else { 509 LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); 510 llvm_unreachable("SplitLoad type not supported for MI"); 511 } 512 break; 513 } 514 case WidenLoad: { 515 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 516 if (DstTy == S96) 517 widenLoad(MI, S128); 518 else if (DstTy == V3S32) 519 widenLoad(MI, V4S32, S32); 520 else if (DstTy == V6S16) 521 widenLoad(MI, V8S16, V2S16); 522 else { 523 LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); 524 llvm_unreachable("WidenLoad type not supported for MI"); 525 } 526 break; 527 } 528 } 529 530 // TODO: executeInWaterfallLoop(... WaterfallSgprs) 531 } 532 533 LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { 534 switch (ID) { 535 case Vcc: 536 case UniInVcc: 537 return LLT::scalar(1); 538 case Sgpr16: 539 case Vgpr16: 540 return LLT::scalar(16); 541 case Sgpr32: 542 case Sgpr32Trunc: 543 case Sgpr32AExt: 544 case Sgpr32AExtBoolInReg: 545 case Sgpr32SExt: 546 case Sgpr32ZExt: 547 case UniInVgprS32: 548 case Vgpr32: 549 case Vgpr32SExt: 550 case Vgpr32ZExt: 551 return LLT::scalar(32); 552 case Sgpr64: 553 case Vgpr64: 554 return LLT::scalar(64); 555 case Sgpr128: 556 case Vgpr128: 557 return LLT::scalar(128); 558 case VgprP0: 559 return LLT::pointer(0, 64); 560 case SgprP1: 561 case VgprP1: 562 return LLT::pointer(1, 64); 563 case SgprP3: 564 case VgprP3: 565 return LLT::pointer(3, 32); 566 case SgprP4: 567 case VgprP4: 568 return LLT::pointer(4, 64); 569 case SgprP5: 570 case VgprP5: 571 return LLT::pointer(5, 32); 572 case SgprV2S16: 573 case VgprV2S16: 574 case UniInVgprV2S16: 575 return LLT::fixed_vector(2, 16); 576 case SgprV2S32: 577 case VgprV2S32: 578 return LLT::fixed_vector(2, 32); 579 case SgprV4S32: 580 case VgprV4S32: 581 case UniInVgprV4S32: 582 return LLT::fixed_vector(4, 32); 583 default: 584 return LLT(); 585 } 586 } 587 588 LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { 589 switch (ID) { 590 case SgprB32: 591 case VgprB32: 592 case UniInVgprB32: 593 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) || 594 isAnyPtr(Ty, 32)) 595 return Ty; 596 return LLT(); 597 case SgprPtr32: 598 case VgprPtr32: 599 return isAnyPtr(Ty, 32) ? Ty : LLT(); 600 case SgprPtr64: 601 case VgprPtr64: 602 return isAnyPtr(Ty, 64) ? Ty : LLT(); 603 case SgprPtr128: 604 case VgprPtr128: 605 return isAnyPtr(Ty, 128) ? Ty : LLT(); 606 case SgprB64: 607 case VgprB64: 608 case UniInVgprB64: 609 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) || 610 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64)) 611 return Ty; 612 return LLT(); 613 case SgprB96: 614 case VgprB96: 615 case UniInVgprB96: 616 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) || 617 Ty == LLT::fixed_vector(6, 16)) 618 return Ty; 619 return LLT(); 620 case SgprB128: 621 case VgprB128: 622 case UniInVgprB128: 623 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) || 624 Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128)) 625 return Ty; 626 return LLT(); 627 case SgprB256: 628 case VgprB256: 629 case UniInVgprB256: 630 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) || 631 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16)) 632 return Ty; 633 return LLT(); 634 case SgprB512: 635 case VgprB512: 636 case UniInVgprB512: 637 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) || 638 Ty == LLT::fixed_vector(8, 64)) 639 return Ty; 640 return LLT(); 641 default: 642 return LLT(); 643 } 644 } 645 646 const RegisterBank * 647 RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { 648 switch (ID) { 649 case Vcc: 650 return VccRB; 651 case Sgpr16: 652 case Sgpr32: 653 case Sgpr64: 654 case Sgpr128: 655 case SgprP1: 656 case SgprP3: 657 case SgprP4: 658 case SgprP5: 659 case SgprPtr32: 660 case SgprPtr64: 661 case SgprPtr128: 662 case SgprV2S16: 663 case SgprV2S32: 664 case SgprV4S32: 665 case SgprB32: 666 case SgprB64: 667 case SgprB96: 668 case SgprB128: 669 case SgprB256: 670 case SgprB512: 671 case UniInVcc: 672 case UniInVgprS32: 673 case UniInVgprV2S16: 674 case UniInVgprV4S32: 675 case UniInVgprB32: 676 case UniInVgprB64: 677 case UniInVgprB96: 678 case UniInVgprB128: 679 case UniInVgprB256: 680 case UniInVgprB512: 681 case Sgpr32Trunc: 682 case Sgpr32AExt: 683 case Sgpr32AExtBoolInReg: 684 case Sgpr32SExt: 685 case Sgpr32ZExt: 686 return SgprRB; 687 case Vgpr16: 688 case Vgpr32: 689 case Vgpr64: 690 case Vgpr128: 691 case VgprP0: 692 case VgprP1: 693 case VgprP3: 694 case VgprP4: 695 case VgprP5: 696 case VgprPtr32: 697 case VgprPtr64: 698 case VgprPtr128: 699 case VgprV2S16: 700 case VgprV2S32: 701 case VgprV4S32: 702 case VgprB32: 703 case VgprB64: 704 case VgprB96: 705 case VgprB128: 706 case VgprB256: 707 case VgprB512: 708 case Vgpr32SExt: 709 case Vgpr32ZExt: 710 return VgprRB; 711 default: 712 return nullptr; 713 } 714 } 715 716 void RegBankLegalizeHelper::applyMappingDst( 717 MachineInstr &MI, unsigned &OpIdx, 718 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) { 719 // Defs start from operand 0 720 for (; OpIdx < MethodIDs.size(); ++OpIdx) { 721 if (MethodIDs[OpIdx] == None) 722 continue; 723 MachineOperand &Op = MI.getOperand(OpIdx); 724 Register Reg = Op.getReg(); 725 LLT Ty = MRI.getType(Reg); 726 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg); 727 728 switch (MethodIDs[OpIdx]) { 729 // vcc, sgpr and vgpr scalars, pointers and vectors 730 case Vcc: 731 case Sgpr16: 732 case Sgpr32: 733 case Sgpr64: 734 case Sgpr128: 735 case SgprP1: 736 case SgprP3: 737 case SgprP4: 738 case SgprP5: 739 case SgprV2S16: 740 case SgprV2S32: 741 case SgprV4S32: 742 case Vgpr16: 743 case Vgpr32: 744 case Vgpr64: 745 case Vgpr128: 746 case VgprP0: 747 case VgprP1: 748 case VgprP3: 749 case VgprP4: 750 case VgprP5: 751 case VgprV2S16: 752 case VgprV2S32: 753 case VgprV4S32: { 754 assert(Ty == getTyFromID(MethodIDs[OpIdx])); 755 assert(RB == getRegBankFromID(MethodIDs[OpIdx])); 756 break; 757 } 758 // sgpr and vgpr B-types 759 case SgprB32: 760 case SgprB64: 761 case SgprB96: 762 case SgprB128: 763 case SgprB256: 764 case SgprB512: 765 case SgprPtr32: 766 case SgprPtr64: 767 case SgprPtr128: 768 case VgprB32: 769 case VgprB64: 770 case VgprB96: 771 case VgprB128: 772 case VgprB256: 773 case VgprB512: 774 case VgprPtr32: 775 case VgprPtr64: 776 case VgprPtr128: { 777 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); 778 assert(RB == getRegBankFromID(MethodIDs[OpIdx])); 779 break; 780 } 781 // uniform in vcc/vgpr: scalars, vectors and B-types 782 case UniInVcc: { 783 assert(Ty == S1); 784 assert(RB == SgprRB); 785 Register NewDst = MRI.createVirtualRegister(VccRB_S1); 786 Op.setReg(NewDst); 787 auto CopyS32_Vcc = 788 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst}); 789 B.buildTrunc(Reg, CopyS32_Vcc); 790 break; 791 } 792 case UniInVgprS32: 793 case UniInVgprV2S16: 794 case UniInVgprV4S32: { 795 assert(Ty == getTyFromID(MethodIDs[OpIdx])); 796 assert(RB == SgprRB); 797 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty}); 798 Op.setReg(NewVgprDst); 799 buildReadAnyLane(B, Reg, NewVgprDst, RBI); 800 break; 801 } 802 case UniInVgprB32: 803 case UniInVgprB64: 804 case UniInVgprB96: 805 case UniInVgprB128: 806 case UniInVgprB256: 807 case UniInVgprB512: { 808 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); 809 assert(RB == SgprRB); 810 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty}); 811 Op.setReg(NewVgprDst); 812 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI); 813 break; 814 } 815 // sgpr trunc 816 case Sgpr32Trunc: { 817 assert(Ty.getSizeInBits() < 32); 818 assert(RB == SgprRB); 819 Register NewDst = MRI.createVirtualRegister(SgprRB_S32); 820 Op.setReg(NewDst); 821 B.buildTrunc(Reg, NewDst); 822 break; 823 } 824 case InvalidMapping: { 825 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump();); 826 llvm_unreachable("missing fast rule for MI"); 827 } 828 default: 829 llvm_unreachable("ID not supported"); 830 } 831 } 832 } 833 834 void RegBankLegalizeHelper::applyMappingSrc( 835 MachineInstr &MI, unsigned &OpIdx, 836 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs, 837 SmallSet<Register, 4> &SgprWaterfallOperandRegs) { 838 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) { 839 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm) 840 continue; 841 842 MachineOperand &Op = MI.getOperand(OpIdx); 843 Register Reg = Op.getReg(); 844 LLT Ty = MRI.getType(Reg); 845 const RegisterBank *RB = MRI.getRegBank(Reg); 846 847 switch (MethodIDs[i]) { 848 case Vcc: { 849 assert(Ty == S1); 850 assert(RB == VccRB || RB == SgprRB); 851 if (RB == SgprRB) { 852 auto Aext = B.buildAnyExt(SgprRB_S32, Reg); 853 auto CopyVcc_Scc = 854 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext}); 855 Op.setReg(CopyVcc_Scc.getReg(0)); 856 } 857 break; 858 } 859 // sgpr scalars, pointers and vectors 860 case Sgpr16: 861 case Sgpr32: 862 case Sgpr64: 863 case Sgpr128: 864 case SgprP1: 865 case SgprP3: 866 case SgprP4: 867 case SgprP5: 868 case SgprV2S16: 869 case SgprV2S32: 870 case SgprV4S32: { 871 assert(Ty == getTyFromID(MethodIDs[i])); 872 assert(RB == getRegBankFromID(MethodIDs[i])); 873 break; 874 } 875 // sgpr B-types 876 case SgprB32: 877 case SgprB64: 878 case SgprB96: 879 case SgprB128: 880 case SgprB256: 881 case SgprB512: 882 case SgprPtr32: 883 case SgprPtr64: 884 case SgprPtr128: { 885 assert(Ty == getBTyFromID(MethodIDs[i], Ty)); 886 assert(RB == getRegBankFromID(MethodIDs[i])); 887 break; 888 } 889 // vgpr scalars, pointers and vectors 890 case Vgpr16: 891 case Vgpr32: 892 case Vgpr64: 893 case Vgpr128: 894 case VgprP0: 895 case VgprP1: 896 case VgprP3: 897 case VgprP4: 898 case VgprP5: 899 case VgprV2S16: 900 case VgprV2S32: 901 case VgprV4S32: { 902 assert(Ty == getTyFromID(MethodIDs[i])); 903 if (RB != VgprRB) { 904 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg); 905 Op.setReg(CopyToVgpr.getReg(0)); 906 } 907 break; 908 } 909 // vgpr B-types 910 case VgprB32: 911 case VgprB64: 912 case VgprB96: 913 case VgprB128: 914 case VgprB256: 915 case VgprB512: 916 case VgprPtr32: 917 case VgprPtr64: 918 case VgprPtr128: { 919 assert(Ty == getBTyFromID(MethodIDs[i], Ty)); 920 if (RB != VgprRB) { 921 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg); 922 Op.setReg(CopyToVgpr.getReg(0)); 923 } 924 break; 925 } 926 // sgpr and vgpr scalars with extend 927 case Sgpr32AExt: { 928 // Note: this ext allows S1, and it is meant to be combined away. 929 assert(Ty.getSizeInBits() < 32); 930 assert(RB == SgprRB); 931 auto Aext = B.buildAnyExt(SgprRB_S32, Reg); 932 Op.setReg(Aext.getReg(0)); 933 break; 934 } 935 case Sgpr32AExtBoolInReg: { 936 // Note: this ext allows S1, and it is meant to be combined away. 937 assert(Ty.getSizeInBits() == 1); 938 assert(RB == SgprRB); 939 auto Aext = B.buildAnyExt(SgprRB_S32, Reg); 940 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is 941 // most of times meant to be combined away in AMDGPURegBankCombiner. 942 auto Cst1 = B.buildConstant(SgprRB_S32, 1); 943 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1); 944 Op.setReg(BoolInReg.getReg(0)); 945 break; 946 } 947 case Sgpr32SExt: { 948 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32); 949 assert(RB == SgprRB); 950 auto Sext = B.buildSExt(SgprRB_S32, Reg); 951 Op.setReg(Sext.getReg(0)); 952 break; 953 } 954 case Sgpr32ZExt: { 955 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32); 956 assert(RB == SgprRB); 957 auto Zext = B.buildZExt({SgprRB, S32}, Reg); 958 Op.setReg(Zext.getReg(0)); 959 break; 960 } 961 case Vgpr32SExt: { 962 // Note this ext allows S1, and it is meant to be combined away. 963 assert(Ty.getSizeInBits() < 32); 964 assert(RB == VgprRB); 965 auto Sext = B.buildSExt({VgprRB, S32}, Reg); 966 Op.setReg(Sext.getReg(0)); 967 break; 968 } 969 case Vgpr32ZExt: { 970 // Note this ext allows S1, and it is meant to be combined away. 971 assert(Ty.getSizeInBits() < 32); 972 assert(RB == VgprRB); 973 auto Zext = B.buildZExt({VgprRB, S32}, Reg); 974 Op.setReg(Zext.getReg(0)); 975 break; 976 } 977 default: 978 llvm_unreachable("ID not supported"); 979 } 980 } 981 } 982 983 void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { 984 Register Dst = MI.getOperand(0).getReg(); 985 LLT Ty = MRI.getType(Dst); 986 987 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) { 988 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI()); 989 990 Register NewDst = MRI.createVirtualRegister(SgprRB_S32); 991 MI.getOperand(0).setReg(NewDst); 992 B.buildTrunc(Dst, NewDst); 993 994 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 995 Register UseReg = MI.getOperand(i).getReg(); 996 997 auto DefMI = MRI.getVRegDef(UseReg)->getIterator(); 998 MachineBasicBlock *DefMBB = DefMI->getParent(); 999 1000 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); 1001 1002 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg); 1003 MI.getOperand(i).setReg(NewUse.getReg(0)); 1004 } 1005 1006 return; 1007 } 1008 1009 // ALL divergent i1 phis should be already lowered and inst-selected into PHI 1010 // with sgpr reg class and S1 LLT. 1011 // Note: this includes divergent phis that don't require lowering. 1012 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) { 1013 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump();); 1014 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering " 1015 "before RegBankLegalize to lower lane mask(vcc) phis"); 1016 } 1017 1018 // We accept all types that can fit in some register class. 1019 // Uniform G_PHIs have all sgpr registers. 1020 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. 1021 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) || 1022 Ty == LLT::pointer(4, 64)) { 1023 return; 1024 } 1025 1026 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump();); 1027 llvm_unreachable("type not supported"); 1028 } 1029 1030 [[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI, 1031 const RegisterBank *RB, 1032 MachineRegisterInfo &MRI, 1033 unsigned StartOpIdx, 1034 unsigned EndOpIdx) { 1035 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) { 1036 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB) 1037 return false; 1038 } 1039 return true; 1040 } 1041 1042 void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) { 1043 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); 1044 // Put RB on all registers 1045 unsigned NumDefs = MI.getNumDefs(); 1046 unsigned NumOperands = MI.getNumOperands(); 1047 1048 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1)); 1049 if (RB == SgprRB) 1050 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1)); 1051 1052 if (RB == VgprRB) { 1053 B.setInstr(MI); 1054 for (unsigned i = NumDefs; i < NumOperands; ++i) { 1055 Register Reg = MI.getOperand(i).getReg(); 1056 if (MRI.getRegBank(Reg) != RB) { 1057 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg); 1058 MI.getOperand(i).setReg(Copy.getReg(0)); 1059 } 1060 } 1061 } 1062 } 1063