1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trival legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPUGlobalISelUtils.h" 74 #include "AMDGPUInstrInfo.h" 75 #include "AMDGPUSubtarget.h" 76 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 84 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 85 #include "llvm/CodeGen/TargetRegisterInfo.h" 86 #include "llvm/CodeGen/TargetSubtargetInfo.h" 87 #include "llvm/IR/Constants.h" 88 89 #define GET_TARGET_REGBANK_IMPL 90 #include "AMDGPUGenRegisterBank.inc" 91 92 // This file will be TableGen'ed at some point. 93 #include "AMDGPUGenRegisterBankInfo.def" 94 95 using namespace llvm; 96 using namespace MIPatternMatch; 97 98 namespace { 99 100 // Observer to apply a register bank to new registers created by LegalizerHelper. 101 class ApplyRegBankMapping final : public GISelChangeObserver { 102 private: 103 const AMDGPURegisterBankInfo &RBI; 104 MachineRegisterInfo &MRI; 105 const RegisterBank *NewBank; 106 SmallVector<MachineInstr *, 4> NewInsts; 107 108 public: 109 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 110 MachineRegisterInfo &MRI_, const RegisterBank *RB) 111 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 112 113 ~ApplyRegBankMapping() { 114 for (MachineInstr *MI : NewInsts) 115 applyBank(*MI); 116 } 117 118 /// Set any registers that don't have a set register class or bank to SALU. 119 void applyBank(MachineInstr &MI) { 120 const unsigned Opc = MI.getOpcode(); 121 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 122 Opc == AMDGPU::G_SEXT) { 123 // LegalizerHelper wants to use the basic legalization artifacts when 124 // widening etc. We don't handle selection with vcc in artifact sources, 125 // so we need to use a sslect instead to handle these properly. 126 Register DstReg = MI.getOperand(0).getReg(); 127 Register SrcReg = MI.getOperand(1).getReg(); 128 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 129 if (SrcBank == &AMDGPU::VCCRegBank) { 130 const LLT S32 = LLT::scalar(32); 131 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 132 assert(MRI.getType(DstReg) == S32); 133 assert(NewBank == &AMDGPU::VGPRRegBank); 134 135 // Replace the extension with a select, which really uses the boolean 136 // source. 137 MachineIRBuilder B(MI); 138 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 139 auto False = B.buildConstant(S32, 0); 140 B.buildSelect(DstReg, SrcReg, True, False); 141 MRI.setRegBank(True.getReg(0), *NewBank); 142 MRI.setRegBank(False.getReg(0), *NewBank); 143 MI.eraseFromParent(); 144 } 145 146 assert(!MRI.getRegClassOrRegBank(DstReg)); 147 MRI.setRegBank(DstReg, *NewBank); 148 return; 149 } 150 151 #ifndef NDEBUG 152 if (Opc == AMDGPU::G_TRUNC) { 153 Register DstReg = MI.getOperand(0).getReg(); 154 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 155 assert(DstBank != &AMDGPU::VCCRegBank); 156 } 157 #endif 158 159 for (MachineOperand &Op : MI.operands()) { 160 if (!Op.isReg()) 161 continue; 162 163 // We may see physical registers if building a real MI 164 Register Reg = Op.getReg(); 165 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 166 continue; 167 168 const RegisterBank *RB = NewBank; 169 if (MRI.getType(Reg) == LLT::scalar(1)) { 170 assert(NewBank == &AMDGPU::VGPRRegBank && 171 "s1 operands should only be used for vector bools"); 172 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 173 MI.getOpcode() != AMDGPU::G_ANYEXT) && 174 "not expecting legalization artifacts here"); 175 RB = &AMDGPU::VCCRegBank; 176 } 177 178 MRI.setRegBank(Reg, *RB); 179 } 180 } 181 182 void erasingInstr(MachineInstr &MI) override {} 183 184 void createdInstr(MachineInstr &MI) override { 185 // At this point, the instruction was just inserted and has no operands. 186 NewInsts.push_back(&MI); 187 } 188 189 void changingInstr(MachineInstr &MI) override {} 190 void changedInstr(MachineInstr &MI) override {} 191 }; 192 193 } 194 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 195 : AMDGPUGenRegisterBankInfo(), 196 Subtarget(ST), 197 TRI(Subtarget.getRegisterInfo()), 198 TII(Subtarget.getInstrInfo()) { 199 200 // HACK: Until this is fully tablegen'd. 201 static llvm::once_flag InitializeRegisterBankFlag; 202 203 static auto InitializeRegisterBankOnce = [this]() { 204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 207 (void)this; 208 }; 209 210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 211 } 212 213 static bool isVectorRegisterBank(const RegisterBank &Bank) { 214 unsigned BankID = Bank.getID(); 215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 216 } 217 218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 219 const RegisterBank &Src, 220 unsigned Size) const { 221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 222 if (Dst.getID() == AMDGPU::SGPRRegBankID && 223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 224 return std::numeric_limits<unsigned>::max(); 225 } 226 227 // Bool values are tricky, because the meaning is based on context. The SCC 228 // and VCC banks are for the natural scalar and vector conditions produced by 229 // a compare. 230 // 231 // Legalization doesn't know about the necessary context, so an s1 use may 232 // have been a truncate from an arbitrary value, in which case a copy (lowered 233 // as a compare with 0) needs to be inserted. 234 if (Size == 1 && 235 (Dst.getID() == AMDGPU::SGPRRegBankID) && 236 (isVectorRegisterBank(Src) || 237 Src.getID() == AMDGPU::SGPRRegBankID || 238 Src.getID() == AMDGPU::VCCRegBankID)) 239 return std::numeric_limits<unsigned>::max(); 240 241 // There is no direct copy between AGPRs. 242 if (Dst.getID() == AMDGPU::AGPRRegBankID && 243 Src.getID() == AMDGPU::AGPRRegBankID) 244 return 4; 245 246 return RegisterBankInfo::copyCost(Dst, Src, Size); 247 } 248 249 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 250 const ValueMapping &ValMapping, 251 const RegisterBank *CurBank) const { 252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 253 // VGPR. 254 // FIXME: Is there a better way to do this? 255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 256 return 10; // This is expensive. 257 258 assert(ValMapping.NumBreakDowns == 2 && 259 ValMapping.BreakDown[0].Length == 32 && 260 ValMapping.BreakDown[0].StartIdx == 0 && 261 ValMapping.BreakDown[1].Length == 32 && 262 ValMapping.BreakDown[1].StartIdx == 32 && 263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 264 265 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 267 // want. 268 269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 270 // alignment restrictions, but this probably isn't important. 271 return 1; 272 } 273 274 const RegisterBank & 275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 276 LLT Ty) const { 277 if (&RC == &AMDGPU::SReg_1RegClass) 278 return AMDGPU::VCCRegBank; 279 280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 281 // VCC-like use. 282 if (TRI->isSGPRClass(&RC)) { 283 // FIXME: This probably came from a copy from a physical register, which 284 // should be inferrrable from the copied to-type. We don't have many boolean 285 // physical register constraints so just assume a normal SGPR for now. 286 if (!Ty.isValid()) 287 return AMDGPU::SGPRRegBank; 288 289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 290 } 291 292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 293 } 294 295 template <unsigned NumOps> 296 RegisterBankInfo::InstructionMappings 297 AMDGPURegisterBankInfo::addMappingFromTable( 298 const MachineInstr &MI, const MachineRegisterInfo &MRI, 299 const std::array<unsigned, NumOps> RegSrcOpIdx, 300 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 301 302 InstructionMappings AltMappings; 303 304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 305 306 unsigned Sizes[NumOps]; 307 for (unsigned I = 0; I < NumOps; ++I) { 308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 310 } 311 312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 315 } 316 317 // getInstrMapping's default mapping uses ID 1, so start at 2. 318 unsigned MappingID = 2; 319 for (const auto &Entry : Table) { 320 for (unsigned I = 0; I < NumOps; ++I) { 321 int OpIdx = RegSrcOpIdx[I]; 322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 323 } 324 325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 326 getOperandsMapping(Operands), 327 Operands.size())); 328 } 329 330 return AltMappings; 331 } 332 333 RegisterBankInfo::InstructionMappings 334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 336 switch (MI.getIntrinsicID()) { 337 case Intrinsic::amdgcn_readlane: { 338 static const OpRegBankEntry<3> Table[2] = { 339 // Perfectly legal. 340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 341 342 // Need a readfirstlane for the index. 343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 344 }; 345 346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 348 } 349 case Intrinsic::amdgcn_writelane: { 350 static const OpRegBankEntry<4> Table[4] = { 351 // Perfectly legal. 352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 353 354 // Need readfirstlane of first op 355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 356 357 // Need readfirstlane of second op 358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 359 360 // Need readfirstlane of both ops 361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 362 }; 363 364 // rsrc, voffset, offset 365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 367 } 368 default: 369 return RegisterBankInfo::getInstrAlternativeMappings(MI); 370 } 371 } 372 373 RegisterBankInfo::InstructionMappings 374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 376 377 switch (MI.getIntrinsicID()) { 378 case Intrinsic::amdgcn_s_buffer_load: { 379 static const OpRegBankEntry<2> Table[4] = { 380 // Perfectly legal. 381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 382 383 // Only need 1 register in loop 384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 385 386 // Have to waterfall the resource. 387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 388 389 // Have to waterfall the resource, and the offset. 390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 391 }; 392 393 // rsrc, offset 394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 396 } 397 case Intrinsic::amdgcn_ds_ordered_add: 398 case Intrinsic::amdgcn_ds_ordered_swap: { 399 // VGPR = M0, VGPR 400 static const OpRegBankEntry<3> Table[2] = { 401 // Perfectly legal. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 403 404 // Need a readfirstlane for m0 405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 406 }; 407 408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 410 } 411 case Intrinsic::amdgcn_s_sendmsg: 412 case Intrinsic::amdgcn_s_sendmsghalt: { 413 // FIXME: Should have no register for immediate 414 static const OpRegBankEntry<1> Table[2] = { 415 // Perfectly legal. 416 { { AMDGPU::SGPRRegBankID }, 1 }, 417 418 // Need readlane 419 { { AMDGPU::VGPRRegBankID }, 3 } 420 }; 421 422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 424 } 425 default: 426 return RegisterBankInfo::getInstrAlternativeMappings(MI); 427 } 428 } 429 430 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { 431 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); 432 return I && I->getMetadata("amdgpu.noclobber"); 433 } 434 435 // FIXME: Returns uniform if there's no source value information. This is 436 // probably wrong. 437 static bool isScalarLoadLegal(const MachineInstr &MI) { 438 if (!MI.hasOneMemOperand()) 439 return false; 440 441 const MachineMemOperand *MMO = *MI.memoperands_begin(); 442 const unsigned AS = MMO->getAddrSpace(); 443 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 444 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 445 446 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. 447 return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) && 448 // Can't do a scalar atomic load. 449 !MMO->isAtomic() && 450 // Don't use scalar loads for volatile accesses to non-constant address 451 // spaces. 452 (IsConst || !MMO->isVolatile()) && 453 // Memory must be known constant, or not written before this load. 454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && 455 AMDGPUInstrInfo::isUniformMMO(MMO); 456 } 457 458 RegisterBankInfo::InstructionMappings 459 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 460 const MachineInstr &MI) const { 461 462 const MachineFunction &MF = *MI.getParent()->getParent(); 463 const MachineRegisterInfo &MRI = MF.getRegInfo(); 464 465 466 InstructionMappings AltMappings; 467 switch (MI.getOpcode()) { 468 case TargetOpcode::G_CONSTANT: { 469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 470 if (Size == 1) { 471 static const OpRegBankEntry<1> Table[3] = { 472 { { AMDGPU::VGPRRegBankID }, 1 }, 473 { { AMDGPU::SGPRRegBankID }, 1 }, 474 { { AMDGPU::VCCRegBankID }, 1 } 475 }; 476 477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 478 } 479 480 LLVM_FALLTHROUGH; 481 } 482 case TargetOpcode::G_FCONSTANT: 483 case TargetOpcode::G_FRAME_INDEX: 484 case TargetOpcode::G_GLOBAL_VALUE: { 485 static const OpRegBankEntry<1> Table[2] = { 486 { { AMDGPU::VGPRRegBankID }, 1 }, 487 { { AMDGPU::SGPRRegBankID }, 1 } 488 }; 489 490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 491 } 492 case TargetOpcode::G_AND: 493 case TargetOpcode::G_OR: 494 case TargetOpcode::G_XOR: { 495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 496 497 if (Size == 1) { 498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 499 const InstructionMapping &SCCMapping = getInstructionMapping( 500 1, 1, getOperandsMapping( 501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 504 3); // Num Operands 505 AltMappings.push_back(&SCCMapping); 506 507 const InstructionMapping &VCCMapping0 = getInstructionMapping( 508 2, 1, getOperandsMapping( 509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 512 3); // Num Operands 513 AltMappings.push_back(&VCCMapping0); 514 return AltMappings; 515 } 516 517 if (Size != 64) 518 break; 519 520 const InstructionMapping &SSMapping = getInstructionMapping( 521 1, 1, getOperandsMapping( 522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 525 3); // Num Operands 526 AltMappings.push_back(&SSMapping); 527 528 const InstructionMapping &VVMapping = getInstructionMapping( 529 2, 2, getOperandsMapping( 530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 533 3); // Num Operands 534 AltMappings.push_back(&VVMapping); 535 break; 536 } 537 case TargetOpcode::G_LOAD: 538 case TargetOpcode::G_ZEXTLOAD: 539 case TargetOpcode::G_SEXTLOAD: { 540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 542 unsigned PtrSize = PtrTy.getSizeInBits(); 543 unsigned AS = PtrTy.getAddressSpace(); 544 545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 546 AS != AMDGPUAS::PRIVATE_ADDRESS) && 547 isScalarLoadLegal(MI)) { 548 const InstructionMapping &SSMapping = getInstructionMapping( 549 1, 1, getOperandsMapping( 550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 552 2); // Num Operands 553 AltMappings.push_back(&SSMapping); 554 } 555 556 const InstructionMapping &VVMapping = getInstructionMapping( 557 2, 1, 558 getOperandsMapping( 559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 561 2); // Num Operands 562 AltMappings.push_back(&VVMapping); 563 564 // It may be possible to have a vgpr = load sgpr mapping here, because 565 // the mubuf instructions support this kind of load, but probably for only 566 // gfx7 and older. However, the addressing mode matching in the instruction 567 // selector should be able to do a better job of detecting and selecting 568 // these kinds of loads from the vgpr = load vgpr mapping. 569 570 return AltMappings; 571 572 } 573 case TargetOpcode::G_SELECT: { 574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 580 4); // Num Operands 581 AltMappings.push_back(&SSMapping); 582 583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 588 4); // Num Operands 589 AltMappings.push_back(&VVMapping); 590 591 return AltMappings; 592 } 593 case TargetOpcode::G_SMIN: 594 case TargetOpcode::G_SMAX: 595 case TargetOpcode::G_UMIN: 596 case TargetOpcode::G_UMAX: { 597 static const OpRegBankEntry<3> Table[2] = { 598 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 599 600 // Scalar requires cmp+select, and extends if 16-bit. 601 // FIXME: Should there be separate costs for 32 and 16-bit 602 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } 603 }; 604 605 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; 606 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 607 } 608 case TargetOpcode::G_UADDE: 609 case TargetOpcode::G_USUBE: 610 case TargetOpcode::G_SADDE: 611 case TargetOpcode::G_SSUBE: { 612 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 613 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 614 getOperandsMapping( 615 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 617 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 618 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 619 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 620 5); // Num Operands 621 AltMappings.push_back(&SSMapping); 622 623 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 624 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 626 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 627 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 628 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 629 5); // Num Operands 630 AltMappings.push_back(&VVMapping); 631 return AltMappings; 632 } 633 case AMDGPU::G_BRCOND: { 634 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 635 636 // TODO: Change type to 32 for scalar 637 const InstructionMapping &SMapping = getInstructionMapping( 638 1, 1, getOperandsMapping( 639 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 640 2); // Num Operands 641 AltMappings.push_back(&SMapping); 642 643 const InstructionMapping &VMapping = getInstructionMapping( 644 1, 1, getOperandsMapping( 645 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 646 2); // Num Operands 647 AltMappings.push_back(&VMapping); 648 return AltMappings; 649 } 650 case AMDGPU::G_INTRINSIC: 651 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 652 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 653 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 654 default: 655 break; 656 } 657 return RegisterBankInfo::getInstrAlternativeMappings(MI); 658 } 659 660 void AMDGPURegisterBankInfo::split64BitValueForMapping( 661 MachineIRBuilder &B, 662 SmallVector<Register, 2> &Regs, 663 LLT HalfTy, 664 Register Reg) const { 665 assert(HalfTy.getSizeInBits() == 32); 666 MachineRegisterInfo *MRI = B.getMRI(); 667 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 668 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 669 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 670 MRI->setRegBank(LoLHS, *Bank); 671 MRI->setRegBank(HiLHS, *Bank); 672 673 Regs.push_back(LoLHS); 674 Regs.push_back(HiLHS); 675 676 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 677 .addDef(LoLHS) 678 .addDef(HiLHS) 679 .addUse(Reg); 680 } 681 682 /// Replace the current type each register in \p Regs has with \p NewTy 683 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 684 LLT NewTy) { 685 for (Register Reg : Regs) { 686 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 687 MRI.setType(Reg, NewTy); 688 } 689 } 690 691 static LLT getHalfSizedType(LLT Ty) { 692 if (Ty.isVector()) { 693 assert(Ty.getNumElements() % 2 == 0); 694 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 695 } 696 697 assert(Ty.getSizeInBits() % 2 == 0); 698 return LLT::scalar(Ty.getSizeInBits() / 2); 699 } 700 701 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 702 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 703 /// execute the instruction for each unique combination of values in all lanes 704 /// in the wave. The block will be split such that rest of the instructions are 705 /// moved to a new block. 706 /// 707 /// Essentially performs this loop: 708 // 709 /// Save Execution Mask 710 /// For (Lane : Wavefront) { 711 /// Enable Lane, Disable all other lanes 712 /// SGPR = read SGPR value for current lane from VGPR 713 /// VGPRResult[Lane] = use_op SGPR 714 /// } 715 /// Restore Execution Mask 716 /// 717 /// There is additional complexity to try for compare values to identify the 718 /// unique values used. 719 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 720 MachineIRBuilder &B, 721 iterator_range<MachineBasicBlock::iterator> Range, 722 SmallSet<Register, 4> &SGPROperandRegs, 723 MachineRegisterInfo &MRI) const { 724 SmallVector<Register, 4> ResultRegs; 725 SmallVector<Register, 4> InitResultRegs; 726 SmallVector<Register, 4> PhiRegs; 727 728 // Track use registers which have already been expanded with a readfirstlane 729 // sequence. This may have multiple uses if moving a sequence. 730 DenseMap<Register, Register> WaterfalledRegMap; 731 732 MachineBasicBlock &MBB = B.getMBB(); 733 MachineFunction *MF = &B.getMF(); 734 735 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 736 const unsigned WaveAndOpc = Subtarget.isWave32() ? 737 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 738 const unsigned MovTermOpc = Subtarget.isWave32() ? 739 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 740 const unsigned XorTermOpc = Subtarget.isWave32() ? 741 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 742 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 743 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 744 const unsigned ExecReg = Subtarget.isWave32() ? 745 AMDGPU::EXEC_LO : AMDGPU::EXEC; 746 747 #ifndef NDEBUG 748 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 749 #endif 750 751 for (MachineInstr &MI : Range) { 752 for (MachineOperand &Def : MI.defs()) { 753 LLT ResTy = MRI.getType(Def.getReg()); 754 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 755 ResultRegs.push_back(Def.getReg()); 756 Register InitReg = B.buildUndef(ResTy).getReg(0); 757 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 758 InitResultRegs.push_back(InitReg); 759 PhiRegs.push_back(PhiReg); 760 MRI.setRegBank(PhiReg, *DefBank); 761 MRI.setRegBank(InitReg, *DefBank); 762 } 763 } 764 765 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 766 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 767 768 // Don't bother using generic instructions/registers for the exec mask. 769 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 770 .addDef(InitSaveExecReg); 771 772 Register PhiExec = MRI.createVirtualRegister(WaveRC); 773 Register NewExec = MRI.createVirtualRegister(WaveRC); 774 775 // To insert the loop we need to split the block. Move everything before this 776 // point to a new block, and insert a new empty block before this instruction. 777 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 778 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 779 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 780 MachineFunction::iterator MBBI(MBB); 781 ++MBBI; 782 MF->insert(MBBI, LoopBB); 783 MF->insert(MBBI, RestoreExecBB); 784 MF->insert(MBBI, RemainderBB); 785 786 LoopBB->addSuccessor(RestoreExecBB); 787 LoopBB->addSuccessor(LoopBB); 788 789 // Move the rest of the block into a new block. 790 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 791 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 792 793 MBB.addSuccessor(LoopBB); 794 RestoreExecBB->addSuccessor(RemainderBB); 795 796 B.setInsertPt(*LoopBB, LoopBB->end()); 797 798 B.buildInstr(TargetOpcode::PHI) 799 .addDef(PhiExec) 800 .addReg(InitSaveExecReg) 801 .addMBB(&MBB) 802 .addReg(NewExec) 803 .addMBB(LoopBB); 804 805 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 806 B.buildInstr(TargetOpcode::G_PHI) 807 .addDef(std::get<2>(Result)) 808 .addReg(std::get<0>(Result)) // Initial value / implicit_def 809 .addMBB(&MBB) 810 .addReg(std::get<1>(Result)) // Mid-loop value. 811 .addMBB(LoopBB); 812 } 813 814 const DebugLoc &DL = B.getDL(); 815 816 MachineInstr &FirstInst = *Range.begin(); 817 818 // Move the instruction into the loop. Note we moved everything after 819 // Range.end() already into a new block, so Range.end() is no longer valid. 820 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 821 822 // Figure out the iterator range after splicing the instructions. 823 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 824 auto NewEnd = LoopBB->end(); 825 826 MachineBasicBlock::iterator I = Range.begin(); 827 B.setInsertPt(*LoopBB, I); 828 829 Register CondReg; 830 831 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 832 833 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 834 for (MachineOperand &Op : MI.uses()) { 835 if (!Op.isReg() || Op.isDef()) 836 continue; 837 838 Register OldReg = Op.getReg(); 839 if (!SGPROperandRegs.count(OldReg)) 840 continue; 841 842 // See if we already processed this register in another instruction in the 843 // sequence. 844 auto OldVal = WaterfalledRegMap.find(OldReg); 845 if (OldVal != WaterfalledRegMap.end()) { 846 Op.setReg(OldVal->second); 847 continue; 848 } 849 850 LLT OpTy = MRI.getType(Op.getReg()); 851 unsigned OpSize = OpTy.getSizeInBits(); 852 853 // Can only do a readlane of 32-bit pieces. 854 if (OpSize == 32) { 855 // Avoid extra copies in the simple case of one 32-bit register. 856 Register CurrentLaneOpReg 857 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 858 MRI.setType(CurrentLaneOpReg, OpTy); 859 860 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); 861 // Read the next variant <- also loop target. 862 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 863 CurrentLaneOpReg) 864 .addReg(Op.getReg()); 865 866 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 867 bool First = CondReg == AMDGPU::NoRegister; 868 if (First) 869 CondReg = NewCondReg; 870 871 // Compare the just read M0 value to all possible Idx values. 872 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 873 .addDef(NewCondReg) 874 .addReg(CurrentLaneOpReg) 875 .addReg(Op.getReg()); 876 Op.setReg(CurrentLaneOpReg); 877 878 if (!First) { 879 Register AndReg = MRI.createVirtualRegister(WaveRC); 880 881 // If there are multiple operands to consider, and the conditions. 882 B.buildInstr(WaveAndOpc) 883 .addDef(AndReg) 884 .addReg(NewCondReg) 885 .addReg(CondReg); 886 CondReg = AndReg; 887 } 888 } else { 889 LLT S32 = LLT::scalar(32); 890 SmallVector<Register, 8> ReadlanePieces; 891 892 // The compares can be done as 64-bit, but the extract needs to be done 893 // in 32-bit pieces. 894 895 bool Is64 = OpSize % 64 == 0; 896 897 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 898 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 899 : AMDGPU::V_CMP_EQ_U32_e64; 900 901 // The compares can be done as 64-bit, but the extract needs to be done 902 // in 32-bit pieces. 903 904 // Insert the unmerge before the loop. 905 906 B.setMBB(MBB); 907 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); 908 B.setInstr(*I); 909 910 unsigned NumPieces = Unmerge->getNumOperands() - 1; 911 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 912 Register UnmergePiece = Unmerge.getReg(PieceIdx); 913 914 Register CurrentLaneOpReg; 915 if (Is64) { 916 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 917 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 918 919 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 920 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 921 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 922 923 // Read the next variant <- also loop target. 924 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 925 CurrentLaneOpRegLo) 926 .addReg(UnmergePiece, 0, AMDGPU::sub0); 927 928 // Read the next variant <- also loop target. 929 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 930 CurrentLaneOpRegHi) 931 .addReg(UnmergePiece, 0, AMDGPU::sub1); 932 933 CurrentLaneOpReg = 934 B.buildMerge(LLT::scalar(64), 935 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 936 .getReg(0); 937 938 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 939 940 if (OpTy.getScalarSizeInBits() == 64) { 941 // If we need to produce a 64-bit element vector, so use the 942 // merged pieces 943 ReadlanePieces.push_back(CurrentLaneOpReg); 944 } else { 945 // 32-bit element type. 946 ReadlanePieces.push_back(CurrentLaneOpRegLo); 947 ReadlanePieces.push_back(CurrentLaneOpRegHi); 948 } 949 } else { 950 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 951 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 952 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 953 954 // Read the next variant <- also loop target. 955 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 956 CurrentLaneOpReg) 957 .addReg(UnmergePiece); 958 ReadlanePieces.push_back(CurrentLaneOpReg); 959 } 960 961 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 962 bool First = CondReg == AMDGPU::NoRegister; 963 if (First) 964 CondReg = NewCondReg; 965 966 B.buildInstr(CmpOp) 967 .addDef(NewCondReg) 968 .addReg(CurrentLaneOpReg) 969 .addReg(UnmergePiece); 970 971 if (!First) { 972 Register AndReg = MRI.createVirtualRegister(WaveRC); 973 974 // If there are multiple operands to consider, and the conditions. 975 B.buildInstr(WaveAndOpc) 976 .addDef(AndReg) 977 .addReg(NewCondReg) 978 .addReg(CondReg); 979 CondReg = AndReg; 980 } 981 } 982 983 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 984 // BUILD_VECTOR 985 if (OpTy.isVector()) { 986 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 987 Op.setReg(Merge.getReg(0)); 988 } else { 989 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 990 Op.setReg(Merge.getReg(0)); 991 } 992 993 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 994 } 995 996 // Make sure we don't re-process this register again. 997 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 998 } 999 } 1000 1001 B.setInsertPt(*LoopBB, LoopBB->end()); 1002 1003 // Update EXEC, save the original EXEC value to VCC. 1004 B.buildInstr(AndSaveExecOpc) 1005 .addDef(NewExec) 1006 .addReg(CondReg, RegState::Kill); 1007 1008 MRI.setSimpleHint(NewExec, CondReg); 1009 1010 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1011 B.buildInstr(XorTermOpc) 1012 .addDef(ExecReg) 1013 .addReg(ExecReg) 1014 .addReg(NewExec); 1015 1016 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1017 // s_cbranch_scc0? 1018 1019 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1020 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 1021 .addMBB(LoopBB); 1022 1023 // Save the EXEC mask before the loop. 1024 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) 1025 .addReg(ExecReg); 1026 1027 // Restore the EXEC mask after the loop. 1028 B.setMBB(*RestoreExecBB); 1029 B.buildInstr(MovTermOpc) 1030 .addDef(ExecReg) 1031 .addReg(SaveExecReg); 1032 1033 // Set the insert point after the original instruction, so any new 1034 // instructions will be in the remainder. 1035 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1036 1037 return true; 1038 } 1039 1040 // Return any unique registers used by \p MI at \p OpIndices that need to be 1041 // handled in a waterfall loop. Returns these registers in \p 1042 // SGPROperandRegs. Returns true if there are any operansd to handle and a 1043 // waterfall loop is necessary. 1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1045 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1046 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1047 for (unsigned Op : OpIndices) { 1048 assert(MI.getOperand(Op).isUse()); 1049 Register Reg = MI.getOperand(Op).getReg(); 1050 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1051 if (OpBank->getID() == AMDGPU::VGPRRegBankID) 1052 SGPROperandRegs.insert(Reg); 1053 } 1054 1055 // No operands need to be replaced, so no need to loop. 1056 return !SGPROperandRegs.empty(); 1057 } 1058 1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1060 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1061 ArrayRef<unsigned> OpIndices) const { 1062 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1063 // are the same register. 1064 SmallSet<Register, 4> SGPROperandRegs; 1065 1066 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1067 return false; 1068 1069 MachineBasicBlock::iterator I = MI.getIterator(); 1070 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1071 SGPROperandRegs, MRI); 1072 } 1073 1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1075 MachineInstr &MI, MachineRegisterInfo &MRI, 1076 ArrayRef<unsigned> OpIndices) const { 1077 MachineIRBuilder B(MI); 1078 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1079 } 1080 1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1083 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1084 Register Reg = MI.getOperand(OpIdx).getReg(); 1085 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1086 if (Bank != &AMDGPU::VGPRRegBank) 1087 return; 1088 1089 MachineIRBuilder B(MI); 1090 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1091 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1092 .addDef(SGPR) 1093 .addReg(Reg); 1094 1095 MRI.setType(SGPR, MRI.getType(Reg)); 1096 1097 const TargetRegisterClass *Constrained = 1098 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1099 (void)Constrained; 1100 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1101 1102 MI.getOperand(OpIdx).setReg(SGPR); 1103 } 1104 1105 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1106 /// rest will be in the remainder. 1107 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1108 unsigned TotalSize = Ty.getSizeInBits(); 1109 if (!Ty.isVector()) 1110 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1111 1112 LLT EltTy = Ty.getElementType(); 1113 unsigned EltSize = EltTy.getSizeInBits(); 1114 assert(FirstSize % EltSize == 0); 1115 1116 unsigned FirstPartNumElts = FirstSize / EltSize; 1117 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1118 1119 return {LLT::scalarOrVector(FirstPartNumElts, EltTy), 1120 LLT::scalarOrVector(RemainderElts, EltTy)}; 1121 } 1122 1123 static LLT widen96To128(LLT Ty) { 1124 if (!Ty.isVector()) 1125 return LLT::scalar(128); 1126 1127 LLT EltTy = Ty.getElementType(); 1128 assert(128 % EltTy.getSizeInBits() == 0); 1129 return LLT::vector(128 / EltTy.getSizeInBits(), EltTy); 1130 } 1131 1132 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1133 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1134 MachineRegisterInfo &MRI) const { 1135 Register DstReg = MI.getOperand(0).getReg(); 1136 const LLT LoadTy = MRI.getType(DstReg); 1137 unsigned LoadSize = LoadTy.getSizeInBits(); 1138 const unsigned MaxNonSmrdLoadSize = 128; 1139 1140 const RegisterBank *PtrBank = 1141 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1142 if (PtrBank == &AMDGPU::SGPRRegBank) { 1143 // If the pointer is an SGPR, we ordinarily have nothing to do. 1144 if (LoadSize != 96) 1145 return false; 1146 1147 MachineMemOperand *MMO = *MI.memoperands_begin(); 1148 Register PtrReg = MI.getOperand(1).getReg(); 1149 // 96-bit loads are only available for vector loads. We need to split this 1150 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1151 1152 MachineIRBuilder B(MI); 1153 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1154 GISelObserverWrapper Observer(&O); 1155 B.setChangeObserver(Observer); 1156 1157 if (MMO->getAlign() < Align(16)) { 1158 LLT Part64, Part32; 1159 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1160 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); 1161 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); 1162 1163 auto Undef = B.buildUndef(LoadTy); 1164 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); 1165 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); 1166 } else { 1167 LLT WiderTy = widen96To128(LoadTy); 1168 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1169 B.buildExtract(MI.getOperand(0), WideLoad, 0); 1170 } 1171 1172 MI.eraseFromParent(); 1173 return true; 1174 } 1175 1176 // 128-bit loads are supported for all instruction types. 1177 if (LoadSize <= MaxNonSmrdLoadSize) 1178 return false; 1179 1180 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1181 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1182 1183 if (SrcRegs.empty()) 1184 SrcRegs.push_back(MI.getOperand(1).getReg()); 1185 1186 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1187 1188 // RegBankSelect only emits scalar types, so we need to reset the pointer 1189 // operand to a pointer type. 1190 Register BasePtrReg = SrcRegs[0]; 1191 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1192 MRI.setType(BasePtrReg, PtrTy); 1193 1194 MachineIRBuilder B(MI); 1195 1196 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1197 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1198 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 1199 GISelObserverWrapper Observer(&O); 1200 B.setChangeObserver(Observer); 1201 LegalizerHelper Helper(B.getMF(), Observer, B); 1202 1203 if (LoadTy.isVector()) { 1204 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1205 return false; 1206 } else { 1207 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1208 return false; 1209 } 1210 1211 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1212 return true; 1213 } 1214 1215 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1216 MachineInstr &MI, 1217 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1218 MachineRegisterInfo &MRI) const { 1219 const MachineFunction &MF = *MI.getMF(); 1220 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1221 const auto &TFI = *ST.getFrameLowering(); 1222 1223 // Guard in case the stack growth direction ever changes with scratch 1224 // instructions. 1225 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1226 return false; 1227 1228 Register Dst = MI.getOperand(0).getReg(); 1229 Register AllocSize = MI.getOperand(1).getReg(); 1230 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1231 1232 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1233 1234 // TODO: Need to emit a wave reduction to get the maximum size. 1235 if (SizeBank != &AMDGPU::SGPRRegBank) 1236 return false; 1237 1238 LLT PtrTy = MRI.getType(Dst); 1239 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1240 1241 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1242 Register SPReg = Info->getStackPtrOffsetReg(); 1243 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1244 GISelObserverWrapper Observer(&ApplyBank); 1245 1246 MachineIRBuilder B(MI); 1247 B.setChangeObserver(Observer); 1248 1249 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1250 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1251 1252 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1253 if (Alignment > TFI.getStackAlign()) { 1254 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1255 B.buildMaskLowPtrBits(Dst, PtrAdd, 1256 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1257 } else { 1258 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1259 } 1260 1261 MI.eraseFromParent(); 1262 return true; 1263 } 1264 1265 bool AMDGPURegisterBankInfo::applyMappingImage( 1266 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1267 MachineRegisterInfo &MRI, int RsrcIdx) const { 1268 const int NumDefs = MI.getNumExplicitDefs(); 1269 1270 // The reported argument index is relative to the IR intrinsic call arguments, 1271 // so we need to shift by the number of defs and the intrinsic ID. 1272 RsrcIdx += NumDefs + 1; 1273 1274 // Insert copies to VGPR arguments. 1275 applyDefaultMapping(OpdMapper); 1276 1277 // Fixup any SGPR arguments. 1278 SmallVector<unsigned, 4> SGPRIndexes; 1279 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1280 if (!MI.getOperand(I).isReg()) 1281 continue; 1282 1283 // If this intrinsic has a sampler, it immediately follows rsrc. 1284 if (I == RsrcIdx || I == RsrcIdx + 1) 1285 SGPRIndexes.push_back(I); 1286 } 1287 1288 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1289 return true; 1290 } 1291 1292 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1293 Register Reg) { 1294 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1295 if (!Def) 1296 return Reg; 1297 1298 // TODO: Guard against this being an implicit def 1299 return Def->getOperand(0).getReg(); 1300 } 1301 1302 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1303 // the three offsets (voffset, soffset and instoffset) 1304 static unsigned setBufferOffsets(MachineIRBuilder &B, 1305 const AMDGPURegisterBankInfo &RBI, 1306 Register CombinedOffset, Register &VOffsetReg, 1307 Register &SOffsetReg, int64_t &InstOffsetVal, 1308 Align Alignment) { 1309 const LLT S32 = LLT::scalar(32); 1310 MachineRegisterInfo *MRI = B.getMRI(); 1311 1312 if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) { 1313 uint32_t SOffset, ImmOffset; 1314 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1315 Alignment)) { 1316 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1317 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1318 InstOffsetVal = ImmOffset; 1319 1320 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1321 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1322 return SOffset + ImmOffset; 1323 } 1324 } 1325 1326 Register Base; 1327 unsigned Offset; 1328 MachineInstr *Unused; 1329 1330 std::tie(Base, Offset, Unused) 1331 = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1332 1333 uint32_t SOffset, ImmOffset; 1334 if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1335 &RBI.Subtarget, Alignment)) { 1336 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1337 VOffsetReg = Base; 1338 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1339 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1340 InstOffsetVal = ImmOffset; 1341 return 0; // XXX - Why is this 0? 1342 } 1343 1344 // If we have SGPR base, we can use it for soffset. 1345 if (SOffset == 0) { 1346 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1347 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1348 SOffsetReg = Base; 1349 InstOffsetVal = ImmOffset; 1350 return 0; // XXX - Why is this 0? 1351 } 1352 } 1353 1354 // Handle the variable sgpr + vgpr case. 1355 if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) { 1356 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1357 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1358 1359 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1360 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1361 1362 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1363 VOffsetReg = Src0; 1364 SOffsetReg = Src1; 1365 return 0; 1366 } 1367 1368 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1369 VOffsetReg = Src1; 1370 SOffsetReg = Src0; 1371 return 0; 1372 } 1373 } 1374 1375 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1376 // have an SGPR offset and a VGPR resource. 1377 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1378 VOffsetReg = CombinedOffset; 1379 } else { 1380 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1381 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1382 } 1383 1384 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1385 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1386 return 0; 1387 } 1388 1389 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1390 const OperandsMapper &OpdMapper) const { 1391 MachineInstr &MI = OpdMapper.getMI(); 1392 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1393 1394 const LLT S32 = LLT::scalar(32); 1395 Register Dst = MI.getOperand(0).getReg(); 1396 LLT Ty = MRI.getType(Dst); 1397 1398 const RegisterBank *RSrcBank = 1399 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1400 const RegisterBank *OffsetBank = 1401 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1402 if (RSrcBank == &AMDGPU::SGPRRegBank && 1403 OffsetBank == &AMDGPU::SGPRRegBank) 1404 return true; // Legal mapping 1405 1406 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back 1407 // here but don't have an MMO. 1408 1409 unsigned LoadSize = Ty.getSizeInBits(); 1410 int NumLoads = 1; 1411 if (LoadSize == 256 || LoadSize == 512) { 1412 NumLoads = LoadSize / 128; 1413 Ty = Ty.divide(NumLoads); 1414 } 1415 1416 // Use the alignment to ensure that the required offsets will fit into the 1417 // immediate offsets. 1418 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1419 1420 MachineIRBuilder B(MI); 1421 MachineFunction &MF = B.getMF(); 1422 1423 Register SOffset; 1424 Register VOffset; 1425 int64_t ImmOffset = 0; 1426 1427 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1428 VOffset, SOffset, ImmOffset, Alignment); 1429 1430 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1431 // can, but we neeed to track an MMO for that. 1432 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1433 const Align MemAlign(4); // FIXME: ABI type alignment? 1434 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1435 MachinePointerInfo(), 1436 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1437 MachineMemOperand::MOInvariant, 1438 MemSize, MemAlign); 1439 if (MMOOffset != 0) 1440 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1441 1442 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1443 // assume that the buffer is unswizzled. 1444 1445 Register RSrc = MI.getOperand(1).getReg(); 1446 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1447 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1448 1449 SmallVector<Register, 4> LoadParts(NumLoads); 1450 1451 MachineBasicBlock::iterator MII = MI.getIterator(); 1452 MachineInstrSpan Span(MII, &B.getMBB()); 1453 1454 for (int i = 0; i < NumLoads; ++i) { 1455 if (NumLoads == 1) { 1456 LoadParts[i] = Dst; 1457 } else { 1458 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1459 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1460 } 1461 1462 MachineMemOperand *MMO = BaseMMO; 1463 if (i != 0) 1464 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1465 1466 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1467 .addDef(LoadParts[i]) // vdata 1468 .addUse(RSrc) // rsrc 1469 .addUse(VIndex) // vindex 1470 .addUse(VOffset) // voffset 1471 .addUse(SOffset) // soffset 1472 .addImm(ImmOffset + 16 * i) // offset(imm) 1473 .addImm(0) // cachepolicy, swizzled buffer(imm) 1474 .addImm(0) // idxen(imm) 1475 .addMemOperand(MMO); 1476 } 1477 1478 // TODO: If only the resource is a VGPR, it may be better to execute the 1479 // scalar load in the waterfall loop if the resource is expected to frequently 1480 // be dynamically uniform. 1481 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1482 // Remove the original instruction to avoid potentially confusing the 1483 // waterfall loop logic. 1484 B.setInstr(*Span.begin()); 1485 MI.eraseFromParent(); 1486 1487 SmallSet<Register, 4> OpsToWaterfall; 1488 1489 OpsToWaterfall.insert(RSrc); 1490 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1491 OpsToWaterfall, MRI); 1492 } 1493 1494 if (NumLoads != 1) { 1495 if (Ty.isVector()) 1496 B.buildConcatVectors(Dst, LoadParts); 1497 else 1498 B.buildMerge(Dst, LoadParts); 1499 } 1500 1501 // We removed the instruction earlier with a waterfall loop. 1502 if (RSrcBank == &AMDGPU::SGPRRegBank) 1503 MI.eraseFromParent(); 1504 1505 return true; 1506 } 1507 1508 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( 1509 const OperandsMapper &OpdMapper, bool Signed) const { 1510 MachineInstr &MI = OpdMapper.getMI(); 1511 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1512 1513 // Insert basic copies 1514 applyDefaultMapping(OpdMapper); 1515 1516 Register DstReg = MI.getOperand(0).getReg(); 1517 LLT Ty = MRI.getType(DstReg); 1518 1519 const LLT S32 = LLT::scalar(32); 1520 1521 const RegisterBank *DstBank = 1522 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1523 if (DstBank == &AMDGPU::VGPRRegBank) { 1524 if (Ty == S32) 1525 return true; 1526 1527 // TODO: 64-bit version is scalar only, so we need to expand this. 1528 return false; 1529 } 1530 1531 Register SrcReg = MI.getOperand(2).getReg(); 1532 Register OffsetReg = MI.getOperand(3).getReg(); 1533 Register WidthReg = MI.getOperand(4).getReg(); 1534 1535 // The scalar form packs the offset and width in a single operand. 1536 1537 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1538 GISelObserverWrapper Observer(&ApplyBank); 1539 MachineIRBuilder B(MI); 1540 B.setChangeObserver(Observer); 1541 1542 // Ensure the high bits are clear to insert the offset. 1543 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1544 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1545 1546 // Zeros out the low bits, so don't bother clamping the input value. 1547 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1548 1549 // Transformation function, pack the offset and width of a BFE into 1550 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1551 // source, bits [5:0] contain the offset and bits [22:16] the width. 1552 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1553 1554 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1555 // register class constraints. 1556 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1557 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1558 1559 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1560 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1561 llvm_unreachable("failed to constrain BFE"); 1562 1563 MI.eraseFromParent(); 1564 return true; 1565 } 1566 1567 // FIXME: Duplicated from LegalizerHelper 1568 static CmpInst::Predicate minMaxToCompare(unsigned Opc) { 1569 switch (Opc) { 1570 case TargetOpcode::G_SMIN: 1571 return CmpInst::ICMP_SLT; 1572 case TargetOpcode::G_SMAX: 1573 return CmpInst::ICMP_SGT; 1574 case TargetOpcode::G_UMIN: 1575 return CmpInst::ICMP_ULT; 1576 case TargetOpcode::G_UMAX: 1577 return CmpInst::ICMP_UGT; 1578 default: 1579 llvm_unreachable("not in integer min/max"); 1580 } 1581 } 1582 1583 static unsigned minMaxToExtend(unsigned Opc) { 1584 switch (Opc) { 1585 case TargetOpcode::G_SMIN: 1586 case TargetOpcode::G_SMAX: 1587 return TargetOpcode::G_SEXT; 1588 case TargetOpcode::G_UMIN: 1589 case TargetOpcode::G_UMAX: 1590 return TargetOpcode::G_ZEXT; 1591 default: 1592 llvm_unreachable("not in integer min/max"); 1593 } 1594 } 1595 1596 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1597 // any illegal vector extend or unmerge operations. 1598 static std::pair<Register, Register> 1599 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1600 const LLT S32 = LLT::scalar(32); 1601 auto Bitcast = B.buildBitcast(S32, Src); 1602 1603 if (ExtOpcode == TargetOpcode::G_SEXT) { 1604 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1605 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1606 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1607 } 1608 1609 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1610 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1611 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1612 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1613 } 1614 1615 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1616 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1617 } 1618 1619 static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B, 1620 CmpInst::Predicate Pred, 1621 Register Dst, Register Src0, 1622 Register Src1) { 1623 const LLT CmpType = LLT::scalar(32); 1624 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); 1625 return B.buildSelect(Dst, Cmp, Src0, Src1); 1626 } 1627 1628 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type. 1629 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, 1630 MachineInstr &MI) const { 1631 Register Dst = MI.getOperand(0).getReg(); 1632 Register Src0 = MI.getOperand(1).getReg(); 1633 Register Src1 = MI.getOperand(2).getReg(); 1634 1635 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 1636 MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1); 1637 1638 Register CmpReg = Sel->getOperand(1).getReg(); 1639 B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank); 1640 MI.eraseFromParent(); 1641 } 1642 1643 // For cases where only a single copy is inserted for matching register banks. 1644 // Replace the register in the instruction operand 1645 static bool substituteSimpleCopyRegs( 1646 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1647 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1648 if (!SrcReg.empty()) { 1649 assert(SrcReg.size() == 1); 1650 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1651 return true; 1652 } 1653 1654 return false; 1655 } 1656 1657 /// Handle register layout difference for f16 images for some subtargets. 1658 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1659 MachineRegisterInfo &MRI, 1660 Register Reg) const { 1661 if (!Subtarget.hasUnpackedD16VMem()) 1662 return Reg; 1663 1664 const LLT S16 = LLT::scalar(16); 1665 LLT StoreVT = MRI.getType(Reg); 1666 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1667 return Reg; 1668 1669 auto Unmerge = B.buildUnmerge(S16, Reg); 1670 1671 1672 SmallVector<Register, 4> WideRegs; 1673 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1674 WideRegs.push_back(Unmerge.getReg(I)); 1675 1676 const LLT S32 = LLT::scalar(32); 1677 int NumElts = StoreVT.getNumElements(); 1678 1679 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1680 } 1681 1682 static std::pair<Register, unsigned> 1683 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1684 int64_t Const; 1685 if (mi_match(Reg, MRI, m_ICst(Const))) 1686 return std::make_pair(Register(), Const); 1687 1688 Register Base; 1689 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1690 return std::make_pair(Base, Const); 1691 1692 // TODO: Handle G_OR used for add case 1693 return std::make_pair(Reg, 0); 1694 } 1695 1696 std::pair<Register, unsigned> 1697 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1698 Register OrigOffset) const { 1699 const unsigned MaxImm = 4095; 1700 Register BaseReg; 1701 unsigned ImmOffset; 1702 const LLT S32 = LLT::scalar(32); 1703 1704 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1705 OrigOffset); 1706 1707 unsigned C1 = 0; 1708 if (ImmOffset != 0) { 1709 // If the immediate value is too big for the immoffset field, put the value 1710 // and -4096 into the immoffset field so that the value that is copied/added 1711 // for the voffset field is a multiple of 4096, and it stands more chance 1712 // of being CSEd with the copy/add for another similar load/store. 1713 // However, do not do that rounding down to a multiple of 4096 if that is a 1714 // negative number, as it appears to be illegal to have a negative offset 1715 // in the vgpr, even if adding the immediate offset makes it positive. 1716 unsigned Overflow = ImmOffset & ~MaxImm; 1717 ImmOffset -= Overflow; 1718 if ((int32_t)Overflow < 0) { 1719 Overflow += ImmOffset; 1720 ImmOffset = 0; 1721 } 1722 1723 C1 = ImmOffset; 1724 if (Overflow != 0) { 1725 if (!BaseReg) 1726 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1727 else { 1728 auto OverflowVal = B.buildConstant(S32, Overflow); 1729 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1730 } 1731 } 1732 } 1733 1734 if (!BaseReg) 1735 BaseReg = B.buildConstant(S32, 0).getReg(0); 1736 1737 return {BaseReg, C1}; 1738 } 1739 1740 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 1741 int64_t C; 1742 return mi_match(Reg, MRI, m_ICst(C)) && C == 0; 1743 } 1744 1745 static unsigned extractGLC(unsigned CachePolicy) { 1746 return CachePolicy & 1; 1747 } 1748 1749 static unsigned extractSLC(unsigned CachePolicy) { 1750 return (CachePolicy >> 1) & 1; 1751 } 1752 1753 static unsigned extractDLC(unsigned CachePolicy) { 1754 return (CachePolicy >> 2) & 1; 1755 } 1756 1757 MachineInstr * 1758 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, 1759 MachineInstr &MI) const { 1760 MachineRegisterInfo &MRI = *B.getMRI(); 1761 executeInWaterfallLoop(B, MI, MRI, {2, 4}); 1762 1763 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. 1764 1765 Register VData = MI.getOperand(1).getReg(); 1766 LLT Ty = MRI.getType(VData); 1767 1768 int EltSize = Ty.getScalarSizeInBits(); 1769 int Size = Ty.getSizeInBits(); 1770 1771 // FIXME: Broken integer truncstore. 1772 if (EltSize != 32) 1773 report_fatal_error("unhandled intrinsic store"); 1774 1775 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1776 const int MemSize = (*MI.memoperands_begin())->getSize(); 1777 1778 1779 Register RSrc = MI.getOperand(2).getReg(); 1780 Register VOffset = MI.getOperand(3).getReg(); 1781 Register SOffset = MI.getOperand(4).getReg(); 1782 unsigned CachePolicy = MI.getOperand(5).getImm(); 1783 1784 unsigned ImmOffset; 1785 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 1786 1787 const bool Offen = !isZero(VOffset, MRI); 1788 1789 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; 1790 switch (8 * MemSize) { 1791 case 8: 1792 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 1793 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 1794 break; 1795 case 16: 1796 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 1797 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 1798 break; 1799 default: 1800 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 1801 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 1802 if (Size > 32) 1803 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 1804 break; 1805 } 1806 1807 1808 // Set the insertion point back to the instruction in case it was moved into a 1809 // loop. 1810 B.setInstr(MI); 1811 1812 MachineInstrBuilder MIB = B.buildInstr(Opc) 1813 .addUse(VData); 1814 1815 if (Offen) 1816 MIB.addUse(VOffset); 1817 1818 MIB.addUse(RSrc) 1819 .addUse(SOffset) 1820 .addImm(ImmOffset) 1821 .addImm(extractGLC(CachePolicy)) 1822 .addImm(extractSLC(CachePolicy)) 1823 .addImm(0) // tfe: FIXME: Remove from inst 1824 .addImm(extractDLC(CachePolicy)) 1825 .cloneMemRefs(MI); 1826 1827 // FIXME: We need a way to report failure from applyMappingImpl. 1828 // Insert constrain copies before inserting the loop. 1829 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1830 report_fatal_error("failed to constrain selected store intrinsic"); 1831 1832 return MIB; 1833 } 1834 1835 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1836 Register SrcReg) const { 1837 MachineRegisterInfo &MRI = *B.getMRI(); 1838 LLT SrcTy = MRI.getType(SrcReg); 1839 if (SrcTy.getSizeInBits() == 32) { 1840 // Use a v_mov_b32 here to make the exec dependency explicit. 1841 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1842 .addDef(DstReg) 1843 .addUse(SrcReg); 1844 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1845 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1846 } 1847 1848 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1849 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1850 1851 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1852 .addDef(TmpReg0) 1853 .addUse(SrcReg, 0, AMDGPU::sub0); 1854 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1855 .addDef(TmpReg1) 1856 .addUse(SrcReg, 0, AMDGPU::sub1); 1857 B.buildInstr(AMDGPU::REG_SEQUENCE) 1858 .addDef(DstReg) 1859 .addUse(TmpReg0) 1860 .addImm(AMDGPU::sub0) 1861 .addUse(TmpReg1) 1862 .addImm(AMDGPU::sub1); 1863 1864 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1865 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1866 } 1867 1868 /// Utility function for pushing dynamic vector indexes with a constant offset 1869 /// into waterwall loops. 1870 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1871 MachineInstr &IdxUseInstr, 1872 unsigned OpIdx, 1873 unsigned ConstOffset) { 1874 MachineRegisterInfo &MRI = *B.getMRI(); 1875 const LLT S32 = LLT::scalar(32); 1876 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1877 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1878 1879 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1880 1881 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1882 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1883 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1884 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1885 } 1886 1887 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1888 /// original 32-bit source value (to be inserted in the low part of the combined 1889 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1890 /// value. 1891 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1892 Register Hi32Reg, Register Lo32Reg, 1893 unsigned ExtOpc, 1894 const RegisterBank &RegBank, 1895 bool IsBooleanSrc = false) { 1896 if (ExtOpc == AMDGPU::G_ZEXT) { 1897 B.buildConstant(Hi32Reg, 0); 1898 } else if (ExtOpc == AMDGPU::G_SEXT) { 1899 if (IsBooleanSrc) { 1900 // If we know the original source was an s1, the high half is the same as 1901 // the low. 1902 B.buildCopy(Hi32Reg, Lo32Reg); 1903 } else { 1904 // Replicate sign bit from 32-bit extended part. 1905 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1906 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1907 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1908 } 1909 } else { 1910 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1911 B.buildUndef(Hi32Reg); 1912 } 1913 } 1914 1915 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1916 MachineInstr &MI, MachineRegisterInfo &MRI, 1917 const OperandsMapper &OpdMapper) const { 1918 1919 Register VecReg = MI.getOperand(1).getReg(); 1920 Register Idx = MI.getOperand(2).getReg(); 1921 1922 const RegisterBank &IdxBank = 1923 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1924 1925 bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; 1926 1927 LLT VecTy = MRI.getType(VecReg); 1928 unsigned EltSize = VecTy.getScalarSizeInBits(); 1929 unsigned NumElem = VecTy.getNumElements(); 1930 1931 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1932 IsDivergentIdx)) 1933 return false; 1934 1935 MachineIRBuilder B(MI); 1936 LLT S32 = LLT::scalar(32); 1937 1938 const RegisterBank &DstBank = 1939 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1940 const RegisterBank &SrcBank = 1941 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1942 1943 const RegisterBank &CCBank = 1944 (DstBank == AMDGPU::SGPRRegBank && 1945 SrcBank == AMDGPU::SGPRRegBank && 1946 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1947 : AMDGPU::VCCRegBank; 1948 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1949 1950 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1951 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1952 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1953 } 1954 1955 LLT EltTy = VecTy.getScalarType(); 1956 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1957 unsigned NumLanes = DstRegs.size(); 1958 if (!NumLanes) 1959 NumLanes = 1; 1960 else 1961 EltTy = MRI.getType(DstRegs[0]); 1962 1963 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1964 SmallVector<Register, 2> Res(NumLanes); 1965 for (unsigned L = 0; L < NumLanes; ++L) 1966 Res[L] = UnmergeToEltTy.getReg(L); 1967 1968 for (unsigned I = 1; I < NumElem; ++I) { 1969 auto IC = B.buildConstant(S32, I); 1970 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1971 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1972 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1973 1974 for (unsigned L = 0; L < NumLanes; ++L) { 1975 auto S = B.buildSelect(EltTy, Cmp, 1976 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1977 1978 for (unsigned N : { 0, 2, 3 }) 1979 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1980 1981 Res[L] = S->getOperand(0).getReg(); 1982 } 1983 } 1984 1985 for (unsigned L = 0; L < NumLanes; ++L) { 1986 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1987 B.buildCopy(DstReg, Res[L]); 1988 MRI.setRegBank(DstReg, DstBank); 1989 } 1990 1991 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1992 MI.eraseFromParent(); 1993 1994 return true; 1995 } 1996 1997 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 1998 MachineInstr &MI, MachineRegisterInfo &MRI, 1999 const OperandsMapper &OpdMapper) const { 2000 2001 Register VecReg = MI.getOperand(1).getReg(); 2002 Register Idx = MI.getOperand(3).getReg(); 2003 2004 const RegisterBank &IdxBank = 2005 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2006 2007 bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; 2008 2009 LLT VecTy = MRI.getType(VecReg); 2010 unsigned EltSize = VecTy.getScalarSizeInBits(); 2011 unsigned NumElem = VecTy.getNumElements(); 2012 2013 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2014 IsDivergentIdx)) 2015 return false; 2016 2017 MachineIRBuilder B(MI); 2018 LLT S32 = LLT::scalar(32); 2019 2020 const RegisterBank &DstBank = 2021 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2022 const RegisterBank &SrcBank = 2023 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2024 const RegisterBank &InsBank = 2025 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2026 2027 const RegisterBank &CCBank = 2028 (DstBank == AMDGPU::SGPRRegBank && 2029 SrcBank == AMDGPU::SGPRRegBank && 2030 InsBank == AMDGPU::SGPRRegBank && 2031 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2032 : AMDGPU::VCCRegBank; 2033 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2034 2035 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2036 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2037 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2038 } 2039 2040 LLT EltTy = VecTy.getScalarType(); 2041 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2042 unsigned NumLanes = InsRegs.size(); 2043 if (!NumLanes) { 2044 NumLanes = 1; 2045 InsRegs.push_back(MI.getOperand(2).getReg()); 2046 } else { 2047 EltTy = MRI.getType(InsRegs[0]); 2048 } 2049 2050 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2051 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2052 2053 for (unsigned I = 0; I < NumElem; ++I) { 2054 auto IC = B.buildConstant(S32, I); 2055 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2056 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2057 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2058 2059 for (unsigned L = 0; L < NumLanes; ++L) { 2060 auto S = B.buildSelect(EltTy, Cmp, InsRegs[L], 2061 UnmergeToEltTy.getReg(I * NumLanes + L)); 2062 2063 for (unsigned N : { 0, 2, 3 }) 2064 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 2065 2066 Ops[I * NumLanes + L] = S->getOperand(0).getReg(); 2067 } 2068 } 2069 2070 LLT MergeTy = LLT::vector(Ops.size(), EltTy); 2071 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2072 B.buildBuildVector(MI.getOperand(0), Ops); 2073 } else { 2074 auto Vec = B.buildBuildVector(MergeTy, Ops); 2075 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2076 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2077 } 2078 2079 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2080 MI.eraseFromParent(); 2081 2082 return true; 2083 } 2084 2085 void AMDGPURegisterBankInfo::applyMappingImpl( 2086 const OperandsMapper &OpdMapper) const { 2087 MachineInstr &MI = OpdMapper.getMI(); 2088 unsigned Opc = MI.getOpcode(); 2089 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2090 switch (Opc) { 2091 case AMDGPU::G_PHI: { 2092 Register DstReg = MI.getOperand(0).getReg(); 2093 LLT DstTy = MRI.getType(DstReg); 2094 if (DstTy != LLT::scalar(1)) 2095 break; 2096 2097 const LLT S32 = LLT::scalar(32); 2098 const RegisterBank *DstBank = 2099 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2100 if (DstBank == &AMDGPU::VCCRegBank) { 2101 applyDefaultMapping(OpdMapper); 2102 // The standard handling only considers the result register bank for 2103 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2104 // produce an invalid copy. We can only copy with some kind of compare to 2105 // get a vector boolean result. Insert a regitser bank copy that will be 2106 // correctly lowered to a compare. 2107 MachineIRBuilder B(*MI.getParent()->getParent()); 2108 2109 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2110 Register SrcReg = MI.getOperand(I).getReg(); 2111 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2112 2113 if (SrcBank != &AMDGPU::VCCRegBank) { 2114 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2115 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2116 2117 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2118 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2119 MI.getOperand(I).setReg(Copy.getReg(0)); 2120 } 2121 } 2122 2123 return; 2124 } 2125 2126 // Phi handling is strange and only considers the bank of the destination. 2127 substituteSimpleCopyRegs(OpdMapper, 0); 2128 2129 // Promote SGPR/VGPR booleans to s32 2130 MachineFunction *MF = MI.getParent()->getParent(); 2131 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2132 GISelObserverWrapper Observer(&ApplyBank); 2133 MachineIRBuilder B(MI); 2134 LegalizerHelper Helper(*MF, Observer, B); 2135 2136 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2137 llvm_unreachable("widen scalar should have succeeded"); 2138 2139 return; 2140 } 2141 case AMDGPU::G_ICMP: 2142 case AMDGPU::G_UADDO: 2143 case AMDGPU::G_USUBO: 2144 case AMDGPU::G_UADDE: 2145 case AMDGPU::G_SADDE: 2146 case AMDGPU::G_USUBE: 2147 case AMDGPU::G_SSUBE: { 2148 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2149 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2150 2151 const RegisterBank *DstBank = 2152 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2153 if (DstBank != &AMDGPU::SGPRRegBank) 2154 break; 2155 2156 const bool HasCarryIn = MI.getNumOperands() == 5; 2157 2158 // If this is a scalar compare, promote the result to s32, as the selection 2159 // will end up using a copy to a 32-bit vreg. 2160 const LLT S32 = LLT::scalar(32); 2161 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2162 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2163 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2164 MachineIRBuilder B(MI); 2165 2166 if (HasCarryIn) { 2167 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2168 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2169 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2170 MI.getOperand(4).setReg(NewSrcReg); 2171 } 2172 2173 MachineBasicBlock *MBB = MI.getParent(); 2174 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2175 2176 // If we had a constrained VCC result register, a copy was inserted to VCC 2177 // from SGPR. 2178 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2179 if (DefRegs.empty()) 2180 DefRegs.push_back(DstReg); 2181 B.buildTrunc(DefRegs[0], NewDstReg); 2182 return; 2183 } 2184 case AMDGPU::G_SELECT: { 2185 Register DstReg = MI.getOperand(0).getReg(); 2186 LLT DstTy = MRI.getType(DstReg); 2187 2188 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2189 if (CondRegs.empty()) 2190 CondRegs.push_back(MI.getOperand(1).getReg()); 2191 else { 2192 assert(CondRegs.size() == 1); 2193 } 2194 2195 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2196 if (CondBank == &AMDGPU::SGPRRegBank) { 2197 MachineIRBuilder B(MI); 2198 const LLT S32 = LLT::scalar(32); 2199 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2200 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2201 2202 MI.getOperand(1).setReg(NewCondReg); 2203 B.buildZExt(NewCondReg, CondRegs[0]); 2204 } 2205 2206 if (DstTy.getSizeInBits() != 64) 2207 break; 2208 2209 MachineIRBuilder B(MI); 2210 LLT HalfTy = getHalfSizedType(DstTy); 2211 2212 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2213 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2214 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2215 2216 // All inputs are SGPRs, nothing special to do. 2217 if (DefRegs.empty()) { 2218 assert(Src1Regs.empty() && Src2Regs.empty()); 2219 break; 2220 } 2221 2222 if (Src1Regs.empty()) 2223 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2224 else { 2225 setRegsToType(MRI, Src1Regs, HalfTy); 2226 } 2227 2228 if (Src2Regs.empty()) 2229 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2230 else 2231 setRegsToType(MRI, Src2Regs, HalfTy); 2232 2233 setRegsToType(MRI, DefRegs, HalfTy); 2234 2235 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2236 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2237 2238 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2239 MI.eraseFromParent(); 2240 return; 2241 } 2242 case AMDGPU::G_BRCOND: { 2243 Register CondReg = MI.getOperand(0).getReg(); 2244 // FIXME: Should use legalizer helper, but should change bool ext type. 2245 const RegisterBank *CondBank = 2246 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2247 2248 if (CondBank == &AMDGPU::SGPRRegBank) { 2249 MachineIRBuilder B(MI); 2250 const LLT S32 = LLT::scalar(32); 2251 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2252 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2253 2254 MI.getOperand(0).setReg(NewCondReg); 2255 B.buildZExt(NewCondReg, CondReg); 2256 return; 2257 } 2258 2259 break; 2260 } 2261 case AMDGPU::G_AND: 2262 case AMDGPU::G_OR: 2263 case AMDGPU::G_XOR: { 2264 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2265 // there is a VGPR input. 2266 Register DstReg = MI.getOperand(0).getReg(); 2267 LLT DstTy = MRI.getType(DstReg); 2268 2269 if (DstTy.getSizeInBits() == 1) { 2270 const RegisterBank *DstBank = 2271 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2272 if (DstBank == &AMDGPU::VCCRegBank) 2273 break; 2274 2275 MachineFunction *MF = MI.getParent()->getParent(); 2276 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2277 GISelObserverWrapper Observer(&ApplyBank); 2278 MachineIRBuilder B(MI); 2279 LegalizerHelper Helper(*MF, Observer, B); 2280 2281 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2282 LegalizerHelper::Legalized) 2283 llvm_unreachable("widen scalar should have succeeded"); 2284 return; 2285 } 2286 2287 if (DstTy.getSizeInBits() != 64) 2288 break; 2289 2290 LLT HalfTy = getHalfSizedType(DstTy); 2291 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2292 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2293 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2294 2295 // All inputs are SGPRs, nothing special to do. 2296 if (DefRegs.empty()) { 2297 assert(Src0Regs.empty() && Src1Regs.empty()); 2298 break; 2299 } 2300 2301 assert(DefRegs.size() == 2); 2302 assert(Src0Regs.size() == Src1Regs.size() && 2303 (Src0Regs.empty() || Src0Regs.size() == 2)); 2304 2305 // Depending on where the source registers came from, the generic code may 2306 // have decided to split the inputs already or not. If not, we still need to 2307 // extract the values. 2308 MachineIRBuilder B(MI); 2309 2310 if (Src0Regs.empty()) 2311 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2312 else 2313 setRegsToType(MRI, Src0Regs, HalfTy); 2314 2315 if (Src1Regs.empty()) 2316 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2317 else 2318 setRegsToType(MRI, Src1Regs, HalfTy); 2319 2320 setRegsToType(MRI, DefRegs, HalfTy); 2321 2322 B.buildInstr(Opc) 2323 .addDef(DefRegs[0]) 2324 .addUse(Src0Regs[0]) 2325 .addUse(Src1Regs[0]); 2326 2327 B.buildInstr(Opc) 2328 .addDef(DefRegs[1]) 2329 .addUse(Src0Regs[1]) 2330 .addUse(Src1Regs[1]); 2331 2332 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2333 MI.eraseFromParent(); 2334 return; 2335 } 2336 case AMDGPU::G_ADD: 2337 case AMDGPU::G_SUB: 2338 case AMDGPU::G_MUL: 2339 case AMDGPU::G_SHL: 2340 case AMDGPU::G_LSHR: 2341 case AMDGPU::G_ASHR: { 2342 Register DstReg = MI.getOperand(0).getReg(); 2343 LLT DstTy = MRI.getType(DstReg); 2344 2345 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2346 // Packed 16-bit operations need to be scalarized and promoted. 2347 if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16)) 2348 break; 2349 2350 const RegisterBank *DstBank = 2351 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2352 if (DstBank == &AMDGPU::VGPRRegBank) 2353 break; 2354 2355 const LLT S32 = LLT::scalar(32); 2356 MachineBasicBlock *MBB = MI.getParent(); 2357 MachineFunction *MF = MBB->getParent(); 2358 MachineIRBuilder B(MI); 2359 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2360 GISelObserverWrapper Observer(&ApplySALU); 2361 2362 if (DstTy.isVector()) { 2363 B.setChangeObserver(Observer); 2364 2365 Register WideSrc0Lo, WideSrc0Hi; 2366 Register WideSrc1Lo, WideSrc1Hi; 2367 2368 std::tie(WideSrc0Lo, WideSrc0Hi) 2369 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT); 2370 std::tie(WideSrc1Lo, WideSrc1Hi) 2371 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT); 2372 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2373 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2374 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2375 MI.eraseFromParent(); 2376 } else { 2377 LegalizerHelper Helper(*MF, Observer, B); 2378 2379 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2380 llvm_unreachable("widen scalar should have succeeded"); 2381 2382 // FIXME: s16 shift amounts should be legal. 2383 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2384 Opc == AMDGPU::G_ASHR) { 2385 B.setInsertPt(*MBB, MI.getIterator()); 2386 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2387 llvm_unreachable("widen scalar should have succeeded"); 2388 } 2389 } 2390 2391 return; 2392 } 2393 case AMDGPU::G_SMIN: 2394 case AMDGPU::G_SMAX: 2395 case AMDGPU::G_UMIN: 2396 case AMDGPU::G_UMAX: { 2397 Register DstReg = MI.getOperand(0).getReg(); 2398 const RegisterBank *DstBank = 2399 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2400 if (DstBank == &AMDGPU::VGPRRegBank) 2401 break; 2402 2403 MachineFunction *MF = MI.getParent()->getParent(); 2404 MachineIRBuilder B(MI); 2405 2406 // Turn scalar min/max into a compare and select. 2407 LLT Ty = MRI.getType(DstReg); 2408 const LLT S32 = LLT::scalar(32); 2409 const LLT S16 = LLT::scalar(16); 2410 const LLT V2S16 = LLT::vector(2, 16); 2411 2412 if (Ty == V2S16) { 2413 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2414 GISelObserverWrapper Observer(&ApplySALU); 2415 B.setChangeObserver(Observer); 2416 2417 // Need to widen to s32, and expand as cmp + select, and avoid producing 2418 // illegal vector extends or unmerges that would need further 2419 // legalization. 2420 // 2421 // TODO: Should we just readfirstlane? That should probably be handled 2422 // with a UniformVGPR register bank that wouldn't need special 2423 // consideration here. 2424 2425 Register Dst = MI.getOperand(0).getReg(); 2426 Register Src0 = MI.getOperand(1).getReg(); 2427 Register Src1 = MI.getOperand(2).getReg(); 2428 2429 Register WideSrc0Lo, WideSrc0Hi; 2430 Register WideSrc1Lo, WideSrc1Hi; 2431 2432 unsigned ExtendOp = minMaxToExtend(MI.getOpcode()); 2433 2434 std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp); 2435 std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp); 2436 2437 Register Lo = MRI.createGenericVirtualRegister(S32); 2438 Register Hi = MRI.createGenericVirtualRegister(S32); 2439 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 2440 buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo); 2441 buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi); 2442 2443 B.buildBuildVectorTrunc(Dst, {Lo, Hi}); 2444 MI.eraseFromParent(); 2445 } else if (Ty == S16) { 2446 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2447 GISelObserverWrapper Observer(&ApplySALU); 2448 LegalizerHelper Helper(*MF, Observer, B); 2449 2450 // Need to widen to s32, and expand as cmp + select. 2451 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2452 llvm_unreachable("widenScalar should have succeeded"); 2453 2454 // FIXME: This is relying on widenScalar leaving MI in place. 2455 lowerScalarMinMax(B, MI); 2456 } else 2457 lowerScalarMinMax(B, MI); 2458 2459 return; 2460 } 2461 case AMDGPU::G_SEXT_INREG: { 2462 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2463 if (SrcRegs.empty()) 2464 break; // Nothing to repair 2465 2466 const LLT S32 = LLT::scalar(32); 2467 MachineIRBuilder B(MI); 2468 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2469 GISelObserverWrapper Observer(&O); 2470 B.setChangeObserver(Observer); 2471 2472 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2473 // we would need to further expand, and doesn't let us directly set the 2474 // result registers. 2475 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2476 2477 int Amt = MI.getOperand(2).getImm(); 2478 if (Amt <= 32) { 2479 if (Amt == 32) { 2480 // The low bits are unchanged. 2481 B.buildCopy(DstRegs[0], SrcRegs[0]); 2482 } else { 2483 // Extend in the low bits and propagate the sign bit to the high half. 2484 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2485 } 2486 2487 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2488 } else { 2489 // The low bits are unchanged, and extend in the high bits. 2490 B.buildCopy(DstRegs[0], SrcRegs[0]); 2491 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2492 } 2493 2494 Register DstReg = MI.getOperand(0).getReg(); 2495 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2496 MI.eraseFromParent(); 2497 return; 2498 } 2499 case AMDGPU::G_CTPOP: 2500 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2501 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2502 MachineIRBuilder B(MI); 2503 MachineFunction &MF = B.getMF(); 2504 2505 const RegisterBank *DstBank = 2506 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2507 if (DstBank == &AMDGPU::SGPRRegBank) 2508 break; 2509 2510 Register SrcReg = MI.getOperand(1).getReg(); 2511 const LLT S32 = LLT::scalar(32); 2512 LLT Ty = MRI.getType(SrcReg); 2513 if (Ty == S32) 2514 break; 2515 2516 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2517 GISelObserverWrapper Observer(&ApplyVALU); 2518 LegalizerHelper Helper(MF, Observer, B); 2519 2520 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2521 llvm_unreachable("narrowScalar should have succeeded"); 2522 return; 2523 } 2524 case AMDGPU::G_SEXT: 2525 case AMDGPU::G_ZEXT: 2526 case AMDGPU::G_ANYEXT: { 2527 Register SrcReg = MI.getOperand(1).getReg(); 2528 LLT SrcTy = MRI.getType(SrcReg); 2529 const bool Signed = Opc == AMDGPU::G_SEXT; 2530 2531 assert(empty(OpdMapper.getVRegs(1))); 2532 2533 MachineIRBuilder B(MI); 2534 const RegisterBank *SrcBank = 2535 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2536 2537 Register DstReg = MI.getOperand(0).getReg(); 2538 LLT DstTy = MRI.getType(DstReg); 2539 if (DstTy.isScalar() && 2540 SrcBank != &AMDGPU::SGPRRegBank && 2541 SrcBank != &AMDGPU::VCCRegBank && 2542 // FIXME: Should handle any type that round to s64 when irregular 2543 // breakdowns supported. 2544 DstTy.getSizeInBits() == 64 && 2545 SrcTy.getSizeInBits() <= 32) { 2546 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2547 2548 // Extend to 32-bit, and then extend the low half. 2549 if (Signed) { 2550 // TODO: Should really be buildSExtOrCopy 2551 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2552 } else if (Opc == AMDGPU::G_ZEXT) { 2553 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2554 } else { 2555 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2556 } 2557 2558 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2559 MRI.setRegBank(DstReg, *SrcBank); 2560 MI.eraseFromParent(); 2561 return; 2562 } 2563 2564 if (SrcTy != LLT::scalar(1)) 2565 return; 2566 2567 // It is not legal to have a legalization artifact with a VCC source. Rather 2568 // than introducing a copy, insert the select we would have to select the 2569 // copy to. 2570 if (SrcBank == &AMDGPU::VCCRegBank) { 2571 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2572 2573 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2574 2575 unsigned DstSize = DstTy.getSizeInBits(); 2576 // 64-bit select is SGPR only 2577 const bool UseSel64 = DstSize > 32 && 2578 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2579 2580 // TODO: Should s16 select be legal? 2581 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2582 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2583 auto False = B.buildConstant(SelType, 0); 2584 2585 MRI.setRegBank(True.getReg(0), *DstBank); 2586 MRI.setRegBank(False.getReg(0), *DstBank); 2587 MRI.setRegBank(DstReg, *DstBank); 2588 2589 if (DstSize > 32) { 2590 B.buildSelect(DefRegs[0], SrcReg, True, False); 2591 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2592 } else if (DstSize < 32) { 2593 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2594 MRI.setRegBank(Sel.getReg(0), *DstBank); 2595 B.buildTrunc(DstReg, Sel); 2596 } else { 2597 B.buildSelect(DstReg, SrcReg, True, False); 2598 } 2599 2600 MI.eraseFromParent(); 2601 return; 2602 } 2603 2604 break; 2605 } 2606 case AMDGPU::G_BUILD_VECTOR: 2607 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2608 Register DstReg = MI.getOperand(0).getReg(); 2609 LLT DstTy = MRI.getType(DstReg); 2610 if (DstTy != LLT::vector(2, 16)) 2611 break; 2612 2613 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2614 substituteSimpleCopyRegs(OpdMapper, 1); 2615 substituteSimpleCopyRegs(OpdMapper, 2); 2616 2617 const RegisterBank *DstBank = 2618 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2619 if (DstBank == &AMDGPU::SGPRRegBank) 2620 break; // Can use S_PACK_* instructions. 2621 2622 MachineIRBuilder B(MI); 2623 2624 Register Lo = MI.getOperand(1).getReg(); 2625 Register Hi = MI.getOperand(2).getReg(); 2626 const LLT S32 = LLT::scalar(32); 2627 2628 const RegisterBank *BankLo = 2629 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2630 const RegisterBank *BankHi = 2631 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2632 2633 Register ZextLo; 2634 Register ShiftHi; 2635 2636 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2637 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2638 MRI.setRegBank(ZextLo, *BankLo); 2639 2640 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2641 MRI.setRegBank(ZextHi, *BankHi); 2642 2643 auto ShiftAmt = B.buildConstant(S32, 16); 2644 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2645 2646 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2647 MRI.setRegBank(ShiftHi, *BankHi); 2648 } else { 2649 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2650 MRI.setRegBank(MaskLo, *BankLo); 2651 2652 auto ShiftAmt = B.buildConstant(S32, 16); 2653 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2654 2655 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2656 MRI.setRegBank(ShiftHi, *BankHi); 2657 2658 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2659 MRI.setRegBank(ZextLo, *BankLo); 2660 } 2661 2662 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2663 MRI.setRegBank(Or.getReg(0), *DstBank); 2664 2665 B.buildBitcast(DstReg, Or); 2666 MI.eraseFromParent(); 2667 return; 2668 } 2669 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2670 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2671 2672 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2673 2674 Register DstReg = MI.getOperand(0).getReg(); 2675 Register SrcReg = MI.getOperand(1).getReg(); 2676 2677 const LLT S32 = LLT::scalar(32); 2678 LLT DstTy = MRI.getType(DstReg); 2679 LLT SrcTy = MRI.getType(SrcReg); 2680 2681 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2682 return; 2683 2684 MachineIRBuilder B(MI); 2685 2686 const ValueMapping &DstMapping 2687 = OpdMapper.getInstrMapping().getOperandMapping(0); 2688 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2689 const RegisterBank *SrcBank = 2690 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2691 const RegisterBank *IdxBank = 2692 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2693 2694 Register BaseIdxReg; 2695 unsigned ConstOffset; 2696 MachineInstr *OffsetDef; 2697 std::tie(BaseIdxReg, ConstOffset, OffsetDef) = 2698 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2699 2700 // See if the index is an add of a constant which will be foldable by moving 2701 // the base register of the index later if this is going to be executed in a 2702 // waterfall loop. This is essentially to reassociate the add of a constant 2703 // with the readfirstlane. 2704 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2705 ConstOffset > 0 && 2706 ConstOffset < SrcTy.getNumElements(); 2707 2708 // Move the base register. We'll re-insert the add later. 2709 if (ShouldMoveIndexIntoLoop) 2710 MI.getOperand(2).setReg(BaseIdxReg); 2711 2712 // If this is a VGPR result only because the index was a VGPR result, the 2713 // actual indexing will be done on the SGPR source vector, which will 2714 // produce a scalar result. We need to copy to the VGPR result inside the 2715 // waterfall loop. 2716 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2717 SrcBank == &AMDGPU::SGPRRegBank; 2718 if (DstRegs.empty()) { 2719 applyDefaultMapping(OpdMapper); 2720 2721 executeInWaterfallLoop(MI, MRI, { 2 }); 2722 2723 if (NeedCopyToVGPR) { 2724 // We don't want a phi for this temporary reg. 2725 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2726 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2727 MI.getOperand(0).setReg(TmpReg); 2728 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2729 2730 // Use a v_mov_b32 here to make the exec dependency explicit. 2731 buildVCopy(B, DstReg, TmpReg); 2732 } 2733 2734 // Re-insert the constant offset add inside the waterfall loop. 2735 if (ShouldMoveIndexIntoLoop) 2736 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2737 2738 return; 2739 } 2740 2741 assert(DstTy.getSizeInBits() == 64); 2742 2743 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 2744 2745 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2746 auto One = B.buildConstant(S32, 1); 2747 2748 MachineBasicBlock::iterator MII = MI.getIterator(); 2749 2750 // Split the vector index into 32-bit pieces. Prepare to move all of the 2751 // new instructions into a waterfall loop if necessary. 2752 // 2753 // Don't put the bitcast or constant in the loop. 2754 MachineInstrSpan Span(MII, &B.getMBB()); 2755 2756 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2757 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2758 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2759 2760 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2761 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2762 2763 MRI.setRegBank(DstReg, *DstBank); 2764 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2765 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2766 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2767 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2768 2769 SmallSet<Register, 4> OpsToWaterfall; 2770 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2771 MI.eraseFromParent(); 2772 return; 2773 } 2774 2775 // Remove the original instruction to avoid potentially confusing the 2776 // waterfall loop logic. 2777 B.setInstr(*Span.begin()); 2778 MI.eraseFromParent(); 2779 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2780 OpsToWaterfall, MRI); 2781 2782 if (NeedCopyToVGPR) { 2783 MachineBasicBlock *LoopBB = Extract1->getParent(); 2784 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2785 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2786 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2787 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2788 2789 Extract0->getOperand(0).setReg(TmpReg0); 2790 Extract1->getOperand(0).setReg(TmpReg1); 2791 2792 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2793 2794 buildVCopy(B, DstRegs[0], TmpReg0); 2795 buildVCopy(B, DstRegs[1], TmpReg1); 2796 } 2797 2798 if (ShouldMoveIndexIntoLoop) 2799 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2800 2801 return; 2802 } 2803 case AMDGPU::G_INSERT_VECTOR_ELT: { 2804 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2805 2806 Register DstReg = MI.getOperand(0).getReg(); 2807 LLT VecTy = MRI.getType(DstReg); 2808 2809 assert(OpdMapper.getVRegs(0).empty()); 2810 assert(OpdMapper.getVRegs(3).empty()); 2811 2812 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2813 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2814 2815 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2816 return; 2817 2818 const RegisterBank *IdxBank = 2819 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2820 2821 Register SrcReg = MI.getOperand(1).getReg(); 2822 Register InsReg = MI.getOperand(2).getReg(); 2823 LLT InsTy = MRI.getType(InsReg); 2824 (void)InsTy; 2825 2826 Register BaseIdxReg; 2827 unsigned ConstOffset; 2828 MachineInstr *OffsetDef; 2829 std::tie(BaseIdxReg, ConstOffset, OffsetDef) = 2830 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2831 2832 // See if the index is an add of a constant which will be foldable by moving 2833 // the base register of the index later if this is going to be executed in a 2834 // waterfall loop. This is essentially to reassociate the add of a constant 2835 // with the readfirstlane. 2836 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2837 ConstOffset > 0 && 2838 ConstOffset < VecTy.getNumElements(); 2839 2840 // Move the base register. We'll re-insert the add later. 2841 if (ShouldMoveIndexIntoLoop) 2842 MI.getOperand(3).setReg(BaseIdxReg); 2843 2844 2845 if (InsRegs.empty()) { 2846 executeInWaterfallLoop(MI, MRI, { 3 }); 2847 2848 // Re-insert the constant offset add inside the waterfall loop. 2849 if (ShouldMoveIndexIntoLoop) { 2850 MachineIRBuilder B(MI); 2851 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2852 } 2853 2854 return; 2855 } 2856 2857 2858 assert(InsTy.getSizeInBits() == 64); 2859 2860 const LLT S32 = LLT::scalar(32); 2861 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); 2862 2863 MachineIRBuilder B(MI); 2864 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2865 auto One = B.buildConstant(S32, 1); 2866 2867 // Split the vector index into 32-bit pieces. Prepare to move all of the 2868 // new instructions into a waterfall loop if necessary. 2869 // 2870 // Don't put the bitcast or constant in the loop. 2871 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2872 2873 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2874 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2875 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2876 2877 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2878 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2879 2880 const RegisterBank *DstBank = 2881 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2882 const RegisterBank *SrcBank = 2883 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2884 const RegisterBank *InsSrcBank = 2885 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2886 2887 MRI.setRegBank(InsReg, *InsSrcBank); 2888 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2889 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2890 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2891 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2892 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2893 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2894 2895 2896 SmallSet<Register, 4> OpsToWaterfall; 2897 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2898 B.setInsertPt(B.getMBB(), MI); 2899 B.buildBitcast(DstReg, InsHi); 2900 MI.eraseFromParent(); 2901 return; 2902 } 2903 2904 B.setInstr(*Span.begin()); 2905 MI.eraseFromParent(); 2906 2907 // Figure out the point after the waterfall loop before mangling the control 2908 // flow. 2909 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2910 OpsToWaterfall, MRI); 2911 2912 // The insertion point is now right after the original instruction. 2913 // 2914 // Keep the bitcast to the original vector type out of the loop. Doing this 2915 // saved an extra phi we don't need inside the loop. 2916 B.buildBitcast(DstReg, InsHi); 2917 2918 // Re-insert the constant offset add inside the waterfall loop. 2919 if (ShouldMoveIndexIntoLoop) 2920 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2921 2922 return; 2923 } 2924 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2925 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2926 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2927 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2928 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2929 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2930 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2931 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2932 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2933 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2934 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2935 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2936 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2937 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2938 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2939 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2940 applyDefaultMapping(OpdMapper); 2941 executeInWaterfallLoop(MI, MRI, {1, 4}); 2942 return; 2943 } 2944 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2945 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2948 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2949 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2950 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2951 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2955 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2956 applyDefaultMapping(OpdMapper); 2957 executeInWaterfallLoop(MI, MRI, {2, 5}); 2958 return; 2959 } 2960 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2961 applyDefaultMapping(OpdMapper); 2962 executeInWaterfallLoop(MI, MRI, {3, 6}); 2963 return; 2964 } 2965 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2966 applyMappingSBufferLoad(OpdMapper); 2967 return; 2968 } 2969 case AMDGPU::G_INTRINSIC: { 2970 switch (MI.getIntrinsicID()) { 2971 case Intrinsic::amdgcn_readlane: { 2972 substituteSimpleCopyRegs(OpdMapper, 2); 2973 2974 assert(OpdMapper.getVRegs(0).empty()); 2975 assert(OpdMapper.getVRegs(3).empty()); 2976 2977 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2978 // waterfall loop, so assume it's a uniform value. 2979 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2980 return; 2981 } 2982 case Intrinsic::amdgcn_writelane: { 2983 assert(OpdMapper.getVRegs(0).empty()); 2984 assert(OpdMapper.getVRegs(2).empty()); 2985 assert(OpdMapper.getVRegs(3).empty()); 2986 2987 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2988 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2989 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2990 return; 2991 } 2992 case Intrinsic::amdgcn_ballot: 2993 case Intrinsic::amdgcn_interp_p1: 2994 case Intrinsic::amdgcn_interp_p2: 2995 case Intrinsic::amdgcn_interp_mov: 2996 case Intrinsic::amdgcn_interp_p1_f16: 2997 case Intrinsic::amdgcn_interp_p2_f16: { 2998 applyDefaultMapping(OpdMapper); 2999 3000 // Readlane for m0 value, which is always the last operand. 3001 // FIXME: Should this be a waterfall loop instead? 3002 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3003 return; 3004 } 3005 case Intrinsic::amdgcn_permlane16: 3006 case Intrinsic::amdgcn_permlanex16: { 3007 // Doing a waterfall loop over these wouldn't make any sense. 3008 substituteSimpleCopyRegs(OpdMapper, 2); 3009 substituteSimpleCopyRegs(OpdMapper, 3); 3010 constrainOpWithReadfirstlane(MI, MRI, 4); 3011 constrainOpWithReadfirstlane(MI, MRI, 5); 3012 return; 3013 } 3014 case Intrinsic::amdgcn_sbfe: 3015 applyMappingBFEIntrinsic(OpdMapper, true); 3016 return; 3017 case Intrinsic::amdgcn_ubfe: 3018 applyMappingBFEIntrinsic(OpdMapper, false); 3019 return; 3020 } 3021 break; 3022 } 3023 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3024 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 3025 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3026 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3027 assert(RSrcIntrin && RSrcIntrin->IsImage); 3028 // Non-images can have complications from operands that allow both SGPR 3029 // and VGPR. For now it's too complicated to figure out the final opcode 3030 // to derive the register bank from the MCInstrDesc. 3031 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3032 return; 3033 } 3034 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3035 auto IntrID = MI.getIntrinsicID(); 3036 switch (IntrID) { 3037 case Intrinsic::amdgcn_ds_ordered_add: 3038 case Intrinsic::amdgcn_ds_ordered_swap: { 3039 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3040 assert(OpdMapper.getVRegs(0).empty()); 3041 substituteSimpleCopyRegs(OpdMapper, 3); 3042 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3043 return; 3044 } 3045 case Intrinsic::amdgcn_ds_gws_init: 3046 case Intrinsic::amdgcn_ds_gws_barrier: 3047 case Intrinsic::amdgcn_ds_gws_sema_br: { 3048 // Only the first lane is executes, so readfirstlane is safe. 3049 substituteSimpleCopyRegs(OpdMapper, 1); 3050 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3051 return; 3052 } 3053 case Intrinsic::amdgcn_ds_gws_sema_v: 3054 case Intrinsic::amdgcn_ds_gws_sema_p: 3055 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3056 // Only the first lane is executes, so readfirstlane is safe. 3057 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3058 return; 3059 } 3060 case Intrinsic::amdgcn_ds_append: 3061 case Intrinsic::amdgcn_ds_consume: { 3062 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3063 return; 3064 } 3065 case Intrinsic::amdgcn_s_sendmsg: 3066 case Intrinsic::amdgcn_s_sendmsghalt: { 3067 // FIXME: Should this use a waterfall loop? 3068 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3069 return; 3070 } 3071 case Intrinsic::amdgcn_s_setreg: { 3072 constrainOpWithReadfirstlane(MI, MRI, 2); 3073 return; 3074 } 3075 default: { 3076 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3077 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3078 // Non-images can have complications from operands that allow both SGPR 3079 // and VGPR. For now it's too complicated to figure out the final opcode 3080 // to derive the register bank from the MCInstrDesc. 3081 if (RSrcIntrin->IsImage) { 3082 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3083 return; 3084 } 3085 } 3086 3087 break; 3088 } 3089 } 3090 break; 3091 } 3092 case AMDGPU::G_LOAD: 3093 case AMDGPU::G_ZEXTLOAD: 3094 case AMDGPU::G_SEXTLOAD: { 3095 if (applyMappingLoad(MI, OpdMapper, MRI)) 3096 return; 3097 break; 3098 } 3099 case AMDGPU::G_DYN_STACKALLOC: 3100 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3101 return; 3102 default: 3103 break; 3104 } 3105 3106 return applyDefaultMapping(OpdMapper); 3107 } 3108 3109 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3110 const MachineFunction &MF = *MI.getParent()->getParent(); 3111 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3112 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 3113 if (!MI.getOperand(i).isReg()) 3114 continue; 3115 Register Reg = MI.getOperand(i).getReg(); 3116 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3117 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3118 return false; 3119 } 3120 } 3121 return true; 3122 } 3123 3124 const RegisterBankInfo::InstructionMapping & 3125 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3126 const MachineFunction &MF = *MI.getParent()->getParent(); 3127 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3128 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3129 3130 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3131 const MachineOperand &SrcOp = MI.getOperand(i); 3132 if (!SrcOp.isReg()) 3133 continue; 3134 3135 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3136 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3137 } 3138 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3139 MI.getNumOperands()); 3140 } 3141 3142 const RegisterBankInfo::InstructionMapping & 3143 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3144 const MachineFunction &MF = *MI.getParent()->getParent(); 3145 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3146 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3147 3148 // Even though we technically could use SGPRs, this would require knowledge of 3149 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3150 // 3151 // TODO: Unary ops are trivially OK, so accept SGPRs? 3152 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3153 const MachineOperand &Src = MI.getOperand(i); 3154 if (!Src.isReg()) 3155 continue; 3156 3157 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3158 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3159 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3160 } 3161 3162 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3163 MI.getNumOperands()); 3164 } 3165 3166 const RegisterBankInfo::InstructionMapping & 3167 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3168 const MachineFunction &MF = *MI.getParent()->getParent(); 3169 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3170 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3171 3172 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3173 const MachineOperand &Op = MI.getOperand(I); 3174 if (!Op.isReg()) 3175 continue; 3176 3177 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3178 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3179 } 3180 3181 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3182 MI.getNumOperands()); 3183 } 3184 3185 const RegisterBankInfo::InstructionMapping & 3186 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3187 const MachineInstr &MI, 3188 int RsrcIdx) const { 3189 // The reported argument index is relative to the IR intrinsic call arguments, 3190 // so we need to shift by the number of defs and the intrinsic ID. 3191 RsrcIdx += MI.getNumExplicitDefs() + 1; 3192 3193 const int NumOps = MI.getNumOperands(); 3194 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3195 3196 // TODO: Should packed/unpacked D16 difference be reported here as part of 3197 // the value mapping? 3198 for (int I = 0; I != NumOps; ++I) { 3199 if (!MI.getOperand(I).isReg()) 3200 continue; 3201 3202 Register OpReg = MI.getOperand(I).getReg(); 3203 // We replace some dead address operands with $noreg 3204 if (!OpReg) 3205 continue; 3206 3207 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3208 3209 // FIXME: Probably need a new intrinsic register bank searchable table to 3210 // handle arbitrary intrinsics easily. 3211 // 3212 // If this has a sampler, it immediately follows rsrc. 3213 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3214 3215 if (MustBeSGPR) { 3216 // If this must be an SGPR, so we must report whatever it is as legal. 3217 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 3218 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3219 } else { 3220 // Some operands must be VGPR, and these are easy to copy to. 3221 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3222 } 3223 } 3224 3225 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3226 } 3227 3228 /// Return the mapping for a pointer arugment. 3229 const RegisterBankInfo::ValueMapping * 3230 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3231 Register PtrReg) const { 3232 LLT PtrTy = MRI.getType(PtrReg); 3233 unsigned Size = PtrTy.getSizeInBits(); 3234 if (Subtarget.useFlatForGlobal() || 3235 !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3236 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3237 3238 // If we're using MUBUF instructions for global memory, an SGPR base register 3239 // is possible. Otherwise this needs to be a VGPR. 3240 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3241 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3242 } 3243 3244 const RegisterBankInfo::InstructionMapping & 3245 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3246 3247 const MachineFunction &MF = *MI.getParent()->getParent(); 3248 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3249 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3250 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3251 Register PtrReg = MI.getOperand(1).getReg(); 3252 LLT PtrTy = MRI.getType(PtrReg); 3253 unsigned AS = PtrTy.getAddressSpace(); 3254 unsigned PtrSize = PtrTy.getSizeInBits(); 3255 3256 const ValueMapping *ValMapping; 3257 const ValueMapping *PtrMapping; 3258 3259 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3260 3261 if (PtrBank == &AMDGPU::SGPRRegBank && 3262 SITargetLowering::isFlatGlobalAddrSpace(AS)) { 3263 if (isScalarLoadLegal(MI)) { 3264 // We have a uniform instruction so we want to use an SMRD load 3265 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3266 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3267 } else { 3268 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3269 3270 // If we're using MUBUF instructions for global memory, an SGPR base 3271 // register is possible. Otherwise this needs to be a VGPR. 3272 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3273 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3274 3275 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3276 } 3277 } else { 3278 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3279 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3280 } 3281 3282 OpdsMapping[0] = ValMapping; 3283 OpdsMapping[1] = PtrMapping; 3284 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3285 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3286 return Mapping; 3287 3288 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3289 // handle that during instruction selection? 3290 } 3291 3292 unsigned 3293 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3294 const MachineRegisterInfo &MRI, 3295 const TargetRegisterInfo &TRI, 3296 unsigned Default) const { 3297 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); 3298 return Bank ? Bank->getID() : Default; 3299 } 3300 3301 3302 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3303 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ? 3304 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3305 } 3306 3307 static int regBankBoolUnion(int RB0, int RB1) { 3308 if (RB0 == -1) 3309 return RB1; 3310 if (RB1 == -1) 3311 return RB0; 3312 3313 // vcc, vcc -> vcc 3314 // vcc, sgpr -> vcc 3315 // vcc, vgpr -> vcc 3316 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3317 return AMDGPU::VCCRegBankID; 3318 3319 // vcc, vgpr -> vgpr 3320 return regBankUnion(RB0, RB1); 3321 } 3322 3323 const RegisterBankInfo::ValueMapping * 3324 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3325 const MachineRegisterInfo &MRI, 3326 const TargetRegisterInfo &TRI) const { 3327 // Lie and claim anything is legal, even though this needs to be an SGPR 3328 // applyMapping will have to deal with it as a waterfall loop. 3329 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID); 3330 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3331 return AMDGPU::getValueMapping(Bank, Size); 3332 } 3333 3334 const RegisterBankInfo::ValueMapping * 3335 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3336 const MachineRegisterInfo &MRI, 3337 const TargetRegisterInfo &TRI) const { 3338 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3339 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3340 } 3341 3342 const RegisterBankInfo::ValueMapping * 3343 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3344 const MachineRegisterInfo &MRI, 3345 const TargetRegisterInfo &TRI) const { 3346 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3347 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3348 } 3349 3350 /// 3351 /// This function must return a legal mapping, because 3352 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3353 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3354 /// VGPR to SGPR generated is illegal. 3355 /// 3356 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3357 // legal. These will be dealt with in applyMappingImpl. 3358 // 3359 const RegisterBankInfo::InstructionMapping & 3360 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3361 const MachineFunction &MF = *MI.getParent()->getParent(); 3362 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3363 3364 if (MI.isCopy()) { 3365 // The default logic bothers to analyze impossible alternative mappings. We 3366 // want the most straightforward mapping, so just directly handle this. 3367 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3368 *TRI); 3369 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3370 *TRI); 3371 assert(SrcBank && "src bank should have been assigned already"); 3372 if (!DstBank) 3373 DstBank = SrcBank; 3374 3375 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3376 if (cannotCopy(*DstBank, *SrcBank, Size)) 3377 return getInvalidInstructionMapping(); 3378 3379 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3380 return getInstructionMapping( 3381 1, /*Cost*/ 1, 3382 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3383 } 3384 3385 if (MI.isRegSequence()) { 3386 // If any input is a VGPR, the result must be a VGPR. The default handling 3387 // assumes any copy between banks is legal. 3388 unsigned BankID = AMDGPU::SGPRRegBankID; 3389 3390 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3391 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); 3392 // It doesn't make sense to use vcc or scc banks here, so just ignore 3393 // them. 3394 if (OpBank != AMDGPU::SGPRRegBankID) { 3395 BankID = AMDGPU::VGPRRegBankID; 3396 break; 3397 } 3398 } 3399 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3400 3401 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3402 return getInstructionMapping( 3403 1, /*Cost*/ 1, 3404 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3405 } 3406 3407 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3408 // properly. 3409 // 3410 // TODO: There are additional exec masking dependencies to analyze. 3411 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3412 // TODO: Generate proper invalid bank enum. 3413 int ResultBank = -1; 3414 Register DstReg = MI.getOperand(0).getReg(); 3415 3416 // Sometimes the result may have already been assigned a bank. 3417 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3418 ResultBank = DstBank->getID(); 3419 3420 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3421 Register Reg = MI.getOperand(I).getReg(); 3422 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3423 3424 // FIXME: Assuming VGPR for any undetermined inputs. 3425 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3426 ResultBank = AMDGPU::VGPRRegBankID; 3427 break; 3428 } 3429 3430 // FIXME: Need to promote SGPR case to s32 3431 unsigned OpBank = Bank->getID(); 3432 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3433 } 3434 3435 assert(ResultBank != -1); 3436 3437 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3438 3439 const ValueMapping &ValMap = 3440 getValueMapping(0, Size, getRegBank(ResultBank)); 3441 return getInstructionMapping( 3442 1, /*Cost*/ 1, 3443 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3444 } 3445 3446 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3447 if (Mapping.isValid()) 3448 return Mapping; 3449 3450 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3451 3452 switch (MI.getOpcode()) { 3453 default: 3454 return getInvalidInstructionMapping(); 3455 3456 case AMDGPU::G_AND: 3457 case AMDGPU::G_OR: 3458 case AMDGPU::G_XOR: { 3459 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3460 if (Size == 1) { 3461 const RegisterBank *DstBank 3462 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3463 3464 unsigned TargetBankID = -1; 3465 unsigned BankLHS = -1; 3466 unsigned BankRHS = -1; 3467 if (DstBank) { 3468 TargetBankID = DstBank->getID(); 3469 if (DstBank == &AMDGPU::VCCRegBank) { 3470 TargetBankID = AMDGPU::VCCRegBankID; 3471 BankLHS = AMDGPU::VCCRegBankID; 3472 BankRHS = AMDGPU::VCCRegBankID; 3473 } else { 3474 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 3475 AMDGPU::SGPRRegBankID); 3476 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 3477 AMDGPU::SGPRRegBankID); 3478 } 3479 } else { 3480 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 3481 AMDGPU::VCCRegBankID); 3482 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 3483 AMDGPU::VCCRegBankID); 3484 3485 // Both inputs should be true booleans to produce a boolean result. 3486 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3487 TargetBankID = AMDGPU::VGPRRegBankID; 3488 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3489 TargetBankID = AMDGPU::VCCRegBankID; 3490 BankLHS = AMDGPU::VCCRegBankID; 3491 BankRHS = AMDGPU::VCCRegBankID; 3492 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3493 TargetBankID = AMDGPU::SGPRRegBankID; 3494 } 3495 } 3496 3497 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3498 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3499 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3500 break; 3501 } 3502 3503 if (Size == 64) { 3504 3505 if (isSALUMapping(MI)) { 3506 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3507 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3508 } else { 3509 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3510 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); 3511 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3512 3513 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); 3514 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3515 } 3516 3517 break; 3518 } 3519 3520 LLVM_FALLTHROUGH; 3521 } 3522 case AMDGPU::G_PTR_ADD: 3523 case AMDGPU::G_PTRMASK: 3524 case AMDGPU::G_ADD: 3525 case AMDGPU::G_SUB: 3526 case AMDGPU::G_MUL: 3527 case AMDGPU::G_SHL: 3528 case AMDGPU::G_LSHR: 3529 case AMDGPU::G_ASHR: 3530 case AMDGPU::G_UADDO: 3531 case AMDGPU::G_USUBO: 3532 case AMDGPU::G_UADDE: 3533 case AMDGPU::G_SADDE: 3534 case AMDGPU::G_USUBE: 3535 case AMDGPU::G_SSUBE: 3536 case AMDGPU::G_SMIN: 3537 case AMDGPU::G_SMAX: 3538 case AMDGPU::G_UMIN: 3539 case AMDGPU::G_UMAX: 3540 case AMDGPU::G_SHUFFLE_VECTOR: 3541 if (isSALUMapping(MI)) 3542 return getDefaultMappingSOP(MI); 3543 LLVM_FALLTHROUGH; 3544 3545 case AMDGPU::G_FADD: 3546 case AMDGPU::G_FSUB: 3547 case AMDGPU::G_FPTOSI: 3548 case AMDGPU::G_FPTOUI: 3549 case AMDGPU::G_FMUL: 3550 case AMDGPU::G_FMA: 3551 case AMDGPU::G_FMAD: 3552 case AMDGPU::G_FSQRT: 3553 case AMDGPU::G_FFLOOR: 3554 case AMDGPU::G_FCEIL: 3555 case AMDGPU::G_FRINT: 3556 case AMDGPU::G_SITOFP: 3557 case AMDGPU::G_UITOFP: 3558 case AMDGPU::G_FPTRUNC: 3559 case AMDGPU::G_FPEXT: 3560 case AMDGPU::G_FEXP2: 3561 case AMDGPU::G_FLOG2: 3562 case AMDGPU::G_FMINNUM: 3563 case AMDGPU::G_FMAXNUM: 3564 case AMDGPU::G_FMINNUM_IEEE: 3565 case AMDGPU::G_FMAXNUM_IEEE: 3566 case AMDGPU::G_FCANONICALIZE: 3567 case AMDGPU::G_INTRINSIC_TRUNC: 3568 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3569 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3570 case AMDGPU::G_AMDGPU_FFBH_U32: 3571 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3572 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3573 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3574 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3575 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3576 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3577 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3578 return getDefaultMappingVOP(MI); 3579 case AMDGPU::G_UMULH: 3580 case AMDGPU::G_SMULH: { 3581 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3582 return getDefaultMappingSOP(MI); 3583 return getDefaultMappingVOP(MI); 3584 } 3585 case AMDGPU::G_IMPLICIT_DEF: { 3586 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3587 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3588 break; 3589 } 3590 case AMDGPU::G_FCONSTANT: 3591 case AMDGPU::G_CONSTANT: 3592 case AMDGPU::G_GLOBAL_VALUE: 3593 case AMDGPU::G_BLOCK_ADDR: 3594 case AMDGPU::G_READCYCLECOUNTER: { 3595 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3596 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3597 break; 3598 } 3599 case AMDGPU::G_FRAME_INDEX: { 3600 // TODO: This should be the same as other constants, but eliminateFrameIndex 3601 // currently assumes VALU uses. 3602 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3603 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3604 break; 3605 } 3606 case AMDGPU::G_DYN_STACKALLOC: { 3607 // Result is always uniform, and a wave reduction is needed for the source. 3608 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3609 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 3610 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3611 break; 3612 } 3613 case AMDGPU::G_INSERT: { 3614 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 3615 AMDGPU::VGPRRegBankID; 3616 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3617 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3618 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3619 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3620 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3621 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3622 OpdsMapping[3] = nullptr; 3623 break; 3624 } 3625 case AMDGPU::G_EXTRACT: { 3626 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 3627 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3628 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3629 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3630 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3631 OpdsMapping[2] = nullptr; 3632 break; 3633 } 3634 case AMDGPU::G_BUILD_VECTOR: 3635 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3636 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3637 if (DstTy == LLT::vector(2, 16)) { 3638 unsigned DstSize = DstTy.getSizeInBits(); 3639 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3640 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 3641 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 3642 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3643 3644 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3645 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3646 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3647 break; 3648 } 3649 3650 LLVM_FALLTHROUGH; 3651 } 3652 case AMDGPU::G_MERGE_VALUES: 3653 case AMDGPU::G_CONCAT_VECTORS: { 3654 unsigned Bank = isSALUMapping(MI) ? 3655 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3656 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3657 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3658 3659 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3660 // Op1 and Dst should use the same register bank. 3661 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3662 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3663 break; 3664 } 3665 case AMDGPU::G_BITCAST: 3666 case AMDGPU::G_INTTOPTR: 3667 case AMDGPU::G_PTRTOINT: 3668 case AMDGPU::G_BITREVERSE: 3669 case AMDGPU::G_FABS: 3670 case AMDGPU::G_FNEG: { 3671 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3672 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 3673 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3674 break; 3675 } 3676 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3677 case AMDGPU::G_CTTZ_ZERO_UNDEF: 3678 case AMDGPU::G_CTPOP: { 3679 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3680 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 3681 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3682 3683 // This should really be getValueMappingSGPR64Only, but allowing the generic 3684 // code to handle the register split just makes using LegalizerHelper more 3685 // difficult. 3686 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3687 break; 3688 } 3689 case AMDGPU::G_TRUNC: { 3690 Register Dst = MI.getOperand(0).getReg(); 3691 Register Src = MI.getOperand(1).getReg(); 3692 unsigned Bank = getRegBankID(Src, MRI, *TRI); 3693 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3694 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3695 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3696 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3697 break; 3698 } 3699 case AMDGPU::G_ZEXT: 3700 case AMDGPU::G_SEXT: 3701 case AMDGPU::G_ANYEXT: 3702 case AMDGPU::G_SEXT_INREG: { 3703 Register Dst = MI.getOperand(0).getReg(); 3704 Register Src = MI.getOperand(1).getReg(); 3705 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3706 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3707 3708 unsigned DstBank; 3709 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3710 assert(SrcBank); 3711 switch (SrcBank->getID()) { 3712 case AMDGPU::SGPRRegBankID: 3713 DstBank = AMDGPU::SGPRRegBankID; 3714 break; 3715 default: 3716 DstBank = AMDGPU::VGPRRegBankID; 3717 break; 3718 } 3719 3720 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3721 // 32-bits, and then to 64. 3722 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3723 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3724 SrcSize); 3725 break; 3726 } 3727 case AMDGPU::G_FCMP: { 3728 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3729 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 3730 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3731 OpdsMapping[1] = nullptr; // Predicate Operand. 3732 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3733 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3734 break; 3735 } 3736 case AMDGPU::G_STORE: { 3737 assert(MI.getOperand(0).isReg()); 3738 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3739 3740 // FIXME: We need to specify a different reg bank once scalar stores are 3741 // supported. 3742 const ValueMapping *ValMapping = 3743 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3744 OpdsMapping[0] = ValMapping; 3745 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3746 break; 3747 } 3748 case AMDGPU::G_ICMP: { 3749 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3750 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3751 3752 // See if the result register has already been constrained to vcc, which may 3753 // happen due to control flow intrinsic lowering. 3754 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, 3755 AMDGPU::SGPRRegBankID); 3756 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 3757 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 3758 3759 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 3760 Op2Bank == AMDGPU::SGPRRegBankID && 3761 Op3Bank == AMDGPU::SGPRRegBankID && 3762 (Size == 32 || (Size == 64 && 3763 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 3764 Subtarget.hasScalarCompareEq64())); 3765 3766 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3767 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3768 3769 // TODO: Use 32-bit for scalar output size. 3770 // SCC results will need to be copied to a 32-bit SGPR virtual register. 3771 const unsigned ResultSize = 1; 3772 3773 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 3774 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 3775 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 3776 break; 3777 } 3778 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 3779 // VGPR index can be used for waterfall when indexing a SGPR vector. 3780 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 3781 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3782 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3783 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3784 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 3785 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 3786 3787 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 3788 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 3789 3790 // The index can be either if the source vector is VGPR. 3791 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3792 break; 3793 } 3794 case AMDGPU::G_INSERT_VECTOR_ELT: { 3795 unsigned OutputBankID = isSALUMapping(MI) ? 3796 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3797 3798 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3799 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3800 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 3801 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), 3802 MRI, *TRI); 3803 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 3804 3805 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3806 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3807 3808 // This is a weird case, because we need to break down the mapping based on 3809 // the register bank of a different operand. 3810 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 3811 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 3812 InsertSize); 3813 } else { 3814 assert(InsertSize == 32 || InsertSize == 64); 3815 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 3816 } 3817 3818 // The index can be either if the source vector is VGPR. 3819 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 3820 break; 3821 } 3822 case AMDGPU::G_UNMERGE_VALUES: { 3823 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 3824 AMDGPU::VGPRRegBankID; 3825 3826 // Op1 and Dst should use the same register bank. 3827 // FIXME: Shouldn't this be the default? Why do we need to handle this? 3828 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3829 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 3830 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 3831 } 3832 break; 3833 } 3834 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3835 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3836 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3837 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3838 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3839 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3840 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3841 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3842 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3843 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3844 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 3845 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3846 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3847 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3848 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3849 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 3850 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3851 3852 // rsrc 3853 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3854 3855 // vindex 3856 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3857 3858 // voffset 3859 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3860 3861 // soffset 3862 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3863 3864 // Any remaining operands are immediates and were correctly null 3865 // initialized. 3866 break; 3867 } 3868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3869 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3875 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3876 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3877 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3878 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3879 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 3880 // vdata_out 3881 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3882 3883 // vdata_in 3884 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3885 3886 // rsrc 3887 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3888 3889 // vindex 3890 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3891 3892 // voffset 3893 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3894 3895 // soffset 3896 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3897 3898 // Any remaining operands are immediates and were correctly null 3899 // initialized. 3900 break; 3901 } 3902 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3903 // vdata_out 3904 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3905 3906 // vdata_in 3907 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3908 3909 // cmp 3910 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3911 3912 // rsrc 3913 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3914 3915 // vindex 3916 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3917 3918 // voffset 3919 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3920 3921 // soffset 3922 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 3923 3924 // Any remaining operands are immediates and were correctly null 3925 // initialized. 3926 break; 3927 } 3928 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 3929 // Lie and claim everything is legal, even though some need to be 3930 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 3931 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3932 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3933 3934 // We need to convert this to a MUBUF if either the resource of offset is 3935 // VGPR. 3936 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 3937 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 3938 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 3939 3940 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3941 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 3942 break; 3943 } 3944 case AMDGPU::G_INTRINSIC: { 3945 switch (MI.getIntrinsicID()) { 3946 default: 3947 return getInvalidInstructionMapping(); 3948 case Intrinsic::amdgcn_div_fmas: 3949 case Intrinsic::amdgcn_div_fixup: 3950 case Intrinsic::amdgcn_trig_preop: 3951 case Intrinsic::amdgcn_sin: 3952 case Intrinsic::amdgcn_cos: 3953 case Intrinsic::amdgcn_log_clamp: 3954 case Intrinsic::amdgcn_rcp: 3955 case Intrinsic::amdgcn_rcp_legacy: 3956 case Intrinsic::amdgcn_sqrt: 3957 case Intrinsic::amdgcn_rsq: 3958 case Intrinsic::amdgcn_rsq_legacy: 3959 case Intrinsic::amdgcn_rsq_clamp: 3960 case Intrinsic::amdgcn_fmul_legacy: 3961 case Intrinsic::amdgcn_ldexp: 3962 case Intrinsic::amdgcn_frexp_mant: 3963 case Intrinsic::amdgcn_frexp_exp: 3964 case Intrinsic::amdgcn_fract: 3965 case Intrinsic::amdgcn_cvt_pkrtz: 3966 case Intrinsic::amdgcn_cvt_pknorm_i16: 3967 case Intrinsic::amdgcn_cvt_pknorm_u16: 3968 case Intrinsic::amdgcn_cvt_pk_i16: 3969 case Intrinsic::amdgcn_cvt_pk_u16: 3970 case Intrinsic::amdgcn_fmed3: 3971 case Intrinsic::amdgcn_cubeid: 3972 case Intrinsic::amdgcn_cubema: 3973 case Intrinsic::amdgcn_cubesc: 3974 case Intrinsic::amdgcn_cubetc: 3975 case Intrinsic::amdgcn_sffbh: 3976 case Intrinsic::amdgcn_fmad_ftz: 3977 case Intrinsic::amdgcn_mbcnt_lo: 3978 case Intrinsic::amdgcn_mbcnt_hi: 3979 case Intrinsic::amdgcn_mul_u24: 3980 case Intrinsic::amdgcn_mul_i24: 3981 case Intrinsic::amdgcn_lerp: 3982 case Intrinsic::amdgcn_sad_u8: 3983 case Intrinsic::amdgcn_msad_u8: 3984 case Intrinsic::amdgcn_sad_hi_u8: 3985 case Intrinsic::amdgcn_sad_u16: 3986 case Intrinsic::amdgcn_qsad_pk_u16_u8: 3987 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 3988 case Intrinsic::amdgcn_mqsad_u32_u8: 3989 case Intrinsic::amdgcn_cvt_pk_u8_f32: 3990 case Intrinsic::amdgcn_alignbit: 3991 case Intrinsic::amdgcn_alignbyte: 3992 case Intrinsic::amdgcn_fdot2: 3993 case Intrinsic::amdgcn_sdot2: 3994 case Intrinsic::amdgcn_udot2: 3995 case Intrinsic::amdgcn_sdot4: 3996 case Intrinsic::amdgcn_udot4: 3997 case Intrinsic::amdgcn_sdot8: 3998 case Intrinsic::amdgcn_udot8: 3999 return getDefaultMappingVOP(MI); 4000 case Intrinsic::amdgcn_sbfe: 4001 case Intrinsic::amdgcn_ubfe: 4002 if (isSALUMapping(MI)) 4003 return getDefaultMappingSOP(MI); 4004 return getDefaultMappingVOP(MI); 4005 case Intrinsic::amdgcn_ds_swizzle: 4006 case Intrinsic::amdgcn_ds_permute: 4007 case Intrinsic::amdgcn_ds_bpermute: 4008 case Intrinsic::amdgcn_update_dpp: 4009 case Intrinsic::amdgcn_mov_dpp8: 4010 case Intrinsic::amdgcn_mov_dpp: 4011 case Intrinsic::amdgcn_wwm: 4012 case Intrinsic::amdgcn_wqm: 4013 case Intrinsic::amdgcn_softwqm: 4014 return getDefaultMappingAllVGPR(MI); 4015 case Intrinsic::amdgcn_kernarg_segment_ptr: 4016 case Intrinsic::amdgcn_s_getpc: 4017 case Intrinsic::amdgcn_groupstaticsize: { 4018 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4019 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4020 break; 4021 } 4022 case Intrinsic::amdgcn_wqm_vote: { 4023 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4024 OpdsMapping[0] = OpdsMapping[2] 4025 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4026 break; 4027 } 4028 case Intrinsic::amdgcn_ps_live: { 4029 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4030 break; 4031 } 4032 case Intrinsic::amdgcn_div_scale: { 4033 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4034 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4035 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4036 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4037 4038 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4039 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4040 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4041 break; 4042 } 4043 case Intrinsic::amdgcn_class: { 4044 Register Src0Reg = MI.getOperand(2).getReg(); 4045 Register Src1Reg = MI.getOperand(3).getReg(); 4046 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4047 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4048 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4049 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4050 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4051 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4052 break; 4053 } 4054 case Intrinsic::amdgcn_icmp: 4055 case Intrinsic::amdgcn_fcmp: { 4056 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4057 // This is not VCCRegBank because this is not used in boolean contexts. 4058 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4059 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4060 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4061 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4062 break; 4063 } 4064 case Intrinsic::amdgcn_readlane: { 4065 // This must be an SGPR, but accept a VGPR. 4066 Register IdxReg = MI.getOperand(3).getReg(); 4067 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4068 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 4069 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4070 LLVM_FALLTHROUGH; 4071 } 4072 case Intrinsic::amdgcn_readfirstlane: { 4073 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4074 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4075 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4076 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4077 break; 4078 } 4079 case Intrinsic::amdgcn_writelane: { 4080 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4081 Register SrcReg = MI.getOperand(2).getReg(); 4082 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4083 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 4084 Register IdxReg = MI.getOperand(3).getReg(); 4085 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4086 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 4087 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4088 4089 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4090 // to legalize. 4091 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4092 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4093 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4094 break; 4095 } 4096 case Intrinsic::amdgcn_if_break: { 4097 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4098 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4099 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4100 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4101 break; 4102 } 4103 case Intrinsic::amdgcn_permlane16: 4104 case Intrinsic::amdgcn_permlanex16: { 4105 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4106 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4107 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4108 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4109 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4110 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4111 break; 4112 } 4113 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4114 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4115 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4116 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4117 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4118 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4119 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4120 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4121 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4122 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4123 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4124 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4125 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4126 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4127 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4128 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4129 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4130 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4131 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4132 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: { 4133 // Default for MAI intrinsics. 4134 // srcC can also be an immediate which can be folded later. 4135 // FIXME: Should we eventually add an alternative mapping with AGPR src 4136 // for srcA/srcB? 4137 // 4138 // vdst, srcA, srcB, srcC 4139 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4140 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4141 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4142 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4143 break; 4144 } 4145 case Intrinsic::amdgcn_interp_p1: 4146 case Intrinsic::amdgcn_interp_p2: 4147 case Intrinsic::amdgcn_interp_mov: 4148 case Intrinsic::amdgcn_interp_p1_f16: 4149 case Intrinsic::amdgcn_interp_p2_f16: { 4150 const int M0Idx = MI.getNumOperands() - 1; 4151 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4152 unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID); 4153 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4154 4155 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4156 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4157 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4158 4159 // Must be SGPR, but we must take whatever the original bank is and fix it 4160 // later. 4161 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4162 break; 4163 } 4164 case Intrinsic::amdgcn_ballot: { 4165 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4166 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4167 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4168 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4169 break; 4170 } 4171 } 4172 break; 4173 } 4174 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4175 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 4176 auto IntrID = MI.getIntrinsicID(); 4177 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4178 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4179 // Non-images can have complications from operands that allow both SGPR 4180 // and VGPR. For now it's too complicated to figure out the final opcode 4181 // to derive the register bank from the MCInstrDesc. 4182 assert(RSrcIntrin->IsImage); 4183 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4184 } 4185 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4186 auto IntrID = MI.getIntrinsicID(); 4187 switch (IntrID) { 4188 case Intrinsic::amdgcn_s_getreg: 4189 case Intrinsic::amdgcn_s_memtime: 4190 case Intrinsic::amdgcn_s_memrealtime: 4191 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 4192 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4193 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4194 break; 4195 } 4196 case Intrinsic::amdgcn_ds_fadd: 4197 case Intrinsic::amdgcn_ds_fmin: 4198 case Intrinsic::amdgcn_ds_fmax: 4199 return getDefaultMappingAllVGPR(MI); 4200 case Intrinsic::amdgcn_ds_ordered_add: 4201 case Intrinsic::amdgcn_ds_ordered_swap: { 4202 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4203 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4204 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 4205 AMDGPU::SGPRRegBankID); 4206 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4207 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4208 break; 4209 } 4210 case Intrinsic::amdgcn_ds_append: 4211 case Intrinsic::amdgcn_ds_consume: { 4212 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4213 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4214 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4215 break; 4216 } 4217 case Intrinsic::amdgcn_exp_compr: 4218 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4219 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4220 break; 4221 case Intrinsic::amdgcn_exp: 4222 // FIXME: Could we support packed types here? 4223 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4224 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4225 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4226 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4227 break; 4228 case Intrinsic::amdgcn_s_sendmsg: 4229 case Intrinsic::amdgcn_s_sendmsghalt: { 4230 // This must be an SGPR, but accept a VGPR. 4231 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 4232 AMDGPU::SGPRRegBankID); 4233 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4234 break; 4235 } 4236 case Intrinsic::amdgcn_s_setreg: { 4237 // This must be an SGPR, but accept a VGPR. 4238 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 4239 AMDGPU::SGPRRegBankID); 4240 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4241 break; 4242 } 4243 case Intrinsic::amdgcn_end_cf: { 4244 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4245 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4246 break; 4247 } 4248 case Intrinsic::amdgcn_else: { 4249 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4250 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4251 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4252 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4253 break; 4254 } 4255 case Intrinsic::amdgcn_kill: { 4256 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4257 break; 4258 } 4259 case Intrinsic::amdgcn_raw_buffer_load: 4260 case Intrinsic::amdgcn_raw_tbuffer_load: { 4261 // FIXME: Should make intrinsic ID the last operand of the instruction, 4262 // then this would be the same as store 4263 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4264 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4265 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4266 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4267 break; 4268 } 4269 case Intrinsic::amdgcn_raw_buffer_store: 4270 case Intrinsic::amdgcn_raw_buffer_store_format: 4271 case Intrinsic::amdgcn_raw_tbuffer_store: { 4272 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4273 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4274 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4275 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4276 break; 4277 } 4278 case Intrinsic::amdgcn_struct_buffer_load: 4279 case Intrinsic::amdgcn_struct_tbuffer_load: { 4280 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4281 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4282 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4283 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4284 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4285 break; 4286 } 4287 case Intrinsic::amdgcn_struct_buffer_store: 4288 case Intrinsic::amdgcn_struct_tbuffer_store: { 4289 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4290 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4291 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4292 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4293 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4294 break; 4295 } 4296 case Intrinsic::amdgcn_init_exec_from_input: { 4297 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4298 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4299 break; 4300 } 4301 case Intrinsic::amdgcn_ds_gws_init: 4302 case Intrinsic::amdgcn_ds_gws_barrier: 4303 case Intrinsic::amdgcn_ds_gws_sema_br: { 4304 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4305 4306 // This must be an SGPR, but accept a VGPR. 4307 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 4308 AMDGPU::SGPRRegBankID); 4309 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4310 break; 4311 } 4312 case Intrinsic::amdgcn_ds_gws_sema_v: 4313 case Intrinsic::amdgcn_ds_gws_sema_p: 4314 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4315 // This must be an SGPR, but accept a VGPR. 4316 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 4317 AMDGPU::SGPRRegBankID); 4318 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4319 break; 4320 } 4321 default: 4322 return getInvalidInstructionMapping(); 4323 } 4324 break; 4325 } 4326 case AMDGPU::G_SELECT: { 4327 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4328 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 4329 AMDGPU::SGPRRegBankID); 4330 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, 4331 AMDGPU::SGPRRegBankID); 4332 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4333 Op3Bank == AMDGPU::SGPRRegBankID; 4334 4335 unsigned CondBankDefault = SGPRSrcs ? 4336 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4337 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 4338 CondBankDefault); 4339 if (CondBank == AMDGPU::SGPRRegBankID) 4340 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4341 else if (CondBank == AMDGPU::VGPRRegBankID) 4342 CondBank = AMDGPU::VCCRegBankID; 4343 4344 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4345 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4346 4347 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4348 4349 // TODO: Should report 32-bit for scalar condition type. 4350 if (Size == 64) { 4351 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4352 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4353 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4354 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4355 } else { 4356 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4357 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4358 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4359 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4360 } 4361 4362 break; 4363 } 4364 4365 case AMDGPU::G_LOAD: 4366 case AMDGPU::G_ZEXTLOAD: 4367 case AMDGPU::G_SEXTLOAD: 4368 return getInstrMappingForLoad(MI); 4369 4370 case AMDGPU::G_ATOMICRMW_XCHG: 4371 case AMDGPU::G_ATOMICRMW_ADD: 4372 case AMDGPU::G_ATOMICRMW_SUB: 4373 case AMDGPU::G_ATOMICRMW_AND: 4374 case AMDGPU::G_ATOMICRMW_OR: 4375 case AMDGPU::G_ATOMICRMW_XOR: 4376 case AMDGPU::G_ATOMICRMW_MAX: 4377 case AMDGPU::G_ATOMICRMW_MIN: 4378 case AMDGPU::G_ATOMICRMW_UMAX: 4379 case AMDGPU::G_ATOMICRMW_UMIN: 4380 case AMDGPU::G_ATOMICRMW_FADD: 4381 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4382 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4383 case AMDGPU::G_AMDGPU_ATOMIC_DEC: { 4384 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4385 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4386 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4387 break; 4388 } 4389 case AMDGPU::G_ATOMIC_CMPXCHG: { 4390 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4391 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4392 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4393 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4394 break; 4395 } 4396 case AMDGPU::G_BRCOND: { 4397 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, 4398 AMDGPU::SGPRRegBankID); 4399 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4400 if (Bank != AMDGPU::SGPRRegBankID) 4401 Bank = AMDGPU::VCCRegBankID; 4402 4403 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4404 break; 4405 } 4406 } 4407 4408 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4409 getOperandsMapping(OpdsMapping), 4410 MI.getNumOperands()); 4411 } 4412