1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 MachineIRBuilder &B; 101 const AMDGPURegisterBankInfo &RBI; 102 MachineRegisterInfo &MRI; 103 const RegisterBank *NewBank; 104 SmallVector<MachineInstr *, 4> NewInsts; 105 106 public: 107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_, 108 MachineRegisterInfo &MRI_, const RegisterBank *RB) 109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) { 110 assert(!B.isObservingChanges()); 111 B.setChangeObserver(*this); 112 } 113 114 ~ApplyRegBankMapping() override { 115 for (MachineInstr *MI : NewInsts) 116 applyBank(*MI); 117 118 B.stopObservingChanges(); 119 } 120 121 /// Set any registers that don't have a set register class or bank to SALU. 122 void applyBank(MachineInstr &MI) { 123 const unsigned Opc = MI.getOpcode(); 124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 125 Opc == AMDGPU::G_SEXT) { 126 // LegalizerHelper wants to use the basic legalization artifacts when 127 // widening etc. We don't handle selection with vcc in artifact sources, 128 // so we need to use a select instead to handle these properly. 129 Register DstReg = MI.getOperand(0).getReg(); 130 Register SrcReg = MI.getOperand(1).getReg(); 131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 132 if (SrcBank == &AMDGPU::VCCRegBank) { 133 const LLT S32 = LLT::scalar(32); 134 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 135 assert(MRI.getType(DstReg) == S32); 136 assert(NewBank == &AMDGPU::VGPRRegBank); 137 138 // Replace the extension with a select, which really uses the boolean 139 // source. 140 B.setInsertPt(*MI.getParent(), MI); 141 142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 143 auto False = B.buildConstant(S32, 0); 144 B.buildSelect(DstReg, SrcReg, True, False); 145 MRI.setRegBank(True.getReg(0), *NewBank); 146 MRI.setRegBank(False.getReg(0), *NewBank); 147 MI.eraseFromParent(); 148 } 149 150 assert(!MRI.getRegClassOrRegBank(DstReg)); 151 MRI.setRegBank(DstReg, *NewBank); 152 return; 153 } 154 155 #ifndef NDEBUG 156 if (Opc == AMDGPU::G_TRUNC) { 157 Register DstReg = MI.getOperand(0).getReg(); 158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 159 assert(DstBank != &AMDGPU::VCCRegBank); 160 } 161 #endif 162 163 for (MachineOperand &Op : MI.operands()) { 164 if (!Op.isReg()) 165 continue; 166 167 // We may see physical registers if building a real MI 168 Register Reg = Op.getReg(); 169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 170 continue; 171 172 const RegisterBank *RB = NewBank; 173 if (MRI.getType(Reg) == LLT::scalar(1)) { 174 assert(NewBank == &AMDGPU::VGPRRegBank && 175 "s1 operands should only be used for vector bools"); 176 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 177 MI.getOpcode() != AMDGPU::G_ANYEXT) && 178 "not expecting legalization artifacts here"); 179 RB = &AMDGPU::VCCRegBank; 180 } 181 182 MRI.setRegBank(Reg, *RB); 183 } 184 } 185 186 void erasingInstr(MachineInstr &MI) override {} 187 188 void createdInstr(MachineInstr &MI) override { 189 // At this point, the instruction was just inserted and has no operands. 190 NewInsts.push_back(&MI); 191 } 192 193 void changingInstr(MachineInstr &MI) override {} 194 void changedInstr(MachineInstr &MI) override { 195 // FIXME: In principle we should probably add the instruction to NewInsts, 196 // but the way the LegalizerHelper uses the observer, we will always see the 197 // registers we need to set the regbank on also referenced in a new 198 // instruction. 199 } 200 }; 201 202 } // anonymous namespace 203 204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 206 TII(Subtarget.getInstrInfo()) { 207 208 // HACK: Until this is fully tablegen'd. 209 static llvm::once_flag InitializeRegisterBankFlag; 210 211 static auto InitializeRegisterBankOnce = [this]() { 212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 215 (void)this; 216 }; 217 218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 219 } 220 221 static bool isVectorRegisterBank(const RegisterBank &Bank) { 222 unsigned BankID = Bank.getID(); 223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 224 } 225 226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { 227 return RB != &AMDGPU::SGPRRegBank; 228 } 229 230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 231 const RegisterBank &Src, 232 TypeSize Size) const { 233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 234 if (Dst.getID() == AMDGPU::SGPRRegBankID && 235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 236 return std::numeric_limits<unsigned>::max(); 237 } 238 239 // Bool values are tricky, because the meaning is based on context. The SCC 240 // and VCC banks are for the natural scalar and vector conditions produced by 241 // a compare. 242 // 243 // Legalization doesn't know about the necessary context, so an s1 use may 244 // have been a truncate from an arbitrary value, in which case a copy (lowered 245 // as a compare with 0) needs to be inserted. 246 if (Size == 1 && 247 (Dst.getID() == AMDGPU::SGPRRegBankID) && 248 (isVectorRegisterBank(Src) || 249 Src.getID() == AMDGPU::SGPRRegBankID || 250 Src.getID() == AMDGPU::VCCRegBankID)) 251 return std::numeric_limits<unsigned>::max(); 252 253 // There is no direct copy between AGPRs. 254 if (Dst.getID() == AMDGPU::AGPRRegBankID && 255 Src.getID() == AMDGPU::AGPRRegBankID) 256 return 4; 257 258 return RegisterBankInfo::copyCost(Dst, Src, Size); 259 } 260 261 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 262 const ValueMapping &ValMapping, 263 const RegisterBank *CurBank) const { 264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 265 // VGPR. 266 // FIXME: Is there a better way to do this? 267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 268 return 10; // This is expensive. 269 270 assert(ValMapping.NumBreakDowns == 2 && 271 ValMapping.BreakDown[0].Length == 32 && 272 ValMapping.BreakDown[0].StartIdx == 0 && 273 ValMapping.BreakDown[1].Length == 32 && 274 ValMapping.BreakDown[1].StartIdx == 32 && 275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 276 277 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 279 // want. 280 281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 282 // alignment restrictions, but this probably isn't important. 283 return 1; 284 } 285 286 const RegisterBank & 287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 288 LLT Ty) const { 289 if (&RC == &AMDGPU::SReg_1RegClass) 290 return AMDGPU::VCCRegBank; 291 292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 293 // VCC-like use. 294 if (TRI->isSGPRClass(&RC)) { 295 // FIXME: This probably came from a copy from a physical register, which 296 // should be inferable from the copied to-type. We don't have many boolean 297 // physical register constraints so just assume a normal SGPR for now. 298 if (!Ty.isValid()) 299 return AMDGPU::SGPRRegBank; 300 301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 302 } 303 304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 305 } 306 307 template <unsigned NumOps> 308 RegisterBankInfo::InstructionMappings 309 AMDGPURegisterBankInfo::addMappingFromTable( 310 const MachineInstr &MI, const MachineRegisterInfo &MRI, 311 const std::array<unsigned, NumOps> RegSrcOpIdx, 312 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 313 314 InstructionMappings AltMappings; 315 316 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 317 318 unsigned Sizes[NumOps]; 319 for (unsigned I = 0; I < NumOps; ++I) { 320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 322 } 323 324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 327 } 328 329 // getInstrMapping's default mapping uses ID 1, so start at 2. 330 unsigned MappingID = 2; 331 for (const auto &Entry : Table) { 332 for (unsigned I = 0; I < NumOps; ++I) { 333 int OpIdx = RegSrcOpIdx[I]; 334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 335 } 336 337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 338 getOperandsMapping(Operands), 339 Operands.size())); 340 } 341 342 return AltMappings; 343 } 344 345 RegisterBankInfo::InstructionMappings 346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 349 case Intrinsic::amdgcn_readlane: { 350 static const OpRegBankEntry<3> Table[2] = { 351 // Perfectly legal. 352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 353 354 // Need a readfirstlane for the index. 355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 356 }; 357 358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 360 } 361 case Intrinsic::amdgcn_writelane: { 362 static const OpRegBankEntry<4> Table[4] = { 363 // Perfectly legal. 364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 365 366 // Need readfirstlane of first op 367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 368 369 // Need readfirstlane of second op 370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 371 372 // Need readfirstlane of both ops 373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 374 }; 375 376 // rsrc, voffset, offset 377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); 379 } 380 default: 381 return RegisterBankInfo::getInstrAlternativeMappings(MI); 382 } 383 } 384 385 RegisterBankInfo::InstructionMappings 386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 388 389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 390 case Intrinsic::amdgcn_s_buffer_load: { 391 static const OpRegBankEntry<2> Table[4] = { 392 // Perfectly legal. 393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 394 395 // Only need 1 register in loop 396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 397 398 // Have to waterfall the resource. 399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 400 401 // Have to waterfall the resource, and the offset. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 403 }; 404 405 // rsrc, offset 406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); 408 } 409 case Intrinsic::amdgcn_ds_ordered_add: 410 case Intrinsic::amdgcn_ds_ordered_swap: { 411 // VGPR = M0, VGPR 412 static const OpRegBankEntry<3> Table[2] = { 413 // Perfectly legal. 414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 415 416 // Need a readfirstlane for m0 417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 418 }; 419 420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 422 } 423 case Intrinsic::amdgcn_s_sendmsg: 424 case Intrinsic::amdgcn_s_sendmsghalt: { 425 // FIXME: Should have no register for immediate 426 static const OpRegBankEntry<1> Table[2] = { 427 // Perfectly legal. 428 { { AMDGPU::SGPRRegBankID }, 1 }, 429 430 // Need readlane 431 { { AMDGPU::VGPRRegBankID }, 3 } 432 }; 433 434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); 436 } 437 default: 438 return RegisterBankInfo::getInstrAlternativeMappings(MI); 439 } 440 } 441 442 // FIXME: Returns uniform if there's no source value information. This is 443 // probably wrong. 444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { 445 if (!MI.hasOneMemOperand()) 446 return false; 447 448 const MachineMemOperand *MMO = *MI.memoperands_begin(); 449 const unsigned AS = MMO->getAddrSpace(); 450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 451 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 452 const unsigned MemSize = 8 * MMO->getSize().getValue(); 453 454 // Require 4-byte alignment. 455 return (MMO->getAlign() >= Align(4) || 456 (Subtarget.hasScalarSubwordLoads() && 457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) || 458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) && 459 // Can't do a scalar atomic load. 460 !MMO->isAtomic() && 461 // Don't use scalar loads for volatile accesses to non-constant address 462 // spaces. 463 (IsConst || !MMO->isVolatile()) && 464 // Memory must be known constant, or not written before this load. 465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 466 AMDGPU::isUniformMMO(MMO); 467 } 468 469 RegisterBankInfo::InstructionMappings 470 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 471 const MachineInstr &MI) const { 472 473 const MachineFunction &MF = *MI.getParent()->getParent(); 474 const MachineRegisterInfo &MRI = MF.getRegInfo(); 475 476 477 InstructionMappings AltMappings; 478 switch (MI.getOpcode()) { 479 case TargetOpcode::G_CONSTANT: 480 case TargetOpcode::G_IMPLICIT_DEF: { 481 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 482 if (Size == 1) { 483 static const OpRegBankEntry<1> Table[3] = { 484 { { AMDGPU::VGPRRegBankID }, 1 }, 485 { { AMDGPU::SGPRRegBankID }, 1 }, 486 { { AMDGPU::VCCRegBankID }, 1 } 487 }; 488 489 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 490 } 491 492 [[fallthrough]]; 493 } 494 case TargetOpcode::G_FCONSTANT: 495 case TargetOpcode::G_FRAME_INDEX: 496 case TargetOpcode::G_GLOBAL_VALUE: { 497 static const OpRegBankEntry<1> Table[2] = { 498 { { AMDGPU::VGPRRegBankID }, 1 }, 499 { { AMDGPU::SGPRRegBankID }, 1 } 500 }; 501 502 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 503 } 504 case TargetOpcode::G_AND: 505 case TargetOpcode::G_OR: 506 case TargetOpcode::G_XOR: { 507 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 508 509 if (Size == 1) { 510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 511 const InstructionMapping &SCCMapping = getInstructionMapping( 512 1, 1, getOperandsMapping( 513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 516 3); // Num Operands 517 AltMappings.push_back(&SCCMapping); 518 519 const InstructionMapping &VCCMapping0 = getInstructionMapping( 520 2, 1, getOperandsMapping( 521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 524 3); // Num Operands 525 AltMappings.push_back(&VCCMapping0); 526 return AltMappings; 527 } 528 529 if (Size != 64) 530 break; 531 532 const InstructionMapping &SSMapping = getInstructionMapping( 533 1, 1, getOperandsMapping( 534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 537 3); // Num Operands 538 AltMappings.push_back(&SSMapping); 539 540 const InstructionMapping &VVMapping = getInstructionMapping( 541 2, 2, getOperandsMapping( 542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 545 3); // Num Operands 546 AltMappings.push_back(&VVMapping); 547 break; 548 } 549 case TargetOpcode::G_LOAD: 550 case TargetOpcode::G_ZEXTLOAD: 551 case TargetOpcode::G_SEXTLOAD: { 552 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 553 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 554 unsigned PtrSize = PtrTy.getSizeInBits(); 555 unsigned AS = PtrTy.getAddressSpace(); 556 557 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 558 AS != AMDGPUAS::PRIVATE_ADDRESS) && 559 isScalarLoadLegal(MI)) { 560 const InstructionMapping &SSMapping = getInstructionMapping( 561 1, 1, getOperandsMapping( 562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 564 2); // Num Operands 565 AltMappings.push_back(&SSMapping); 566 } 567 568 const InstructionMapping &VVMapping = getInstructionMapping( 569 2, 1, 570 getOperandsMapping( 571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 573 2); // Num Operands 574 AltMappings.push_back(&VVMapping); 575 576 // It may be possible to have a vgpr = load sgpr mapping here, because 577 // the mubuf instructions support this kind of load, but probably for only 578 // gfx7 and older. However, the addressing mode matching in the instruction 579 // selector should be able to do a better job of detecting and selecting 580 // these kinds of loads from the vgpr = load vgpr mapping. 581 582 return AltMappings; 583 584 } 585 case TargetOpcode::G_SELECT: { 586 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 587 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 592 4); // Num Operands 593 AltMappings.push_back(&SSMapping); 594 595 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 600 4); // Num Operands 601 AltMappings.push_back(&VVMapping); 602 603 return AltMappings; 604 } 605 case TargetOpcode::G_UADDE: 606 case TargetOpcode::G_USUBE: 607 case TargetOpcode::G_SADDE: 608 case TargetOpcode::G_SSUBE: { 609 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 610 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 611 getOperandsMapping( 612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 617 5); // Num Operands 618 AltMappings.push_back(&SSMapping); 619 620 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 626 5); // Num Operands 627 AltMappings.push_back(&VVMapping); 628 return AltMappings; 629 } 630 case AMDGPU::G_BRCOND: { 631 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 632 633 // TODO: Change type to 32 for scalar 634 const InstructionMapping &SMapping = getInstructionMapping( 635 1, 1, getOperandsMapping( 636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 637 2); // Num Operands 638 AltMappings.push_back(&SMapping); 639 640 const InstructionMapping &VMapping = getInstructionMapping( 641 1, 1, getOperandsMapping( 642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 643 2); // Num Operands 644 AltMappings.push_back(&VMapping); 645 return AltMappings; 646 } 647 case AMDGPU::G_INTRINSIC: 648 case AMDGPU::G_INTRINSIC_CONVERGENT: 649 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 652 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 653 default: 654 break; 655 } 656 return RegisterBankInfo::getInstrAlternativeMappings(MI); 657 } 658 659 void AMDGPURegisterBankInfo::split64BitValueForMapping( 660 MachineIRBuilder &B, 661 SmallVector<Register, 2> &Regs, 662 LLT HalfTy, 663 Register Reg) const { 664 assert(HalfTy.getSizeInBits() == 32); 665 MachineRegisterInfo *MRI = B.getMRI(); 666 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 667 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 668 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 669 MRI->setRegBank(LoLHS, *Bank); 670 MRI->setRegBank(HiLHS, *Bank); 671 672 Regs.push_back(LoLHS); 673 Regs.push_back(HiLHS); 674 675 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 676 .addDef(LoLHS) 677 .addDef(HiLHS) 678 .addUse(Reg); 679 } 680 681 /// Replace the current type each register in \p Regs has with \p NewTy 682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 683 LLT NewTy) { 684 for (Register Reg : Regs) { 685 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 686 MRI.setType(Reg, NewTy); 687 } 688 } 689 690 static LLT getHalfSizedType(LLT Ty) { 691 if (Ty.isVector()) { 692 assert(Ty.getElementCount().isKnownMultipleOf(2)); 693 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 694 Ty.getElementType()); 695 } 696 697 assert(Ty.getScalarSizeInBits() % 2 == 0); 698 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 699 } 700 701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 702 // source value into a scalar register. 703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 704 MachineRegisterInfo &MRI, 705 Register Src) const { 706 LLT Ty = MRI.getType(Src); 707 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 708 709 if (Bank == &AMDGPU::SGPRRegBank) 710 return Src; 711 712 unsigned Bits = Ty.getSizeInBits(); 713 assert(Bits % 32 == 0); 714 715 if (Bank != &AMDGPU::VGPRRegBank) { 716 // We need to copy from AGPR to VGPR 717 Src = B.buildCopy(Ty, Src).getReg(0); 718 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 719 } 720 721 LLT S32 = LLT::scalar(32); 722 unsigned NumParts = Bits / 32; 723 SmallVector<Register, 8> SrcParts; 724 SmallVector<Register, 8> DstParts; 725 726 if (Bits == 32) { 727 SrcParts.push_back(Src); 728 } else { 729 auto Unmerge = B.buildUnmerge(S32, Src); 730 for (unsigned i = 0; i < NumParts; ++i) 731 SrcParts.push_back(Unmerge.getReg(i)); 732 } 733 734 for (unsigned i = 0; i < NumParts; ++i) { 735 Register SrcPart = SrcParts[i]; 736 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 737 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 738 739 const TargetRegisterClass *Constrained = 740 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 741 (void)Constrained; 742 assert(Constrained && "Failed to constrain readfirstlane src reg"); 743 744 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 745 746 DstParts.push_back(DstPart); 747 } 748 749 if (Bits == 32) 750 return DstParts[0]; 751 752 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0); 753 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 754 return Dst; 755 } 756 757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 759 /// execute the instruction for each unique combination of values in all lanes 760 /// in the wave. The block will be split such that rest of the instructions are 761 /// moved to a new block. 762 /// 763 /// Essentially performs this loop: 764 // 765 /// Save Execution Mask 766 /// For (Lane : Wavefront) { 767 /// Enable Lane, Disable all other lanes 768 /// SGPR = read SGPR value for current lane from VGPR 769 /// VGPRResult[Lane] = use_op SGPR 770 /// } 771 /// Restore Execution Mask 772 /// 773 /// There is additional complexity to try for compare values to identify the 774 /// unique values used. 775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 776 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, 777 SmallSet<Register, 4> &SGPROperandRegs) const { 778 // Track use registers which have already been expanded with a readfirstlane 779 // sequence. This may have multiple uses if moving a sequence. 780 DenseMap<Register, Register> WaterfalledRegMap; 781 782 MachineBasicBlock &MBB = B.getMBB(); 783 MachineFunction *MF = &B.getMF(); 784 785 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 786 const unsigned MovExecOpc = 787 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 788 const unsigned MovExecTermOpc = 789 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 790 791 const unsigned XorTermOpc = Subtarget.isWave32() ? 792 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 793 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 794 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 795 const unsigned ExecReg = Subtarget.isWave32() ? 796 AMDGPU::EXEC_LO : AMDGPU::EXEC; 797 798 #ifndef NDEBUG 799 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 800 #endif 801 802 MachineRegisterInfo &MRI = *B.getMRI(); 803 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 804 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 805 806 // Don't bother using generic instructions/registers for the exec mask. 807 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 808 .addDef(InitSaveExecReg); 809 810 Register PhiExec = MRI.createVirtualRegister(WaveRC); 811 Register NewExec = MRI.createVirtualRegister(WaveRC); 812 813 // To insert the loop we need to split the block. Move everything before this 814 // point to a new block, and insert a new empty block before this instruction. 815 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 816 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 819 MachineFunction::iterator MBBI(MBB); 820 ++MBBI; 821 MF->insert(MBBI, LoopBB); 822 MF->insert(MBBI, BodyBB); 823 MF->insert(MBBI, RestoreExecBB); 824 MF->insert(MBBI, RemainderBB); 825 826 LoopBB->addSuccessor(BodyBB); 827 BodyBB->addSuccessor(RestoreExecBB); 828 BodyBB->addSuccessor(LoopBB); 829 830 // Move the rest of the block into a new block. 831 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 832 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 833 834 MBB.addSuccessor(LoopBB); 835 RestoreExecBB->addSuccessor(RemainderBB); 836 837 B.setInsertPt(*LoopBB, LoopBB->end()); 838 839 B.buildInstr(TargetOpcode::PHI) 840 .addDef(PhiExec) 841 .addReg(InitSaveExecReg) 842 .addMBB(&MBB) 843 .addReg(NewExec) 844 .addMBB(BodyBB); 845 846 const DebugLoc &DL = B.getDL(); 847 848 MachineInstr &FirstInst = *Range.begin(); 849 850 // Move the instruction into the loop body. Note we moved everything after 851 // Range.end() already into a new block, so Range.end() is no longer valid. 852 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 853 854 // Figure out the iterator range after splicing the instructions. 855 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 856 auto NewEnd = BodyBB->end(); 857 858 B.setMBB(*LoopBB); 859 860 LLT S1 = LLT::scalar(1); 861 Register CondReg; 862 863 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 864 865 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 866 for (MachineOperand &Op : MI.all_uses()) { 867 Register OldReg = Op.getReg(); 868 if (!SGPROperandRegs.count(OldReg)) 869 continue; 870 871 // See if we already processed this register in another instruction in the 872 // sequence. 873 auto OldVal = WaterfalledRegMap.find(OldReg); 874 if (OldVal != WaterfalledRegMap.end()) { 875 Op.setReg(OldVal->second); 876 continue; 877 } 878 879 Register OpReg = Op.getReg(); 880 LLT OpTy = MRI.getType(OpReg); 881 882 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 883 if (OpBank != &AMDGPU::VGPRRegBank) { 884 // Insert copy from AGPR to VGPR before the loop. 885 B.setMBB(MBB); 886 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 887 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 888 B.setMBB(*LoopBB); 889 } 890 891 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 892 893 // Build the comparison(s). 894 unsigned OpSize = OpTy.getSizeInBits(); 895 bool Is64 = OpSize % 64 == 0; 896 unsigned PartSize = Is64 ? 64 : 32; 897 LLT PartTy = LLT::scalar(PartSize); 898 unsigned NumParts = OpSize / PartSize; 899 SmallVector<Register, 8> OpParts; 900 SmallVector<Register, 8> CurrentLaneParts; 901 902 if (NumParts == 1) { 903 OpParts.push_back(OpReg); 904 CurrentLaneParts.push_back(CurrentLaneReg); 905 } else { 906 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 907 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 908 for (unsigned i = 0; i < NumParts; ++i) { 909 OpParts.push_back(UnmergeOp.getReg(i)); 910 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 911 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 912 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 913 } 914 } 915 916 for (unsigned i = 0; i < NumParts; ++i) { 917 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 918 OpParts[i]).getReg(0); 919 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 920 921 if (!CondReg) { 922 CondReg = CmpReg; 923 } else { 924 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 925 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 926 } 927 } 928 929 Op.setReg(CurrentLaneReg); 930 931 // Make sure we don't re-process this register again. 932 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); 933 } 934 } 935 936 // The ballot becomes a no-op during instruction selection. 937 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 938 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}) 939 .addReg(CondReg) 940 .getReg(0); 941 MRI.setRegClass(CondReg, WaveRC); 942 943 // Update EXEC, save the original EXEC value to VCC. 944 B.buildInstr(AndSaveExecOpc) 945 .addDef(NewExec) 946 .addReg(CondReg, RegState::Kill); 947 948 MRI.setSimpleHint(NewExec, CondReg); 949 950 B.setInsertPt(*BodyBB, BodyBB->end()); 951 952 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 953 B.buildInstr(XorTermOpc) 954 .addDef(ExecReg) 955 .addReg(ExecReg) 956 .addReg(NewExec); 957 958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 959 // s_cbranch_scc0? 960 961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 962 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 963 964 // Save the EXEC mask before the loop. 965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 966 .addReg(ExecReg); 967 968 // Restore the EXEC mask after the loop. 969 B.setMBB(*RestoreExecBB); 970 B.buildInstr(MovExecTermOpc) 971 .addDef(ExecReg) 972 .addReg(SaveExecReg); 973 974 // Set the insert point after the original instruction, so any new 975 // instructions will be in the remainder. 976 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 977 978 return true; 979 } 980 981 // Return any unique registers used by \p MI at \p OpIndices that need to be 982 // handled in a waterfall loop. Returns these registers in \p 983 // SGPROperandRegs. Returns true if there are any operands to handle and a 984 // waterfall loop is necessary. 985 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 986 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 987 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 988 for (unsigned Op : OpIndices) { 989 assert(MI.getOperand(Op).isUse()); 990 Register Reg = MI.getOperand(Op).getReg(); 991 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 992 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 993 SGPROperandRegs.insert(Reg); 994 } 995 996 // No operands need to be replaced, so no need to loop. 997 return !SGPROperandRegs.empty(); 998 } 999 1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1001 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const { 1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1003 // are the same register. 1004 SmallSet<Register, 4> SGPROperandRegs; 1005 1006 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices)) 1007 return false; 1008 1009 MachineBasicBlock::iterator I = MI.getIterator(); 1010 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1011 SGPROperandRegs); 1012 } 1013 1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1016 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const { 1017 Register Reg = MI.getOperand(OpIdx).getReg(); 1018 MachineRegisterInfo &MRI = *B.getMRI(); 1019 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1020 if (Bank == &AMDGPU::SGPRRegBank) 1021 return; 1022 1023 Reg = buildReadFirstLane(B, MRI, Reg); 1024 MI.getOperand(OpIdx).setReg(Reg); 1025 } 1026 1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1028 /// rest will be in the remainder. 1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1030 unsigned TotalSize = Ty.getSizeInBits(); 1031 if (!Ty.isVector()) 1032 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1033 1034 LLT EltTy = Ty.getElementType(); 1035 unsigned EltSize = EltTy.getSizeInBits(); 1036 assert(FirstSize % EltSize == 0); 1037 1038 unsigned FirstPartNumElts = FirstSize / EltSize; 1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1040 1041 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1042 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1043 } 1044 1045 static LLT widen96To128(LLT Ty) { 1046 if (!Ty.isVector()) 1047 return LLT::scalar(128); 1048 1049 LLT EltTy = Ty.getElementType(); 1050 assert(128 % EltTy.getSizeInBits() == 0); 1051 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1052 } 1053 1054 bool AMDGPURegisterBankInfo::applyMappingLoad( 1055 MachineIRBuilder &B, 1056 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1057 MachineInstr &MI) const { 1058 MachineRegisterInfo &MRI = *B.getMRI(); 1059 Register DstReg = MI.getOperand(0).getReg(); 1060 const LLT LoadTy = MRI.getType(DstReg); 1061 unsigned LoadSize = LoadTy.getSizeInBits(); 1062 MachineMemOperand *MMO = *MI.memoperands_begin(); 1063 const unsigned MaxNonSmrdLoadSize = 128; 1064 1065 const RegisterBank *DstBank = 1066 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1067 if (DstBank == &AMDGPU::SGPRRegBank) { 1068 // There are some special cases that we need to look at for 32 bit and 96 1069 // bit SGPR loads otherwise we have nothing to do. 1070 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) 1071 return false; 1072 1073 const unsigned MemSize = 8 * MMO->getSize().getValue(); 1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1076 // scalar loads should have a load size of 32 but memory access size of less 1077 // than 32. 1078 if (LoadSize == 32 && 1079 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1080 return false; 1081 1082 if (LoadSize == 32 && 1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) || 1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) && 1085 isScalarLoadLegal(MI) && 1086 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12) 1087 return false; 1088 1089 Register PtrReg = MI.getOperand(1).getReg(); 1090 1091 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 1092 1093 if (LoadSize == 32) { 1094 // This is an extending load from a sub-dword size. Widen the memory 1095 // access size to 4 bytes and clear the extra high bits appropriately 1096 const LLT S32 = LLT::scalar(32); 1097 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1099 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1100 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1101 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1103 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1104 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1105 } else 1106 // We do not need to touch the higher bits for regular loads. 1107 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1108 } else { 1109 // 96-bit loads are only available for vector loads. We need to split this 1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1111 if (MMO->getAlign() < Align(16)) { 1112 LegalizerHelper Helper(B.getMF(), ApplyBank, B); 1113 LLT Part64, Part32; 1114 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1115 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1116 LegalizerHelper::Legalized) 1117 return false; 1118 return true; 1119 } 1120 LLT WiderTy = widen96To128(LoadTy); 1121 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1122 if (WiderTy.isScalar()) { 1123 B.buildTrunc(MI.getOperand(0), WideLoad); 1124 } else { 1125 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1126 WideLoad); 1127 } 1128 } 1129 1130 MI.eraseFromParent(); 1131 return true; 1132 } 1133 1134 // 128-bit loads are supported for all instruction types. 1135 if (LoadSize <= MaxNonSmrdLoadSize) 1136 return false; 1137 1138 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1139 1140 if (SrcRegs.empty()) 1141 SrcRegs.push_back(MI.getOperand(1).getReg()); 1142 1143 // RegBankSelect only emits scalar types, so we need to reset the pointer 1144 // operand to a pointer type. 1145 Register BasePtrReg = SrcRegs[0]; 1146 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1147 MRI.setType(BasePtrReg, PtrTy); 1148 1149 // The following are the loads not splitted enough during legalization 1150 // because it was not clear they are smem-load or vmem-load 1151 if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) || 1152 MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) { 1153 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1154 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1155 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1156 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); 1157 LegalizerHelper Helper(B.getMF(), O, B); 1158 if (LoadTy.isVector()) { 1159 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != 1160 LegalizerHelper::Legalized) 1161 return false; 1162 } else { 1163 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1164 return false; 1165 } 1166 } 1167 1168 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1169 return true; 1170 } 1171 1172 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1173 MachineIRBuilder &B, 1174 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1175 MachineInstr &MI) const { 1176 MachineRegisterInfo &MRI = *B.getMRI(); 1177 const MachineFunction &MF = B.getMF(); 1178 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1179 const auto &TFI = *ST.getFrameLowering(); 1180 1181 // Guard in case the stack growth direction ever changes with scratch 1182 // instructions. 1183 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp && 1184 "Stack grows upwards for AMDGPU"); 1185 1186 Register Dst = MI.getOperand(0).getReg(); 1187 Register AllocSize = MI.getOperand(1).getReg(); 1188 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1189 1190 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1191 1192 if (SizeBank != &AMDGPU::SGPRRegBank) { 1193 auto WaveReduction = 1194 B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)}) 1195 .addUse(AllocSize) 1196 .addImm(0); 1197 AllocSize = WaveReduction.getReg(0); 1198 } 1199 1200 LLT PtrTy = MRI.getType(Dst); 1201 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1202 1203 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1204 Register SPReg = Info->getStackPtrOffsetReg(); 1205 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); 1206 1207 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1208 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1209 1210 auto OldSP = B.buildCopy(PtrTy, SPReg); 1211 if (Alignment > TFI.getStackAlign()) { 1212 auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1; 1213 auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP, 1214 B.buildConstant(LLT::scalar(32), StackAlignMask)); 1215 B.buildMaskLowPtrBits(Dst, Tmp1, 1216 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1217 } else { 1218 B.buildCopy(Dst, OldSP); 1219 } 1220 auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize); 1221 B.buildCopy(SPReg, PtrAdd); 1222 MI.eraseFromParent(); 1223 return true; 1224 } 1225 1226 bool AMDGPURegisterBankInfo::applyMappingImage( 1227 MachineIRBuilder &B, MachineInstr &MI, 1228 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1229 int RsrcIdx) const { 1230 const int NumDefs = MI.getNumExplicitDefs(); 1231 1232 // The reported argument index is relative to the IR intrinsic call arguments, 1233 // so we need to shift by the number of defs and the intrinsic ID. 1234 RsrcIdx += NumDefs + 1; 1235 1236 // Insert copies to VGPR arguments. 1237 applyDefaultMapping(OpdMapper); 1238 1239 // Fixup any SGPR arguments. 1240 SmallVector<unsigned, 4> SGPRIndexes; 1241 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1242 if (!MI.getOperand(I).isReg()) 1243 continue; 1244 1245 // If this intrinsic has a sampler, it immediately follows rsrc. 1246 if (I == RsrcIdx || I == RsrcIdx + 1) 1247 SGPRIndexes.push_back(I); 1248 } 1249 1250 executeInWaterfallLoop(B, MI, SGPRIndexes); 1251 return true; 1252 } 1253 1254 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1255 // the three offsets (voffset, soffset and instoffset) 1256 unsigned AMDGPURegisterBankInfo::setBufferOffsets( 1257 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, 1258 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { 1259 const LLT S32 = LLT::scalar(32); 1260 MachineRegisterInfo *MRI = B.getMRI(); 1261 1262 if (std::optional<int64_t> Imm = 1263 getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1264 uint32_t SOffset, ImmOffset; 1265 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { 1266 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1267 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1268 InstOffsetVal = ImmOffset; 1269 1270 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1271 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1272 return SOffset + ImmOffset; 1273 } 1274 } 1275 1276 Register Base; 1277 unsigned Offset; 1278 1279 std::tie(Base, Offset) = 1280 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1281 1282 uint32_t SOffset, ImmOffset; 1283 if ((int)Offset > 0 && 1284 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 1285 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1286 VOffsetReg = Base; 1287 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1288 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1289 InstOffsetVal = ImmOffset; 1290 return 0; // XXX - Why is this 0? 1291 } 1292 1293 // If we have SGPR base, we can use it for soffset. 1294 if (SOffset == 0) { 1295 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1296 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1297 SOffsetReg = Base; 1298 InstOffsetVal = ImmOffset; 1299 return 0; // XXX - Why is this 0? 1300 } 1301 } 1302 1303 // Handle the variable sgpr + vgpr case. 1304 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1305 if (Add && (int)Offset >= 0) { 1306 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); 1307 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); 1308 1309 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); 1310 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); 1311 1312 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1313 VOffsetReg = Src0; 1314 SOffsetReg = Src1; 1315 return 0; 1316 } 1317 1318 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1319 VOffsetReg = Src1; 1320 SOffsetReg = Src0; 1321 return 0; 1322 } 1323 } 1324 1325 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1326 // have an SGPR offset and a VGPR resource. 1327 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1328 VOffsetReg = CombinedOffset; 1329 } else { 1330 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1331 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1332 } 1333 1334 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1335 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1336 return 0; 1337 } 1338 1339 static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) { 1340 switch (Opc) { 1341 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: 1342 return AMDGPU::G_AMDGPU_BUFFER_LOAD; 1343 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: 1344 return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 1345 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: 1346 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE; 1347 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: 1348 return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 1349 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: 1350 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; 1351 default: 1352 break; 1353 } 1354 llvm_unreachable("Unexpected s_buffer_load opcode"); 1355 } 1356 1357 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1358 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 1359 MachineInstr &MI = OpdMapper.getMI(); 1360 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1361 1362 const LLT S32 = LLT::scalar(32); 1363 Register Dst = MI.getOperand(0).getReg(); 1364 LLT Ty = MRI.getType(Dst); 1365 1366 const RegisterBank *RSrcBank = 1367 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1368 const RegisterBank *OffsetBank = 1369 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1370 if (RSrcBank == &AMDGPU::SGPRRegBank && 1371 OffsetBank == &AMDGPU::SGPRRegBank) 1372 return true; // Legal mapping 1373 1374 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1375 // here but don't have an MMO. 1376 1377 unsigned LoadSize = Ty.getSizeInBits(); 1378 int NumLoads = 1; 1379 if (LoadSize == 256 || LoadSize == 512) { 1380 NumLoads = LoadSize / 128; 1381 Ty = Ty.divide(NumLoads); 1382 } 1383 1384 // Use the alignment to ensure that the required offsets will fit into the 1385 // immediate offsets. 1386 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1387 1388 MachineFunction &MF = B.getMF(); 1389 1390 Register SOffset; 1391 Register VOffset; 1392 int64_t ImmOffset = 0; 1393 1394 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset, 1395 SOffset, ImmOffset, Alignment); 1396 1397 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1398 // can, but we need to track an MMO for that. 1399 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1400 const Align MemAlign(4); // FIXME: ABI type alignment? 1401 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1402 MachinePointerInfo(), 1403 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1404 MachineMemOperand::MOInvariant, 1405 MemSize, MemAlign); 1406 if (MMOOffset != 0) 1407 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1408 1409 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1410 // assume that the buffer is unswizzled. 1411 1412 Register RSrc = MI.getOperand(1).getReg(); 1413 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1414 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1415 1416 SmallVector<Register, 4> LoadParts(NumLoads); 1417 1418 MachineBasicBlock::iterator MII = MI.getIterator(); 1419 MachineInstrSpan Span(MII, &B.getMBB()); 1420 1421 for (int i = 0; i < NumLoads; ++i) { 1422 if (NumLoads == 1) { 1423 LoadParts[i] = Dst; 1424 } else { 1425 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1426 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1427 } 1428 1429 MachineMemOperand *MMO = BaseMMO; 1430 if (i != 0) 1431 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1432 1433 B.buildInstr(getSBufferLoadCorrespondingBufferLoadOpcode(MI.getOpcode())) 1434 .addDef(LoadParts[i]) // vdata 1435 .addUse(RSrc) // rsrc 1436 .addUse(VIndex) // vindex 1437 .addUse(VOffset) // voffset 1438 .addUse(SOffset) // soffset 1439 .addImm(ImmOffset + 16 * i) // offset(imm) 1440 .addImm(0) // cachepolicy, swizzled buffer(imm) 1441 .addImm(0) // idxen(imm) 1442 .addMemOperand(MMO); 1443 } 1444 1445 // TODO: If only the resource is a VGPR, it may be better to execute the 1446 // scalar load in the waterfall loop if the resource is expected to frequently 1447 // be dynamically uniform. 1448 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1449 // Remove the original instruction to avoid potentially confusing the 1450 // waterfall loop logic. 1451 B.setInstr(*Span.begin()); 1452 MI.eraseFromParent(); 1453 1454 SmallSet<Register, 4> OpsToWaterfall; 1455 1456 OpsToWaterfall.insert(RSrc); 1457 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1458 OpsToWaterfall); 1459 } 1460 1461 if (NumLoads != 1) { 1462 if (Ty.isVector()) 1463 B.buildConcatVectors(Dst, LoadParts); 1464 else 1465 B.buildMergeLikeInstr(Dst, LoadParts); 1466 } 1467 1468 // We removed the instruction earlier with a waterfall loop. 1469 if (RSrcBank == &AMDGPU::SGPRRegBank) 1470 MI.eraseFromParent(); 1471 1472 return true; 1473 } 1474 1475 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, 1476 const OperandsMapper &OpdMapper, 1477 bool Signed) const { 1478 MachineInstr &MI = OpdMapper.getMI(); 1479 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1480 1481 // Insert basic copies 1482 applyDefaultMapping(OpdMapper); 1483 1484 Register DstReg = MI.getOperand(0).getReg(); 1485 LLT Ty = MRI.getType(DstReg); 1486 1487 const LLT S32 = LLT::scalar(32); 1488 1489 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; 1490 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1491 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1492 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1493 1494 const RegisterBank *DstBank = 1495 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1496 if (DstBank == &AMDGPU::VGPRRegBank) { 1497 if (Ty == S32) 1498 return true; 1499 1500 // There is no 64-bit vgpr bitfield extract instructions so the operation 1501 // is expanded to a sequence of instructions that implement the operation. 1502 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); 1503 1504 const LLT S64 = LLT::scalar(64); 1505 // Shift the source operand so that extracted bits start at bit 0. 1506 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1507 : B.buildLShr(S64, SrcReg, OffsetReg); 1508 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1509 1510 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1511 // if the width is a constant. 1512 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1513 // Use the 32-bit bitfield extract instruction if the width is a constant. 1514 // Depending on the width size, use either the low or high 32-bits. 1515 auto Zero = B.buildConstant(S32, 0); 1516 auto WidthImm = ConstWidth->Value.getZExtValue(); 1517 if (WidthImm <= 32) { 1518 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1519 // or clear the upper 32-bits. 1520 auto Extract = 1521 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1522 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1523 auto Extend = 1524 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1525 B.buildMergeLikeInstr(DstReg, {Extract, Extend}); 1526 } else { 1527 // Use bitfield extract on upper 32-bit source, and combine with lower 1528 // 32-bit source. 1529 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1530 auto Extract = 1531 Signed 1532 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1533 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1534 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1535 } 1536 MI.eraseFromParent(); 1537 return true; 1538 } 1539 1540 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1541 // operations. 1542 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1543 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1544 if (Signed) 1545 B.buildAShr(S64, SignBit, ExtShift); 1546 else 1547 B.buildLShr(S64, SignBit, ExtShift); 1548 MI.eraseFromParent(); 1549 return true; 1550 } 1551 1552 // The scalar form packs the offset and width in a single operand. 1553 1554 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); 1555 1556 // Ensure the high bits are clear to insert the offset. 1557 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1558 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1559 1560 // Zeros out the low bits, so don't bother clamping the input value. 1561 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1562 1563 // Transformation function, pack the offset and width of a BFE into 1564 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1565 // source, bits [5:0] contain the offset and bits [22:16] the width. 1566 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1567 1568 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1569 // register class constraints. 1570 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1571 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1572 1573 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1574 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1575 llvm_unreachable("failed to constrain BFE"); 1576 1577 MI.eraseFromParent(); 1578 return true; 1579 } 1580 1581 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1582 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 1583 MachineInstr &MI = OpdMapper.getMI(); 1584 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1585 1586 // Insert basic copies. 1587 applyDefaultMapping(OpdMapper); 1588 1589 Register Dst0 = MI.getOperand(0).getReg(); 1590 Register Dst1 = MI.getOperand(1).getReg(); 1591 Register Src0 = MI.getOperand(2).getReg(); 1592 Register Src1 = MI.getOperand(3).getReg(); 1593 Register Src2 = MI.getOperand(4).getReg(); 1594 1595 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1596 return true; 1597 1598 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1599 LLT S1 = LLT::scalar(1); 1600 LLT S32 = LLT::scalar(32); 1601 1602 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1603 bool Accumulate = true; 1604 1605 if (!DstOnValu) { 1606 if (mi_match(Src2, MRI, m_ZeroInt())) 1607 Accumulate = false; 1608 } 1609 1610 // Keep the multiplication on the SALU. 1611 Register DstHi; 1612 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1613 bool MulHiInVgpr = false; 1614 1615 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1616 1617 if (Subtarget.hasSMulHi()) { 1618 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1619 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1620 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1621 } else { 1622 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1623 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1624 1625 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1626 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1627 1628 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1629 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1630 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1631 1632 if (!DstOnValu) { 1633 DstHi = buildReadFirstLane(B, MRI, DstHi); 1634 } else { 1635 MulHiInVgpr = true; 1636 } 1637 } 1638 1639 // Accumulate and produce the "carry-out" bit. 1640 // 1641 // The "carry-out" is defined as bit 64 of the result when computed as a 1642 // big integer. For unsigned multiply-add, this matches the usual definition 1643 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1644 // result, which is determined as: 1645 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1646 LLT CarryType = DstOnValu ? S1 : S32; 1647 const RegisterBank &CarryBank = 1648 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1649 const RegisterBank &DstBank = 1650 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1651 Register Carry; 1652 Register Zero; 1653 1654 if (!IsUnsigned) { 1655 Zero = B.buildConstant(S32, 0).getReg(0); 1656 MRI.setRegBank(Zero, 1657 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1658 1659 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1660 .getReg(0); 1661 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1662 : AMDGPU::SGPRRegBank); 1663 1664 if (DstOnValu && !MulHiInVgpr) { 1665 Carry = B.buildTrunc(S1, Carry).getReg(0); 1666 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1667 } 1668 } 1669 1670 if (Accumulate) { 1671 if (DstOnValu) { 1672 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1673 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1674 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1675 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1676 } 1677 1678 auto Unmerge = B.buildUnmerge(S32, Src2); 1679 Register Src2Lo = Unmerge.getReg(0); 1680 Register Src2Hi = Unmerge.getReg(1); 1681 MRI.setRegBank(Src2Lo, DstBank); 1682 MRI.setRegBank(Src2Hi, DstBank); 1683 1684 if (!IsUnsigned) { 1685 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1686 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1687 1688 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1689 MRI.setRegBank(Carry, CarryBank); 1690 } 1691 1692 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1693 DstLo = AddLo.getReg(0); 1694 Register CarryLo = AddLo.getReg(1); 1695 MRI.setRegBank(DstLo, DstBank); 1696 MRI.setRegBank(CarryLo, CarryBank); 1697 1698 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1699 DstHi = AddHi.getReg(0); 1700 MRI.setRegBank(DstHi, DstBank); 1701 1702 Register CarryHi = AddHi.getReg(1); 1703 MRI.setRegBank(CarryHi, CarryBank); 1704 1705 if (IsUnsigned) { 1706 Carry = CarryHi; 1707 } else { 1708 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1709 MRI.setRegBank(Carry, CarryBank); 1710 } 1711 } else { 1712 if (IsUnsigned) { 1713 Carry = B.buildConstant(CarryType, 0).getReg(0); 1714 MRI.setRegBank(Carry, CarryBank); 1715 } 1716 } 1717 1718 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi}); 1719 1720 if (DstOnValu) { 1721 B.buildCopy(Dst1, Carry); 1722 } else { 1723 B.buildTrunc(Dst1, Carry); 1724 } 1725 1726 MI.eraseFromParent(); 1727 return true; 1728 } 1729 1730 // Return a suitable opcode for extending the operands of Opc when widening. 1731 static unsigned getExtendOp(unsigned Opc) { 1732 switch (Opc) { 1733 case TargetOpcode::G_ASHR: 1734 case TargetOpcode::G_SMIN: 1735 case TargetOpcode::G_SMAX: 1736 return TargetOpcode::G_SEXT; 1737 case TargetOpcode::G_LSHR: 1738 case TargetOpcode::G_UMIN: 1739 case TargetOpcode::G_UMAX: 1740 return TargetOpcode::G_ZEXT; 1741 default: 1742 return TargetOpcode::G_ANYEXT; 1743 } 1744 } 1745 1746 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1747 // any illegal vector extend or unmerge operations. 1748 static std::pair<Register, Register> 1749 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1750 const LLT S32 = LLT::scalar(32); 1751 auto Bitcast = B.buildBitcast(S32, Src); 1752 1753 if (ExtOpcode == TargetOpcode::G_SEXT) { 1754 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1755 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1756 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1757 } 1758 1759 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1760 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1761 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1762 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1763 } 1764 1765 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1766 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1767 } 1768 1769 // For cases where only a single copy is inserted for matching register banks. 1770 // Replace the register in the instruction operand 1771 static bool substituteSimpleCopyRegs( 1772 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1773 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1774 if (!SrcReg.empty()) { 1775 assert(SrcReg.size() == 1); 1776 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1777 return true; 1778 } 1779 1780 return false; 1781 } 1782 1783 /// Handle register layout difference for f16 images for some subtargets. 1784 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1785 MachineRegisterInfo &MRI, 1786 Register Reg) const { 1787 if (!Subtarget.hasUnpackedD16VMem()) 1788 return Reg; 1789 1790 const LLT S16 = LLT::scalar(16); 1791 LLT StoreVT = MRI.getType(Reg); 1792 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1793 return Reg; 1794 1795 auto Unmerge = B.buildUnmerge(S16, Reg); 1796 1797 1798 SmallVector<Register, 4> WideRegs; 1799 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1800 WideRegs.push_back(Unmerge.getReg(I)); 1801 1802 const LLT S32 = LLT::scalar(32); 1803 int NumElts = StoreVT.getNumElements(); 1804 1805 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs) 1806 .getReg(0); 1807 } 1808 1809 static std::pair<Register, unsigned> 1810 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1811 int64_t Const; 1812 if (mi_match(Reg, MRI, m_ICst(Const))) 1813 return std::pair(Register(), Const); 1814 1815 Register Base; 1816 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1817 return std::pair(Base, Const); 1818 1819 // TODO: Handle G_OR used for add case 1820 return std::pair(Reg, 0); 1821 } 1822 1823 std::pair<Register, unsigned> 1824 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1825 Register OrigOffset) const { 1826 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget); 1827 Register BaseReg; 1828 unsigned ImmOffset; 1829 const LLT S32 = LLT::scalar(32); 1830 1831 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. 1832 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1833 OrigOffset); 1834 1835 unsigned C1 = 0; 1836 if (ImmOffset != 0) { 1837 // If the immediate value is too big for the immoffset field, put only bits 1838 // that would normally fit in the immoffset field. The remaining value that 1839 // is copied/added for the voffset field is a large power of 2, and it 1840 // stands more chance of being CSEd with the copy/add for another similar 1841 // load/store. 1842 // However, do not do that rounding down if that is a negative 1843 // number, as it appears to be illegal to have a negative offset in the 1844 // vgpr, even if adding the immediate offset makes it positive. 1845 unsigned Overflow = ImmOffset & ~MaxImm; 1846 ImmOffset -= Overflow; 1847 if ((int32_t)Overflow < 0) { 1848 Overflow += ImmOffset; 1849 ImmOffset = 0; 1850 } 1851 1852 C1 = ImmOffset; 1853 if (Overflow != 0) { 1854 if (!BaseReg) 1855 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1856 else { 1857 auto OverflowVal = B.buildConstant(S32, Overflow); 1858 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1859 } 1860 } 1861 } 1862 1863 if (!BaseReg) 1864 BaseReg = B.buildConstant(S32, 0).getReg(0); 1865 1866 return {BaseReg, C1}; 1867 } 1868 1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1870 Register SrcReg) const { 1871 MachineRegisterInfo &MRI = *B.getMRI(); 1872 LLT SrcTy = MRI.getType(SrcReg); 1873 if (SrcTy.getSizeInBits() == 32) { 1874 // Use a v_mov_b32 here to make the exec dependency explicit. 1875 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1876 .addDef(DstReg) 1877 .addUse(SrcReg); 1878 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1879 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1880 } 1881 1882 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1883 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1884 1885 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1886 .addDef(TmpReg0) 1887 .addUse(SrcReg, 0, AMDGPU::sub0); 1888 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1889 .addDef(TmpReg1) 1890 .addUse(SrcReg, 0, AMDGPU::sub1); 1891 B.buildInstr(AMDGPU::REG_SEQUENCE) 1892 .addDef(DstReg) 1893 .addUse(TmpReg0) 1894 .addImm(AMDGPU::sub0) 1895 .addUse(TmpReg1) 1896 .addImm(AMDGPU::sub1); 1897 1898 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1899 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1900 } 1901 1902 /// Utility function for pushing dynamic vector indexes with a constant offset 1903 /// into waterfall loops. 1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1905 MachineInstr &IdxUseInstr, 1906 unsigned OpIdx, 1907 unsigned ConstOffset) { 1908 MachineRegisterInfo &MRI = *B.getMRI(); 1909 const LLT S32 = LLT::scalar(32); 1910 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1911 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1912 1913 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1914 1915 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1916 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1917 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1918 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1919 } 1920 1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1922 /// original 32-bit source value (to be inserted in the low part of the combined 1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1924 /// value. 1925 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1926 Register Hi32Reg, Register Lo32Reg, 1927 unsigned ExtOpc, 1928 const RegisterBank &RegBank, 1929 bool IsBooleanSrc = false) { 1930 if (ExtOpc == AMDGPU::G_ZEXT) { 1931 B.buildConstant(Hi32Reg, 0); 1932 } else if (ExtOpc == AMDGPU::G_SEXT) { 1933 if (IsBooleanSrc) { 1934 // If we know the original source was an s1, the high half is the same as 1935 // the low. 1936 B.buildCopy(Hi32Reg, Lo32Reg); 1937 } else { 1938 // Replicate sign bit from 32-bit extended part. 1939 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1940 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1941 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1942 } 1943 } else { 1944 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1945 B.buildUndef(Hi32Reg); 1946 } 1947 } 1948 1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1950 MachineIRBuilder &B, MachineInstr &MI, 1951 const OperandsMapper &OpdMapper) const { 1952 MachineRegisterInfo &MRI = *B.getMRI(); 1953 1954 Register VecReg = MI.getOperand(1).getReg(); 1955 Register Idx = MI.getOperand(2).getReg(); 1956 1957 const RegisterBank &IdxBank = 1958 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1959 1960 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1961 1962 LLT VecTy = MRI.getType(VecReg); 1963 unsigned EltSize = VecTy.getScalarSizeInBits(); 1964 unsigned NumElem = VecTy.getNumElements(); 1965 1966 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1967 IsDivergentIdx, &Subtarget)) 1968 return false; 1969 1970 LLT S32 = LLT::scalar(32); 1971 1972 const RegisterBank &DstBank = 1973 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1974 const RegisterBank &SrcBank = 1975 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1976 1977 const RegisterBank &CCBank = 1978 (DstBank == AMDGPU::SGPRRegBank && 1979 SrcBank == AMDGPU::SGPRRegBank && 1980 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1981 : AMDGPU::VCCRegBank; 1982 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1983 1984 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1985 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1986 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1987 } 1988 1989 LLT EltTy = VecTy.getScalarType(); 1990 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1991 unsigned NumLanes = DstRegs.size(); 1992 if (!NumLanes) 1993 NumLanes = 1; 1994 else 1995 EltTy = MRI.getType(DstRegs[0]); 1996 1997 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1998 SmallVector<Register, 2> Res(NumLanes); 1999 for (unsigned L = 0; L < NumLanes; ++L) 2000 Res[L] = UnmergeToEltTy.getReg(L); 2001 2002 for (unsigned I = 1; I < NumElem; ++I) { 2003 auto IC = B.buildConstant(S32, I); 2004 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2005 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2006 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2007 2008 for (unsigned L = 0; L < NumLanes; ++L) { 2009 auto S = B.buildSelect(EltTy, Cmp, 2010 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 2011 2012 for (unsigned N : { 0, 2, 3 }) 2013 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 2014 2015 Res[L] = S->getOperand(0).getReg(); 2016 } 2017 } 2018 2019 for (unsigned L = 0; L < NumLanes; ++L) { 2020 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 2021 B.buildCopy(DstReg, Res[L]); 2022 MRI.setRegBank(DstReg, DstBank); 2023 } 2024 2025 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2026 MI.eraseFromParent(); 2027 2028 return true; 2029 } 2030 2031 // Insert a cross regbank copy for a register if it already has a bank that 2032 // differs from the one we want to set. 2033 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2034 MachineIRBuilder &B, Register &Reg, 2035 const RegisterBank &Bank) { 2036 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2037 if (CurrBank && *CurrBank != Bank) { 2038 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2039 MRI.setRegBank(Copy, Bank); 2040 return Copy; 2041 } 2042 2043 MRI.setRegBank(Reg, Bank); 2044 return Reg; 2045 } 2046 2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2048 MachineIRBuilder &B, MachineInstr &MI, 2049 const OperandsMapper &OpdMapper) const { 2050 2051 MachineRegisterInfo &MRI = *B.getMRI(); 2052 Register VecReg = MI.getOperand(1).getReg(); 2053 Register Idx = MI.getOperand(3).getReg(); 2054 2055 const RegisterBank &IdxBank = 2056 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2057 2058 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2059 2060 LLT VecTy = MRI.getType(VecReg); 2061 unsigned EltSize = VecTy.getScalarSizeInBits(); 2062 unsigned NumElem = VecTy.getNumElements(); 2063 2064 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2065 IsDivergentIdx, &Subtarget)) 2066 return false; 2067 2068 LLT S32 = LLT::scalar(32); 2069 2070 const RegisterBank &DstBank = 2071 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2072 const RegisterBank &SrcBank = 2073 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2074 const RegisterBank &InsBank = 2075 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2076 2077 const RegisterBank &CCBank = 2078 (DstBank == AMDGPU::SGPRRegBank && 2079 SrcBank == AMDGPU::SGPRRegBank && 2080 InsBank == AMDGPU::SGPRRegBank && 2081 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2082 : AMDGPU::VCCRegBank; 2083 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2084 2085 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2086 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2087 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2088 } 2089 2090 LLT EltTy = VecTy.getScalarType(); 2091 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2092 unsigned NumLanes = InsRegs.size(); 2093 if (!NumLanes) { 2094 NumLanes = 1; 2095 InsRegs.push_back(MI.getOperand(2).getReg()); 2096 } else { 2097 EltTy = MRI.getType(InsRegs[0]); 2098 } 2099 2100 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2101 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2102 2103 for (unsigned I = 0; I < NumElem; ++I) { 2104 auto IC = B.buildConstant(S32, I); 2105 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2106 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2107 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2108 2109 for (unsigned L = 0; L < NumLanes; ++L) { 2110 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2111 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2112 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2113 2114 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2115 MRI.setRegBank(Select, DstBank); 2116 2117 Ops[I * NumLanes + L] = Select; 2118 } 2119 } 2120 2121 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2122 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2123 B.buildBuildVector(MI.getOperand(0), Ops); 2124 } else { 2125 auto Vec = B.buildBuildVector(MergeTy, Ops); 2126 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2127 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2128 } 2129 2130 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2131 MI.eraseFromParent(); 2132 2133 return true; 2134 } 2135 2136 // Break s_mul_u64 into 32-bit vector operations. 2137 void AMDGPURegisterBankInfo::applyMappingSMULU64( 2138 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 2139 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2140 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2141 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2142 2143 // All inputs are SGPRs, nothing special to do. 2144 if (DefRegs.empty()) { 2145 assert(Src0Regs.empty() && Src1Regs.empty()); 2146 applyDefaultMapping(OpdMapper); 2147 return; 2148 } 2149 2150 assert(DefRegs.size() == 2); 2151 assert(Src0Regs.size() == Src1Regs.size() && 2152 (Src0Regs.empty() || Src0Regs.size() == 2)); 2153 2154 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2155 MachineInstr &MI = OpdMapper.getMI(); 2156 Register DstReg = MI.getOperand(0).getReg(); 2157 LLT HalfTy = LLT::scalar(32); 2158 2159 // Depending on where the source registers came from, the generic code may 2160 // have decided to split the inputs already or not. If not, we still need to 2161 // extract the values. 2162 2163 if (Src0Regs.empty()) 2164 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2165 else 2166 setRegsToType(MRI, Src0Regs, HalfTy); 2167 2168 if (Src1Regs.empty()) 2169 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2170 else 2171 setRegsToType(MRI, Src1Regs, HalfTy); 2172 2173 setRegsToType(MRI, DefRegs, HalfTy); 2174 2175 // The multiplication is done as follows: 2176 // 2177 // Op1H Op1L 2178 // * Op0H Op0L 2179 // -------------------- 2180 // Op1H*Op0L Op1L*Op0L 2181 // + Op1H*Op0H Op1L*Op0H 2182 // ----------------------------------------- 2183 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L 2184 // 2185 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit 2186 // value and that would overflow. 2187 // The low 32-bit value is Op1L*Op0L. 2188 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from 2189 // Op1L*Op0L). 2190 2191 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); 2192 2193 Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0); 2194 Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0); 2195 Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0); 2196 Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0); 2197 B.buildAdd(DefRegs[1], Add, MulHiLo); 2198 B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]); 2199 2200 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2201 MI.eraseFromParent(); 2202 } 2203 2204 void AMDGPURegisterBankInfo::applyMappingImpl( 2205 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 2206 MachineInstr &MI = OpdMapper.getMI(); 2207 B.setInstrAndDebugLoc(MI); 2208 unsigned Opc = MI.getOpcode(); 2209 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2210 switch (Opc) { 2211 case AMDGPU::G_CONSTANT: 2212 case AMDGPU::G_IMPLICIT_DEF: { 2213 Register DstReg = MI.getOperand(0).getReg(); 2214 LLT DstTy = MRI.getType(DstReg); 2215 if (DstTy != LLT::scalar(1)) 2216 break; 2217 2218 const RegisterBank *DstBank = 2219 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2220 if (DstBank == &AMDGPU::VCCRegBank) 2221 break; 2222 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2223 if (DefRegs.empty()) 2224 DefRegs.push_back(DstReg); 2225 2226 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2227 2228 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 2229 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 2230 2231 MI.getOperand(0).setReg(NewDstReg); 2232 if (Opc != AMDGPU::G_IMPLICIT_DEF) { 2233 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); 2234 MI.getOperand(1).setCImm( 2235 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal)); 2236 } 2237 2238 MRI.setRegBank(NewDstReg, *DstBank); 2239 B.buildTrunc(DefRegs[0], NewDstReg); 2240 return; 2241 } 2242 case AMDGPU::G_PHI: { 2243 Register DstReg = MI.getOperand(0).getReg(); 2244 LLT DstTy = MRI.getType(DstReg); 2245 if (DstTy != LLT::scalar(1)) 2246 break; 2247 2248 const LLT S32 = LLT::scalar(32); 2249 const RegisterBank *DstBank = 2250 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2251 if (DstBank == &AMDGPU::VCCRegBank) { 2252 applyDefaultMapping(OpdMapper); 2253 // The standard handling only considers the result register bank for 2254 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2255 // produce an invalid copy. We can only copy with some kind of compare to 2256 // get a vector boolean result. Insert a register bank copy that will be 2257 // correctly lowered to a compare. 2258 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2259 Register SrcReg = MI.getOperand(I).getReg(); 2260 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2261 2262 if (SrcBank != &AMDGPU::VCCRegBank) { 2263 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2264 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2265 2266 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2267 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2268 MI.getOperand(I).setReg(Copy.getReg(0)); 2269 } 2270 } 2271 2272 return; 2273 } 2274 2275 // Phi handling is strange and only considers the bank of the destination. 2276 substituteSimpleCopyRegs(OpdMapper, 0); 2277 2278 // Promote SGPR/VGPR booleans to s32 2279 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 2280 B.setInsertPt(B.getMBB(), MI); 2281 LegalizerHelper Helper(B.getMF(), ApplyBank, B); 2282 2283 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2284 llvm_unreachable("widen scalar should have succeeded"); 2285 2286 return; 2287 } 2288 case AMDGPU::G_FCMP: 2289 if (!Subtarget.hasSALUFloatInsts()) 2290 break; 2291 [[fallthrough]]; 2292 case AMDGPU::G_ICMP: 2293 case AMDGPU::G_UADDO: 2294 case AMDGPU::G_USUBO: 2295 case AMDGPU::G_UADDE: 2296 case AMDGPU::G_SADDE: 2297 case AMDGPU::G_USUBE: 2298 case AMDGPU::G_SSUBE: { 2299 unsigned BoolDstOp = 2300 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1; 2301 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2302 2303 const RegisterBank *DstBank = 2304 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2305 if (DstBank != &AMDGPU::SGPRRegBank) 2306 break; 2307 2308 const bool HasCarryIn = MI.getNumOperands() == 5; 2309 2310 // If this is a scalar compare, promote the result to s32, as the selection 2311 // will end up using a copy to a 32-bit vreg. 2312 const LLT S32 = LLT::scalar(32); 2313 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2314 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2315 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2316 2317 if (HasCarryIn) { 2318 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2319 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2320 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2321 MI.getOperand(4).setReg(NewSrcReg); 2322 } 2323 2324 MachineBasicBlock *MBB = MI.getParent(); 2325 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2326 2327 // If we had a constrained VCC result register, a copy was inserted to VCC 2328 // from SGPR. 2329 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2330 if (DefRegs.empty()) 2331 DefRegs.push_back(DstReg); 2332 B.buildTrunc(DefRegs[0], NewDstReg); 2333 return; 2334 } 2335 case AMDGPU::G_SELECT: { 2336 Register DstReg = MI.getOperand(0).getReg(); 2337 LLT DstTy = MRI.getType(DstReg); 2338 2339 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2340 if (CondRegs.empty()) 2341 CondRegs.push_back(MI.getOperand(1).getReg()); 2342 else { 2343 assert(CondRegs.size() == 1); 2344 } 2345 2346 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2347 if (CondBank == &AMDGPU::SGPRRegBank) { 2348 const LLT S32 = LLT::scalar(32); 2349 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2350 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2351 2352 MI.getOperand(1).setReg(NewCondReg); 2353 B.buildZExt(NewCondReg, CondRegs[0]); 2354 } 2355 2356 if (DstTy.getSizeInBits() != 64) 2357 break; 2358 2359 LLT HalfTy = getHalfSizedType(DstTy); 2360 2361 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2362 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2363 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2364 2365 // All inputs are SGPRs, nothing special to do. 2366 if (DefRegs.empty()) { 2367 assert(Src1Regs.empty() && Src2Regs.empty()); 2368 break; 2369 } 2370 2371 if (Src1Regs.empty()) 2372 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2373 else { 2374 setRegsToType(MRI, Src1Regs, HalfTy); 2375 } 2376 2377 if (Src2Regs.empty()) 2378 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2379 else 2380 setRegsToType(MRI, Src2Regs, HalfTy); 2381 2382 setRegsToType(MRI, DefRegs, HalfTy); 2383 2384 auto Flags = MI.getFlags(); 2385 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0], Flags); 2386 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1], Flags); 2387 2388 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2389 MI.eraseFromParent(); 2390 return; 2391 } 2392 case AMDGPU::G_BRCOND: { 2393 Register CondReg = MI.getOperand(0).getReg(); 2394 // FIXME: Should use legalizer helper, but should change bool ext type. 2395 const RegisterBank *CondBank = 2396 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2397 2398 if (CondBank == &AMDGPU::SGPRRegBank) { 2399 const LLT S32 = LLT::scalar(32); 2400 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2401 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2402 2403 MI.getOperand(0).setReg(NewCondReg); 2404 B.buildZExt(NewCondReg, CondReg); 2405 return; 2406 } 2407 2408 break; 2409 } 2410 case AMDGPU::G_AND: 2411 case AMDGPU::G_OR: 2412 case AMDGPU::G_XOR: { 2413 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2414 // there is a VGPR input. 2415 Register DstReg = MI.getOperand(0).getReg(); 2416 LLT DstTy = MRI.getType(DstReg); 2417 2418 const RegisterBank *DstBank = 2419 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2420 2421 if (DstTy.getSizeInBits() == 1) { 2422 if (DstBank == &AMDGPU::VCCRegBank) 2423 break; 2424 2425 MachineFunction *MF = MI.getParent()->getParent(); 2426 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 2427 LegalizerHelper Helper(*MF, ApplyBank, B); 2428 2429 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2430 LegalizerHelper::Legalized) 2431 llvm_unreachable("widen scalar should have succeeded"); 2432 return; 2433 } 2434 2435 if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) { 2436 const LLT S32 = LLT::scalar(32); 2437 MachineBasicBlock *MBB = MI.getParent(); 2438 MachineFunction *MF = MBB->getParent(); 2439 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); 2440 LegalizerHelper Helper(*MF, ApplySALU, B); 2441 // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening 2442 // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1 2443 // as "not". 2444 if (MI.getOpcode() == AMDGPU::G_XOR && 2445 mi_match(MI.getOperand(2).getReg(), MRI, m_SpecificICstOrSplat(-1))) { 2446 Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT); 2447 Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_SEXT); 2448 Helper.widenScalarDst(MI, S32); 2449 } else { 2450 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2451 llvm_unreachable("widen scalar should have succeeded"); 2452 } 2453 return; 2454 } 2455 2456 if (DstTy.getSizeInBits() != 64) 2457 break; 2458 2459 LLT HalfTy = getHalfSizedType(DstTy); 2460 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2461 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2462 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2463 2464 // All inputs are SGPRs, nothing special to do. 2465 if (DefRegs.empty()) { 2466 assert(Src0Regs.empty() && Src1Regs.empty()); 2467 break; 2468 } 2469 2470 assert(DefRegs.size() == 2); 2471 assert(Src0Regs.size() == Src1Regs.size() && 2472 (Src0Regs.empty() || Src0Regs.size() == 2)); 2473 2474 // Depending on where the source registers came from, the generic code may 2475 // have decided to split the inputs already or not. If not, we still need to 2476 // extract the values. 2477 2478 if (Src0Regs.empty()) 2479 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2480 else 2481 setRegsToType(MRI, Src0Regs, HalfTy); 2482 2483 if (Src1Regs.empty()) 2484 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2485 else 2486 setRegsToType(MRI, Src1Regs, HalfTy); 2487 2488 setRegsToType(MRI, DefRegs, HalfTy); 2489 2490 auto Flags = MI.getFlags(); 2491 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}, Flags); 2492 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}, Flags); 2493 2494 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2495 MI.eraseFromParent(); 2496 return; 2497 } 2498 case AMDGPU::G_ABS: { 2499 Register SrcReg = MI.getOperand(1).getReg(); 2500 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2501 2502 // There is no VALU abs instruction so we need to replace it with a sub and 2503 // max combination. 2504 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2505 MachineFunction *MF = MI.getParent()->getParent(); 2506 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); 2507 LegalizerHelper Helper(*MF, Apply, B); 2508 2509 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2510 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2511 return; 2512 } 2513 [[fallthrough]]; 2514 } 2515 case AMDGPU::G_ADD: 2516 case AMDGPU::G_SUB: 2517 case AMDGPU::G_MUL: 2518 case AMDGPU::G_SHL: 2519 case AMDGPU::G_LSHR: 2520 case AMDGPU::G_ASHR: 2521 case AMDGPU::G_SMIN: 2522 case AMDGPU::G_SMAX: 2523 case AMDGPU::G_UMIN: 2524 case AMDGPU::G_UMAX: { 2525 Register DstReg = MI.getOperand(0).getReg(); 2526 LLT DstTy = MRI.getType(DstReg); 2527 2528 // Special case for s_mul_u64. There is not a vector equivalent of 2529 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector 2530 // multiplications. 2531 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { 2532 applyMappingSMULU64(B, OpdMapper); 2533 return; 2534 } 2535 2536 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2537 // Packed 16-bit operations need to be scalarized and promoted. 2538 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2539 break; 2540 2541 const RegisterBank *DstBank = 2542 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2543 if (DstBank == &AMDGPU::VGPRRegBank) 2544 break; 2545 2546 const LLT S32 = LLT::scalar(32); 2547 MachineBasicBlock *MBB = MI.getParent(); 2548 MachineFunction *MF = MBB->getParent(); 2549 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); 2550 2551 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) { 2552 Register WideSrcLo, WideSrcHi; 2553 2554 std::tie(WideSrcLo, WideSrcHi) = 2555 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT); 2556 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo}); 2557 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi}); 2558 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2559 MI.eraseFromParent(); 2560 return; 2561 } 2562 2563 if (DstTy.isVector()) { 2564 Register WideSrc0Lo, WideSrc0Hi; 2565 Register WideSrc1Lo, WideSrc1Hi; 2566 2567 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2568 std::tie(WideSrc0Lo, WideSrc0Hi) 2569 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2570 std::tie(WideSrc1Lo, WideSrc1Hi) 2571 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2572 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2573 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2574 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2575 MI.eraseFromParent(); 2576 } else { 2577 LegalizerHelper Helper(*MF, ApplySALU, B); 2578 2579 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2580 llvm_unreachable("widen scalar should have succeeded"); 2581 2582 // FIXME: s16 shift amounts should be legal. 2583 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2584 Opc == AMDGPU::G_ASHR) { 2585 B.setInsertPt(*MBB, MI.getIterator()); 2586 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2587 llvm_unreachable("widen scalar should have succeeded"); 2588 } 2589 } 2590 2591 return; 2592 } 2593 case AMDGPU::G_AMDGPU_S_MUL_I64_I32: 2594 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { 2595 // This is a special case for s_mul_u64. We use 2596 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation 2597 // where the 33 higher bits are sign-extended and 2598 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation 2599 // where the 32 higher bits are zero-extended. In case scalar registers are 2600 // selected, both opcodes are lowered as s_mul_u64. If the vector registers 2601 // are selected, then G_AMDGPU_S_MUL_I64_I32 and 2602 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. 2603 2604 // Insert basic copies. 2605 applyDefaultMapping(OpdMapper); 2606 2607 Register DstReg = MI.getOperand(0).getReg(); 2608 Register SrcReg0 = MI.getOperand(1).getReg(); 2609 Register SrcReg1 = MI.getOperand(2).getReg(); 2610 const LLT S32 = LLT::scalar(32); 2611 const LLT S64 = LLT::scalar(64); 2612 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " 2613 "that handles only 64-bit operands."); 2614 const RegisterBank *DstBank = 2615 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2616 2617 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 2618 // with s_mul_u64 operation. 2619 if (DstBank == &AMDGPU::SGPRRegBank) { 2620 MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); 2621 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); 2622 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); 2623 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); 2624 return; 2625 } 2626 2627 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 2628 // with a vector mad. 2629 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && 2630 "The destination operand should be in vector registers."); 2631 2632 // Extract the lower subregister from the first operand. 2633 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2634 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); 2635 MRI.setType(Op0L, S32); 2636 B.buildTrunc(Op0L, SrcReg0); 2637 2638 // Extract the lower subregister from the second operand. 2639 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2640 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); 2641 MRI.setType(Op1L, S32); 2642 B.buildTrunc(Op1L, SrcReg1); 2643 2644 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 2645 ? AMDGPU::G_AMDGPU_MAD_U64_U32 2646 : AMDGPU::G_AMDGPU_MAD_I64_I32; 2647 2648 MachineIRBuilder B(MI); 2649 Register Zero64 = B.buildConstant(S64, 0).getReg(0); 2650 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); 2651 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2652 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); 2653 B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64}); 2654 MI.eraseFromParent(); 2655 return; 2656 } 2657 case AMDGPU::G_SEXT_INREG: { 2658 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2659 if (SrcRegs.empty()) 2660 break; // Nothing to repair 2661 2662 const LLT S32 = LLT::scalar(32); 2663 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); 2664 2665 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2666 // we would need to further expand, and doesn't let us directly set the 2667 // result registers. 2668 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2669 2670 int Amt = MI.getOperand(2).getImm(); 2671 if (Amt <= 32) { 2672 // Downstream users have expectations for the high bit behavior, so freeze 2673 // incoming undefined bits. 2674 if (Amt == 32) { 2675 // The low bits are unchanged. 2676 B.buildFreeze(DstRegs[0], SrcRegs[0]); 2677 } else { 2678 auto Freeze = B.buildFreeze(S32, SrcRegs[0]); 2679 // Extend in the low bits and propagate the sign bit to the high half. 2680 B.buildSExtInReg(DstRegs[0], Freeze, Amt); 2681 } 2682 2683 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2684 } else { 2685 // The low bits are unchanged, and extend in the high bits. 2686 // No freeze required 2687 B.buildCopy(DstRegs[0], SrcRegs[0]); 2688 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2689 } 2690 2691 Register DstReg = MI.getOperand(0).getReg(); 2692 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2693 MI.eraseFromParent(); 2694 return; 2695 } 2696 case AMDGPU::G_CTPOP: 2697 case AMDGPU::G_BITREVERSE: { 2698 const RegisterBank *DstBank = 2699 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2700 if (DstBank == &AMDGPU::SGPRRegBank) 2701 break; 2702 2703 Register SrcReg = MI.getOperand(1).getReg(); 2704 const LLT S32 = LLT::scalar(32); 2705 LLT Ty = MRI.getType(SrcReg); 2706 if (Ty == S32) 2707 break; 2708 2709 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); 2710 2711 MachineFunction &MF = B.getMF(); 2712 LegalizerHelper Helper(MF, ApplyVALU, B); 2713 2714 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2715 llvm_unreachable("narrowScalar should have succeeded"); 2716 return; 2717 } 2718 case AMDGPU::G_AMDGPU_FFBH_U32: 2719 case AMDGPU::G_AMDGPU_FFBL_B32: 2720 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2721 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2722 const RegisterBank *DstBank = 2723 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2724 if (DstBank == &AMDGPU::SGPRRegBank) 2725 break; 2726 2727 Register SrcReg = MI.getOperand(1).getReg(); 2728 const LLT S32 = LLT::scalar(32); 2729 LLT Ty = MRI.getType(SrcReg); 2730 if (Ty == S32) 2731 break; 2732 2733 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2734 // which return -1 when the input is zero: 2735 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2736 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2737 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2738 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2739 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); 2740 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2741 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2742 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2743 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2744 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2745 : Opc; 2746 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2747 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2748 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2749 unsigned AddOpc = 2750 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2751 ? AMDGPU::G_ADD 2752 : AMDGPU::G_UADDSAT; 2753 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2754 Register DstReg = MI.getOperand(0).getReg(); 2755 B.buildUMin(DstReg, X, Y); 2756 MI.eraseFromParent(); 2757 return; 2758 } 2759 case AMDGPU::G_SEXT: 2760 case AMDGPU::G_ZEXT: 2761 case AMDGPU::G_ANYEXT: { 2762 Register SrcReg = MI.getOperand(1).getReg(); 2763 LLT SrcTy = MRI.getType(SrcReg); 2764 const bool Signed = Opc == AMDGPU::G_SEXT; 2765 2766 assert(OpdMapper.getVRegs(1).empty()); 2767 2768 const RegisterBank *SrcBank = 2769 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2770 2771 Register DstReg = MI.getOperand(0).getReg(); 2772 LLT DstTy = MRI.getType(DstReg); 2773 if (DstTy.isScalar() && 2774 SrcBank != &AMDGPU::SGPRRegBank && 2775 SrcBank != &AMDGPU::VCCRegBank && 2776 // FIXME: Should handle any type that round to s64 when irregular 2777 // breakdowns supported. 2778 DstTy.getSizeInBits() == 64 && 2779 SrcTy.getSizeInBits() <= 32) { 2780 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2781 2782 // Extend to 32-bit, and then extend the low half. 2783 if (Signed) { 2784 // TODO: Should really be buildSExtOrCopy 2785 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2786 } else if (Opc == AMDGPU::G_ZEXT) { 2787 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2788 } else { 2789 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2790 } 2791 2792 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2793 MRI.setRegBank(DstReg, *SrcBank); 2794 MI.eraseFromParent(); 2795 return; 2796 } 2797 2798 if (SrcTy != LLT::scalar(1)) 2799 return; 2800 2801 // It is not legal to have a legalization artifact with a VCC source. Rather 2802 // than introducing a copy, insert the select we would have to select the 2803 // copy to. 2804 if (SrcBank == &AMDGPU::VCCRegBank) { 2805 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2806 2807 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2808 2809 unsigned DstSize = DstTy.getSizeInBits(); 2810 // 64-bit select is SGPR only 2811 const bool UseSel64 = DstSize > 32 && 2812 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2813 2814 // TODO: Should s16 select be legal? 2815 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2816 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2817 auto False = B.buildConstant(SelType, 0); 2818 2819 MRI.setRegBank(True.getReg(0), *DstBank); 2820 MRI.setRegBank(False.getReg(0), *DstBank); 2821 MRI.setRegBank(DstReg, *DstBank); 2822 2823 if (DstSize > 32) { 2824 B.buildSelect(DefRegs[0], SrcReg, True, False); 2825 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2826 } else if (DstSize < 32) { 2827 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2828 MRI.setRegBank(Sel.getReg(0), *DstBank); 2829 B.buildTrunc(DstReg, Sel); 2830 } else { 2831 B.buildSelect(DstReg, SrcReg, True, False); 2832 } 2833 2834 MI.eraseFromParent(); 2835 return; 2836 } 2837 2838 break; 2839 } 2840 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2841 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2842 2843 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2844 2845 Register DstReg = MI.getOperand(0).getReg(); 2846 Register SrcReg = MI.getOperand(1).getReg(); 2847 2848 const LLT S32 = LLT::scalar(32); 2849 LLT DstTy = MRI.getType(DstReg); 2850 LLT SrcTy = MRI.getType(SrcReg); 2851 2852 if (foldExtractEltToCmpSelect(B, MI, OpdMapper)) 2853 return; 2854 2855 const ValueMapping &DstMapping 2856 = OpdMapper.getInstrMapping().getOperandMapping(0); 2857 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2858 const RegisterBank *SrcBank = 2859 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2860 const RegisterBank *IdxBank = 2861 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2862 2863 Register BaseIdxReg; 2864 unsigned ConstOffset; 2865 std::tie(BaseIdxReg, ConstOffset) = 2866 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2867 2868 // See if the index is an add of a constant which will be foldable by moving 2869 // the base register of the index later if this is going to be executed in a 2870 // waterfall loop. This is essentially to reassociate the add of a constant 2871 // with the readfirstlane. 2872 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2873 ConstOffset > 0 && 2874 ConstOffset < SrcTy.getNumElements(); 2875 2876 // Move the base register. We'll re-insert the add later. 2877 if (ShouldMoveIndexIntoLoop) 2878 MI.getOperand(2).setReg(BaseIdxReg); 2879 2880 // If this is a VGPR result only because the index was a VGPR result, the 2881 // actual indexing will be done on the SGPR source vector, which will 2882 // produce a scalar result. We need to copy to the VGPR result inside the 2883 // waterfall loop. 2884 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2885 SrcBank == &AMDGPU::SGPRRegBank; 2886 if (DstRegs.empty()) { 2887 applyDefaultMapping(OpdMapper); 2888 2889 executeInWaterfallLoop(B, MI, {2}); 2890 2891 if (NeedCopyToVGPR) { 2892 // We don't want a phi for this temporary reg. 2893 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2894 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2895 MI.getOperand(0).setReg(TmpReg); 2896 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2897 2898 // Use a v_mov_b32 here to make the exec dependency explicit. 2899 buildVCopy(B, DstReg, TmpReg); 2900 } 2901 2902 // Re-insert the constant offset add inside the waterfall loop. 2903 if (ShouldMoveIndexIntoLoop) 2904 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2905 2906 return; 2907 } 2908 2909 assert(DstTy.getSizeInBits() == 64); 2910 2911 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2912 2913 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2914 auto One = B.buildConstant(S32, 1); 2915 2916 MachineBasicBlock::iterator MII = MI.getIterator(); 2917 2918 // Split the vector index into 32-bit pieces. Prepare to move all of the 2919 // new instructions into a waterfall loop if necessary. 2920 // 2921 // Don't put the bitcast or constant in the loop. 2922 MachineInstrSpan Span(MII, &B.getMBB()); 2923 2924 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2925 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2926 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2927 2928 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2929 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2930 2931 MRI.setRegBank(DstReg, *DstBank); 2932 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2933 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2934 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2935 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2936 2937 SmallSet<Register, 4> OpsToWaterfall; 2938 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2939 MI.eraseFromParent(); 2940 return; 2941 } 2942 2943 // Remove the original instruction to avoid potentially confusing the 2944 // waterfall loop logic. 2945 B.setInstr(*Span.begin()); 2946 MI.eraseFromParent(); 2947 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2948 OpsToWaterfall); 2949 2950 if (NeedCopyToVGPR) { 2951 MachineBasicBlock *LoopBB = Extract1->getParent(); 2952 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2953 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2954 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2955 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2956 2957 Extract0->getOperand(0).setReg(TmpReg0); 2958 Extract1->getOperand(0).setReg(TmpReg1); 2959 2960 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2961 2962 buildVCopy(B, DstRegs[0], TmpReg0); 2963 buildVCopy(B, DstRegs[1], TmpReg1); 2964 } 2965 2966 if (ShouldMoveIndexIntoLoop) 2967 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2968 2969 return; 2970 } 2971 case AMDGPU::G_INSERT_VECTOR_ELT: { 2972 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2973 2974 Register DstReg = MI.getOperand(0).getReg(); 2975 LLT VecTy = MRI.getType(DstReg); 2976 2977 assert(OpdMapper.getVRegs(0).empty()); 2978 assert(OpdMapper.getVRegs(3).empty()); 2979 2980 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2981 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2982 2983 if (foldInsertEltToCmpSelect(B, MI, OpdMapper)) 2984 return; 2985 2986 const RegisterBank *IdxBank = 2987 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2988 2989 Register SrcReg = MI.getOperand(1).getReg(); 2990 Register InsReg = MI.getOperand(2).getReg(); 2991 LLT InsTy = MRI.getType(InsReg); 2992 (void)InsTy; 2993 2994 Register BaseIdxReg; 2995 unsigned ConstOffset; 2996 std::tie(BaseIdxReg, ConstOffset) = 2997 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2998 2999 // See if the index is an add of a constant which will be foldable by moving 3000 // the base register of the index later if this is going to be executed in a 3001 // waterfall loop. This is essentially to reassociate the add of a constant 3002 // with the readfirstlane. 3003 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 3004 ConstOffset > 0 && 3005 ConstOffset < VecTy.getNumElements(); 3006 3007 // Move the base register. We'll re-insert the add later. 3008 if (ShouldMoveIndexIntoLoop) 3009 MI.getOperand(3).setReg(BaseIdxReg); 3010 3011 3012 if (InsRegs.empty()) { 3013 executeInWaterfallLoop(B, MI, {3}); 3014 3015 // Re-insert the constant offset add inside the waterfall loop. 3016 if (ShouldMoveIndexIntoLoop) { 3017 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 3018 } 3019 3020 return; 3021 } 3022 3023 assert(InsTy.getSizeInBits() == 64); 3024 3025 const LLT S32 = LLT::scalar(32); 3026 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 3027 3028 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 3029 auto One = B.buildConstant(S32, 1); 3030 3031 // Split the vector index into 32-bit pieces. Prepare to move all of the 3032 // new instructions into a waterfall loop if necessary. 3033 // 3034 // Don't put the bitcast or constant in the loop. 3035 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 3036 3037 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 3038 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 3039 auto IdxHi = B.buildAdd(S32, IdxLo, One); 3040 3041 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 3042 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 3043 3044 const RegisterBank *DstBank = 3045 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 3046 const RegisterBank *SrcBank = 3047 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 3048 const RegisterBank *InsSrcBank = 3049 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 3050 3051 MRI.setRegBank(InsReg, *InsSrcBank); 3052 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 3053 MRI.setRegBank(InsLo.getReg(0), *DstBank); 3054 MRI.setRegBank(InsHi.getReg(0), *DstBank); 3055 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 3056 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 3057 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 3058 3059 3060 SmallSet<Register, 4> OpsToWaterfall; 3061 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 3062 B.setInsertPt(B.getMBB(), MI); 3063 B.buildBitcast(DstReg, InsHi); 3064 MI.eraseFromParent(); 3065 return; 3066 } 3067 3068 B.setInstr(*Span.begin()); 3069 MI.eraseFromParent(); 3070 3071 // Figure out the point after the waterfall loop before mangling the control 3072 // flow. 3073 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 3074 OpsToWaterfall); 3075 3076 // The insertion point is now right after the original instruction. 3077 // 3078 // Keep the bitcast to the original vector type out of the loop. Doing this 3079 // saved an extra phi we don't need inside the loop. 3080 B.buildBitcast(DstReg, InsHi); 3081 3082 // Re-insert the constant offset add inside the waterfall loop. 3083 if (ShouldMoveIndexIntoLoop) 3084 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 3085 3086 return; 3087 } 3088 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE: 3094 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE: 3095 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE: 3096 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE: 3097 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE: 3098 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3099 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 3100 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3101 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3102 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3103 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3104 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3105 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3106 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3107 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 3108 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3109 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 3110 applyDefaultMapping(OpdMapper); 3111 executeInWaterfallLoop(B, MI, {1, 4}); 3112 return; 3113 } 3114 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3115 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 3126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3129 applyDefaultMapping(OpdMapper); 3130 executeInWaterfallLoop(B, MI, {2, 5}); 3131 return; 3132 } 3133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3134 applyDefaultMapping(OpdMapper); 3135 executeInWaterfallLoop(B, MI, {3, 6}); 3136 return; 3137 } 3138 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: 3139 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: 3140 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: 3141 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: 3142 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { 3143 applyMappingSBufferLoad(B, OpdMapper); 3144 return; 3145 } 3146 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH: 3147 constrainOpWithReadfirstlane(B, MI, 0); 3148 constrainOpWithReadfirstlane(B, MI, 2); 3149 return; 3150 case AMDGPU::G_INTRINSIC: 3151 case AMDGPU::G_INTRINSIC_CONVERGENT: { 3152 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 3153 case Intrinsic::amdgcn_readlane: { 3154 substituteSimpleCopyRegs(OpdMapper, 2); 3155 3156 assert(OpdMapper.getVRegs(0).empty()); 3157 assert(OpdMapper.getVRegs(3).empty()); 3158 3159 // Make sure the index is an SGPR. It doesn't make sense to run this in a 3160 // waterfall loop, so assume it's a uniform value. 3161 constrainOpWithReadfirstlane(B, MI, 3); // Index 3162 return; 3163 } 3164 case Intrinsic::amdgcn_writelane: { 3165 assert(OpdMapper.getVRegs(0).empty()); 3166 assert(OpdMapper.getVRegs(2).empty()); 3167 assert(OpdMapper.getVRegs(3).empty()); 3168 3169 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 3170 constrainOpWithReadfirstlane(B, MI, 2); // Source value 3171 constrainOpWithReadfirstlane(B, MI, 3); // Index 3172 return; 3173 } 3174 case Intrinsic::amdgcn_interp_p1: 3175 case Intrinsic::amdgcn_interp_p2: 3176 case Intrinsic::amdgcn_interp_mov: 3177 case Intrinsic::amdgcn_interp_p1_f16: 3178 case Intrinsic::amdgcn_interp_p2_f16: 3179 case Intrinsic::amdgcn_lds_param_load: { 3180 applyDefaultMapping(OpdMapper); 3181 3182 // Readlane for m0 value, which is always the last operand. 3183 // FIXME: Should this be a waterfall loop instead? 3184 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index 3185 return; 3186 } 3187 case Intrinsic::amdgcn_interp_inreg_p10: 3188 case Intrinsic::amdgcn_interp_inreg_p2: 3189 case Intrinsic::amdgcn_interp_inreg_p10_f16: 3190 case Intrinsic::amdgcn_interp_inreg_p2_f16: 3191 case Intrinsic::amdgcn_interp_p10_rtz_f16: 3192 case Intrinsic::amdgcn_interp_p2_rtz_f16: 3193 case Intrinsic::amdgcn_permlane16_swap: 3194 case Intrinsic::amdgcn_permlane32_swap: 3195 applyDefaultMapping(OpdMapper); 3196 return; 3197 case Intrinsic::amdgcn_permlane16: 3198 case Intrinsic::amdgcn_permlanex16: { 3199 // Doing a waterfall loop over these wouldn't make any sense. 3200 substituteSimpleCopyRegs(OpdMapper, 2); 3201 substituteSimpleCopyRegs(OpdMapper, 3); 3202 constrainOpWithReadfirstlane(B, MI, 4); 3203 constrainOpWithReadfirstlane(B, MI, 5); 3204 return; 3205 } 3206 case Intrinsic::amdgcn_sbfe: 3207 applyMappingBFE(B, OpdMapper, true); 3208 return; 3209 case Intrinsic::amdgcn_ubfe: 3210 applyMappingBFE(B, OpdMapper, false); 3211 return; 3212 case Intrinsic::amdgcn_inverse_ballot: 3213 case Intrinsic::amdgcn_s_bitreplicate: 3214 case Intrinsic::amdgcn_s_quadmask: 3215 case Intrinsic::amdgcn_s_wqm: 3216 applyDefaultMapping(OpdMapper); 3217 constrainOpWithReadfirstlane(B, MI, 2); // Mask 3218 return; 3219 case Intrinsic::amdgcn_ballot: 3220 // Use default handling and insert copy to vcc source. 3221 break; 3222 } 3223 break; 3224 } 3225 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3226 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3227 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: 3228 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3229 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3230 const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3231 AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI)); 3232 assert(RSrcIntrin && RSrcIntrin->IsImage); 3233 // Non-images can have complications from operands that allow both SGPR 3234 // and VGPR. For now it's too complicated to figure out the final opcode 3235 // to derive the register bank from the MCInstrDesc. 3236 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); 3237 return; 3238 } 3239 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: 3240 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: 3241 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { 3242 bool IsDualOrBVH8 = 3243 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY || 3244 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY; 3245 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier 3246 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; 3247 applyDefaultMapping(OpdMapper); 3248 executeInWaterfallLoop(B, MI, {LastRegOpIdx}); 3249 return; 3250 } 3251 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 3252 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { 3253 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 3254 switch (IntrID) { 3255 case Intrinsic::amdgcn_ds_ordered_add: 3256 case Intrinsic::amdgcn_ds_ordered_swap: { 3257 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3258 assert(OpdMapper.getVRegs(0).empty()); 3259 substituteSimpleCopyRegs(OpdMapper, 3); 3260 constrainOpWithReadfirstlane(B, MI, 2); // M0 3261 return; 3262 } 3263 case Intrinsic::amdgcn_ds_gws_init: 3264 case Intrinsic::amdgcn_ds_gws_barrier: 3265 case Intrinsic::amdgcn_ds_gws_sema_br: { 3266 // Only the first lane is executes, so readfirstlane is safe. 3267 substituteSimpleCopyRegs(OpdMapper, 1); 3268 constrainOpWithReadfirstlane(B, MI, 2); // M0 3269 return; 3270 } 3271 case Intrinsic::amdgcn_ds_gws_sema_v: 3272 case Intrinsic::amdgcn_ds_gws_sema_p: 3273 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3274 // Only the first lane is executes, so readfirstlane is safe. 3275 constrainOpWithReadfirstlane(B, MI, 1); // M0 3276 return; 3277 } 3278 case Intrinsic::amdgcn_ds_append: 3279 case Intrinsic::amdgcn_ds_consume: { 3280 constrainOpWithReadfirstlane(B, MI, 2); // M0 3281 return; 3282 } 3283 case Intrinsic::amdgcn_s_sendmsg: 3284 case Intrinsic::amdgcn_s_sendmsghalt: { 3285 // FIXME: Should this use a waterfall loop? 3286 constrainOpWithReadfirstlane(B, MI, 2); // M0 3287 return; 3288 } 3289 case Intrinsic::amdgcn_s_setreg: { 3290 constrainOpWithReadfirstlane(B, MI, 2); 3291 return; 3292 } 3293 case Intrinsic::amdgcn_s_ttracedata: 3294 constrainOpWithReadfirstlane(B, MI, 1); // M0 3295 return; 3296 case Intrinsic::amdgcn_raw_buffer_load_lds: 3297 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 3298 applyDefaultMapping(OpdMapper); 3299 constrainOpWithReadfirstlane(B, MI, 1); // rsrc 3300 constrainOpWithReadfirstlane(B, MI, 2); // M0 3301 constrainOpWithReadfirstlane(B, MI, 5); // soffset 3302 return; 3303 } 3304 case Intrinsic::amdgcn_struct_buffer_load_lds: 3305 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 3306 applyDefaultMapping(OpdMapper); 3307 constrainOpWithReadfirstlane(B, MI, 1); // rsrc 3308 constrainOpWithReadfirstlane(B, MI, 2); // M0 3309 constrainOpWithReadfirstlane(B, MI, 6); // soffset 3310 return; 3311 } 3312 case Intrinsic::amdgcn_load_to_lds: 3313 case Intrinsic::amdgcn_global_load_lds: { 3314 applyDefaultMapping(OpdMapper); 3315 constrainOpWithReadfirstlane(B, MI, 2); 3316 return; 3317 } 3318 case Intrinsic::amdgcn_lds_direct_load: { 3319 applyDefaultMapping(OpdMapper); 3320 // Readlane for m0 value, which is always the last operand. 3321 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index 3322 return; 3323 } 3324 case Intrinsic::amdgcn_exp_row: 3325 applyDefaultMapping(OpdMapper); 3326 constrainOpWithReadfirstlane(B, MI, 8); // M0 3327 return; 3328 case Intrinsic::amdgcn_s_sleep_var: 3329 assert(OpdMapper.getVRegs(1).empty()); 3330 constrainOpWithReadfirstlane(B, MI, 1); 3331 return; 3332 case Intrinsic::amdgcn_s_barrier_signal_var: 3333 constrainOpWithReadfirstlane(B, MI, 1); 3334 constrainOpWithReadfirstlane(B, MI, 2); 3335 return; 3336 case Intrinsic::amdgcn_s_get_barrier_state: 3337 case Intrinsic::amdgcn_s_get_named_barrier_state: { 3338 constrainOpWithReadfirstlane(B, MI, 2); 3339 return; 3340 } 3341 case Intrinsic::amdgcn_s_prefetch_data: { 3342 Register PtrReg = MI.getOperand(1).getReg(); 3343 unsigned AS = MRI.getType(PtrReg).getAddressSpace(); 3344 if (AMDGPU::isFlatGlobalAddrSpace(AS)) { 3345 constrainOpWithReadfirstlane(B, MI, 1); 3346 constrainOpWithReadfirstlane(B, MI, 2); 3347 } else 3348 MI.eraseFromParent(); 3349 return; 3350 } 3351 case Intrinsic::amdgcn_tensor_load_to_lds: 3352 case Intrinsic::amdgcn_tensor_store_from_lds: { 3353 constrainOpWithReadfirstlane(B, MI, 1); 3354 constrainOpWithReadfirstlane(B, MI, 2); 3355 constrainOpWithReadfirstlane(B, MI, 3); 3356 constrainOpWithReadfirstlane(B, MI, 4); 3357 return; 3358 } 3359 case Intrinsic::amdgcn_tensor_load_to_lds_d2: 3360 case Intrinsic::amdgcn_tensor_store_from_lds_d2: { 3361 constrainOpWithReadfirstlane(B, MI, 1); 3362 constrainOpWithReadfirstlane(B, MI, 2); 3363 return; 3364 } 3365 default: { 3366 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3367 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3368 // Non-images can have complications from operands that allow both SGPR 3369 // and VGPR. For now it's too complicated to figure out the final opcode 3370 // to derive the register bank from the MCInstrDesc. 3371 if (RSrcIntrin->IsImage) { 3372 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); 3373 return; 3374 } 3375 } 3376 3377 break; 3378 } 3379 } 3380 break; 3381 } 3382 case AMDGPU::G_SI_CALL: { 3383 // Use a set to avoid extra readfirstlanes in the case where multiple 3384 // operands are the same register. 3385 SmallSet<Register, 4> SGPROperandRegs; 3386 3387 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3388 break; 3389 3390 // Move all copies to physical SGPRs that are used by the call instruction 3391 // into the loop block. Start searching for these copies until the 3392 // ADJCALLSTACKUP. 3393 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3394 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3395 3396 // Move all non-copies before the copies, so that a complete range can be 3397 // moved into the waterfall loop. 3398 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3399 // Count of NonCopyInstrs found until the current LastCopy. 3400 unsigned NonCopyInstrsLen = 0; 3401 MachineBasicBlock::iterator Start(&MI); 3402 MachineBasicBlock::iterator LastCopy = Start; 3403 MachineBasicBlock *MBB = MI.getParent(); 3404 const SIMachineFunctionInfo *Info = 3405 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3406 while (Start->getOpcode() != FrameSetupOpcode) { 3407 --Start; 3408 bool IsCopy = false; 3409 if (Start->getOpcode() == AMDGPU::COPY) { 3410 auto &Dst = Start->getOperand(0); 3411 if (Dst.isReg()) { 3412 Register Reg = Dst.getReg(); 3413 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3414 IsCopy = true; 3415 } else { 3416 // Also move the copy from the scratch rsrc descriptor into the loop 3417 // to allow it to be optimized away. 3418 auto &Src = Start->getOperand(1); 3419 if (Src.isReg()) { 3420 Reg = Src.getReg(); 3421 IsCopy = Info->getScratchRSrcReg() == Reg; 3422 } 3423 } 3424 } 3425 } 3426 3427 if (IsCopy) { 3428 LastCopy = Start; 3429 NonCopyInstrsLen = NonCopyInstrs.size(); 3430 } else { 3431 NonCopyInstrs.push_back(&*Start); 3432 } 3433 } 3434 NonCopyInstrs.resize(NonCopyInstrsLen); 3435 3436 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3437 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3438 } 3439 Start = LastCopy; 3440 3441 // Do the same for copies after the loop 3442 NonCopyInstrs.clear(); 3443 NonCopyInstrsLen = 0; 3444 MachineBasicBlock::iterator End(&MI); 3445 LastCopy = End; 3446 while (End->getOpcode() != FrameDestroyOpcode) { 3447 ++End; 3448 bool IsCopy = false; 3449 if (End->getOpcode() == AMDGPU::COPY) { 3450 auto &Src = End->getOperand(1); 3451 if (Src.isReg()) { 3452 Register Reg = Src.getReg(); 3453 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3454 } 3455 } 3456 3457 if (IsCopy) { 3458 LastCopy = End; 3459 NonCopyInstrsLen = NonCopyInstrs.size(); 3460 } else { 3461 NonCopyInstrs.push_back(&*End); 3462 } 3463 } 3464 NonCopyInstrs.resize(NonCopyInstrsLen); 3465 3466 End = LastCopy; 3467 ++LastCopy; 3468 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3469 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3470 } 3471 3472 ++End; 3473 B.setInsertPt(B.getMBB(), Start); 3474 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs); 3475 break; 3476 } 3477 case AMDGPU::G_LOAD: 3478 case AMDGPU::G_ZEXTLOAD: 3479 case AMDGPU::G_SEXTLOAD: { 3480 if (applyMappingLoad(B, OpdMapper, MI)) 3481 return; 3482 break; 3483 } 3484 case AMDGPU::G_DYN_STACKALLOC: 3485 applyMappingDynStackAlloc(B, OpdMapper, MI); 3486 return; 3487 case AMDGPU::G_STACKRESTORE: { 3488 applyDefaultMapping(OpdMapper); 3489 constrainOpWithReadfirstlane(B, MI, 0); 3490 return; 3491 } 3492 case AMDGPU::G_SBFX: 3493 applyMappingBFE(B, OpdMapper, /*Signed*/ true); 3494 return; 3495 case AMDGPU::G_UBFX: 3496 applyMappingBFE(B, OpdMapper, /*Signed*/ false); 3497 return; 3498 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3499 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3500 applyMappingMAD_64_32(B, OpdMapper); 3501 return; 3502 case AMDGPU::G_PREFETCH: { 3503 if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { 3504 MI.eraseFromParent(); 3505 return; 3506 } 3507 Register PtrReg = MI.getOperand(0).getReg(); 3508 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); 3509 if (PtrBank == AMDGPU::VGPRRegBankID) { 3510 MI.eraseFromParent(); 3511 return; 3512 } 3513 unsigned AS = MRI.getType(PtrReg).getAddressSpace(); 3514 if (!AMDGPU::isFlatGlobalAddrSpace(AS) && 3515 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 3516 MI.eraseFromParent(); 3517 return; 3518 } 3519 applyDefaultMapping(OpdMapper); 3520 return; 3521 } 3522 default: 3523 break; 3524 } 3525 3526 return applyDefaultMapping(OpdMapper); 3527 } 3528 3529 // vgpr, sgpr -> vgpr 3530 // vgpr, agpr -> vgpr 3531 // agpr, agpr -> agpr 3532 // agpr, sgpr -> vgpr 3533 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3534 if (RB0 == AMDGPU::InvalidRegBankID) 3535 return RB1; 3536 if (RB1 == AMDGPU::InvalidRegBankID) 3537 return RB0; 3538 3539 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3540 return AMDGPU::SGPRRegBankID; 3541 3542 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3543 return AMDGPU::AGPRRegBankID; 3544 3545 return AMDGPU::VGPRRegBankID; 3546 } 3547 3548 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3549 if (RB0 == AMDGPU::InvalidRegBankID) 3550 return RB1; 3551 if (RB1 == AMDGPU::InvalidRegBankID) 3552 return RB0; 3553 3554 // vcc, vcc -> vcc 3555 // vcc, sgpr -> vcc 3556 // vcc, vgpr -> vcc 3557 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3558 return AMDGPU::VCCRegBankID; 3559 3560 // vcc, vgpr -> vgpr 3561 return regBankUnion(RB0, RB1); 3562 } 3563 3564 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3565 const MachineInstr &MI) const { 3566 unsigned RegBank = AMDGPU::InvalidRegBankID; 3567 3568 for (const MachineOperand &MO : MI.operands()) { 3569 if (!MO.isReg()) 3570 continue; 3571 Register Reg = MO.getReg(); 3572 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3573 RegBank = regBankUnion(RegBank, Bank->getID()); 3574 if (RegBank == AMDGPU::VGPRRegBankID) 3575 break; 3576 } 3577 } 3578 3579 return RegBank; 3580 } 3581 3582 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3583 const MachineFunction &MF = *MI.getParent()->getParent(); 3584 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3585 for (const MachineOperand &MO : MI.operands()) { 3586 if (!MO.isReg()) 3587 continue; 3588 Register Reg = MO.getReg(); 3589 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3590 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3591 return false; 3592 } 3593 } 3594 return true; 3595 } 3596 3597 const RegisterBankInfo::InstructionMapping & 3598 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3599 const MachineFunction &MF = *MI.getParent()->getParent(); 3600 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3601 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3602 3603 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3604 const MachineOperand &SrcOp = MI.getOperand(i); 3605 if (!SrcOp.isReg()) 3606 continue; 3607 3608 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3609 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3610 } 3611 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3612 MI.getNumOperands()); 3613 } 3614 3615 const RegisterBankInfo::InstructionMapping & 3616 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3617 const MachineFunction &MF = *MI.getParent()->getParent(); 3618 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3619 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3620 3621 // Even though we technically could use SGPRs, this would require knowledge of 3622 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3623 // 3624 // TODO: Unary ops are trivially OK, so accept SGPRs? 3625 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3626 const MachineOperand &Src = MI.getOperand(i); 3627 if (!Src.isReg()) 3628 continue; 3629 3630 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3631 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3632 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3633 } 3634 3635 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3636 MI.getNumOperands()); 3637 } 3638 3639 const RegisterBankInfo::InstructionMapping & 3640 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3641 const MachineFunction &MF = *MI.getParent()->getParent(); 3642 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3643 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3644 3645 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3646 const MachineOperand &Op = MI.getOperand(I); 3647 if (!Op.isReg()) 3648 continue; 3649 3650 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3651 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3652 } 3653 3654 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3655 MI.getNumOperands()); 3656 } 3657 3658 const RegisterBankInfo::InstructionMapping & 3659 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3660 const MachineInstr &MI, 3661 int RsrcIdx) const { 3662 // The reported argument index is relative to the IR intrinsic call arguments, 3663 // so we need to shift by the number of defs and the intrinsic ID. 3664 RsrcIdx += MI.getNumExplicitDefs() + 1; 3665 3666 const int NumOps = MI.getNumOperands(); 3667 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3668 3669 // TODO: Should packed/unpacked D16 difference be reported here as part of 3670 // the value mapping? 3671 for (int I = 0; I != NumOps; ++I) { 3672 if (!MI.getOperand(I).isReg()) 3673 continue; 3674 3675 Register OpReg = MI.getOperand(I).getReg(); 3676 // We replace some dead address operands with $noreg 3677 if (!OpReg) 3678 continue; 3679 3680 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3681 3682 // FIXME: Probably need a new intrinsic register bank searchable table to 3683 // handle arbitrary intrinsics easily. 3684 // 3685 // If this has a sampler, it immediately follows rsrc. 3686 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3687 3688 if (MustBeSGPR) { 3689 // If this must be an SGPR, so we must report whatever it is as legal. 3690 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3691 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3692 } else { 3693 // Some operands must be VGPR, and these are easy to copy to. 3694 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3695 } 3696 } 3697 3698 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3699 } 3700 3701 /// Return the mapping for a pointer argument. 3702 const RegisterBankInfo::ValueMapping * 3703 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3704 Register PtrReg) const { 3705 LLT PtrTy = MRI.getType(PtrReg); 3706 unsigned Size = PtrTy.getSizeInBits(); 3707 if (Subtarget.useFlatForGlobal() || 3708 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3709 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3710 3711 // If we're using MUBUF instructions for global memory, an SGPR base register 3712 // is possible. Otherwise this needs to be a VGPR. 3713 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3714 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3715 } 3716 3717 const RegisterBankInfo::InstructionMapping & 3718 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3719 3720 const MachineFunction &MF = *MI.getParent()->getParent(); 3721 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3722 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3723 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3724 Register PtrReg = MI.getOperand(1).getReg(); 3725 LLT PtrTy = MRI.getType(PtrReg); 3726 unsigned AS = PtrTy.getAddressSpace(); 3727 unsigned PtrSize = PtrTy.getSizeInBits(); 3728 3729 const ValueMapping *ValMapping; 3730 const ValueMapping *PtrMapping; 3731 3732 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3733 3734 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3735 if (isScalarLoadLegal(MI)) { 3736 // We have a uniform instruction so we want to use an SMRD load 3737 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3738 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3739 } else { 3740 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3741 3742 // If we're using MUBUF instructions for global memory, an SGPR base 3743 // register is possible. Otherwise this needs to be a VGPR. 3744 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3745 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3746 3747 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3748 } 3749 } else { 3750 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3751 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3752 } 3753 3754 OpdsMapping[0] = ValMapping; 3755 OpdsMapping[1] = PtrMapping; 3756 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3757 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3758 return Mapping; 3759 3760 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3761 // handle that during instruction selection? 3762 } 3763 3764 unsigned 3765 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3766 const MachineRegisterInfo &MRI, 3767 unsigned Default) const { 3768 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3769 return Bank ? Bank->getID() : Default; 3770 } 3771 3772 const RegisterBankInfo::ValueMapping * 3773 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3774 const MachineRegisterInfo &MRI, 3775 const TargetRegisterInfo &TRI) const { 3776 // Lie and claim anything is legal, even though this needs to be an SGPR 3777 // applyMapping will have to deal with it as a waterfall loop. 3778 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3779 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3780 return AMDGPU::getValueMapping(Bank, Size); 3781 } 3782 3783 const RegisterBankInfo::ValueMapping * 3784 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3785 const MachineRegisterInfo &MRI, 3786 const TargetRegisterInfo &TRI) const { 3787 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3788 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3789 } 3790 3791 const RegisterBankInfo::ValueMapping * 3792 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3793 const MachineRegisterInfo &MRI, 3794 const TargetRegisterInfo &TRI) const { 3795 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3796 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3797 } 3798 3799 /// 3800 /// This function must return a legal mapping, because 3801 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3802 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3803 /// VGPR to SGPR generated is illegal. 3804 /// 3805 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3806 // legal. These will be dealt with in applyMappingImpl. 3807 // 3808 const RegisterBankInfo::InstructionMapping & 3809 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3810 const MachineFunction &MF = *MI.getParent()->getParent(); 3811 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3812 3813 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3814 Register DstReg = MI.getOperand(0).getReg(); 3815 Register SrcReg = MI.getOperand(1).getReg(); 3816 3817 // The default logic bothers to analyze impossible alternative mappings. We 3818 // want the most straightforward mapping, so just directly handle this. 3819 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 3820 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 3821 assert(SrcBank && "src bank should have been assigned already"); 3822 3823 // For COPY between a physical reg and an s1, there is no type associated so 3824 // we need to take the virtual register's type as a hint on how to interpret 3825 // s1 values. 3826 if (!SrcReg.isVirtual() && !DstBank && 3827 MRI.getType(DstReg) == LLT::scalar(1)) 3828 DstBank = &AMDGPU::VCCRegBank; 3829 else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1)) 3830 DstBank = &AMDGPU::VCCRegBank; 3831 3832 if (!DstBank) 3833 DstBank = SrcBank; 3834 3835 unsigned Size = getSizeInBits(DstReg, MRI, *TRI); 3836 if (MI.getOpcode() != AMDGPU::G_FREEZE && 3837 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size))) 3838 return getInvalidInstructionMapping(); 3839 3840 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3841 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3842 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3843 OpdsMapping[0] = &ValMap; 3844 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3845 OpdsMapping[1] = &ValMap; 3846 3847 return getInstructionMapping( 3848 1, /*Cost*/ 1, 3849 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3850 } 3851 3852 if (MI.isRegSequence()) { 3853 // If any input is a VGPR, the result must be a VGPR. The default handling 3854 // assumes any copy between banks is legal. 3855 unsigned BankID = AMDGPU::SGPRRegBankID; 3856 3857 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3858 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3859 // It doesn't make sense to use vcc or scc banks here, so just ignore 3860 // them. 3861 if (OpBank != AMDGPU::SGPRRegBankID) { 3862 BankID = AMDGPU::VGPRRegBankID; 3863 break; 3864 } 3865 } 3866 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3867 3868 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3869 return getInstructionMapping( 3870 1, /*Cost*/ 1, 3871 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3872 } 3873 3874 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3875 // properly. 3876 // 3877 // TODO: There are additional exec masking dependencies to analyze. 3878 if (auto *PHI = dyn_cast<GPhi>(&MI)) { 3879 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3880 Register DstReg = PHI->getReg(0); 3881 3882 // Sometimes the result may have already been assigned a bank. 3883 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3884 ResultBank = DstBank->getID(); 3885 3886 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { 3887 Register Reg = PHI->getIncomingValue(I); 3888 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3889 3890 // FIXME: Assuming VGPR for any undetermined inputs. 3891 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3892 ResultBank = AMDGPU::VGPRRegBankID; 3893 break; 3894 } 3895 3896 // FIXME: Need to promote SGPR case to s32 3897 unsigned OpBank = Bank->getID(); 3898 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3899 } 3900 3901 assert(ResultBank != AMDGPU::InvalidRegBankID); 3902 3903 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3904 3905 const ValueMapping &ValMap = 3906 getValueMapping(0, Size, getRegBank(ResultBank)); 3907 return getInstructionMapping( 3908 1, /*Cost*/ 1, 3909 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3910 } 3911 3912 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3913 if (Mapping.isValid()) 3914 return Mapping; 3915 3916 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3917 3918 switch (MI.getOpcode()) { 3919 default: 3920 return getInvalidInstructionMapping(); 3921 3922 case AMDGPU::G_AND: 3923 case AMDGPU::G_OR: 3924 case AMDGPU::G_XOR: 3925 case AMDGPU::G_MUL: { 3926 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3927 if (Size == 1) { 3928 const RegisterBank *DstBank 3929 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3930 3931 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3932 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3933 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3934 if (DstBank) { 3935 TargetBankID = DstBank->getID(); 3936 if (DstBank == &AMDGPU::VCCRegBank) { 3937 TargetBankID = AMDGPU::VCCRegBankID; 3938 BankLHS = AMDGPU::VCCRegBankID; 3939 BankRHS = AMDGPU::VCCRegBankID; 3940 } else { 3941 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3942 AMDGPU::SGPRRegBankID); 3943 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3944 AMDGPU::SGPRRegBankID); 3945 } 3946 } else { 3947 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3948 AMDGPU::VCCRegBankID); 3949 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3950 AMDGPU::VCCRegBankID); 3951 3952 // Both inputs should be true booleans to produce a boolean result. 3953 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3954 TargetBankID = AMDGPU::VGPRRegBankID; 3955 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3956 TargetBankID = AMDGPU::VCCRegBankID; 3957 BankLHS = AMDGPU::VCCRegBankID; 3958 BankRHS = AMDGPU::VCCRegBankID; 3959 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3960 TargetBankID = AMDGPU::SGPRRegBankID; 3961 } 3962 } 3963 3964 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3965 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3966 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3967 break; 3968 } 3969 3970 if (Size == 64) { 3971 3972 if (isSALUMapping(MI)) { 3973 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3974 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3975 } else { 3976 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3977 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3978 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3979 3980 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3981 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3982 } 3983 3984 break; 3985 } 3986 3987 [[fallthrough]]; 3988 } 3989 case AMDGPU::G_PTR_ADD: 3990 case AMDGPU::G_PTRMASK: 3991 case AMDGPU::G_ADD: 3992 case AMDGPU::G_SUB: 3993 case AMDGPU::G_SHL: 3994 case AMDGPU::G_LSHR: 3995 case AMDGPU::G_ASHR: 3996 case AMDGPU::G_UADDO: 3997 case AMDGPU::G_USUBO: 3998 case AMDGPU::G_UADDE: 3999 case AMDGPU::G_SADDE: 4000 case AMDGPU::G_USUBE: 4001 case AMDGPU::G_SSUBE: 4002 case AMDGPU::G_SMIN: 4003 case AMDGPU::G_SMAX: 4004 case AMDGPU::G_UMIN: 4005 case AMDGPU::G_UMAX: 4006 case AMDGPU::G_ABS: 4007 case AMDGPU::G_SHUFFLE_VECTOR: 4008 case AMDGPU::G_SBFX: 4009 case AMDGPU::G_UBFX: 4010 case AMDGPU::G_AMDGPU_S_MUL_I64_I32: 4011 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: 4012 if (isSALUMapping(MI)) 4013 return getDefaultMappingSOP(MI); 4014 return getDefaultMappingVOP(MI); 4015 case AMDGPU::G_FADD: 4016 case AMDGPU::G_FSUB: 4017 case AMDGPU::G_FMUL: 4018 case AMDGPU::G_FMA: 4019 case AMDGPU::G_FFLOOR: 4020 case AMDGPU::G_FCEIL: 4021 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 4022 case AMDGPU::G_FMINNUM: 4023 case AMDGPU::G_FMAXNUM: 4024 case AMDGPU::G_FMINIMUM: 4025 case AMDGPU::G_FMAXIMUM: 4026 case AMDGPU::G_FMINIMUMNUM: 4027 case AMDGPU::G_FMAXIMUMNUM: 4028 case AMDGPU::G_INTRINSIC_TRUNC: 4029 case AMDGPU::G_STRICT_FADD: 4030 case AMDGPU::G_STRICT_FSUB: 4031 case AMDGPU::G_STRICT_FMUL: 4032 case AMDGPU::G_STRICT_FMA: { 4033 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4034 unsigned Size = Ty.getSizeInBits(); 4035 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() && 4036 (Size == 32 || Size == 16) && isSALUMapping(MI)) 4037 return getDefaultMappingSOP(MI); 4038 return getDefaultMappingVOP(MI); 4039 } 4040 case AMDGPU::G_FPTOSI: 4041 case AMDGPU::G_FPTOUI: 4042 case AMDGPU::G_SITOFP: 4043 case AMDGPU::G_UITOFP: { 4044 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4045 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4046 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 && 4047 isSALUMapping(MI)) 4048 return getDefaultMappingSOP(MI); 4049 return getDefaultMappingVOP(MI); 4050 } 4051 case AMDGPU::G_FPTRUNC: 4052 case AMDGPU::G_FPEXT: { 4053 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4054 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4055 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 && 4056 isSALUMapping(MI)) 4057 return getDefaultMappingSOP(MI); 4058 return getDefaultMappingVOP(MI); 4059 } 4060 case AMDGPU::G_FSQRT: 4061 case AMDGPU::G_FEXP2: 4062 case AMDGPU::G_FLOG2: { 4063 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4064 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && 4065 isSALUMapping(MI)) 4066 return getDefaultMappingSOP(MI); 4067 return getDefaultMappingVOP(MI); 4068 } 4069 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 4070 case AMDGPU::G_SSUBSAT: 4071 case AMDGPU::G_UADDSAT: 4072 case AMDGPU::G_USUBSAT: 4073 case AMDGPU::G_FMAD: 4074 case AMDGPU::G_FLDEXP: 4075 case AMDGPU::G_FMINNUM_IEEE: 4076 case AMDGPU::G_FMAXNUM_IEEE: 4077 case AMDGPU::G_FCANONICALIZE: 4078 case AMDGPU::G_STRICT_FLDEXP: 4079 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 4080 case AMDGPU::G_FSHR: // TODO: Expand for scalar 4081 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 4082 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 4083 case AMDGPU::G_AMDGPU_RCP_IFLAG: 4084 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 4085 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 4086 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 4087 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 4088 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 4089 case AMDGPU::G_AMDGPU_SMED3: 4090 case AMDGPU::G_AMDGPU_FMED3: 4091 return getDefaultMappingVOP(MI); 4092 case AMDGPU::G_UMULH: 4093 case AMDGPU::G_SMULH: { 4094 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 4095 return getDefaultMappingSOP(MI); 4096 return getDefaultMappingVOP(MI); 4097 } 4098 case AMDGPU::G_AMDGPU_MAD_U64_U32: 4099 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 4100 // Three possible mappings: 4101 // 4102 // - Default SOP 4103 // - Default VOP 4104 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 4105 // 4106 // This allows instruction selection to keep the multiplication part of the 4107 // instruction on the SALU. 4108 bool AllSalu = true; 4109 bool MulSalu = true; 4110 for (unsigned i = 0; i < 5; ++i) { 4111 Register Reg = MI.getOperand(i).getReg(); 4112 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 4113 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 4114 AllSalu = false; 4115 if (i == 2 || i == 3) { 4116 MulSalu = false; 4117 break; 4118 } 4119 } 4120 } 4121 } 4122 4123 if (AllSalu) 4124 return getDefaultMappingSOP(MI); 4125 4126 // If the multiply-add is full-rate in VALU, use that even if the 4127 // multiplication part is scalar. Accumulating separately on the VALU would 4128 // take two instructions. 4129 if (!MulSalu || Subtarget.hasFullRate64Ops()) 4130 return getDefaultMappingVOP(MI); 4131 4132 // Keep the multiplication on the SALU, then accumulate on the VALU. 4133 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 4134 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4135 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4136 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4137 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 4138 break; 4139 } 4140 case AMDGPU::G_IMPLICIT_DEF: { 4141 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4142 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4143 break; 4144 } 4145 case AMDGPU::G_FCONSTANT: 4146 case AMDGPU::G_CONSTANT: 4147 case AMDGPU::G_GLOBAL_VALUE: 4148 case AMDGPU::G_FRAME_INDEX: 4149 case AMDGPU::G_BLOCK_ADDR: 4150 case AMDGPU::G_READSTEADYCOUNTER: 4151 case AMDGPU::G_READCYCLECOUNTER: { 4152 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4153 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4154 break; 4155 } 4156 case AMDGPU::G_DYN_STACKALLOC: { 4157 // Result is always uniform, and a wave reduction is needed for the source. 4158 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4159 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4160 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 4161 break; 4162 } 4163 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 4164 // This case is weird because we expect a physical register in the source, 4165 // but need to set a bank anyway. 4166 // 4167 // TODO: We could select the result to SGPR or VGPR 4168 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4169 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4170 break; 4171 } 4172 case AMDGPU::G_INSERT: { 4173 unsigned BankID = getMappingType(MRI, MI); 4174 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4175 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4176 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 4177 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 4178 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 4179 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 4180 OpdsMapping[3] = nullptr; 4181 break; 4182 } 4183 case AMDGPU::G_EXTRACT: { 4184 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4185 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4186 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4187 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 4188 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 4189 OpdsMapping[2] = nullptr; 4190 break; 4191 } 4192 case AMDGPU::G_BUILD_VECTOR: 4193 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 4194 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 4195 if (DstTy == LLT::fixed_vector(2, 16)) { 4196 unsigned DstSize = DstTy.getSizeInBits(); 4197 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4198 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4199 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4200 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 4201 4202 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 4203 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 4204 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 4205 break; 4206 } 4207 4208 [[fallthrough]]; 4209 } 4210 case AMDGPU::G_MERGE_VALUES: 4211 case AMDGPU::G_CONCAT_VECTORS: { 4212 unsigned Bank = getMappingType(MRI, MI); 4213 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4214 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4215 4216 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 4217 // Op1 and Dst should use the same register bank. 4218 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 4219 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 4220 break; 4221 } 4222 case AMDGPU::G_BITREVERSE: 4223 case AMDGPU::G_BITCAST: 4224 case AMDGPU::G_INTTOPTR: 4225 case AMDGPU::G_PTRTOINT: 4226 case AMDGPU::G_FABS: 4227 case AMDGPU::G_FNEG: { 4228 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4229 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4230 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 4231 break; 4232 } 4233 case AMDGPU::G_AMDGPU_FFBH_U32: 4234 case AMDGPU::G_AMDGPU_FFBL_B32: 4235 case AMDGPU::G_CTLZ_ZERO_UNDEF: 4236 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 4237 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4238 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4239 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 4240 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 4241 break; 4242 } 4243 case AMDGPU::G_CTPOP: { 4244 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4245 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4246 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 4247 4248 // This should really be getValueMappingSGPR64Only, but allowing the generic 4249 // code to handle the register split just makes using LegalizerHelper more 4250 // difficult. 4251 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 4252 break; 4253 } 4254 case AMDGPU::G_TRUNC: { 4255 Register Dst = MI.getOperand(0).getReg(); 4256 Register Src = MI.getOperand(1).getReg(); 4257 unsigned Bank = getRegBankID(Src, MRI); 4258 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 4259 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 4260 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 4261 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 4262 break; 4263 } 4264 case AMDGPU::G_ZEXT: 4265 case AMDGPU::G_SEXT: 4266 case AMDGPU::G_ANYEXT: 4267 case AMDGPU::G_SEXT_INREG: { 4268 Register Dst = MI.getOperand(0).getReg(); 4269 Register Src = MI.getOperand(1).getReg(); 4270 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 4271 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 4272 4273 unsigned DstBank; 4274 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 4275 assert(SrcBank); 4276 switch (SrcBank->getID()) { 4277 case AMDGPU::SGPRRegBankID: 4278 DstBank = AMDGPU::SGPRRegBankID; 4279 break; 4280 default: 4281 DstBank = AMDGPU::VGPRRegBankID; 4282 break; 4283 } 4284 4285 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 4286 // 32-bits, and then to 64. 4287 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 4288 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 4289 SrcSize); 4290 break; 4291 } 4292 case AMDGPU::G_IS_FPCLASS: { 4293 Register SrcReg = MI.getOperand(1).getReg(); 4294 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4295 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4296 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4297 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4298 break; 4299 } 4300 case AMDGPU::G_STORE: { 4301 assert(MI.getOperand(0).isReg()); 4302 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4303 4304 // FIXME: We need to specify a different reg bank once scalar stores are 4305 // supported. 4306 const ValueMapping *ValMapping = 4307 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4308 OpdsMapping[0] = ValMapping; 4309 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4310 break; 4311 } 4312 case AMDGPU::G_ICMP: 4313 case AMDGPU::G_FCMP: { 4314 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4315 4316 // See if the result register has already been constrained to vcc, which may 4317 // happen due to control flow intrinsic lowering. 4318 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4319 AMDGPU::SGPRRegBankID); 4320 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4321 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4322 4323 auto canUseSCCICMP = [&]() { 4324 auto Pred = 4325 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 4326 return Size == 32 || 4327 (Size == 64 && 4328 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4329 Subtarget.hasScalarCompareEq64()); 4330 }; 4331 auto canUseSCCFCMP = [&]() { 4332 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16); 4333 }; 4334 4335 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP; 4336 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4337 Op2Bank == AMDGPU::SGPRRegBankID && 4338 Op3Bank == AMDGPU::SGPRRegBankID && 4339 (isICMP ? canUseSCCICMP() : canUseSCCFCMP()); 4340 4341 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4342 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4343 4344 // TODO: Use 32-bit for scalar output size. 4345 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4346 const unsigned ResultSize = 1; 4347 4348 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4349 OpdsMapping[1] = nullptr; // Predicate Operand. 4350 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4351 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4352 break; 4353 } 4354 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4355 // VGPR index can be used for waterfall when indexing a SGPR vector. 4356 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4357 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4358 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4359 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4360 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4361 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4362 4363 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4364 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4365 4366 // The index can be either if the source vector is VGPR. 4367 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4368 break; 4369 } 4370 case AMDGPU::G_INSERT_VECTOR_ELT: { 4371 unsigned OutputBankID = isSALUMapping(MI) ? 4372 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4373 4374 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4375 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4376 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4377 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4378 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4379 4380 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4381 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4382 4383 // This is a weird case, because we need to break down the mapping based on 4384 // the register bank of a different operand. 4385 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4386 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4387 InsertSize); 4388 } else { 4389 assert(InsertSize == 32 || InsertSize == 64); 4390 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4391 } 4392 4393 // The index can be either if the source vector is VGPR. 4394 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4395 break; 4396 } 4397 case AMDGPU::G_UNMERGE_VALUES: { 4398 unsigned Bank = getMappingType(MRI, MI); 4399 4400 // Op1 and Dst should use the same register bank. 4401 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4402 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4403 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4404 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4405 } 4406 break; 4407 } 4408 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4409 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4410 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4411 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4412 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4413 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE: 4414 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE: 4415 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE: 4416 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE: 4417 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE: 4418 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4419 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 4420 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4421 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4422 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4423 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4424 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4425 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4426 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4427 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4428 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4429 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4430 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4431 4432 // rsrc 4433 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4434 4435 // vindex 4436 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4437 4438 // voffset 4439 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4440 4441 // soffset 4442 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4443 4444 // Any remaining operands are immediates and were correctly null 4445 // initialized. 4446 break; 4447 } 4448 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4449 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4450 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4451 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4452 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4453 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4454 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4455 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4456 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4457 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4458 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4459 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4460 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4461 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4462 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4463 // vdata_out 4464 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4465 4466 // vdata_in 4467 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4468 4469 // rsrc 4470 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4471 4472 // vindex 4473 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4474 4475 // voffset 4476 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4477 4478 // soffset 4479 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4480 4481 // Any remaining operands are immediates and were correctly null 4482 // initialized. 4483 break; 4484 } 4485 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4486 // vdata_out 4487 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4488 4489 // vdata_in 4490 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4491 4492 // cmp 4493 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4494 4495 // rsrc 4496 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4497 4498 // vindex 4499 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4500 4501 // voffset 4502 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4503 4504 // soffset 4505 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4506 4507 // Any remaining operands are immediates and were correctly null 4508 // initialized. 4509 break; 4510 } 4511 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: 4512 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: 4513 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: 4514 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: 4515 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { 4516 // Lie and claim everything is legal, even though some need to be 4517 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4518 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4519 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4520 4521 // We need to convert this to a MUBUF if either the resource of offset is 4522 // VGPR. 4523 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4524 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4525 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4526 4527 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4528 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4529 break; 4530 } 4531 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH: 4532 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4533 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4534 break; 4535 case AMDGPU::G_INTRINSIC: 4536 case AMDGPU::G_INTRINSIC_CONVERGENT: { 4537 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 4538 default: 4539 return getInvalidInstructionMapping(); 4540 case Intrinsic::amdgcn_div_fmas: 4541 case Intrinsic::amdgcn_div_fixup: 4542 case Intrinsic::amdgcn_trig_preop: 4543 case Intrinsic::amdgcn_sin: 4544 case Intrinsic::amdgcn_cos: 4545 case Intrinsic::amdgcn_log_clamp: 4546 case Intrinsic::amdgcn_rcp_legacy: 4547 case Intrinsic::amdgcn_rsq_legacy: 4548 case Intrinsic::amdgcn_rsq_clamp: 4549 case Intrinsic::amdgcn_tanh: 4550 case Intrinsic::amdgcn_fmul_legacy: 4551 case Intrinsic::amdgcn_fma_legacy: 4552 case Intrinsic::amdgcn_frexp_mant: 4553 case Intrinsic::amdgcn_frexp_exp: 4554 case Intrinsic::amdgcn_fract: 4555 case Intrinsic::amdgcn_cvt_pknorm_i16: 4556 case Intrinsic::amdgcn_cvt_pknorm_u16: 4557 case Intrinsic::amdgcn_cvt_pk_i16: 4558 case Intrinsic::amdgcn_cvt_pk_u16: 4559 case Intrinsic::amdgcn_cvt_pk_f16_fp8: 4560 case Intrinsic::amdgcn_cvt_pk_f16_bf8: 4561 case Intrinsic::amdgcn_fmed3: 4562 case Intrinsic::amdgcn_cubeid: 4563 case Intrinsic::amdgcn_cubema: 4564 case Intrinsic::amdgcn_cubesc: 4565 case Intrinsic::amdgcn_cubetc: 4566 case Intrinsic::amdgcn_sffbh: 4567 case Intrinsic::amdgcn_fmad_ftz: 4568 case Intrinsic::amdgcn_mbcnt_lo: 4569 case Intrinsic::amdgcn_mbcnt_hi: 4570 case Intrinsic::amdgcn_mul_u24: 4571 case Intrinsic::amdgcn_mul_i24: 4572 case Intrinsic::amdgcn_mulhi_u24: 4573 case Intrinsic::amdgcn_mulhi_i24: 4574 case Intrinsic::amdgcn_lerp: 4575 case Intrinsic::amdgcn_sad_u8: 4576 case Intrinsic::amdgcn_msad_u8: 4577 case Intrinsic::amdgcn_sad_hi_u8: 4578 case Intrinsic::amdgcn_sad_u16: 4579 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4580 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4581 case Intrinsic::amdgcn_mqsad_u32_u8: 4582 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4583 case Intrinsic::amdgcn_alignbyte: 4584 case Intrinsic::amdgcn_perm: 4585 case Intrinsic::amdgcn_prng_b32: 4586 case Intrinsic::amdgcn_fdot2: 4587 case Intrinsic::amdgcn_sdot2: 4588 case Intrinsic::amdgcn_udot2: 4589 case Intrinsic::amdgcn_sdot4: 4590 case Intrinsic::amdgcn_udot4: 4591 case Intrinsic::amdgcn_sdot8: 4592 case Intrinsic::amdgcn_udot8: 4593 case Intrinsic::amdgcn_fdot2_bf16_bf16: 4594 case Intrinsic::amdgcn_fdot2_f16_f16: 4595 case Intrinsic::amdgcn_fdot2_f32_bf16: 4596 case Intrinsic::amdgcn_fdot2c_f32_bf16: 4597 case Intrinsic::amdgcn_sudot4: 4598 case Intrinsic::amdgcn_sudot8: 4599 case Intrinsic::amdgcn_dot4_f32_fp8_bf8: 4600 case Intrinsic::amdgcn_dot4_f32_bf8_fp8: 4601 case Intrinsic::amdgcn_dot4_f32_fp8_fp8: 4602 case Intrinsic::amdgcn_dot4_f32_bf8_bf8: 4603 case Intrinsic::amdgcn_cvt_f32_fp8: 4604 case Intrinsic::amdgcn_cvt_f32_fp8_e5m3: 4605 case Intrinsic::amdgcn_cvt_f32_bf8: 4606 case Intrinsic::amdgcn_cvt_off_f32_i4: 4607 case Intrinsic::amdgcn_cvt_pk_f32_fp8: 4608 case Intrinsic::amdgcn_cvt_pk_f32_bf8: 4609 case Intrinsic::amdgcn_cvt_pk_fp8_f32: 4610 case Intrinsic::amdgcn_cvt_pk_bf8_f32: 4611 case Intrinsic::amdgcn_cvt_sr_fp8_f32: 4612 case Intrinsic::amdgcn_cvt_sr_bf8_f32: 4613 case Intrinsic::amdgcn_cvt_sr_bf16_f32: 4614 case Intrinsic::amdgcn_cvt_sr_f16_f32: 4615 case Intrinsic::amdgcn_cvt_f16_fp8: 4616 case Intrinsic::amdgcn_cvt_f16_bf8: 4617 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16: 4618 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16: 4619 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16: 4620 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16: 4621 case Intrinsic::amdgcn_cvt_scalef32_f16_fp8: 4622 case Intrinsic::amdgcn_cvt_scalef32_f16_bf8: 4623 case Intrinsic::amdgcn_cvt_scalef32_f32_fp8: 4624 case Intrinsic::amdgcn_cvt_scalef32_f32_bf8: 4625 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32: 4626 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32: 4627 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8: 4628 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8: 4629 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16: 4630 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16: 4631 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16: 4632 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16: 4633 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4: 4634 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32: 4635 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4: 4636 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4: 4637 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6: 4638 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6: 4639 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6: 4640 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6: 4641 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6: 4642 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6: 4643 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8: 4644 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8: 4645 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8: 4646 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8: 4647 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16: 4648 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16: 4649 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16: 4650 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16: 4651 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32: 4652 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16: 4653 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16: 4654 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32: 4655 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16: 4656 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16: 4657 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32: 4658 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16: 4659 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16: 4660 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32: 4661 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16: 4662 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16: 4663 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32: 4664 case Intrinsic::amdgcn_ashr_pk_i8_i32: 4665 case Intrinsic::amdgcn_ashr_pk_u8_i32: 4666 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32: 4667 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32: 4668 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: 4669 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: 4670 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: 4671 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: 4672 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: 4673 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: 4674 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: 4675 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: 4676 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: 4677 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: 4678 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: 4679 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: 4680 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: 4681 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 4682 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 4683 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 4684 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 4685 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 4686 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 4687 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: 4688 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 4689 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 4690 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 4691 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: 4692 return getDefaultMappingVOP(MI); 4693 case Intrinsic::amdgcn_log: 4694 case Intrinsic::amdgcn_exp2: 4695 case Intrinsic::amdgcn_rcp: 4696 case Intrinsic::amdgcn_rsq: 4697 case Intrinsic::amdgcn_sqrt: { 4698 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4699 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && 4700 isSALUMapping(MI)) 4701 return getDefaultMappingSOP(MI); 4702 return getDefaultMappingVOP(MI); 4703 } 4704 case Intrinsic::amdgcn_sbfe: 4705 case Intrinsic::amdgcn_ubfe: 4706 if (isSALUMapping(MI)) 4707 return getDefaultMappingSOP(MI); 4708 return getDefaultMappingVOP(MI); 4709 case Intrinsic::amdgcn_ds_swizzle: 4710 case Intrinsic::amdgcn_ds_permute: 4711 case Intrinsic::amdgcn_ds_bpermute: 4712 case Intrinsic::amdgcn_update_dpp: 4713 case Intrinsic::amdgcn_mov_dpp8: 4714 case Intrinsic::amdgcn_mov_dpp: 4715 case Intrinsic::amdgcn_strict_wwm: 4716 case Intrinsic::amdgcn_wwm: 4717 case Intrinsic::amdgcn_strict_wqm: 4718 case Intrinsic::amdgcn_wqm: 4719 case Intrinsic::amdgcn_softwqm: 4720 case Intrinsic::amdgcn_set_inactive: 4721 case Intrinsic::amdgcn_set_inactive_chain_arg: 4722 case Intrinsic::amdgcn_permlane64: 4723 case Intrinsic::amdgcn_ds_bpermute_fi_b32: 4724 return getDefaultMappingAllVGPR(MI); 4725 case Intrinsic::amdgcn_cvt_pkrtz: 4726 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) 4727 return getDefaultMappingSOP(MI); 4728 return getDefaultMappingVOP(MI); 4729 case Intrinsic::amdgcn_kernarg_segment_ptr: 4730 case Intrinsic::amdgcn_s_getpc: 4731 case Intrinsic::amdgcn_groupstaticsize: 4732 case Intrinsic::amdgcn_reloc_constant: 4733 case Intrinsic::returnaddress: { 4734 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4735 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4736 break; 4737 } 4738 case Intrinsic::amdgcn_wqm_vote: { 4739 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4740 OpdsMapping[0] = OpdsMapping[2] 4741 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4742 break; 4743 } 4744 case Intrinsic::amdgcn_ps_live: { 4745 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4746 break; 4747 } 4748 case Intrinsic::amdgcn_div_scale: { 4749 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4750 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4751 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4752 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4753 4754 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4755 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4756 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4757 break; 4758 } 4759 case Intrinsic::amdgcn_class: { 4760 Register Src0Reg = MI.getOperand(2).getReg(); 4761 Register Src1Reg = MI.getOperand(3).getReg(); 4762 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4763 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4764 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4765 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4766 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4767 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4768 break; 4769 } 4770 case Intrinsic::amdgcn_icmp: 4771 case Intrinsic::amdgcn_fcmp: { 4772 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4773 // This is not VCCRegBank because this is not used in boolean contexts. 4774 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4775 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4776 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4777 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4778 break; 4779 } 4780 case Intrinsic::amdgcn_readlane: { 4781 // This must be an SGPR, but accept a VGPR. 4782 Register IdxReg = MI.getOperand(3).getReg(); 4783 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4784 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4785 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4786 [[fallthrough]]; 4787 } 4788 case Intrinsic::amdgcn_readfirstlane: { 4789 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4790 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4791 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4792 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4793 break; 4794 } 4795 case Intrinsic::amdgcn_writelane: { 4796 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4797 Register SrcReg = MI.getOperand(2).getReg(); 4798 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4799 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4800 Register IdxReg = MI.getOperand(3).getReg(); 4801 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4802 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4803 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4804 4805 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4806 // to legalize. 4807 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4808 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4809 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4810 break; 4811 } 4812 case Intrinsic::amdgcn_if_break: { 4813 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4814 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4815 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4816 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4817 break; 4818 } 4819 case Intrinsic::amdgcn_permlane16: 4820 case Intrinsic::amdgcn_permlanex16: { 4821 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4822 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4823 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4824 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4825 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4826 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4827 break; 4828 } 4829 case Intrinsic::amdgcn_permlane16_var: 4830 case Intrinsic::amdgcn_permlanex16_var: { 4831 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4832 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4833 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4834 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4835 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4836 break; 4837 } 4838 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4839 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4840 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4841 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4842 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4843 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4844 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4845 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4846 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4847 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4848 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4849 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4850 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4851 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4852 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4853 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4854 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4855 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4856 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4857 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4858 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4859 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4860 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4861 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4862 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4863 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4864 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4865 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4866 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4867 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4868 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: 4869 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: 4870 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: 4871 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: 4872 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: 4873 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: 4874 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: 4875 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: 4876 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: 4877 case Intrinsic::amdgcn_mfma_f32_16x16x32_f16: 4878 case Intrinsic::amdgcn_mfma_f32_32x32x16_f16: 4879 case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: 4880 case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: 4881 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: { 4882 // Default for MAI intrinsics. 4883 // srcC can also be an immediate which can be folded later. 4884 // FIXME: Should we eventually add an alternative mapping with AGPR src 4885 // for srcA/srcB? 4886 // 4887 // vdst, srcA, srcB, srcC 4888 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4889 OpdsMapping[0] = 4890 Info->mayNeedAGPRs() 4891 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4892 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4893 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4894 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4895 OpdsMapping[4] = 4896 Info->mayNeedAGPRs() 4897 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4898 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4899 break; 4900 } 4901 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: 4902 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { 4903 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4904 OpdsMapping[0] = 4905 Info->mayNeedAGPRs() 4906 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4907 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4908 4909 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4910 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4911 OpdsMapping[4] = 4912 Info->mayNeedAGPRs() 4913 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4914 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4915 4916 OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 4917 OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); 4918 break; 4919 } 4920 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4921 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4922 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4923 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4924 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4925 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 4926 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 4927 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 4928 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 4929 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 4930 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 4931 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 4932 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 4933 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 4934 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: 4935 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16: 4936 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: 4937 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: 4938 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: 4939 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: 4940 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: 4941 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: 4942 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: 4943 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: 4944 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: 4945 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: 4946 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: 4947 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: { 4948 // vdst, srcA, srcB, srcC, idx 4949 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4950 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4951 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4952 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4953 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4954 break; 4955 } 4956 case Intrinsic::amdgcn_interp_p1: 4957 case Intrinsic::amdgcn_interp_p2: 4958 case Intrinsic::amdgcn_interp_mov: 4959 case Intrinsic::amdgcn_interp_p1_f16: 4960 case Intrinsic::amdgcn_interp_p2_f16: 4961 case Intrinsic::amdgcn_lds_param_load: { 4962 const int M0Idx = MI.getNumOperands() - 1; 4963 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4964 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4965 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4966 4967 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4968 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4969 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4970 4971 // Must be SGPR, but we must take whatever the original bank is and fix it 4972 // later. 4973 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4974 break; 4975 } 4976 case Intrinsic::amdgcn_interp_inreg_p10: 4977 case Intrinsic::amdgcn_interp_inreg_p2: 4978 case Intrinsic::amdgcn_interp_inreg_p10_f16: 4979 case Intrinsic::amdgcn_interp_inreg_p2_f16: 4980 case Intrinsic::amdgcn_interp_p10_rtz_f16: 4981 case Intrinsic::amdgcn_interp_p2_rtz_f16: { 4982 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4983 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4984 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4985 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4986 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4987 break; 4988 } 4989 case Intrinsic::amdgcn_permlane16_swap: 4990 case Intrinsic::amdgcn_permlane32_swap: { 4991 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4992 OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] = 4993 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4994 break; 4995 } 4996 case Intrinsic::amdgcn_ballot: { 4997 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4998 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4999 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 5000 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 5001 break; 5002 } 5003 case Intrinsic::amdgcn_inverse_ballot: { 5004 // This must be an SGPR, but accept a VGPR. 5005 Register MaskReg = MI.getOperand(2).getReg(); 5006 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 5007 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 5008 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 5009 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 5010 break; 5011 } 5012 case Intrinsic::amdgcn_bitop3: { 5013 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 5014 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 5015 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 5016 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 5017 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 5018 break; 5019 } 5020 case Intrinsic::amdgcn_s_quadmask: 5021 case Intrinsic::amdgcn_s_wqm: { 5022 Register MaskReg = MI.getOperand(2).getReg(); 5023 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 5024 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 5025 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize); 5026 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 5027 break; 5028 } 5029 case Intrinsic::amdgcn_wave_reduce_add: 5030 case Intrinsic::amdgcn_wave_reduce_sub: 5031 case Intrinsic::amdgcn_wave_reduce_min: 5032 case Intrinsic::amdgcn_wave_reduce_umin: 5033 case Intrinsic::amdgcn_wave_reduce_max: 5034 case Intrinsic::amdgcn_wave_reduce_umax: 5035 case Intrinsic::amdgcn_wave_reduce_and: 5036 case Intrinsic::amdgcn_wave_reduce_or: 5037 case Intrinsic::amdgcn_wave_reduce_xor: { 5038 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5039 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 5040 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 5041 auto regBankID = 5042 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 5043 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); 5044 break; 5045 } 5046 case Intrinsic::amdgcn_s_bitreplicate: 5047 Register MaskReg = MI.getOperand(2).getReg(); 5048 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 5049 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 5050 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); 5051 } 5052 break; 5053 } 5054 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 5055 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 5056 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: 5057 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 5058 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 5059 auto IntrID = AMDGPU::getIntrinsicID(MI); 5060 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 5061 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 5062 // Non-images can have complications from operands that allow both SGPR 5063 // and VGPR. For now it's too complicated to figure out the final opcode 5064 // to derive the register bank from the MCInstrDesc. 5065 assert(RSrcIntrin->IsImage); 5066 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 5067 } 5068 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: 5069 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: 5070 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { 5071 bool IsDualOrBVH8 = 5072 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY || 5073 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY; 5074 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier 5075 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; 5076 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5077 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 5078 if (IsDualOrBVH8) { 5079 OpdsMapping[1] = AMDGPU::getValueMapping( 5080 AMDGPU::VGPRRegBankID, 5081 MRI.getType(MI.getOperand(1).getReg()).getSizeInBits()); 5082 OpdsMapping[2] = AMDGPU::getValueMapping( 5083 AMDGPU::VGPRRegBankID, 5084 MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); 5085 } 5086 OpdsMapping[LastRegOpIdx] = 5087 getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI); 5088 if (LastRegOpIdx == 3) { 5089 // Sequential form: all operands combined into VGPR256/VGPR512 5090 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 5091 if (Size > 256) 5092 Size = 512; 5093 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 5094 } else { 5095 // NSA form 5096 unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2; 5097 for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) { 5098 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); 5099 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 5100 } 5101 } 5102 break; 5103 } 5104 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 5105 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { 5106 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 5107 switch (IntrID) { 5108 case Intrinsic::amdgcn_s_getreg: 5109 case Intrinsic::amdgcn_s_memtime: 5110 case Intrinsic::amdgcn_s_memrealtime: 5111 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 5112 case Intrinsic::amdgcn_s_sendmsg_rtn: { 5113 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5114 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 5115 break; 5116 } 5117 case Intrinsic::amdgcn_global_atomic_csub: 5118 case Intrinsic::amdgcn_global_atomic_fmin_num: 5119 case Intrinsic::amdgcn_global_atomic_fmax_num: 5120 case Intrinsic::amdgcn_flat_atomic_fmin_num: 5121 case Intrinsic::amdgcn_flat_atomic_fmax_num: 5122 case Intrinsic::amdgcn_atomic_cond_sub_u32: 5123 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 5124 case Intrinsic::amdgcn_global_load_tr_b64: 5125 case Intrinsic::amdgcn_global_load_tr_b128: 5126 case Intrinsic::amdgcn_global_load_tr4_b64: 5127 case Intrinsic::amdgcn_global_load_tr6_b96: 5128 case Intrinsic::amdgcn_ds_load_tr8_b64: 5129 case Intrinsic::amdgcn_ds_load_tr16_b128: 5130 case Intrinsic::amdgcn_ds_load_tr4_b64: 5131 case Intrinsic::amdgcn_ds_load_tr6_b96: 5132 case Intrinsic::amdgcn_ds_read_tr4_b64: 5133 case Intrinsic::amdgcn_ds_read_tr6_b96: 5134 case Intrinsic::amdgcn_ds_read_tr8_b64: 5135 case Intrinsic::amdgcn_ds_read_tr16_b64: 5136 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: 5137 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: 5138 return getDefaultMappingAllVGPR(MI); 5139 case Intrinsic::amdgcn_ds_ordered_add: 5140 case Intrinsic::amdgcn_ds_ordered_swap: { 5141 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5142 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 5143 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5144 AMDGPU::SGPRRegBankID); 5145 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 5146 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5147 break; 5148 } 5149 case Intrinsic::amdgcn_ds_append: 5150 case Intrinsic::amdgcn_ds_consume: { 5151 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5152 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 5153 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5154 break; 5155 } 5156 case Intrinsic::amdgcn_exp_compr: 5157 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5158 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5159 break; 5160 case Intrinsic::amdgcn_exp: 5161 // FIXME: Could we support packed types here? 5162 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5163 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5164 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5165 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5166 break; 5167 case Intrinsic::amdgcn_exp_row: 5168 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5169 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5170 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5171 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5172 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 5173 break; 5174 case Intrinsic::amdgcn_s_sendmsg: 5175 case Intrinsic::amdgcn_s_sendmsghalt: { 5176 // This must be an SGPR, but accept a VGPR. 5177 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5178 AMDGPU::SGPRRegBankID); 5179 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 5180 break; 5181 } 5182 case Intrinsic::amdgcn_s_setreg: { 5183 // This must be an SGPR, but accept a VGPR. 5184 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5185 AMDGPU::SGPRRegBankID); 5186 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 5187 break; 5188 } 5189 case Intrinsic::amdgcn_s_ttracedata: { 5190 // This must be an SGPR, but accept a VGPR. 5191 unsigned Bank = 5192 getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); 5193 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 5194 break; 5195 } 5196 case Intrinsic::amdgcn_end_cf: { 5197 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 5198 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 5199 break; 5200 } 5201 case Intrinsic::amdgcn_else: { 5202 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 5203 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 5204 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 5205 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 5206 break; 5207 } 5208 case Intrinsic::amdgcn_init_whole_wave: 5209 case Intrinsic::amdgcn_live_mask: { 5210 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 5211 break; 5212 } 5213 case Intrinsic::amdgcn_wqm_demote: 5214 case Intrinsic::amdgcn_kill: { 5215 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 5216 break; 5217 } 5218 case Intrinsic::amdgcn_raw_buffer_load: 5219 case Intrinsic::amdgcn_raw_ptr_buffer_load: 5220 case Intrinsic::amdgcn_raw_atomic_buffer_load: 5221 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 5222 case Intrinsic::amdgcn_raw_tbuffer_load: 5223 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 5224 // FIXME: Should make intrinsic ID the last operand of the instruction, 5225 // then this would be the same as store 5226 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5227 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5228 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5229 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5230 break; 5231 } 5232 case Intrinsic::amdgcn_raw_buffer_load_lds: 5233 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 5234 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5235 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5236 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5237 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5238 break; 5239 } 5240 case Intrinsic::amdgcn_raw_buffer_store: 5241 case Intrinsic::amdgcn_raw_ptr_buffer_store: 5242 case Intrinsic::amdgcn_raw_buffer_store_format: 5243 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 5244 case Intrinsic::amdgcn_raw_tbuffer_store: 5245 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 5246 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5247 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5248 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5249 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5250 break; 5251 } 5252 case Intrinsic::amdgcn_struct_buffer_load: 5253 case Intrinsic::amdgcn_struct_ptr_buffer_load: 5254 case Intrinsic::amdgcn_struct_tbuffer_load: 5255 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 5256 case Intrinsic::amdgcn_struct_atomic_buffer_load: 5257 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: { 5258 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5259 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5260 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5261 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5262 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5263 break; 5264 } 5265 case Intrinsic::amdgcn_struct_buffer_load_lds: 5266 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 5267 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5268 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5269 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5270 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5271 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 5272 break; 5273 } 5274 case Intrinsic::amdgcn_struct_buffer_store: 5275 case Intrinsic::amdgcn_struct_ptr_buffer_store: 5276 case Intrinsic::amdgcn_struct_tbuffer_store: 5277 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 5278 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5279 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5280 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5281 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5282 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5283 break; 5284 } 5285 case Intrinsic::amdgcn_init_exec_from_input: { 5286 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 5287 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 5288 break; 5289 } 5290 case Intrinsic::amdgcn_ds_gws_init: 5291 case Intrinsic::amdgcn_ds_gws_barrier: 5292 case Intrinsic::amdgcn_ds_gws_sema_br: { 5293 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5294 5295 // This must be an SGPR, but accept a VGPR. 5296 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5297 AMDGPU::SGPRRegBankID); 5298 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 5299 break; 5300 } 5301 case Intrinsic::amdgcn_ds_gws_sema_v: 5302 case Intrinsic::amdgcn_ds_gws_sema_p: 5303 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 5304 // This must be an SGPR, but accept a VGPR. 5305 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 5306 AMDGPU::SGPRRegBankID); 5307 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 5308 break; 5309 } 5310 case Intrinsic::amdgcn_load_to_lds: 5311 case Intrinsic::amdgcn_global_load_lds: { 5312 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5313 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5314 break; 5315 } 5316 case Intrinsic::amdgcn_lds_direct_load: { 5317 const int M0Idx = MI.getNumOperands() - 1; 5318 Register M0Reg = MI.getOperand(M0Idx).getReg(); 5319 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 5320 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5321 5322 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 5323 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 5324 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5325 5326 // Must be SGPR, but we must take whatever the original bank is and fix it 5327 // later. 5328 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 5329 break; 5330 } 5331 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 5332 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: 5333 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5334 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5335 break; 5336 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 5337 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: 5338 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: 5339 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { 5340 OpdsMapping[0] = 5341 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst 5342 OpdsMapping[1] = 5343 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr 5344 OpdsMapping[3] = 5345 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr 5346 OpdsMapping[4] = 5347 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0 5348 OpdsMapping[5] = 5349 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 5350 break; 5351 } 5352 case Intrinsic::amdgcn_s_sleep_var: 5353 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5354 break; 5355 case Intrinsic::amdgcn_s_barrier_signal_var: 5356 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5357 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5358 break; 5359 case Intrinsic::amdgcn_s_barrier_signal_isfirst: { 5360 const unsigned ResultSize = 1; 5361 OpdsMapping[0] = 5362 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); 5363 break; 5364 } 5365 case Intrinsic::amdgcn_s_get_barrier_state: 5366 case Intrinsic::amdgcn_s_get_named_barrier_state: { 5367 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5368 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5369 break; 5370 } 5371 case Intrinsic::amdgcn_pops_exiting_wave_id: 5372 return getDefaultMappingSOP(MI); 5373 case Intrinsic::amdgcn_tensor_load_to_lds_d2: 5374 case Intrinsic::amdgcn_tensor_store_from_lds_d2: 5375 case Intrinsic::amdgcn_tensor_load_to_lds: 5376 case Intrinsic::amdgcn_tensor_store_from_lds: { 5377 // Lie and claim everything is legal, even all operands need to be 5378 // SGPRs. applyMapping will have to deal with it with readfirstlane. 5379 for (unsigned I = 1; I < MI.getNumOperands(); ++I) { 5380 if (MI.getOperand(I).isReg()) { 5381 Register Reg = MI.getOperand(I).getReg(); 5382 auto OpBank = getRegBankID(Reg, MRI); 5383 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 5384 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 5385 } 5386 } 5387 break; 5388 } 5389 case Intrinsic::amdgcn_s_prefetch_data: { 5390 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5391 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5392 break; 5393 } 5394 default: 5395 return getInvalidInstructionMapping(); 5396 } 5397 break; 5398 } 5399 case AMDGPU::G_SELECT: { 5400 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5401 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5402 AMDGPU::SGPRRegBankID); 5403 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 5404 AMDGPU::SGPRRegBankID); 5405 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 5406 Op3Bank == AMDGPU::SGPRRegBankID; 5407 5408 unsigned CondBankDefault = SGPRSrcs ? 5409 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 5410 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 5411 CondBankDefault); 5412 if (CondBank == AMDGPU::SGPRRegBankID) 5413 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 5414 else if (CondBank == AMDGPU::VGPRRegBankID) 5415 CondBank = AMDGPU::VCCRegBankID; 5416 5417 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 5418 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 5419 5420 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 5421 5422 // TODO: Should report 32-bit for scalar condition type. 5423 if (Size == 64) { 5424 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5425 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 5426 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5427 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5428 } else { 5429 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 5430 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 5431 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 5432 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 5433 } 5434 5435 break; 5436 } 5437 5438 case AMDGPU::G_SI_CALL: { 5439 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 5440 // Lie and claim everything is legal, even though some need to be 5441 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 5442 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5443 5444 // Allow anything for implicit arguments 5445 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 5446 if (MI.getOperand(I).isReg()) { 5447 Register Reg = MI.getOperand(I).getReg(); 5448 auto OpBank = getRegBankID(Reg, MRI); 5449 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 5450 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 5451 } 5452 } 5453 break; 5454 } 5455 case AMDGPU::G_LOAD: 5456 case AMDGPU::G_ZEXTLOAD: 5457 case AMDGPU::G_SEXTLOAD: 5458 return getInstrMappingForLoad(MI); 5459 5460 case AMDGPU::G_ATOMICRMW_XCHG: 5461 case AMDGPU::G_ATOMICRMW_ADD: 5462 case AMDGPU::G_ATOMICRMW_SUB: 5463 case AMDGPU::G_ATOMICRMW_AND: 5464 case AMDGPU::G_ATOMICRMW_OR: 5465 case AMDGPU::G_ATOMICRMW_XOR: 5466 case AMDGPU::G_ATOMICRMW_MAX: 5467 case AMDGPU::G_ATOMICRMW_MIN: 5468 case AMDGPU::G_ATOMICRMW_UMAX: 5469 case AMDGPU::G_ATOMICRMW_UMIN: 5470 case AMDGPU::G_ATOMICRMW_FADD: 5471 case AMDGPU::G_ATOMICRMW_FMIN: 5472 case AMDGPU::G_ATOMICRMW_FMAX: 5473 case AMDGPU::G_ATOMICRMW_UINC_WRAP: 5474 case AMDGPU::G_ATOMICRMW_UDEC_WRAP: 5475 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { 5476 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5477 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 5478 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5479 break; 5480 } 5481 case AMDGPU::G_ATOMIC_CMPXCHG: { 5482 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5483 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 5484 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5485 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5486 break; 5487 } 5488 case AMDGPU::G_BRCOND: { 5489 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 5490 AMDGPU::SGPRRegBankID); 5491 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 5492 if (Bank != AMDGPU::SGPRRegBankID) 5493 Bank = AMDGPU::VCCRegBankID; 5494 5495 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 5496 break; 5497 } 5498 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: 5499 return getDefaultMappingVOP(MI); 5500 case AMDGPU::G_PREFETCH: 5501 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5502 break; 5503 } 5504 5505 return getInstructionMapping(/*ID*/1, /*Cost*/1, 5506 getOperandsMapping(OpdsMapping), 5507 MI.getNumOperands()); 5508 } 5509