1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 MachineIRBuilder &B; 101 const AMDGPURegisterBankInfo &RBI; 102 MachineRegisterInfo &MRI; 103 const RegisterBank *NewBank; 104 SmallVector<MachineInstr *, 4> NewInsts; 105 106 public: 107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_, 108 MachineRegisterInfo &MRI_, const RegisterBank *RB) 109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) { 110 assert(!B.isObservingChanges()); 111 B.setChangeObserver(*this); 112 } 113 114 ~ApplyRegBankMapping() { 115 for (MachineInstr *MI : NewInsts) 116 applyBank(*MI); 117 118 B.stopObservingChanges(); 119 } 120 121 /// Set any registers that don't have a set register class or bank to SALU. 122 void applyBank(MachineInstr &MI) { 123 const unsigned Opc = MI.getOpcode(); 124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 125 Opc == AMDGPU::G_SEXT) { 126 // LegalizerHelper wants to use the basic legalization artifacts when 127 // widening etc. We don't handle selection with vcc in artifact sources, 128 // so we need to use a select instead to handle these properly. 129 Register DstReg = MI.getOperand(0).getReg(); 130 Register SrcReg = MI.getOperand(1).getReg(); 131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 132 if (SrcBank == &AMDGPU::VCCRegBank) { 133 const LLT S32 = LLT::scalar(32); 134 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 135 assert(MRI.getType(DstReg) == S32); 136 assert(NewBank == &AMDGPU::VGPRRegBank); 137 138 // Replace the extension with a select, which really uses the boolean 139 // source. 140 B.setInsertPt(*MI.getParent(), MI); 141 142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 143 auto False = B.buildConstant(S32, 0); 144 B.buildSelect(DstReg, SrcReg, True, False); 145 MRI.setRegBank(True.getReg(0), *NewBank); 146 MRI.setRegBank(False.getReg(0), *NewBank); 147 MI.eraseFromParent(); 148 } 149 150 assert(!MRI.getRegClassOrRegBank(DstReg)); 151 MRI.setRegBank(DstReg, *NewBank); 152 return; 153 } 154 155 #ifndef NDEBUG 156 if (Opc == AMDGPU::G_TRUNC) { 157 Register DstReg = MI.getOperand(0).getReg(); 158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 159 assert(DstBank != &AMDGPU::VCCRegBank); 160 } 161 #endif 162 163 for (MachineOperand &Op : MI.operands()) { 164 if (!Op.isReg()) 165 continue; 166 167 // We may see physical registers if building a real MI 168 Register Reg = Op.getReg(); 169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 170 continue; 171 172 const RegisterBank *RB = NewBank; 173 if (MRI.getType(Reg) == LLT::scalar(1)) { 174 assert(NewBank == &AMDGPU::VGPRRegBank && 175 "s1 operands should only be used for vector bools"); 176 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 177 MI.getOpcode() != AMDGPU::G_ANYEXT) && 178 "not expecting legalization artifacts here"); 179 RB = &AMDGPU::VCCRegBank; 180 } 181 182 MRI.setRegBank(Reg, *RB); 183 } 184 } 185 186 void erasingInstr(MachineInstr &MI) override {} 187 188 void createdInstr(MachineInstr &MI) override { 189 // At this point, the instruction was just inserted and has no operands. 190 NewInsts.push_back(&MI); 191 } 192 193 void changingInstr(MachineInstr &MI) override {} 194 void changedInstr(MachineInstr &MI) override { 195 // FIXME: In principle we should probably add the instruction to NewInsts, 196 // but the way the LegalizerHelper uses the observer, we will always see the 197 // registers we need to set the regbank on also referenced in a new 198 // instruction. 199 } 200 }; 201 202 } 203 204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 206 TII(Subtarget.getInstrInfo()) { 207 208 // HACK: Until this is fully tablegen'd. 209 static llvm::once_flag InitializeRegisterBankFlag; 210 211 static auto InitializeRegisterBankOnce = [this]() { 212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 215 (void)this; 216 }; 217 218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 219 } 220 221 static bool isVectorRegisterBank(const RegisterBank &Bank) { 222 unsigned BankID = Bank.getID(); 223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 224 } 225 226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { 227 return RB != &AMDGPU::SGPRRegBank; 228 } 229 230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 231 const RegisterBank &Src, 232 TypeSize Size) const { 233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 234 if (Dst.getID() == AMDGPU::SGPRRegBankID && 235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 236 return std::numeric_limits<unsigned>::max(); 237 } 238 239 // Bool values are tricky, because the meaning is based on context. The SCC 240 // and VCC banks are for the natural scalar and vector conditions produced by 241 // a compare. 242 // 243 // Legalization doesn't know about the necessary context, so an s1 use may 244 // have been a truncate from an arbitrary value, in which case a copy (lowered 245 // as a compare with 0) needs to be inserted. 246 if (Size == 1 && 247 (Dst.getID() == AMDGPU::SGPRRegBankID) && 248 (isVectorRegisterBank(Src) || 249 Src.getID() == AMDGPU::SGPRRegBankID || 250 Src.getID() == AMDGPU::VCCRegBankID)) 251 return std::numeric_limits<unsigned>::max(); 252 253 // There is no direct copy between AGPRs. 254 if (Dst.getID() == AMDGPU::AGPRRegBankID && 255 Src.getID() == AMDGPU::AGPRRegBankID) 256 return 4; 257 258 return RegisterBankInfo::copyCost(Dst, Src, Size); 259 } 260 261 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 262 const ValueMapping &ValMapping, 263 const RegisterBank *CurBank) const { 264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 265 // VGPR. 266 // FIXME: Is there a better way to do this? 267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 268 return 10; // This is expensive. 269 270 assert(ValMapping.NumBreakDowns == 2 && 271 ValMapping.BreakDown[0].Length == 32 && 272 ValMapping.BreakDown[0].StartIdx == 0 && 273 ValMapping.BreakDown[1].Length == 32 && 274 ValMapping.BreakDown[1].StartIdx == 32 && 275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 276 277 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 279 // want. 280 281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 282 // alignment restrictions, but this probably isn't important. 283 return 1; 284 } 285 286 const RegisterBank & 287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 288 LLT Ty) const { 289 if (&RC == &AMDGPU::SReg_1RegClass) 290 return AMDGPU::VCCRegBank; 291 292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 293 // VCC-like use. 294 if (TRI->isSGPRClass(&RC)) { 295 // FIXME: This probably came from a copy from a physical register, which 296 // should be inferable from the copied to-type. We don't have many boolean 297 // physical register constraints so just assume a normal SGPR for now. 298 if (!Ty.isValid()) 299 return AMDGPU::SGPRRegBank; 300 301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 302 } 303 304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 305 } 306 307 template <unsigned NumOps> 308 RegisterBankInfo::InstructionMappings 309 AMDGPURegisterBankInfo::addMappingFromTable( 310 const MachineInstr &MI, const MachineRegisterInfo &MRI, 311 const std::array<unsigned, NumOps> RegSrcOpIdx, 312 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 313 314 InstructionMappings AltMappings; 315 316 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 317 318 unsigned Sizes[NumOps]; 319 for (unsigned I = 0; I < NumOps; ++I) { 320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 322 } 323 324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 327 } 328 329 // getInstrMapping's default mapping uses ID 1, so start at 2. 330 unsigned MappingID = 2; 331 for (const auto &Entry : Table) { 332 for (unsigned I = 0; I < NumOps; ++I) { 333 int OpIdx = RegSrcOpIdx[I]; 334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 335 } 336 337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 338 getOperandsMapping(Operands), 339 Operands.size())); 340 } 341 342 return AltMappings; 343 } 344 345 RegisterBankInfo::InstructionMappings 346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 349 case Intrinsic::amdgcn_readlane: { 350 static const OpRegBankEntry<3> Table[2] = { 351 // Perfectly legal. 352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 353 354 // Need a readfirstlane for the index. 355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 356 }; 357 358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 360 } 361 case Intrinsic::amdgcn_writelane: { 362 static const OpRegBankEntry<4> Table[4] = { 363 // Perfectly legal. 364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 365 366 // Need readfirstlane of first op 367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 368 369 // Need readfirstlane of second op 370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 371 372 // Need readfirstlane of both ops 373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 374 }; 375 376 // rsrc, voffset, offset 377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); 379 } 380 default: 381 return RegisterBankInfo::getInstrAlternativeMappings(MI); 382 } 383 } 384 385 RegisterBankInfo::InstructionMappings 386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 388 389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 390 case Intrinsic::amdgcn_s_buffer_load: { 391 static const OpRegBankEntry<2> Table[4] = { 392 // Perfectly legal. 393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 394 395 // Only need 1 register in loop 396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 397 398 // Have to waterfall the resource. 399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 400 401 // Have to waterfall the resource, and the offset. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 403 }; 404 405 // rsrc, offset 406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); 408 } 409 case Intrinsic::amdgcn_ds_ordered_add: 410 case Intrinsic::amdgcn_ds_ordered_swap: { 411 // VGPR = M0, VGPR 412 static const OpRegBankEntry<3> Table[2] = { 413 // Perfectly legal. 414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 415 416 // Need a readfirstlane for m0 417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 418 }; 419 420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 422 } 423 case Intrinsic::amdgcn_s_sendmsg: 424 case Intrinsic::amdgcn_s_sendmsghalt: { 425 // FIXME: Should have no register for immediate 426 static const OpRegBankEntry<1> Table[2] = { 427 // Perfectly legal. 428 { { AMDGPU::SGPRRegBankID }, 1 }, 429 430 // Need readlane 431 { { AMDGPU::VGPRRegBankID }, 3 } 432 }; 433 434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); 436 } 437 default: 438 return RegisterBankInfo::getInstrAlternativeMappings(MI); 439 } 440 } 441 442 // FIXME: Returns uniform if there's no source value information. This is 443 // probably wrong. 444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { 445 if (!MI.hasOneMemOperand()) 446 return false; 447 448 const MachineMemOperand *MMO = *MI.memoperands_begin(); 449 const unsigned AS = MMO->getAddrSpace(); 450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 451 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 452 const unsigned MemSize = 8 * MMO->getSize(); 453 454 // Require 4-byte alignment. 455 return (MMO->getAlign() >= Align(4) || 456 (Subtarget.hasScalarSubwordLoads() && 457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) || 458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) && 459 // Can't do a scalar atomic load. 460 !MMO->isAtomic() && 461 // Don't use scalar loads for volatile accesses to non-constant address 462 // spaces. 463 (IsConst || !MMO->isVolatile()) && 464 // Memory must be known constant, or not written before this load. 465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 466 AMDGPUInstrInfo::isUniformMMO(MMO); 467 } 468 469 RegisterBankInfo::InstructionMappings 470 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 471 const MachineInstr &MI) const { 472 473 const MachineFunction &MF = *MI.getParent()->getParent(); 474 const MachineRegisterInfo &MRI = MF.getRegInfo(); 475 476 477 InstructionMappings AltMappings; 478 switch (MI.getOpcode()) { 479 case TargetOpcode::G_CONSTANT: 480 case TargetOpcode::G_IMPLICIT_DEF: { 481 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 482 if (Size == 1) { 483 static const OpRegBankEntry<1> Table[3] = { 484 { { AMDGPU::VGPRRegBankID }, 1 }, 485 { { AMDGPU::SGPRRegBankID }, 1 }, 486 { { AMDGPU::VCCRegBankID }, 1 } 487 }; 488 489 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 490 } 491 492 [[fallthrough]]; 493 } 494 case TargetOpcode::G_FCONSTANT: 495 case TargetOpcode::G_FRAME_INDEX: 496 case TargetOpcode::G_GLOBAL_VALUE: { 497 static const OpRegBankEntry<1> Table[2] = { 498 { { AMDGPU::VGPRRegBankID }, 1 }, 499 { { AMDGPU::SGPRRegBankID }, 1 } 500 }; 501 502 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 503 } 504 case TargetOpcode::G_AND: 505 case TargetOpcode::G_OR: 506 case TargetOpcode::G_XOR: { 507 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 508 509 if (Size == 1) { 510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 511 const InstructionMapping &SCCMapping = getInstructionMapping( 512 1, 1, getOperandsMapping( 513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 516 3); // Num Operands 517 AltMappings.push_back(&SCCMapping); 518 519 const InstructionMapping &VCCMapping0 = getInstructionMapping( 520 2, 1, getOperandsMapping( 521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 524 3); // Num Operands 525 AltMappings.push_back(&VCCMapping0); 526 return AltMappings; 527 } 528 529 if (Size != 64) 530 break; 531 532 const InstructionMapping &SSMapping = getInstructionMapping( 533 1, 1, getOperandsMapping( 534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 537 3); // Num Operands 538 AltMappings.push_back(&SSMapping); 539 540 const InstructionMapping &VVMapping = getInstructionMapping( 541 2, 2, getOperandsMapping( 542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 545 3); // Num Operands 546 AltMappings.push_back(&VVMapping); 547 break; 548 } 549 case TargetOpcode::G_LOAD: 550 case TargetOpcode::G_ZEXTLOAD: 551 case TargetOpcode::G_SEXTLOAD: { 552 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 553 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 554 unsigned PtrSize = PtrTy.getSizeInBits(); 555 unsigned AS = PtrTy.getAddressSpace(); 556 557 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 558 AS != AMDGPUAS::PRIVATE_ADDRESS) && 559 isScalarLoadLegal(MI)) { 560 const InstructionMapping &SSMapping = getInstructionMapping( 561 1, 1, getOperandsMapping( 562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 564 2); // Num Operands 565 AltMappings.push_back(&SSMapping); 566 } 567 568 const InstructionMapping &VVMapping = getInstructionMapping( 569 2, 1, 570 getOperandsMapping( 571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 573 2); // Num Operands 574 AltMappings.push_back(&VVMapping); 575 576 // It may be possible to have a vgpr = load sgpr mapping here, because 577 // the mubuf instructions support this kind of load, but probably for only 578 // gfx7 and older. However, the addressing mode matching in the instruction 579 // selector should be able to do a better job of detecting and selecting 580 // these kinds of loads from the vgpr = load vgpr mapping. 581 582 return AltMappings; 583 584 } 585 case TargetOpcode::G_SELECT: { 586 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 587 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 592 4); // Num Operands 593 AltMappings.push_back(&SSMapping); 594 595 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 600 4); // Num Operands 601 AltMappings.push_back(&VVMapping); 602 603 return AltMappings; 604 } 605 case TargetOpcode::G_UADDE: 606 case TargetOpcode::G_USUBE: 607 case TargetOpcode::G_SADDE: 608 case TargetOpcode::G_SSUBE: { 609 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 610 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 611 getOperandsMapping( 612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 617 5); // Num Operands 618 AltMappings.push_back(&SSMapping); 619 620 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 626 5); // Num Operands 627 AltMappings.push_back(&VVMapping); 628 return AltMappings; 629 } 630 case AMDGPU::G_BRCOND: { 631 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 632 633 // TODO: Change type to 32 for scalar 634 const InstructionMapping &SMapping = getInstructionMapping( 635 1, 1, getOperandsMapping( 636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 637 2); // Num Operands 638 AltMappings.push_back(&SMapping); 639 640 const InstructionMapping &VMapping = getInstructionMapping( 641 1, 1, getOperandsMapping( 642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 643 2); // Num Operands 644 AltMappings.push_back(&VMapping); 645 return AltMappings; 646 } 647 case AMDGPU::G_INTRINSIC: 648 case AMDGPU::G_INTRINSIC_CONVERGENT: 649 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 652 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 653 default: 654 break; 655 } 656 return RegisterBankInfo::getInstrAlternativeMappings(MI); 657 } 658 659 void AMDGPURegisterBankInfo::split64BitValueForMapping( 660 MachineIRBuilder &B, 661 SmallVector<Register, 2> &Regs, 662 LLT HalfTy, 663 Register Reg) const { 664 assert(HalfTy.getSizeInBits() == 32); 665 MachineRegisterInfo *MRI = B.getMRI(); 666 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 667 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 668 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 669 MRI->setRegBank(LoLHS, *Bank); 670 MRI->setRegBank(HiLHS, *Bank); 671 672 Regs.push_back(LoLHS); 673 Regs.push_back(HiLHS); 674 675 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 676 .addDef(LoLHS) 677 .addDef(HiLHS) 678 .addUse(Reg); 679 } 680 681 /// Replace the current type each register in \p Regs has with \p NewTy 682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 683 LLT NewTy) { 684 for (Register Reg : Regs) { 685 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 686 MRI.setType(Reg, NewTy); 687 } 688 } 689 690 static LLT getHalfSizedType(LLT Ty) { 691 if (Ty.isVector()) { 692 assert(Ty.getElementCount().isKnownMultipleOf(2)); 693 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 694 Ty.getElementType()); 695 } 696 697 assert(Ty.getScalarSizeInBits() % 2 == 0); 698 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 699 } 700 701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 702 // source value into a scalar register. 703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 704 MachineRegisterInfo &MRI, 705 Register Src) const { 706 LLT Ty = MRI.getType(Src); 707 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 708 709 if (Bank == &AMDGPU::SGPRRegBank) 710 return Src; 711 712 unsigned Bits = Ty.getSizeInBits(); 713 assert(Bits % 32 == 0); 714 715 if (Bank != &AMDGPU::VGPRRegBank) { 716 // We need to copy from AGPR to VGPR 717 Src = B.buildCopy(Ty, Src).getReg(0); 718 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 719 } 720 721 LLT S32 = LLT::scalar(32); 722 unsigned NumParts = Bits / 32; 723 SmallVector<Register, 8> SrcParts; 724 SmallVector<Register, 8> DstParts; 725 726 if (Bits == 32) { 727 SrcParts.push_back(Src); 728 } else { 729 auto Unmerge = B.buildUnmerge(S32, Src); 730 for (unsigned i = 0; i < NumParts; ++i) 731 SrcParts.push_back(Unmerge.getReg(i)); 732 } 733 734 for (unsigned i = 0; i < NumParts; ++i) { 735 Register SrcPart = SrcParts[i]; 736 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 737 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 738 739 const TargetRegisterClass *Constrained = 740 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 741 (void)Constrained; 742 assert(Constrained && "Failed to constrain readfirstlane src reg"); 743 744 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 745 746 DstParts.push_back(DstPart); 747 } 748 749 if (Bits == 32) 750 return DstParts[0]; 751 752 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0); 753 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 754 return Dst; 755 } 756 757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 759 /// execute the instruction for each unique combination of values in all lanes 760 /// in the wave. The block will be split such that rest of the instructions are 761 /// moved to a new block. 762 /// 763 /// Essentially performs this loop: 764 // 765 /// Save Execution Mask 766 /// For (Lane : Wavefront) { 767 /// Enable Lane, Disable all other lanes 768 /// SGPR = read SGPR value for current lane from VGPR 769 /// VGPRResult[Lane] = use_op SGPR 770 /// } 771 /// Restore Execution Mask 772 /// 773 /// There is additional complexity to try for compare values to identify the 774 /// unique values used. 775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 776 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, 777 SmallSet<Register, 4> &SGPROperandRegs) const { 778 // Track use registers which have already been expanded with a readfirstlane 779 // sequence. This may have multiple uses if moving a sequence. 780 DenseMap<Register, Register> WaterfalledRegMap; 781 782 MachineBasicBlock &MBB = B.getMBB(); 783 MachineFunction *MF = &B.getMF(); 784 785 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 786 const unsigned MovExecOpc = 787 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 788 const unsigned MovExecTermOpc = 789 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 790 791 const unsigned XorTermOpc = Subtarget.isWave32() ? 792 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 793 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 794 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 795 const unsigned ExecReg = Subtarget.isWave32() ? 796 AMDGPU::EXEC_LO : AMDGPU::EXEC; 797 798 #ifndef NDEBUG 799 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 800 #endif 801 802 MachineRegisterInfo &MRI = *B.getMRI(); 803 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 804 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 805 806 // Don't bother using generic instructions/registers for the exec mask. 807 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 808 .addDef(InitSaveExecReg); 809 810 Register PhiExec = MRI.createVirtualRegister(WaveRC); 811 Register NewExec = MRI.createVirtualRegister(WaveRC); 812 813 // To insert the loop we need to split the block. Move everything before this 814 // point to a new block, and insert a new empty block before this instruction. 815 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 816 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 819 MachineFunction::iterator MBBI(MBB); 820 ++MBBI; 821 MF->insert(MBBI, LoopBB); 822 MF->insert(MBBI, BodyBB); 823 MF->insert(MBBI, RestoreExecBB); 824 MF->insert(MBBI, RemainderBB); 825 826 LoopBB->addSuccessor(BodyBB); 827 BodyBB->addSuccessor(RestoreExecBB); 828 BodyBB->addSuccessor(LoopBB); 829 830 // Move the rest of the block into a new block. 831 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 832 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 833 834 MBB.addSuccessor(LoopBB); 835 RestoreExecBB->addSuccessor(RemainderBB); 836 837 B.setInsertPt(*LoopBB, LoopBB->end()); 838 839 B.buildInstr(TargetOpcode::PHI) 840 .addDef(PhiExec) 841 .addReg(InitSaveExecReg) 842 .addMBB(&MBB) 843 .addReg(NewExec) 844 .addMBB(BodyBB); 845 846 const DebugLoc &DL = B.getDL(); 847 848 MachineInstr &FirstInst = *Range.begin(); 849 850 // Move the instruction into the loop body. Note we moved everything after 851 // Range.end() already into a new block, so Range.end() is no longer valid. 852 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 853 854 // Figure out the iterator range after splicing the instructions. 855 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 856 auto NewEnd = BodyBB->end(); 857 858 B.setMBB(*LoopBB); 859 860 LLT S1 = LLT::scalar(1); 861 Register CondReg; 862 863 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 864 865 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 866 for (MachineOperand &Op : MI.all_uses()) { 867 Register OldReg = Op.getReg(); 868 if (!SGPROperandRegs.count(OldReg)) 869 continue; 870 871 // See if we already processed this register in another instruction in the 872 // sequence. 873 auto OldVal = WaterfalledRegMap.find(OldReg); 874 if (OldVal != WaterfalledRegMap.end()) { 875 Op.setReg(OldVal->second); 876 continue; 877 } 878 879 Register OpReg = Op.getReg(); 880 LLT OpTy = MRI.getType(OpReg); 881 882 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 883 if (OpBank != &AMDGPU::VGPRRegBank) { 884 // Insert copy from AGPR to VGPR before the loop. 885 B.setMBB(MBB); 886 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 887 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 888 B.setMBB(*LoopBB); 889 } 890 891 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 892 893 // Build the comparison(s). 894 unsigned OpSize = OpTy.getSizeInBits(); 895 bool Is64 = OpSize % 64 == 0; 896 unsigned PartSize = Is64 ? 64 : 32; 897 LLT PartTy = LLT::scalar(PartSize); 898 unsigned NumParts = OpSize / PartSize; 899 SmallVector<Register, 8> OpParts; 900 SmallVector<Register, 8> CurrentLaneParts; 901 902 if (NumParts == 1) { 903 OpParts.push_back(OpReg); 904 CurrentLaneParts.push_back(CurrentLaneReg); 905 } else { 906 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 907 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 908 for (unsigned i = 0; i < NumParts; ++i) { 909 OpParts.push_back(UnmergeOp.getReg(i)); 910 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 911 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 912 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 913 } 914 } 915 916 for (unsigned i = 0; i < NumParts; ++i) { 917 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 918 OpParts[i]).getReg(0); 919 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 920 921 if (!CondReg) { 922 CondReg = CmpReg; 923 } else { 924 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 925 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 926 } 927 } 928 929 Op.setReg(CurrentLaneReg); 930 931 // Make sure we don't re-process this register again. 932 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); 933 } 934 } 935 936 // The ballot becomes a no-op during instruction selection. 937 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 938 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}) 939 .addReg(CondReg) 940 .getReg(0); 941 MRI.setRegClass(CondReg, WaveRC); 942 943 // Update EXEC, save the original EXEC value to VCC. 944 B.buildInstr(AndSaveExecOpc) 945 .addDef(NewExec) 946 .addReg(CondReg, RegState::Kill); 947 948 MRI.setSimpleHint(NewExec, CondReg); 949 950 B.setInsertPt(*BodyBB, BodyBB->end()); 951 952 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 953 B.buildInstr(XorTermOpc) 954 .addDef(ExecReg) 955 .addReg(ExecReg) 956 .addReg(NewExec); 957 958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 959 // s_cbranch_scc0? 960 961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 962 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 963 964 // Save the EXEC mask before the loop. 965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 966 .addReg(ExecReg); 967 968 // Restore the EXEC mask after the loop. 969 B.setMBB(*RestoreExecBB); 970 B.buildInstr(MovExecTermOpc) 971 .addDef(ExecReg) 972 .addReg(SaveExecReg); 973 974 // Set the insert point after the original instruction, so any new 975 // instructions will be in the remainder. 976 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 977 978 return true; 979 } 980 981 // Return any unique registers used by \p MI at \p OpIndices that need to be 982 // handled in a waterfall loop. Returns these registers in \p 983 // SGPROperandRegs. Returns true if there are any operands to handle and a 984 // waterfall loop is necessary. 985 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 986 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 987 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 988 for (unsigned Op : OpIndices) { 989 assert(MI.getOperand(Op).isUse()); 990 Register Reg = MI.getOperand(Op).getReg(); 991 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 992 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 993 SGPROperandRegs.insert(Reg); 994 } 995 996 // No operands need to be replaced, so no need to loop. 997 return !SGPROperandRegs.empty(); 998 } 999 1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1001 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const { 1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1003 // are the same register. 1004 SmallSet<Register, 4> SGPROperandRegs; 1005 1006 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices)) 1007 return false; 1008 1009 MachineBasicBlock::iterator I = MI.getIterator(); 1010 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1011 SGPROperandRegs); 1012 } 1013 1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1016 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const { 1017 Register Reg = MI.getOperand(OpIdx).getReg(); 1018 MachineRegisterInfo &MRI = *B.getMRI(); 1019 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1020 if (Bank == &AMDGPU::SGPRRegBank) 1021 return; 1022 1023 Reg = buildReadFirstLane(B, MRI, Reg); 1024 MI.getOperand(OpIdx).setReg(Reg); 1025 } 1026 1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1028 /// rest will be in the remainder. 1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1030 unsigned TotalSize = Ty.getSizeInBits(); 1031 if (!Ty.isVector()) 1032 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1033 1034 LLT EltTy = Ty.getElementType(); 1035 unsigned EltSize = EltTy.getSizeInBits(); 1036 assert(FirstSize % EltSize == 0); 1037 1038 unsigned FirstPartNumElts = FirstSize / EltSize; 1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1040 1041 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1042 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1043 } 1044 1045 static LLT widen96To128(LLT Ty) { 1046 if (!Ty.isVector()) 1047 return LLT::scalar(128); 1048 1049 LLT EltTy = Ty.getElementType(); 1050 assert(128 % EltTy.getSizeInBits() == 0); 1051 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1052 } 1053 1054 bool AMDGPURegisterBankInfo::applyMappingLoad( 1055 MachineIRBuilder &B, 1056 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1057 MachineInstr &MI) const { 1058 MachineRegisterInfo &MRI = *B.getMRI(); 1059 Register DstReg = MI.getOperand(0).getReg(); 1060 const LLT LoadTy = MRI.getType(DstReg); 1061 unsigned LoadSize = LoadTy.getSizeInBits(); 1062 const unsigned MaxNonSmrdLoadSize = 128; 1063 1064 const RegisterBank *DstBank = 1065 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1066 if (DstBank == &AMDGPU::SGPRRegBank) { 1067 // There are some special cases that we need to look at for 32 bit and 96 1068 // bit SGPR loads otherwise we have nothing to do. 1069 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) 1070 return false; 1071 1072 MachineMemOperand *MMO = *MI.memoperands_begin(); 1073 const unsigned MemSize = 8 * MMO->getSize(); 1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1076 // scalar loads should have a load size of 32 but memory access size of less 1077 // than 32. 1078 if (LoadSize == 32 && 1079 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1080 return false; 1081 1082 if (LoadSize == 32 && 1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) || 1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) && 1085 isScalarLoadLegal(MI) && 1086 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12) 1087 return false; 1088 1089 Register PtrReg = MI.getOperand(1).getReg(); 1090 1091 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 1092 1093 if (LoadSize == 32) { 1094 // This is an extending load from a sub-dword size. Widen the memory 1095 // access size to 4 bytes and clear the extra high bits appropriately 1096 const LLT S32 = LLT::scalar(32); 1097 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1099 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1100 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1101 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1103 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1104 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1105 } else 1106 // We do not need to touch the higher bits for regular loads. 1107 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1108 } else { 1109 // 96-bit loads are only available for vector loads. We need to split this 1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1111 if (MMO->getAlign() < Align(16)) { 1112 LegalizerHelper Helper(B.getMF(), ApplyBank, B); 1113 LLT Part64, Part32; 1114 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1115 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1116 LegalizerHelper::Legalized) 1117 return false; 1118 return true; 1119 } else { 1120 LLT WiderTy = widen96To128(LoadTy); 1121 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1122 if (WiderTy.isScalar()) 1123 B.buildTrunc(MI.getOperand(0), WideLoad); 1124 else { 1125 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1126 WideLoad); 1127 } 1128 } 1129 } 1130 1131 MI.eraseFromParent(); 1132 return true; 1133 } 1134 1135 // 128-bit loads are supported for all instruction types. 1136 if (LoadSize <= MaxNonSmrdLoadSize) 1137 return false; 1138 1139 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1140 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1141 1142 if (SrcRegs.empty()) 1143 SrcRegs.push_back(MI.getOperand(1).getReg()); 1144 1145 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1146 1147 // RegBankSelect only emits scalar types, so we need to reset the pointer 1148 // operand to a pointer type. 1149 Register BasePtrReg = SrcRegs[0]; 1150 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1151 MRI.setType(BasePtrReg, PtrTy); 1152 1153 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1154 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1155 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); 1156 LegalizerHelper Helper(B.getMF(), O, B); 1157 1158 if (LoadTy.isVector()) { 1159 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1160 return false; 1161 } else { 1162 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1163 return false; 1164 } 1165 1166 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1167 return true; 1168 } 1169 1170 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1171 MachineIRBuilder &B, 1172 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1173 MachineInstr &MI) const { 1174 MachineRegisterInfo &MRI = *B.getMRI(); 1175 const MachineFunction &MF = B.getMF(); 1176 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1177 const auto &TFI = *ST.getFrameLowering(); 1178 1179 // Guard in case the stack growth direction ever changes with scratch 1180 // instructions. 1181 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1182 return false; 1183 1184 Register Dst = MI.getOperand(0).getReg(); 1185 Register AllocSize = MI.getOperand(1).getReg(); 1186 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1187 1188 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1189 1190 // TODO: Need to emit a wave reduction to get the maximum size. 1191 if (SizeBank != &AMDGPU::SGPRRegBank) 1192 return false; 1193 1194 LLT PtrTy = MRI.getType(Dst); 1195 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1196 1197 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1198 Register SPReg = Info->getStackPtrOffsetReg(); 1199 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); 1200 1201 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1202 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1203 1204 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1205 if (Alignment > TFI.getStackAlign()) { 1206 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1207 B.buildMaskLowPtrBits(Dst, PtrAdd, 1208 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1209 } else { 1210 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1211 } 1212 1213 MI.eraseFromParent(); 1214 return true; 1215 } 1216 1217 bool AMDGPURegisterBankInfo::applyMappingImage( 1218 MachineIRBuilder &B, MachineInstr &MI, 1219 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1220 int RsrcIdx) const { 1221 const int NumDefs = MI.getNumExplicitDefs(); 1222 1223 // The reported argument index is relative to the IR intrinsic call arguments, 1224 // so we need to shift by the number of defs and the intrinsic ID. 1225 RsrcIdx += NumDefs + 1; 1226 1227 // Insert copies to VGPR arguments. 1228 applyDefaultMapping(OpdMapper); 1229 1230 // Fixup any SGPR arguments. 1231 SmallVector<unsigned, 4> SGPRIndexes; 1232 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1233 if (!MI.getOperand(I).isReg()) 1234 continue; 1235 1236 // If this intrinsic has a sampler, it immediately follows rsrc. 1237 if (I == RsrcIdx || I == RsrcIdx + 1) 1238 SGPRIndexes.push_back(I); 1239 } 1240 1241 executeInWaterfallLoop(B, MI, SGPRIndexes); 1242 return true; 1243 } 1244 1245 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1246 // the three offsets (voffset, soffset and instoffset) 1247 unsigned AMDGPURegisterBankInfo::setBufferOffsets( 1248 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, 1249 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { 1250 const LLT S32 = LLT::scalar(32); 1251 MachineRegisterInfo *MRI = B.getMRI(); 1252 1253 if (std::optional<int64_t> Imm = 1254 getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1255 uint32_t SOffset, ImmOffset; 1256 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { 1257 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1258 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1259 InstOffsetVal = ImmOffset; 1260 1261 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1262 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1263 return SOffset + ImmOffset; 1264 } 1265 } 1266 1267 Register Base; 1268 unsigned Offset; 1269 1270 std::tie(Base, Offset) = 1271 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1272 1273 uint32_t SOffset, ImmOffset; 1274 if ((int)Offset > 0 && 1275 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 1276 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1277 VOffsetReg = Base; 1278 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1279 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1280 InstOffsetVal = ImmOffset; 1281 return 0; // XXX - Why is this 0? 1282 } 1283 1284 // If we have SGPR base, we can use it for soffset. 1285 if (SOffset == 0) { 1286 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1287 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1288 SOffsetReg = Base; 1289 InstOffsetVal = ImmOffset; 1290 return 0; // XXX - Why is this 0? 1291 } 1292 } 1293 1294 // Handle the variable sgpr + vgpr case. 1295 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1296 if (Add && (int)Offset >= 0) { 1297 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); 1298 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); 1299 1300 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); 1301 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); 1302 1303 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1304 VOffsetReg = Src0; 1305 SOffsetReg = Src1; 1306 return 0; 1307 } 1308 1309 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1310 VOffsetReg = Src1; 1311 SOffsetReg = Src0; 1312 return 0; 1313 } 1314 } 1315 1316 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1317 // have an SGPR offset and a VGPR resource. 1318 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1319 VOffsetReg = CombinedOffset; 1320 } else { 1321 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1322 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1323 } 1324 1325 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1326 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1327 return 0; 1328 } 1329 1330 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1331 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 1332 MachineInstr &MI = OpdMapper.getMI(); 1333 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1334 1335 const LLT S32 = LLT::scalar(32); 1336 Register Dst = MI.getOperand(0).getReg(); 1337 LLT Ty = MRI.getType(Dst); 1338 1339 const RegisterBank *RSrcBank = 1340 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1341 const RegisterBank *OffsetBank = 1342 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1343 if (RSrcBank == &AMDGPU::SGPRRegBank && 1344 OffsetBank == &AMDGPU::SGPRRegBank) 1345 return true; // Legal mapping 1346 1347 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1348 // here but don't have an MMO. 1349 1350 unsigned LoadSize = Ty.getSizeInBits(); 1351 int NumLoads = 1; 1352 if (LoadSize == 256 || LoadSize == 512) { 1353 NumLoads = LoadSize / 128; 1354 Ty = Ty.divide(NumLoads); 1355 } 1356 1357 // Use the alignment to ensure that the required offsets will fit into the 1358 // immediate offsets. 1359 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1360 1361 MachineFunction &MF = B.getMF(); 1362 1363 Register SOffset; 1364 Register VOffset; 1365 int64_t ImmOffset = 0; 1366 1367 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset, 1368 SOffset, ImmOffset, Alignment); 1369 1370 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1371 // can, but we need to track an MMO for that. 1372 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1373 const Align MemAlign(4); // FIXME: ABI type alignment? 1374 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1375 MachinePointerInfo(), 1376 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1377 MachineMemOperand::MOInvariant, 1378 MemSize, MemAlign); 1379 if (MMOOffset != 0) 1380 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1381 1382 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1383 // assume that the buffer is unswizzled. 1384 1385 Register RSrc = MI.getOperand(1).getReg(); 1386 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1387 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1388 1389 SmallVector<Register, 4> LoadParts(NumLoads); 1390 1391 MachineBasicBlock::iterator MII = MI.getIterator(); 1392 MachineInstrSpan Span(MII, &B.getMBB()); 1393 1394 for (int i = 0; i < NumLoads; ++i) { 1395 if (NumLoads == 1) { 1396 LoadParts[i] = Dst; 1397 } else { 1398 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1399 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1400 } 1401 1402 MachineMemOperand *MMO = BaseMMO; 1403 if (i != 0) 1404 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1405 1406 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1407 .addDef(LoadParts[i]) // vdata 1408 .addUse(RSrc) // rsrc 1409 .addUse(VIndex) // vindex 1410 .addUse(VOffset) // voffset 1411 .addUse(SOffset) // soffset 1412 .addImm(ImmOffset + 16 * i) // offset(imm) 1413 .addImm(0) // cachepolicy, swizzled buffer(imm) 1414 .addImm(0) // idxen(imm) 1415 .addMemOperand(MMO); 1416 } 1417 1418 // TODO: If only the resource is a VGPR, it may be better to execute the 1419 // scalar load in the waterfall loop if the resource is expected to frequently 1420 // be dynamically uniform. 1421 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1422 // Remove the original instruction to avoid potentially confusing the 1423 // waterfall loop logic. 1424 B.setInstr(*Span.begin()); 1425 MI.eraseFromParent(); 1426 1427 SmallSet<Register, 4> OpsToWaterfall; 1428 1429 OpsToWaterfall.insert(RSrc); 1430 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1431 OpsToWaterfall); 1432 } 1433 1434 if (NumLoads != 1) { 1435 if (Ty.isVector()) 1436 B.buildConcatVectors(Dst, LoadParts); 1437 else 1438 B.buildMergeLikeInstr(Dst, LoadParts); 1439 } 1440 1441 // We removed the instruction earlier with a waterfall loop. 1442 if (RSrcBank == &AMDGPU::SGPRRegBank) 1443 MI.eraseFromParent(); 1444 1445 return true; 1446 } 1447 1448 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, 1449 const OperandsMapper &OpdMapper, 1450 bool Signed) const { 1451 MachineInstr &MI = OpdMapper.getMI(); 1452 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1453 1454 // Insert basic copies 1455 applyDefaultMapping(OpdMapper); 1456 1457 Register DstReg = MI.getOperand(0).getReg(); 1458 LLT Ty = MRI.getType(DstReg); 1459 1460 const LLT S32 = LLT::scalar(32); 1461 1462 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; 1463 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1464 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1465 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1466 1467 const RegisterBank *DstBank = 1468 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1469 if (DstBank == &AMDGPU::VGPRRegBank) { 1470 if (Ty == S32) 1471 return true; 1472 1473 // There is no 64-bit vgpr bitfield extract instructions so the operation 1474 // is expanded to a sequence of instructions that implement the operation. 1475 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); 1476 1477 const LLT S64 = LLT::scalar(64); 1478 // Shift the source operand so that extracted bits start at bit 0. 1479 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1480 : B.buildLShr(S64, SrcReg, OffsetReg); 1481 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1482 1483 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1484 // if the width is a constant. 1485 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1486 // Use the 32-bit bitfield extract instruction if the width is a constant. 1487 // Depending on the width size, use either the low or high 32-bits. 1488 auto Zero = B.buildConstant(S32, 0); 1489 auto WidthImm = ConstWidth->Value.getZExtValue(); 1490 if (WidthImm <= 32) { 1491 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1492 // or clear the upper 32-bits. 1493 auto Extract = 1494 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1495 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1496 auto Extend = 1497 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1498 B.buildMergeLikeInstr(DstReg, {Extract, Extend}); 1499 } else { 1500 // Use bitfield extract on upper 32-bit source, and combine with lower 1501 // 32-bit source. 1502 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1503 auto Extract = 1504 Signed 1505 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1506 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1507 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1508 } 1509 MI.eraseFromParent(); 1510 return true; 1511 } 1512 1513 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1514 // operations. 1515 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1516 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1517 if (Signed) 1518 B.buildAShr(S64, SignBit, ExtShift); 1519 else 1520 B.buildLShr(S64, SignBit, ExtShift); 1521 MI.eraseFromParent(); 1522 return true; 1523 } 1524 1525 // The scalar form packs the offset and width in a single operand. 1526 1527 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); 1528 1529 // Ensure the high bits are clear to insert the offset. 1530 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1531 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1532 1533 // Zeros out the low bits, so don't bother clamping the input value. 1534 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1535 1536 // Transformation function, pack the offset and width of a BFE into 1537 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1538 // source, bits [5:0] contain the offset and bits [22:16] the width. 1539 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1540 1541 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1542 // register class constraints. 1543 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1544 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1545 1546 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1547 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1548 llvm_unreachable("failed to constrain BFE"); 1549 1550 MI.eraseFromParent(); 1551 return true; 1552 } 1553 1554 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1555 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 1556 MachineInstr &MI = OpdMapper.getMI(); 1557 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1558 1559 // Insert basic copies. 1560 applyDefaultMapping(OpdMapper); 1561 1562 Register Dst0 = MI.getOperand(0).getReg(); 1563 Register Dst1 = MI.getOperand(1).getReg(); 1564 Register Src0 = MI.getOperand(2).getReg(); 1565 Register Src1 = MI.getOperand(3).getReg(); 1566 Register Src2 = MI.getOperand(4).getReg(); 1567 1568 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1569 return true; 1570 1571 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1572 LLT S1 = LLT::scalar(1); 1573 LLT S32 = LLT::scalar(32); 1574 1575 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1576 bool Accumulate = true; 1577 1578 if (!DstOnValu) { 1579 if (mi_match(Src2, MRI, m_ZeroInt())) 1580 Accumulate = false; 1581 } 1582 1583 // Keep the multiplication on the SALU. 1584 Register DstHi; 1585 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1586 bool MulHiInVgpr = false; 1587 1588 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1589 1590 if (Subtarget.hasSMulHi()) { 1591 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1592 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1593 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1594 } else { 1595 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1596 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1597 1598 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1599 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1600 1601 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1602 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1603 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1604 1605 if (!DstOnValu) { 1606 DstHi = buildReadFirstLane(B, MRI, DstHi); 1607 } else { 1608 MulHiInVgpr = true; 1609 } 1610 } 1611 1612 // Accumulate and produce the "carry-out" bit. 1613 // 1614 // The "carry-out" is defined as bit 64 of the result when computed as a 1615 // big integer. For unsigned multiply-add, this matches the usual definition 1616 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1617 // result, which is determined as: 1618 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1619 LLT CarryType = DstOnValu ? S1 : S32; 1620 const RegisterBank &CarryBank = 1621 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1622 const RegisterBank &DstBank = 1623 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1624 Register Carry; 1625 Register Zero; 1626 1627 if (!IsUnsigned) { 1628 Zero = B.buildConstant(S32, 0).getReg(0); 1629 MRI.setRegBank(Zero, 1630 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1631 1632 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1633 .getReg(0); 1634 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1635 : AMDGPU::SGPRRegBank); 1636 1637 if (DstOnValu && !MulHiInVgpr) { 1638 Carry = B.buildTrunc(S1, Carry).getReg(0); 1639 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1640 } 1641 } 1642 1643 if (Accumulate) { 1644 if (DstOnValu) { 1645 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1646 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1647 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1648 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1649 } 1650 1651 auto Unmerge = B.buildUnmerge(S32, Src2); 1652 Register Src2Lo = Unmerge.getReg(0); 1653 Register Src2Hi = Unmerge.getReg(1); 1654 MRI.setRegBank(Src2Lo, DstBank); 1655 MRI.setRegBank(Src2Hi, DstBank); 1656 1657 if (!IsUnsigned) { 1658 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1659 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1660 1661 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1662 MRI.setRegBank(Carry, CarryBank); 1663 } 1664 1665 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1666 DstLo = AddLo.getReg(0); 1667 Register CarryLo = AddLo.getReg(1); 1668 MRI.setRegBank(DstLo, DstBank); 1669 MRI.setRegBank(CarryLo, CarryBank); 1670 1671 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1672 DstHi = AddHi.getReg(0); 1673 MRI.setRegBank(DstHi, DstBank); 1674 1675 Register CarryHi = AddHi.getReg(1); 1676 MRI.setRegBank(CarryHi, CarryBank); 1677 1678 if (IsUnsigned) { 1679 Carry = CarryHi; 1680 } else { 1681 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1682 MRI.setRegBank(Carry, CarryBank); 1683 } 1684 } else { 1685 if (IsUnsigned) { 1686 Carry = B.buildConstant(CarryType, 0).getReg(0); 1687 MRI.setRegBank(Carry, CarryBank); 1688 } 1689 } 1690 1691 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi}); 1692 1693 if (DstOnValu) { 1694 B.buildCopy(Dst1, Carry); 1695 } else { 1696 B.buildTrunc(Dst1, Carry); 1697 } 1698 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 // Return a suitable opcode for extending the operands of Opc when widening. 1704 static unsigned getExtendOp(unsigned Opc) { 1705 switch (Opc) { 1706 case TargetOpcode::G_ASHR: 1707 case TargetOpcode::G_SMIN: 1708 case TargetOpcode::G_SMAX: 1709 return TargetOpcode::G_SEXT; 1710 case TargetOpcode::G_LSHR: 1711 case TargetOpcode::G_UMIN: 1712 case TargetOpcode::G_UMAX: 1713 return TargetOpcode::G_ZEXT; 1714 default: 1715 return TargetOpcode::G_ANYEXT; 1716 } 1717 } 1718 1719 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1720 // any illegal vector extend or unmerge operations. 1721 static std::pair<Register, Register> 1722 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1723 const LLT S32 = LLT::scalar(32); 1724 auto Bitcast = B.buildBitcast(S32, Src); 1725 1726 if (ExtOpcode == TargetOpcode::G_SEXT) { 1727 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1728 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1729 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1730 } 1731 1732 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1733 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1734 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1735 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1736 } 1737 1738 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1739 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1740 } 1741 1742 // For cases where only a single copy is inserted for matching register banks. 1743 // Replace the register in the instruction operand 1744 static bool substituteSimpleCopyRegs( 1745 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1746 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1747 if (!SrcReg.empty()) { 1748 assert(SrcReg.size() == 1); 1749 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1750 return true; 1751 } 1752 1753 return false; 1754 } 1755 1756 /// Handle register layout difference for f16 images for some subtargets. 1757 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1758 MachineRegisterInfo &MRI, 1759 Register Reg) const { 1760 if (!Subtarget.hasUnpackedD16VMem()) 1761 return Reg; 1762 1763 const LLT S16 = LLT::scalar(16); 1764 LLT StoreVT = MRI.getType(Reg); 1765 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1766 return Reg; 1767 1768 auto Unmerge = B.buildUnmerge(S16, Reg); 1769 1770 1771 SmallVector<Register, 4> WideRegs; 1772 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1773 WideRegs.push_back(Unmerge.getReg(I)); 1774 1775 const LLT S32 = LLT::scalar(32); 1776 int NumElts = StoreVT.getNumElements(); 1777 1778 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs) 1779 .getReg(0); 1780 } 1781 1782 static std::pair<Register, unsigned> 1783 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1784 int64_t Const; 1785 if (mi_match(Reg, MRI, m_ICst(Const))) 1786 return std::pair(Register(), Const); 1787 1788 Register Base; 1789 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1790 return std::pair(Base, Const); 1791 1792 // TODO: Handle G_OR used for add case 1793 return std::pair(Reg, 0); 1794 } 1795 1796 std::pair<Register, unsigned> 1797 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1798 Register OrigOffset) const { 1799 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget); 1800 Register BaseReg; 1801 unsigned ImmOffset; 1802 const LLT S32 = LLT::scalar(32); 1803 1804 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. 1805 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1806 OrigOffset); 1807 1808 unsigned C1 = 0; 1809 if (ImmOffset != 0) { 1810 // If the immediate value is too big for the immoffset field, put only bits 1811 // that would normally fit in the immoffset field. The remaining value that 1812 // is copied/added for the voffset field is a large power of 2, and it 1813 // stands more chance of being CSEd with the copy/add for another similar 1814 // load/store. 1815 // However, do not do that rounding down if that is a negative 1816 // number, as it appears to be illegal to have a negative offset in the 1817 // vgpr, even if adding the immediate offset makes it positive. 1818 unsigned Overflow = ImmOffset & ~MaxImm; 1819 ImmOffset -= Overflow; 1820 if ((int32_t)Overflow < 0) { 1821 Overflow += ImmOffset; 1822 ImmOffset = 0; 1823 } 1824 1825 C1 = ImmOffset; 1826 if (Overflow != 0) { 1827 if (!BaseReg) 1828 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1829 else { 1830 auto OverflowVal = B.buildConstant(S32, Overflow); 1831 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1832 } 1833 } 1834 } 1835 1836 if (!BaseReg) 1837 BaseReg = B.buildConstant(S32, 0).getReg(0); 1838 1839 return {BaseReg, C1}; 1840 } 1841 1842 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1843 Register SrcReg) const { 1844 MachineRegisterInfo &MRI = *B.getMRI(); 1845 LLT SrcTy = MRI.getType(SrcReg); 1846 if (SrcTy.getSizeInBits() == 32) { 1847 // Use a v_mov_b32 here to make the exec dependency explicit. 1848 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1849 .addDef(DstReg) 1850 .addUse(SrcReg); 1851 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1852 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1853 } 1854 1855 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1856 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1857 1858 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1859 .addDef(TmpReg0) 1860 .addUse(SrcReg, 0, AMDGPU::sub0); 1861 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1862 .addDef(TmpReg1) 1863 .addUse(SrcReg, 0, AMDGPU::sub1); 1864 B.buildInstr(AMDGPU::REG_SEQUENCE) 1865 .addDef(DstReg) 1866 .addUse(TmpReg0) 1867 .addImm(AMDGPU::sub0) 1868 .addUse(TmpReg1) 1869 .addImm(AMDGPU::sub1); 1870 1871 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1872 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1873 } 1874 1875 /// Utility function for pushing dynamic vector indexes with a constant offset 1876 /// into waterfall loops. 1877 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1878 MachineInstr &IdxUseInstr, 1879 unsigned OpIdx, 1880 unsigned ConstOffset) { 1881 MachineRegisterInfo &MRI = *B.getMRI(); 1882 const LLT S32 = LLT::scalar(32); 1883 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1884 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1885 1886 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1887 1888 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1889 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1890 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1891 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1892 } 1893 1894 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1895 /// original 32-bit source value (to be inserted in the low part of the combined 1896 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1897 /// value. 1898 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1899 Register Hi32Reg, Register Lo32Reg, 1900 unsigned ExtOpc, 1901 const RegisterBank &RegBank, 1902 bool IsBooleanSrc = false) { 1903 if (ExtOpc == AMDGPU::G_ZEXT) { 1904 B.buildConstant(Hi32Reg, 0); 1905 } else if (ExtOpc == AMDGPU::G_SEXT) { 1906 if (IsBooleanSrc) { 1907 // If we know the original source was an s1, the high half is the same as 1908 // the low. 1909 B.buildCopy(Hi32Reg, Lo32Reg); 1910 } else { 1911 // Replicate sign bit from 32-bit extended part. 1912 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1913 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1914 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1915 } 1916 } else { 1917 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1918 B.buildUndef(Hi32Reg); 1919 } 1920 } 1921 1922 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1923 MachineIRBuilder &B, MachineInstr &MI, 1924 const OperandsMapper &OpdMapper) const { 1925 MachineRegisterInfo &MRI = *B.getMRI(); 1926 1927 Register VecReg = MI.getOperand(1).getReg(); 1928 Register Idx = MI.getOperand(2).getReg(); 1929 1930 const RegisterBank &IdxBank = 1931 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1932 1933 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1934 1935 LLT VecTy = MRI.getType(VecReg); 1936 unsigned EltSize = VecTy.getScalarSizeInBits(); 1937 unsigned NumElem = VecTy.getNumElements(); 1938 1939 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1940 IsDivergentIdx, &Subtarget)) 1941 return false; 1942 1943 LLT S32 = LLT::scalar(32); 1944 1945 const RegisterBank &DstBank = 1946 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1947 const RegisterBank &SrcBank = 1948 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1949 1950 const RegisterBank &CCBank = 1951 (DstBank == AMDGPU::SGPRRegBank && 1952 SrcBank == AMDGPU::SGPRRegBank && 1953 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1954 : AMDGPU::VCCRegBank; 1955 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1956 1957 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1958 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1959 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1960 } 1961 1962 LLT EltTy = VecTy.getScalarType(); 1963 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1964 unsigned NumLanes = DstRegs.size(); 1965 if (!NumLanes) 1966 NumLanes = 1; 1967 else 1968 EltTy = MRI.getType(DstRegs[0]); 1969 1970 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1971 SmallVector<Register, 2> Res(NumLanes); 1972 for (unsigned L = 0; L < NumLanes; ++L) 1973 Res[L] = UnmergeToEltTy.getReg(L); 1974 1975 for (unsigned I = 1; I < NumElem; ++I) { 1976 auto IC = B.buildConstant(S32, I); 1977 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1978 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1979 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1980 1981 for (unsigned L = 0; L < NumLanes; ++L) { 1982 auto S = B.buildSelect(EltTy, Cmp, 1983 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1984 1985 for (unsigned N : { 0, 2, 3 }) 1986 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1987 1988 Res[L] = S->getOperand(0).getReg(); 1989 } 1990 } 1991 1992 for (unsigned L = 0; L < NumLanes; ++L) { 1993 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1994 B.buildCopy(DstReg, Res[L]); 1995 MRI.setRegBank(DstReg, DstBank); 1996 } 1997 1998 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1999 MI.eraseFromParent(); 2000 2001 return true; 2002 } 2003 2004 // Insert a cross regbank copy for a register if it already has a bank that 2005 // differs from the one we want to set. 2006 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2007 MachineIRBuilder &B, Register &Reg, 2008 const RegisterBank &Bank) { 2009 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2010 if (CurrBank && *CurrBank != Bank) { 2011 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2012 MRI.setRegBank(Copy, Bank); 2013 return Copy; 2014 } 2015 2016 MRI.setRegBank(Reg, Bank); 2017 return Reg; 2018 } 2019 2020 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2021 MachineIRBuilder &B, MachineInstr &MI, 2022 const OperandsMapper &OpdMapper) const { 2023 2024 MachineRegisterInfo &MRI = *B.getMRI(); 2025 Register VecReg = MI.getOperand(1).getReg(); 2026 Register Idx = MI.getOperand(3).getReg(); 2027 2028 const RegisterBank &IdxBank = 2029 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2030 2031 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2032 2033 LLT VecTy = MRI.getType(VecReg); 2034 unsigned EltSize = VecTy.getScalarSizeInBits(); 2035 unsigned NumElem = VecTy.getNumElements(); 2036 2037 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2038 IsDivergentIdx, &Subtarget)) 2039 return false; 2040 2041 LLT S32 = LLT::scalar(32); 2042 2043 const RegisterBank &DstBank = 2044 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2045 const RegisterBank &SrcBank = 2046 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2047 const RegisterBank &InsBank = 2048 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2049 2050 const RegisterBank &CCBank = 2051 (DstBank == AMDGPU::SGPRRegBank && 2052 SrcBank == AMDGPU::SGPRRegBank && 2053 InsBank == AMDGPU::SGPRRegBank && 2054 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2055 : AMDGPU::VCCRegBank; 2056 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2057 2058 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2059 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2060 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2061 } 2062 2063 LLT EltTy = VecTy.getScalarType(); 2064 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2065 unsigned NumLanes = InsRegs.size(); 2066 if (!NumLanes) { 2067 NumLanes = 1; 2068 InsRegs.push_back(MI.getOperand(2).getReg()); 2069 } else { 2070 EltTy = MRI.getType(InsRegs[0]); 2071 } 2072 2073 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2074 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2075 2076 for (unsigned I = 0; I < NumElem; ++I) { 2077 auto IC = B.buildConstant(S32, I); 2078 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2079 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2080 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2081 2082 for (unsigned L = 0; L < NumLanes; ++L) { 2083 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2084 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2085 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2086 2087 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2088 MRI.setRegBank(Select, DstBank); 2089 2090 Ops[I * NumLanes + L] = Select; 2091 } 2092 } 2093 2094 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2095 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2096 B.buildBuildVector(MI.getOperand(0), Ops); 2097 } else { 2098 auto Vec = B.buildBuildVector(MergeTy, Ops); 2099 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2100 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2101 } 2102 2103 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2104 MI.eraseFromParent(); 2105 2106 return true; 2107 } 2108 2109 // Break s_mul_u64 into 32-bit vector operations. 2110 void AMDGPURegisterBankInfo::applyMappingSMULU64( 2111 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 2112 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2113 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2114 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2115 2116 // All inputs are SGPRs, nothing special to do. 2117 if (DefRegs.empty()) { 2118 assert(Src0Regs.empty() && Src1Regs.empty()); 2119 applyDefaultMapping(OpdMapper); 2120 return; 2121 } 2122 2123 assert(DefRegs.size() == 2); 2124 assert(Src0Regs.size() == Src1Regs.size() && 2125 (Src0Regs.empty() || Src0Regs.size() == 2)); 2126 2127 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2128 MachineInstr &MI = OpdMapper.getMI(); 2129 Register DstReg = MI.getOperand(0).getReg(); 2130 LLT HalfTy = LLT::scalar(32); 2131 2132 // Depending on where the source registers came from, the generic code may 2133 // have decided to split the inputs already or not. If not, we still need to 2134 // extract the values. 2135 2136 if (Src0Regs.empty()) 2137 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2138 else 2139 setRegsToType(MRI, Src0Regs, HalfTy); 2140 2141 if (Src1Regs.empty()) 2142 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2143 else 2144 setRegsToType(MRI, Src1Regs, HalfTy); 2145 2146 setRegsToType(MRI, DefRegs, HalfTy); 2147 2148 // The multiplication is done as follows: 2149 // 2150 // Op1H Op1L 2151 // * Op0H Op0L 2152 // -------------------- 2153 // Op1H*Op0L Op1L*Op0L 2154 // + Op1H*Op0H Op1L*Op0H 2155 // ----------------------------------------- 2156 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L 2157 // 2158 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit 2159 // value and that would overflow. 2160 // The low 32-bit value is Op1L*Op0L. 2161 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from 2162 // Op1L*Op0L). 2163 2164 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); 2165 2166 Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0); 2167 Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0); 2168 Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0); 2169 Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0); 2170 B.buildAdd(DefRegs[1], Add, MulHiLo); 2171 B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]); 2172 2173 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2174 MI.eraseFromParent(); 2175 } 2176 2177 void AMDGPURegisterBankInfo::applyMappingImpl( 2178 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 2179 MachineInstr &MI = OpdMapper.getMI(); 2180 B.setInstrAndDebugLoc(MI); 2181 unsigned Opc = MI.getOpcode(); 2182 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2183 switch (Opc) { 2184 case AMDGPU::G_CONSTANT: 2185 case AMDGPU::G_IMPLICIT_DEF: { 2186 Register DstReg = MI.getOperand(0).getReg(); 2187 LLT DstTy = MRI.getType(DstReg); 2188 if (DstTy != LLT::scalar(1)) 2189 break; 2190 2191 const RegisterBank *DstBank = 2192 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2193 if (DstBank == &AMDGPU::VCCRegBank) 2194 break; 2195 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2196 if (DefRegs.empty()) 2197 DefRegs.push_back(DstReg); 2198 2199 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2200 2201 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 2202 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 2203 2204 MI.getOperand(0).setReg(NewDstReg); 2205 if (Opc != AMDGPU::G_IMPLICIT_DEF) { 2206 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); 2207 MI.getOperand(1).setCImm( 2208 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal)); 2209 } 2210 2211 MRI.setRegBank(NewDstReg, *DstBank); 2212 B.buildTrunc(DefRegs[0], NewDstReg); 2213 return; 2214 } 2215 case AMDGPU::G_PHI: { 2216 Register DstReg = MI.getOperand(0).getReg(); 2217 LLT DstTy = MRI.getType(DstReg); 2218 if (DstTy != LLT::scalar(1)) 2219 break; 2220 2221 const LLT S32 = LLT::scalar(32); 2222 const RegisterBank *DstBank = 2223 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2224 if (DstBank == &AMDGPU::VCCRegBank) { 2225 applyDefaultMapping(OpdMapper); 2226 // The standard handling only considers the result register bank for 2227 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2228 // produce an invalid copy. We can only copy with some kind of compare to 2229 // get a vector boolean result. Insert a register bank copy that will be 2230 // correctly lowered to a compare. 2231 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2232 Register SrcReg = MI.getOperand(I).getReg(); 2233 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2234 2235 if (SrcBank != &AMDGPU::VCCRegBank) { 2236 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2237 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2238 2239 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2240 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2241 MI.getOperand(I).setReg(Copy.getReg(0)); 2242 } 2243 } 2244 2245 return; 2246 } 2247 2248 // Phi handling is strange and only considers the bank of the destination. 2249 substituteSimpleCopyRegs(OpdMapper, 0); 2250 2251 // Promote SGPR/VGPR booleans to s32 2252 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 2253 B.setInsertPt(B.getMBB(), MI); 2254 LegalizerHelper Helper(B.getMF(), ApplyBank, B); 2255 2256 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2257 llvm_unreachable("widen scalar should have succeeded"); 2258 2259 return; 2260 } 2261 case AMDGPU::G_FCMP: 2262 if (!Subtarget.hasSALUFloatInsts()) 2263 break; 2264 LLVM_FALLTHROUGH; 2265 case AMDGPU::G_ICMP: 2266 case AMDGPU::G_UADDO: 2267 case AMDGPU::G_USUBO: 2268 case AMDGPU::G_UADDE: 2269 case AMDGPU::G_SADDE: 2270 case AMDGPU::G_USUBE: 2271 case AMDGPU::G_SSUBE: { 2272 unsigned BoolDstOp = 2273 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1; 2274 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2275 2276 const RegisterBank *DstBank = 2277 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2278 if (DstBank != &AMDGPU::SGPRRegBank) 2279 break; 2280 2281 const bool HasCarryIn = MI.getNumOperands() == 5; 2282 2283 // If this is a scalar compare, promote the result to s32, as the selection 2284 // will end up using a copy to a 32-bit vreg. 2285 const LLT S32 = LLT::scalar(32); 2286 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2287 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2288 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2289 2290 if (HasCarryIn) { 2291 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2292 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2293 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2294 MI.getOperand(4).setReg(NewSrcReg); 2295 } 2296 2297 MachineBasicBlock *MBB = MI.getParent(); 2298 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2299 2300 // If we had a constrained VCC result register, a copy was inserted to VCC 2301 // from SGPR. 2302 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2303 if (DefRegs.empty()) 2304 DefRegs.push_back(DstReg); 2305 B.buildTrunc(DefRegs[0], NewDstReg); 2306 return; 2307 } 2308 case AMDGPU::G_SELECT: { 2309 Register DstReg = MI.getOperand(0).getReg(); 2310 LLT DstTy = MRI.getType(DstReg); 2311 2312 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2313 if (CondRegs.empty()) 2314 CondRegs.push_back(MI.getOperand(1).getReg()); 2315 else { 2316 assert(CondRegs.size() == 1); 2317 } 2318 2319 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2320 if (CondBank == &AMDGPU::SGPRRegBank) { 2321 const LLT S32 = LLT::scalar(32); 2322 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2323 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2324 2325 MI.getOperand(1).setReg(NewCondReg); 2326 B.buildZExt(NewCondReg, CondRegs[0]); 2327 } 2328 2329 if (DstTy.getSizeInBits() != 64) 2330 break; 2331 2332 LLT HalfTy = getHalfSizedType(DstTy); 2333 2334 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2335 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2336 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2337 2338 // All inputs are SGPRs, nothing special to do. 2339 if (DefRegs.empty()) { 2340 assert(Src1Regs.empty() && Src2Regs.empty()); 2341 break; 2342 } 2343 2344 if (Src1Regs.empty()) 2345 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2346 else { 2347 setRegsToType(MRI, Src1Regs, HalfTy); 2348 } 2349 2350 if (Src2Regs.empty()) 2351 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2352 else 2353 setRegsToType(MRI, Src2Regs, HalfTy); 2354 2355 setRegsToType(MRI, DefRegs, HalfTy); 2356 2357 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2358 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2359 2360 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2361 MI.eraseFromParent(); 2362 return; 2363 } 2364 case AMDGPU::G_BRCOND: { 2365 Register CondReg = MI.getOperand(0).getReg(); 2366 // FIXME: Should use legalizer helper, but should change bool ext type. 2367 const RegisterBank *CondBank = 2368 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2369 2370 if (CondBank == &AMDGPU::SGPRRegBank) { 2371 const LLT S32 = LLT::scalar(32); 2372 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2373 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2374 2375 MI.getOperand(0).setReg(NewCondReg); 2376 B.buildZExt(NewCondReg, CondReg); 2377 return; 2378 } 2379 2380 break; 2381 } 2382 case AMDGPU::G_AND: 2383 case AMDGPU::G_OR: 2384 case AMDGPU::G_XOR: { 2385 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2386 // there is a VGPR input. 2387 Register DstReg = MI.getOperand(0).getReg(); 2388 LLT DstTy = MRI.getType(DstReg); 2389 2390 if (DstTy.getSizeInBits() == 1) { 2391 const RegisterBank *DstBank = 2392 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2393 if (DstBank == &AMDGPU::VCCRegBank) 2394 break; 2395 2396 MachineFunction *MF = MI.getParent()->getParent(); 2397 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 2398 LegalizerHelper Helper(*MF, ApplyBank, B); 2399 2400 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2401 LegalizerHelper::Legalized) 2402 llvm_unreachable("widen scalar should have succeeded"); 2403 return; 2404 } 2405 2406 if (DstTy.getSizeInBits() != 64) 2407 break; 2408 2409 LLT HalfTy = getHalfSizedType(DstTy); 2410 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2411 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2412 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2413 2414 // All inputs are SGPRs, nothing special to do. 2415 if (DefRegs.empty()) { 2416 assert(Src0Regs.empty() && Src1Regs.empty()); 2417 break; 2418 } 2419 2420 assert(DefRegs.size() == 2); 2421 assert(Src0Regs.size() == Src1Regs.size() && 2422 (Src0Regs.empty() || Src0Regs.size() == 2)); 2423 2424 // Depending on where the source registers came from, the generic code may 2425 // have decided to split the inputs already or not. If not, we still need to 2426 // extract the values. 2427 2428 if (Src0Regs.empty()) 2429 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2430 else 2431 setRegsToType(MRI, Src0Regs, HalfTy); 2432 2433 if (Src1Regs.empty()) 2434 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2435 else 2436 setRegsToType(MRI, Src1Regs, HalfTy); 2437 2438 setRegsToType(MRI, DefRegs, HalfTy); 2439 2440 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2441 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2442 2443 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2444 MI.eraseFromParent(); 2445 return; 2446 } 2447 case AMDGPU::G_ABS: { 2448 Register SrcReg = MI.getOperand(1).getReg(); 2449 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2450 2451 // There is no VALU abs instruction so we need to replace it with a sub and 2452 // max combination. 2453 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2454 MachineFunction *MF = MI.getParent()->getParent(); 2455 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); 2456 LegalizerHelper Helper(*MF, Apply, B); 2457 2458 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2459 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2460 return; 2461 } 2462 [[fallthrough]]; 2463 } 2464 case AMDGPU::G_ADD: 2465 case AMDGPU::G_SUB: 2466 case AMDGPU::G_MUL: 2467 case AMDGPU::G_SHL: 2468 case AMDGPU::G_LSHR: 2469 case AMDGPU::G_ASHR: 2470 case AMDGPU::G_SMIN: 2471 case AMDGPU::G_SMAX: 2472 case AMDGPU::G_UMIN: 2473 case AMDGPU::G_UMAX: { 2474 Register DstReg = MI.getOperand(0).getReg(); 2475 LLT DstTy = MRI.getType(DstReg); 2476 2477 // Special case for s_mul_u64. There is not a vector equivalent of 2478 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector 2479 // multiplications. 2480 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { 2481 applyMappingSMULU64(B, OpdMapper); 2482 return; 2483 } 2484 2485 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2486 // Packed 16-bit operations need to be scalarized and promoted. 2487 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2488 break; 2489 2490 const RegisterBank *DstBank = 2491 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2492 if (DstBank == &AMDGPU::VGPRRegBank) 2493 break; 2494 2495 const LLT S32 = LLT::scalar(32); 2496 MachineBasicBlock *MBB = MI.getParent(); 2497 MachineFunction *MF = MBB->getParent(); 2498 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); 2499 2500 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) { 2501 Register WideSrcLo, WideSrcHi; 2502 2503 std::tie(WideSrcLo, WideSrcHi) = 2504 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT); 2505 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo}); 2506 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi}); 2507 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2508 MI.eraseFromParent(); 2509 return; 2510 } 2511 2512 if (DstTy.isVector()) { 2513 Register WideSrc0Lo, WideSrc0Hi; 2514 Register WideSrc1Lo, WideSrc1Hi; 2515 2516 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2517 std::tie(WideSrc0Lo, WideSrc0Hi) 2518 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2519 std::tie(WideSrc1Lo, WideSrc1Hi) 2520 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2521 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2522 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2523 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2524 MI.eraseFromParent(); 2525 } else { 2526 LegalizerHelper Helper(*MF, ApplySALU, B); 2527 2528 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2529 llvm_unreachable("widen scalar should have succeeded"); 2530 2531 // FIXME: s16 shift amounts should be legal. 2532 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2533 Opc == AMDGPU::G_ASHR) { 2534 B.setInsertPt(*MBB, MI.getIterator()); 2535 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2536 llvm_unreachable("widen scalar should have succeeded"); 2537 } 2538 } 2539 2540 return; 2541 } 2542 case AMDGPU::G_AMDGPU_S_MUL_I64_I32: 2543 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { 2544 // This is a special case for s_mul_u64. We use 2545 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation 2546 // where the 33 higher bits are sign-extended and 2547 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation 2548 // where the 32 higher bits are zero-extended. In case scalar registers are 2549 // selected, both opcodes are lowered as s_mul_u64. If the vector registers 2550 // are selected, then G_AMDGPU_S_MUL_I64_I32 and 2551 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. 2552 2553 // Insert basic copies. 2554 applyDefaultMapping(OpdMapper); 2555 2556 Register DstReg = MI.getOperand(0).getReg(); 2557 Register SrcReg0 = MI.getOperand(1).getReg(); 2558 Register SrcReg1 = MI.getOperand(2).getReg(); 2559 const LLT S32 = LLT::scalar(32); 2560 const LLT S64 = LLT::scalar(64); 2561 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " 2562 "that handles only 64-bit operands."); 2563 const RegisterBank *DstBank = 2564 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2565 2566 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 2567 // with s_mul_u64 operation. 2568 if (DstBank == &AMDGPU::SGPRRegBank) { 2569 MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); 2570 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); 2571 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); 2572 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); 2573 return; 2574 } 2575 2576 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 2577 // with a vector mad. 2578 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && 2579 "The destination operand should be in vector registers."); 2580 2581 DebugLoc DL = MI.getDebugLoc(); 2582 2583 // Extract the lower subregister from the first operand. 2584 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2585 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); 2586 MRI.setType(Op0L, S32); 2587 B.buildTrunc(Op0L, SrcReg0); 2588 2589 // Extract the lower subregister from the second operand. 2590 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2591 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); 2592 MRI.setType(Op1L, S32); 2593 B.buildTrunc(Op1L, SrcReg1); 2594 2595 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 2596 ? AMDGPU::G_AMDGPU_MAD_U64_U32 2597 : AMDGPU::G_AMDGPU_MAD_I64_I32; 2598 2599 MachineIRBuilder B(MI); 2600 Register Zero64 = B.buildConstant(S64, 0).getReg(0); 2601 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); 2602 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2603 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); 2604 B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64}); 2605 MI.eraseFromParent(); 2606 return; 2607 } 2608 case AMDGPU::G_SEXT_INREG: { 2609 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2610 if (SrcRegs.empty()) 2611 break; // Nothing to repair 2612 2613 const LLT S32 = LLT::scalar(32); 2614 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); 2615 2616 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2617 // we would need to further expand, and doesn't let us directly set the 2618 // result registers. 2619 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2620 2621 int Amt = MI.getOperand(2).getImm(); 2622 if (Amt <= 32) { 2623 // Downstream users have expectations for the high bit behavior, so freeze 2624 // incoming undefined bits. 2625 if (Amt == 32) { 2626 // The low bits are unchanged. 2627 B.buildFreeze(DstRegs[0], SrcRegs[0]); 2628 } else { 2629 auto Freeze = B.buildFreeze(S32, SrcRegs[0]); 2630 // Extend in the low bits and propagate the sign bit to the high half. 2631 B.buildSExtInReg(DstRegs[0], Freeze, Amt); 2632 } 2633 2634 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2635 } else { 2636 // The low bits are unchanged, and extend in the high bits. 2637 // No freeze required 2638 B.buildCopy(DstRegs[0], SrcRegs[0]); 2639 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2640 } 2641 2642 Register DstReg = MI.getOperand(0).getReg(); 2643 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2644 MI.eraseFromParent(); 2645 return; 2646 } 2647 case AMDGPU::G_CTPOP: 2648 case AMDGPU::G_BITREVERSE: { 2649 const RegisterBank *DstBank = 2650 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2651 if (DstBank == &AMDGPU::SGPRRegBank) 2652 break; 2653 2654 Register SrcReg = MI.getOperand(1).getReg(); 2655 const LLT S32 = LLT::scalar(32); 2656 LLT Ty = MRI.getType(SrcReg); 2657 if (Ty == S32) 2658 break; 2659 2660 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); 2661 2662 MachineFunction &MF = B.getMF(); 2663 LegalizerHelper Helper(MF, ApplyVALU, B); 2664 2665 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2666 llvm_unreachable("narrowScalar should have succeeded"); 2667 return; 2668 } 2669 case AMDGPU::G_AMDGPU_FFBH_U32: 2670 case AMDGPU::G_AMDGPU_FFBL_B32: 2671 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2672 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2673 const RegisterBank *DstBank = 2674 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2675 if (DstBank == &AMDGPU::SGPRRegBank) 2676 break; 2677 2678 Register SrcReg = MI.getOperand(1).getReg(); 2679 const LLT S32 = LLT::scalar(32); 2680 LLT Ty = MRI.getType(SrcReg); 2681 if (Ty == S32) 2682 break; 2683 2684 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2685 // which return -1 when the input is zero: 2686 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2687 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2688 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2689 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2690 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); 2691 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2692 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2693 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2694 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2695 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2696 : Opc; 2697 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2698 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2699 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2700 unsigned AddOpc = 2701 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2702 ? AMDGPU::G_ADD 2703 : AMDGPU::G_UADDSAT; 2704 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2705 Register DstReg = MI.getOperand(0).getReg(); 2706 B.buildUMin(DstReg, X, Y); 2707 MI.eraseFromParent(); 2708 return; 2709 } 2710 case AMDGPU::G_SEXT: 2711 case AMDGPU::G_ZEXT: 2712 case AMDGPU::G_ANYEXT: { 2713 Register SrcReg = MI.getOperand(1).getReg(); 2714 LLT SrcTy = MRI.getType(SrcReg); 2715 const bool Signed = Opc == AMDGPU::G_SEXT; 2716 2717 assert(OpdMapper.getVRegs(1).empty()); 2718 2719 const RegisterBank *SrcBank = 2720 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2721 2722 Register DstReg = MI.getOperand(0).getReg(); 2723 LLT DstTy = MRI.getType(DstReg); 2724 if (DstTy.isScalar() && 2725 SrcBank != &AMDGPU::SGPRRegBank && 2726 SrcBank != &AMDGPU::VCCRegBank && 2727 // FIXME: Should handle any type that round to s64 when irregular 2728 // breakdowns supported. 2729 DstTy.getSizeInBits() == 64 && 2730 SrcTy.getSizeInBits() <= 32) { 2731 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2732 2733 // Extend to 32-bit, and then extend the low half. 2734 if (Signed) { 2735 // TODO: Should really be buildSExtOrCopy 2736 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2737 } else if (Opc == AMDGPU::G_ZEXT) { 2738 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2739 } else { 2740 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2741 } 2742 2743 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2744 MRI.setRegBank(DstReg, *SrcBank); 2745 MI.eraseFromParent(); 2746 return; 2747 } 2748 2749 if (SrcTy != LLT::scalar(1)) 2750 return; 2751 2752 // It is not legal to have a legalization artifact with a VCC source. Rather 2753 // than introducing a copy, insert the select we would have to select the 2754 // copy to. 2755 if (SrcBank == &AMDGPU::VCCRegBank) { 2756 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2757 2758 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2759 2760 unsigned DstSize = DstTy.getSizeInBits(); 2761 // 64-bit select is SGPR only 2762 const bool UseSel64 = DstSize > 32 && 2763 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2764 2765 // TODO: Should s16 select be legal? 2766 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2767 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2768 auto False = B.buildConstant(SelType, 0); 2769 2770 MRI.setRegBank(True.getReg(0), *DstBank); 2771 MRI.setRegBank(False.getReg(0), *DstBank); 2772 MRI.setRegBank(DstReg, *DstBank); 2773 2774 if (DstSize > 32) { 2775 B.buildSelect(DefRegs[0], SrcReg, True, False); 2776 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2777 } else if (DstSize < 32) { 2778 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2779 MRI.setRegBank(Sel.getReg(0), *DstBank); 2780 B.buildTrunc(DstReg, Sel); 2781 } else { 2782 B.buildSelect(DstReg, SrcReg, True, False); 2783 } 2784 2785 MI.eraseFromParent(); 2786 return; 2787 } 2788 2789 break; 2790 } 2791 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2792 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2793 2794 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2795 2796 Register DstReg = MI.getOperand(0).getReg(); 2797 Register SrcReg = MI.getOperand(1).getReg(); 2798 2799 const LLT S32 = LLT::scalar(32); 2800 LLT DstTy = MRI.getType(DstReg); 2801 LLT SrcTy = MRI.getType(SrcReg); 2802 2803 if (foldExtractEltToCmpSelect(B, MI, OpdMapper)) 2804 return; 2805 2806 const ValueMapping &DstMapping 2807 = OpdMapper.getInstrMapping().getOperandMapping(0); 2808 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2809 const RegisterBank *SrcBank = 2810 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2811 const RegisterBank *IdxBank = 2812 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2813 2814 Register BaseIdxReg; 2815 unsigned ConstOffset; 2816 std::tie(BaseIdxReg, ConstOffset) = 2817 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2818 2819 // See if the index is an add of a constant which will be foldable by moving 2820 // the base register of the index later if this is going to be executed in a 2821 // waterfall loop. This is essentially to reassociate the add of a constant 2822 // with the readfirstlane. 2823 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2824 ConstOffset > 0 && 2825 ConstOffset < SrcTy.getNumElements(); 2826 2827 // Move the base register. We'll re-insert the add later. 2828 if (ShouldMoveIndexIntoLoop) 2829 MI.getOperand(2).setReg(BaseIdxReg); 2830 2831 // If this is a VGPR result only because the index was a VGPR result, the 2832 // actual indexing will be done on the SGPR source vector, which will 2833 // produce a scalar result. We need to copy to the VGPR result inside the 2834 // waterfall loop. 2835 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2836 SrcBank == &AMDGPU::SGPRRegBank; 2837 if (DstRegs.empty()) { 2838 applyDefaultMapping(OpdMapper); 2839 2840 executeInWaterfallLoop(B, MI, {2}); 2841 2842 if (NeedCopyToVGPR) { 2843 // We don't want a phi for this temporary reg. 2844 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2845 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2846 MI.getOperand(0).setReg(TmpReg); 2847 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2848 2849 // Use a v_mov_b32 here to make the exec dependency explicit. 2850 buildVCopy(B, DstReg, TmpReg); 2851 } 2852 2853 // Re-insert the constant offset add inside the waterfall loop. 2854 if (ShouldMoveIndexIntoLoop) 2855 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2856 2857 return; 2858 } 2859 2860 assert(DstTy.getSizeInBits() == 64); 2861 2862 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2863 2864 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2865 auto One = B.buildConstant(S32, 1); 2866 2867 MachineBasicBlock::iterator MII = MI.getIterator(); 2868 2869 // Split the vector index into 32-bit pieces. Prepare to move all of the 2870 // new instructions into a waterfall loop if necessary. 2871 // 2872 // Don't put the bitcast or constant in the loop. 2873 MachineInstrSpan Span(MII, &B.getMBB()); 2874 2875 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2876 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2877 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2878 2879 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2880 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2881 2882 MRI.setRegBank(DstReg, *DstBank); 2883 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2884 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2885 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2886 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2887 2888 SmallSet<Register, 4> OpsToWaterfall; 2889 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2890 MI.eraseFromParent(); 2891 return; 2892 } 2893 2894 // Remove the original instruction to avoid potentially confusing the 2895 // waterfall loop logic. 2896 B.setInstr(*Span.begin()); 2897 MI.eraseFromParent(); 2898 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2899 OpsToWaterfall); 2900 2901 if (NeedCopyToVGPR) { 2902 MachineBasicBlock *LoopBB = Extract1->getParent(); 2903 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2904 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2905 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2906 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2907 2908 Extract0->getOperand(0).setReg(TmpReg0); 2909 Extract1->getOperand(0).setReg(TmpReg1); 2910 2911 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2912 2913 buildVCopy(B, DstRegs[0], TmpReg0); 2914 buildVCopy(B, DstRegs[1], TmpReg1); 2915 } 2916 2917 if (ShouldMoveIndexIntoLoop) 2918 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2919 2920 return; 2921 } 2922 case AMDGPU::G_INSERT_VECTOR_ELT: { 2923 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2924 2925 Register DstReg = MI.getOperand(0).getReg(); 2926 LLT VecTy = MRI.getType(DstReg); 2927 2928 assert(OpdMapper.getVRegs(0).empty()); 2929 assert(OpdMapper.getVRegs(3).empty()); 2930 2931 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2932 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2933 2934 if (foldInsertEltToCmpSelect(B, MI, OpdMapper)) 2935 return; 2936 2937 const RegisterBank *IdxBank = 2938 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2939 2940 Register SrcReg = MI.getOperand(1).getReg(); 2941 Register InsReg = MI.getOperand(2).getReg(); 2942 LLT InsTy = MRI.getType(InsReg); 2943 (void)InsTy; 2944 2945 Register BaseIdxReg; 2946 unsigned ConstOffset; 2947 std::tie(BaseIdxReg, ConstOffset) = 2948 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2949 2950 // See if the index is an add of a constant which will be foldable by moving 2951 // the base register of the index later if this is going to be executed in a 2952 // waterfall loop. This is essentially to reassociate the add of a constant 2953 // with the readfirstlane. 2954 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2955 ConstOffset > 0 && 2956 ConstOffset < VecTy.getNumElements(); 2957 2958 // Move the base register. We'll re-insert the add later. 2959 if (ShouldMoveIndexIntoLoop) 2960 MI.getOperand(3).setReg(BaseIdxReg); 2961 2962 2963 if (InsRegs.empty()) { 2964 executeInWaterfallLoop(B, MI, {3}); 2965 2966 // Re-insert the constant offset add inside the waterfall loop. 2967 if (ShouldMoveIndexIntoLoop) { 2968 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2969 } 2970 2971 return; 2972 } 2973 2974 assert(InsTy.getSizeInBits() == 64); 2975 2976 const LLT S32 = LLT::scalar(32); 2977 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2978 2979 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2980 auto One = B.buildConstant(S32, 1); 2981 2982 // Split the vector index into 32-bit pieces. Prepare to move all of the 2983 // new instructions into a waterfall loop if necessary. 2984 // 2985 // Don't put the bitcast or constant in the loop. 2986 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2987 2988 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2989 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2990 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2991 2992 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2993 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2994 2995 const RegisterBank *DstBank = 2996 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2997 const RegisterBank *SrcBank = 2998 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2999 const RegisterBank *InsSrcBank = 3000 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 3001 3002 MRI.setRegBank(InsReg, *InsSrcBank); 3003 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 3004 MRI.setRegBank(InsLo.getReg(0), *DstBank); 3005 MRI.setRegBank(InsHi.getReg(0), *DstBank); 3006 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 3007 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 3008 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 3009 3010 3011 SmallSet<Register, 4> OpsToWaterfall; 3012 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 3013 B.setInsertPt(B.getMBB(), MI); 3014 B.buildBitcast(DstReg, InsHi); 3015 MI.eraseFromParent(); 3016 return; 3017 } 3018 3019 B.setInstr(*Span.begin()); 3020 MI.eraseFromParent(); 3021 3022 // Figure out the point after the waterfall loop before mangling the control 3023 // flow. 3024 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 3025 OpsToWaterfall); 3026 3027 // The insertion point is now right after the original instruction. 3028 // 3029 // Keep the bitcast to the original vector type out of the loop. Doing this 3030 // saved an extra phi we don't need inside the loop. 3031 B.buildBitcast(DstReg, InsHi); 3032 3033 // Re-insert the constant offset add inside the waterfall loop. 3034 if (ShouldMoveIndexIntoLoop) 3035 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 3036 3037 return; 3038 } 3039 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3040 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3041 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3042 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3044 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3045 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 3046 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3047 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3048 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3049 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3050 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3051 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3052 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3053 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 3054 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3055 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 3056 applyDefaultMapping(OpdMapper); 3057 executeInWaterfallLoop(B, MI, {1, 4}); 3058 return; 3059 } 3060 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3061 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3062 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3063 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3064 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3065 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3066 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3067 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3068 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3069 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3070 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3071 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 3072 applyDefaultMapping(OpdMapper); 3073 executeInWaterfallLoop(B, MI, {2, 5}); 3074 return; 3075 } 3076 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3077 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: 3078 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3079 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3080 applyDefaultMapping(OpdMapper); 3081 executeInWaterfallLoop(B, MI, {2, 5}); 3082 return; 3083 } 3084 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3085 applyDefaultMapping(OpdMapper); 3086 executeInWaterfallLoop(B, MI, {3, 6}); 3087 return; 3088 } 3089 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: 3090 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: 3091 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: 3092 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: 3093 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { 3094 applyMappingSBufferLoad(B, OpdMapper); 3095 return; 3096 } 3097 case AMDGPU::G_INTRINSIC: 3098 case AMDGPU::G_INTRINSIC_CONVERGENT: { 3099 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 3100 case Intrinsic::amdgcn_readlane: { 3101 substituteSimpleCopyRegs(OpdMapper, 2); 3102 3103 assert(OpdMapper.getVRegs(0).empty()); 3104 assert(OpdMapper.getVRegs(3).empty()); 3105 3106 // Make sure the index is an SGPR. It doesn't make sense to run this in a 3107 // waterfall loop, so assume it's a uniform value. 3108 constrainOpWithReadfirstlane(B, MI, 3); // Index 3109 return; 3110 } 3111 case Intrinsic::amdgcn_writelane: { 3112 assert(OpdMapper.getVRegs(0).empty()); 3113 assert(OpdMapper.getVRegs(2).empty()); 3114 assert(OpdMapper.getVRegs(3).empty()); 3115 3116 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 3117 constrainOpWithReadfirstlane(B, MI, 2); // Source value 3118 constrainOpWithReadfirstlane(B, MI, 3); // Index 3119 return; 3120 } 3121 case Intrinsic::amdgcn_interp_p1: 3122 case Intrinsic::amdgcn_interp_p2: 3123 case Intrinsic::amdgcn_interp_mov: 3124 case Intrinsic::amdgcn_interp_p1_f16: 3125 case Intrinsic::amdgcn_interp_p2_f16: 3126 case Intrinsic::amdgcn_lds_param_load: { 3127 applyDefaultMapping(OpdMapper); 3128 3129 // Readlane for m0 value, which is always the last operand. 3130 // FIXME: Should this be a waterfall loop instead? 3131 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index 3132 return; 3133 } 3134 case Intrinsic::amdgcn_interp_inreg_p10: 3135 case Intrinsic::amdgcn_interp_inreg_p2: 3136 case Intrinsic::amdgcn_interp_inreg_p10_f16: 3137 case Intrinsic::amdgcn_interp_inreg_p2_f16: 3138 applyDefaultMapping(OpdMapper); 3139 return; 3140 case Intrinsic::amdgcn_permlane16: 3141 case Intrinsic::amdgcn_permlanex16: { 3142 // Doing a waterfall loop over these wouldn't make any sense. 3143 substituteSimpleCopyRegs(OpdMapper, 2); 3144 substituteSimpleCopyRegs(OpdMapper, 3); 3145 constrainOpWithReadfirstlane(B, MI, 4); 3146 constrainOpWithReadfirstlane(B, MI, 5); 3147 return; 3148 } 3149 case Intrinsic::amdgcn_sbfe: 3150 applyMappingBFE(B, OpdMapper, true); 3151 return; 3152 case Intrinsic::amdgcn_ubfe: 3153 applyMappingBFE(B, OpdMapper, false); 3154 return; 3155 case Intrinsic::amdgcn_inverse_ballot: 3156 case Intrinsic::amdgcn_s_bitreplicate: 3157 case Intrinsic::amdgcn_s_quadmask: 3158 case Intrinsic::amdgcn_s_wqm: 3159 applyDefaultMapping(OpdMapper); 3160 constrainOpWithReadfirstlane(B, MI, 2); // Mask 3161 return; 3162 case Intrinsic::amdgcn_ballot: 3163 // Use default handling and insert copy to vcc source. 3164 break; 3165 } 3166 break; 3167 } 3168 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3169 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3170 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3171 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3172 const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3173 AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI)); 3174 assert(RSrcIntrin && RSrcIntrin->IsImage); 3175 // Non-images can have complications from operands that allow both SGPR 3176 // and VGPR. For now it's too complicated to figure out the final opcode 3177 // to derive the register bank from the MCInstrDesc. 3178 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); 3179 return; 3180 } 3181 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3182 unsigned N = MI.getNumExplicitOperands() - 2; 3183 applyDefaultMapping(OpdMapper); 3184 executeInWaterfallLoop(B, MI, {N}); 3185 return; 3186 } 3187 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 3188 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { 3189 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 3190 switch (IntrID) { 3191 case Intrinsic::amdgcn_ds_ordered_add: 3192 case Intrinsic::amdgcn_ds_ordered_swap: { 3193 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3194 assert(OpdMapper.getVRegs(0).empty()); 3195 substituteSimpleCopyRegs(OpdMapper, 3); 3196 constrainOpWithReadfirstlane(B, MI, 2); // M0 3197 return; 3198 } 3199 case Intrinsic::amdgcn_ds_gws_init: 3200 case Intrinsic::amdgcn_ds_gws_barrier: 3201 case Intrinsic::amdgcn_ds_gws_sema_br: { 3202 // Only the first lane is executes, so readfirstlane is safe. 3203 substituteSimpleCopyRegs(OpdMapper, 1); 3204 constrainOpWithReadfirstlane(B, MI, 2); // M0 3205 return; 3206 } 3207 case Intrinsic::amdgcn_ds_gws_sema_v: 3208 case Intrinsic::amdgcn_ds_gws_sema_p: 3209 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3210 // Only the first lane is executes, so readfirstlane is safe. 3211 constrainOpWithReadfirstlane(B, MI, 1); // M0 3212 return; 3213 } 3214 case Intrinsic::amdgcn_ds_append: 3215 case Intrinsic::amdgcn_ds_consume: { 3216 constrainOpWithReadfirstlane(B, MI, 2); // M0 3217 return; 3218 } 3219 case Intrinsic::amdgcn_s_sendmsg: 3220 case Intrinsic::amdgcn_s_sendmsghalt: { 3221 // FIXME: Should this use a waterfall loop? 3222 constrainOpWithReadfirstlane(B, MI, 2); // M0 3223 return; 3224 } 3225 case Intrinsic::amdgcn_s_setreg: { 3226 constrainOpWithReadfirstlane(B, MI, 2); 3227 return; 3228 } 3229 case Intrinsic::amdgcn_s_ttracedata: 3230 constrainOpWithReadfirstlane(B, MI, 1); // M0 3231 return; 3232 case Intrinsic::amdgcn_raw_buffer_load_lds: 3233 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 3234 applyDefaultMapping(OpdMapper); 3235 constrainOpWithReadfirstlane(B, MI, 1); // rsrc 3236 constrainOpWithReadfirstlane(B, MI, 2); // M0 3237 constrainOpWithReadfirstlane(B, MI, 5); // soffset 3238 return; 3239 } 3240 case Intrinsic::amdgcn_struct_buffer_load_lds: 3241 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 3242 applyDefaultMapping(OpdMapper); 3243 constrainOpWithReadfirstlane(B, MI, 1); // rsrc 3244 constrainOpWithReadfirstlane(B, MI, 2); // M0 3245 constrainOpWithReadfirstlane(B, MI, 6); // soffset 3246 return; 3247 } 3248 case Intrinsic::amdgcn_global_load_lds: { 3249 applyDefaultMapping(OpdMapper); 3250 constrainOpWithReadfirstlane(B, MI, 2); 3251 return; 3252 } 3253 case Intrinsic::amdgcn_lds_direct_load: { 3254 applyDefaultMapping(OpdMapper); 3255 // Readlane for m0 value, which is always the last operand. 3256 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index 3257 return; 3258 } 3259 case Intrinsic::amdgcn_exp_row: 3260 applyDefaultMapping(OpdMapper); 3261 constrainOpWithReadfirstlane(B, MI, 8); // M0 3262 return; 3263 case Intrinsic::amdgcn_s_sleep_var: 3264 assert(OpdMapper.getVRegs(1).empty()); 3265 constrainOpWithReadfirstlane(B, MI, 1); 3266 return; 3267 case Intrinsic::amdgcn_s_barrier_signal_var: 3268 case Intrinsic::amdgcn_s_barrier_join: 3269 case Intrinsic::amdgcn_s_wakeup_barrier: 3270 constrainOpWithReadfirstlane(B, MI, 1); 3271 return; 3272 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: 3273 constrainOpWithReadfirstlane(B, MI, 2); 3274 return; 3275 case Intrinsic::amdgcn_s_barrier_init: 3276 constrainOpWithReadfirstlane(B, MI, 1); 3277 constrainOpWithReadfirstlane(B, MI, 2); 3278 return; 3279 case Intrinsic::amdgcn_s_get_barrier_state: { 3280 constrainOpWithReadfirstlane(B, MI, 2); 3281 return; 3282 } 3283 default: { 3284 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3285 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3286 // Non-images can have complications from operands that allow both SGPR 3287 // and VGPR. For now it's too complicated to figure out the final opcode 3288 // to derive the register bank from the MCInstrDesc. 3289 if (RSrcIntrin->IsImage) { 3290 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); 3291 return; 3292 } 3293 } 3294 3295 break; 3296 } 3297 } 3298 break; 3299 } 3300 case AMDGPU::G_SI_CALL: { 3301 // Use a set to avoid extra readfirstlanes in the case where multiple 3302 // operands are the same register. 3303 SmallSet<Register, 4> SGPROperandRegs; 3304 3305 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3306 break; 3307 3308 // Move all copies to physical SGPRs that are used by the call instruction 3309 // into the loop block. Start searching for these copies until the 3310 // ADJCALLSTACKUP. 3311 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3312 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3313 3314 // Move all non-copies before the copies, so that a complete range can be 3315 // moved into the waterfall loop. 3316 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3317 // Count of NonCopyInstrs found until the current LastCopy. 3318 unsigned NonCopyInstrsLen = 0; 3319 MachineBasicBlock::iterator Start(&MI); 3320 MachineBasicBlock::iterator LastCopy = Start; 3321 MachineBasicBlock *MBB = MI.getParent(); 3322 const SIMachineFunctionInfo *Info = 3323 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3324 while (Start->getOpcode() != FrameSetupOpcode) { 3325 --Start; 3326 bool IsCopy = false; 3327 if (Start->getOpcode() == AMDGPU::COPY) { 3328 auto &Dst = Start->getOperand(0); 3329 if (Dst.isReg()) { 3330 Register Reg = Dst.getReg(); 3331 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3332 IsCopy = true; 3333 } else { 3334 // Also move the copy from the scratch rsrc descriptor into the loop 3335 // to allow it to be optimized away. 3336 auto &Src = Start->getOperand(1); 3337 if (Src.isReg()) { 3338 Reg = Src.getReg(); 3339 IsCopy = Info->getScratchRSrcReg() == Reg; 3340 } 3341 } 3342 } 3343 } 3344 3345 if (IsCopy) { 3346 LastCopy = Start; 3347 NonCopyInstrsLen = NonCopyInstrs.size(); 3348 } else { 3349 NonCopyInstrs.push_back(&*Start); 3350 } 3351 } 3352 NonCopyInstrs.resize(NonCopyInstrsLen); 3353 3354 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3355 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3356 } 3357 Start = LastCopy; 3358 3359 // Do the same for copies after the loop 3360 NonCopyInstrs.clear(); 3361 NonCopyInstrsLen = 0; 3362 MachineBasicBlock::iterator End(&MI); 3363 LastCopy = End; 3364 while (End->getOpcode() != FrameDestroyOpcode) { 3365 ++End; 3366 bool IsCopy = false; 3367 if (End->getOpcode() == AMDGPU::COPY) { 3368 auto &Src = End->getOperand(1); 3369 if (Src.isReg()) { 3370 Register Reg = Src.getReg(); 3371 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3372 } 3373 } 3374 3375 if (IsCopy) { 3376 LastCopy = End; 3377 NonCopyInstrsLen = NonCopyInstrs.size(); 3378 } else { 3379 NonCopyInstrs.push_back(&*End); 3380 } 3381 } 3382 NonCopyInstrs.resize(NonCopyInstrsLen); 3383 3384 End = LastCopy; 3385 ++LastCopy; 3386 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3387 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3388 } 3389 3390 ++End; 3391 B.setInsertPt(B.getMBB(), Start); 3392 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs); 3393 break; 3394 } 3395 case AMDGPU::G_LOAD: 3396 case AMDGPU::G_ZEXTLOAD: 3397 case AMDGPU::G_SEXTLOAD: { 3398 if (applyMappingLoad(B, OpdMapper, MI)) 3399 return; 3400 break; 3401 } 3402 case AMDGPU::G_DYN_STACKALLOC: 3403 applyMappingDynStackAlloc(B, OpdMapper, MI); 3404 return; 3405 case AMDGPU::G_STACKRESTORE: { 3406 applyDefaultMapping(OpdMapper); 3407 constrainOpWithReadfirstlane(B, MI, 0); 3408 return; 3409 } 3410 case AMDGPU::G_SBFX: 3411 applyMappingBFE(B, OpdMapper, /*Signed*/ true); 3412 return; 3413 case AMDGPU::G_UBFX: 3414 applyMappingBFE(B, OpdMapper, /*Signed*/ false); 3415 return; 3416 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3417 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3418 applyMappingMAD_64_32(B, OpdMapper); 3419 return; 3420 case AMDGPU::G_PREFETCH: { 3421 if (!Subtarget.hasPrefetch()) { 3422 MI.eraseFromParent(); 3423 return; 3424 } 3425 Register PtrReg = MI.getOperand(0).getReg(); 3426 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); 3427 if (PtrBank == AMDGPU::VGPRRegBankID) { 3428 MI.eraseFromParent(); 3429 return; 3430 } 3431 unsigned AS = MRI.getType(PtrReg).getAddressSpace(); 3432 if (!AMDGPU::isFlatGlobalAddrSpace(AS) && 3433 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 3434 MI.eraseFromParent(); 3435 return; 3436 } 3437 applyDefaultMapping(OpdMapper); 3438 return; 3439 } 3440 default: 3441 break; 3442 } 3443 3444 return applyDefaultMapping(OpdMapper); 3445 } 3446 3447 // vgpr, sgpr -> vgpr 3448 // vgpr, agpr -> vgpr 3449 // agpr, agpr -> agpr 3450 // agpr, sgpr -> vgpr 3451 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3452 if (RB0 == AMDGPU::InvalidRegBankID) 3453 return RB1; 3454 if (RB1 == AMDGPU::InvalidRegBankID) 3455 return RB0; 3456 3457 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3458 return AMDGPU::SGPRRegBankID; 3459 3460 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3461 return AMDGPU::AGPRRegBankID; 3462 3463 return AMDGPU::VGPRRegBankID; 3464 } 3465 3466 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3467 if (RB0 == AMDGPU::InvalidRegBankID) 3468 return RB1; 3469 if (RB1 == AMDGPU::InvalidRegBankID) 3470 return RB0; 3471 3472 // vcc, vcc -> vcc 3473 // vcc, sgpr -> vcc 3474 // vcc, vgpr -> vcc 3475 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3476 return AMDGPU::VCCRegBankID; 3477 3478 // vcc, vgpr -> vgpr 3479 return regBankUnion(RB0, RB1); 3480 } 3481 3482 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3483 const MachineInstr &MI) const { 3484 unsigned RegBank = AMDGPU::InvalidRegBankID; 3485 3486 for (const MachineOperand &MO : MI.operands()) { 3487 if (!MO.isReg()) 3488 continue; 3489 Register Reg = MO.getReg(); 3490 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3491 RegBank = regBankUnion(RegBank, Bank->getID()); 3492 if (RegBank == AMDGPU::VGPRRegBankID) 3493 break; 3494 } 3495 } 3496 3497 return RegBank; 3498 } 3499 3500 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3501 const MachineFunction &MF = *MI.getParent()->getParent(); 3502 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3503 for (const MachineOperand &MO : MI.operands()) { 3504 if (!MO.isReg()) 3505 continue; 3506 Register Reg = MO.getReg(); 3507 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3508 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3509 return false; 3510 } 3511 } 3512 return true; 3513 } 3514 3515 const RegisterBankInfo::InstructionMapping & 3516 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3517 const MachineFunction &MF = *MI.getParent()->getParent(); 3518 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3519 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3520 3521 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3522 const MachineOperand &SrcOp = MI.getOperand(i); 3523 if (!SrcOp.isReg()) 3524 continue; 3525 3526 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3527 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3528 } 3529 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3530 MI.getNumOperands()); 3531 } 3532 3533 const RegisterBankInfo::InstructionMapping & 3534 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3535 const MachineFunction &MF = *MI.getParent()->getParent(); 3536 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3537 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3538 3539 // Even though we technically could use SGPRs, this would require knowledge of 3540 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3541 // 3542 // TODO: Unary ops are trivially OK, so accept SGPRs? 3543 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3544 const MachineOperand &Src = MI.getOperand(i); 3545 if (!Src.isReg()) 3546 continue; 3547 3548 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3549 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3550 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3551 } 3552 3553 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3554 MI.getNumOperands()); 3555 } 3556 3557 const RegisterBankInfo::InstructionMapping & 3558 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3559 const MachineFunction &MF = *MI.getParent()->getParent(); 3560 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3561 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3562 3563 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3564 const MachineOperand &Op = MI.getOperand(I); 3565 if (!Op.isReg()) 3566 continue; 3567 3568 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3569 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3570 } 3571 3572 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3573 MI.getNumOperands()); 3574 } 3575 3576 const RegisterBankInfo::InstructionMapping & 3577 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3578 const MachineInstr &MI, 3579 int RsrcIdx) const { 3580 // The reported argument index is relative to the IR intrinsic call arguments, 3581 // so we need to shift by the number of defs and the intrinsic ID. 3582 RsrcIdx += MI.getNumExplicitDefs() + 1; 3583 3584 const int NumOps = MI.getNumOperands(); 3585 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3586 3587 // TODO: Should packed/unpacked D16 difference be reported here as part of 3588 // the value mapping? 3589 for (int I = 0; I != NumOps; ++I) { 3590 if (!MI.getOperand(I).isReg()) 3591 continue; 3592 3593 Register OpReg = MI.getOperand(I).getReg(); 3594 // We replace some dead address operands with $noreg 3595 if (!OpReg) 3596 continue; 3597 3598 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3599 3600 // FIXME: Probably need a new intrinsic register bank searchable table to 3601 // handle arbitrary intrinsics easily. 3602 // 3603 // If this has a sampler, it immediately follows rsrc. 3604 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3605 3606 if (MustBeSGPR) { 3607 // If this must be an SGPR, so we must report whatever it is as legal. 3608 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3609 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3610 } else { 3611 // Some operands must be VGPR, and these are easy to copy to. 3612 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3613 } 3614 } 3615 3616 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3617 } 3618 3619 /// Return the mapping for a pointer argument. 3620 const RegisterBankInfo::ValueMapping * 3621 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3622 Register PtrReg) const { 3623 LLT PtrTy = MRI.getType(PtrReg); 3624 unsigned Size = PtrTy.getSizeInBits(); 3625 if (Subtarget.useFlatForGlobal() || 3626 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3627 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3628 3629 // If we're using MUBUF instructions for global memory, an SGPR base register 3630 // is possible. Otherwise this needs to be a VGPR. 3631 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3632 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3633 } 3634 3635 const RegisterBankInfo::InstructionMapping & 3636 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3637 3638 const MachineFunction &MF = *MI.getParent()->getParent(); 3639 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3640 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3641 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3642 Register PtrReg = MI.getOperand(1).getReg(); 3643 LLT PtrTy = MRI.getType(PtrReg); 3644 unsigned AS = PtrTy.getAddressSpace(); 3645 unsigned PtrSize = PtrTy.getSizeInBits(); 3646 3647 const ValueMapping *ValMapping; 3648 const ValueMapping *PtrMapping; 3649 3650 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3651 3652 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3653 if (isScalarLoadLegal(MI)) { 3654 // We have a uniform instruction so we want to use an SMRD load 3655 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3656 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3657 } else { 3658 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3659 3660 // If we're using MUBUF instructions for global memory, an SGPR base 3661 // register is possible. Otherwise this needs to be a VGPR. 3662 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3663 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3664 3665 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3666 } 3667 } else { 3668 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3669 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3670 } 3671 3672 OpdsMapping[0] = ValMapping; 3673 OpdsMapping[1] = PtrMapping; 3674 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3675 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3676 return Mapping; 3677 3678 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3679 // handle that during instruction selection? 3680 } 3681 3682 unsigned 3683 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3684 const MachineRegisterInfo &MRI, 3685 unsigned Default) const { 3686 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3687 return Bank ? Bank->getID() : Default; 3688 } 3689 3690 const RegisterBankInfo::ValueMapping * 3691 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3692 const MachineRegisterInfo &MRI, 3693 const TargetRegisterInfo &TRI) const { 3694 // Lie and claim anything is legal, even though this needs to be an SGPR 3695 // applyMapping will have to deal with it as a waterfall loop. 3696 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3697 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3698 return AMDGPU::getValueMapping(Bank, Size); 3699 } 3700 3701 const RegisterBankInfo::ValueMapping * 3702 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3703 const MachineRegisterInfo &MRI, 3704 const TargetRegisterInfo &TRI) const { 3705 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3706 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3707 } 3708 3709 const RegisterBankInfo::ValueMapping * 3710 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3711 const MachineRegisterInfo &MRI, 3712 const TargetRegisterInfo &TRI) const { 3713 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3714 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3715 } 3716 3717 /// 3718 /// This function must return a legal mapping, because 3719 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3720 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3721 /// VGPR to SGPR generated is illegal. 3722 /// 3723 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3724 // legal. These will be dealt with in applyMappingImpl. 3725 // 3726 const RegisterBankInfo::InstructionMapping & 3727 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3728 const MachineFunction &MF = *MI.getParent()->getParent(); 3729 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3730 3731 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3732 // The default logic bothers to analyze impossible alternative mappings. We 3733 // want the most straightforward mapping, so just directly handle this. 3734 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3735 *TRI); 3736 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3737 *TRI); 3738 assert(SrcBank && "src bank should have been assigned already"); 3739 if (!DstBank) 3740 DstBank = SrcBank; 3741 3742 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3743 if (MI.getOpcode() != AMDGPU::G_FREEZE && 3744 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size))) 3745 return getInvalidInstructionMapping(); 3746 3747 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3748 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3749 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3750 OpdsMapping[0] = &ValMap; 3751 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3752 OpdsMapping[1] = &ValMap; 3753 3754 return getInstructionMapping( 3755 1, /*Cost*/ 1, 3756 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3757 } 3758 3759 if (MI.isRegSequence()) { 3760 // If any input is a VGPR, the result must be a VGPR. The default handling 3761 // assumes any copy between banks is legal. 3762 unsigned BankID = AMDGPU::SGPRRegBankID; 3763 3764 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3765 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3766 // It doesn't make sense to use vcc or scc banks here, so just ignore 3767 // them. 3768 if (OpBank != AMDGPU::SGPRRegBankID) { 3769 BankID = AMDGPU::VGPRRegBankID; 3770 break; 3771 } 3772 } 3773 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3774 3775 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3776 return getInstructionMapping( 3777 1, /*Cost*/ 1, 3778 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3779 } 3780 3781 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3782 // properly. 3783 // 3784 // TODO: There are additional exec masking dependencies to analyze. 3785 if (auto *PHI = dyn_cast<GPhi>(&MI)) { 3786 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3787 Register DstReg = PHI->getReg(0); 3788 3789 // Sometimes the result may have already been assigned a bank. 3790 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3791 ResultBank = DstBank->getID(); 3792 3793 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { 3794 Register Reg = PHI->getIncomingValue(I); 3795 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3796 3797 // FIXME: Assuming VGPR for any undetermined inputs. 3798 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3799 ResultBank = AMDGPU::VGPRRegBankID; 3800 break; 3801 } 3802 3803 // FIXME: Need to promote SGPR case to s32 3804 unsigned OpBank = Bank->getID(); 3805 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3806 } 3807 3808 assert(ResultBank != AMDGPU::InvalidRegBankID); 3809 3810 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3811 3812 const ValueMapping &ValMap = 3813 getValueMapping(0, Size, getRegBank(ResultBank)); 3814 return getInstructionMapping( 3815 1, /*Cost*/ 1, 3816 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3817 } 3818 3819 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3820 if (Mapping.isValid()) 3821 return Mapping; 3822 3823 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3824 3825 switch (MI.getOpcode()) { 3826 default: 3827 return getInvalidInstructionMapping(); 3828 3829 case AMDGPU::G_AND: 3830 case AMDGPU::G_OR: 3831 case AMDGPU::G_XOR: 3832 case AMDGPU::G_MUL: { 3833 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3834 if (Size == 1) { 3835 const RegisterBank *DstBank 3836 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3837 3838 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3839 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3840 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3841 if (DstBank) { 3842 TargetBankID = DstBank->getID(); 3843 if (DstBank == &AMDGPU::VCCRegBank) { 3844 TargetBankID = AMDGPU::VCCRegBankID; 3845 BankLHS = AMDGPU::VCCRegBankID; 3846 BankRHS = AMDGPU::VCCRegBankID; 3847 } else { 3848 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3849 AMDGPU::SGPRRegBankID); 3850 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3851 AMDGPU::SGPRRegBankID); 3852 } 3853 } else { 3854 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3855 AMDGPU::VCCRegBankID); 3856 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3857 AMDGPU::VCCRegBankID); 3858 3859 // Both inputs should be true booleans to produce a boolean result. 3860 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3861 TargetBankID = AMDGPU::VGPRRegBankID; 3862 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3863 TargetBankID = AMDGPU::VCCRegBankID; 3864 BankLHS = AMDGPU::VCCRegBankID; 3865 BankRHS = AMDGPU::VCCRegBankID; 3866 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3867 TargetBankID = AMDGPU::SGPRRegBankID; 3868 } 3869 } 3870 3871 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3872 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3873 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3874 break; 3875 } 3876 3877 if (Size == 64) { 3878 3879 if (isSALUMapping(MI)) { 3880 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3881 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3882 } else { 3883 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3884 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3885 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3886 3887 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3888 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3889 } 3890 3891 break; 3892 } 3893 3894 [[fallthrough]]; 3895 } 3896 case AMDGPU::G_PTR_ADD: 3897 case AMDGPU::G_PTRMASK: 3898 case AMDGPU::G_ADD: 3899 case AMDGPU::G_SUB: 3900 case AMDGPU::G_SHL: 3901 case AMDGPU::G_LSHR: 3902 case AMDGPU::G_ASHR: 3903 case AMDGPU::G_UADDO: 3904 case AMDGPU::G_USUBO: 3905 case AMDGPU::G_UADDE: 3906 case AMDGPU::G_SADDE: 3907 case AMDGPU::G_USUBE: 3908 case AMDGPU::G_SSUBE: 3909 case AMDGPU::G_SMIN: 3910 case AMDGPU::G_SMAX: 3911 case AMDGPU::G_UMIN: 3912 case AMDGPU::G_UMAX: 3913 case AMDGPU::G_ABS: 3914 case AMDGPU::G_SHUFFLE_VECTOR: 3915 case AMDGPU::G_SBFX: 3916 case AMDGPU::G_UBFX: 3917 case AMDGPU::G_AMDGPU_S_MUL_I64_I32: 3918 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: 3919 if (isSALUMapping(MI)) 3920 return getDefaultMappingSOP(MI); 3921 return getDefaultMappingVOP(MI); 3922 case AMDGPU::G_FADD: 3923 case AMDGPU::G_FSUB: 3924 case AMDGPU::G_FMUL: 3925 case AMDGPU::G_FMA: 3926 case AMDGPU::G_FFLOOR: 3927 case AMDGPU::G_FCEIL: 3928 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 3929 case AMDGPU::G_FMINNUM: 3930 case AMDGPU::G_FMAXNUM: 3931 case AMDGPU::G_FMINIMUM: 3932 case AMDGPU::G_FMAXIMUM: 3933 case AMDGPU::G_INTRINSIC_TRUNC: 3934 case AMDGPU::G_STRICT_FADD: 3935 case AMDGPU::G_STRICT_FSUB: 3936 case AMDGPU::G_STRICT_FMUL: 3937 case AMDGPU::G_STRICT_FMA: { 3938 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3939 unsigned Size = Ty.getSizeInBits(); 3940 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() && 3941 (Size == 32 || Size == 16) && isSALUMapping(MI)) 3942 return getDefaultMappingSOP(MI); 3943 return getDefaultMappingVOP(MI); 3944 } 3945 case AMDGPU::G_FPTOSI: 3946 case AMDGPU::G_FPTOUI: 3947 case AMDGPU::G_SITOFP: 3948 case AMDGPU::G_UITOFP: { 3949 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3950 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3951 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 && 3952 isSALUMapping(MI)) 3953 return getDefaultMappingSOP(MI); 3954 return getDefaultMappingVOP(MI); 3955 } 3956 case AMDGPU::G_FPTRUNC: 3957 case AMDGPU::G_FPEXT: { 3958 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3959 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3960 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 && 3961 isSALUMapping(MI)) 3962 return getDefaultMappingSOP(MI); 3963 return getDefaultMappingVOP(MI); 3964 } 3965 case AMDGPU::G_FSQRT: 3966 case AMDGPU::G_FEXP2: 3967 case AMDGPU::G_FLOG2: { 3968 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3969 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && 3970 isSALUMapping(MI)) 3971 return getDefaultMappingSOP(MI); 3972 return getDefaultMappingVOP(MI); 3973 } 3974 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3975 case AMDGPU::G_SSUBSAT: 3976 case AMDGPU::G_UADDSAT: 3977 case AMDGPU::G_USUBSAT: 3978 case AMDGPU::G_FMAD: 3979 case AMDGPU::G_FLDEXP: 3980 case AMDGPU::G_FMINNUM_IEEE: 3981 case AMDGPU::G_FMAXNUM_IEEE: 3982 case AMDGPU::G_FCANONICALIZE: 3983 case AMDGPU::G_STRICT_FLDEXP: 3984 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3985 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3986 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3987 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3988 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3989 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3990 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3991 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3992 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3993 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3994 case AMDGPU::G_AMDGPU_SMED3: 3995 case AMDGPU::G_AMDGPU_FMED3: 3996 return getDefaultMappingVOP(MI); 3997 case AMDGPU::G_UMULH: 3998 case AMDGPU::G_SMULH: { 3999 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 4000 return getDefaultMappingSOP(MI); 4001 return getDefaultMappingVOP(MI); 4002 } 4003 case AMDGPU::G_AMDGPU_MAD_U64_U32: 4004 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 4005 // Three possible mappings: 4006 // 4007 // - Default SOP 4008 // - Default VOP 4009 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 4010 // 4011 // This allows instruction selection to keep the multiplication part of the 4012 // instruction on the SALU. 4013 bool AllSalu = true; 4014 bool MulSalu = true; 4015 for (unsigned i = 0; i < 5; ++i) { 4016 Register Reg = MI.getOperand(i).getReg(); 4017 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 4018 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 4019 AllSalu = false; 4020 if (i == 2 || i == 3) { 4021 MulSalu = false; 4022 break; 4023 } 4024 } 4025 } 4026 } 4027 4028 if (AllSalu) 4029 return getDefaultMappingSOP(MI); 4030 4031 // If the multiply-add is full-rate in VALU, use that even if the 4032 // multiplication part is scalar. Accumulating separately on the VALU would 4033 // take two instructions. 4034 if (!MulSalu || Subtarget.hasFullRate64Ops()) 4035 return getDefaultMappingVOP(MI); 4036 4037 // Keep the multiplication on the SALU, then accumulate on the VALU. 4038 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 4039 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4040 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4041 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4042 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 4043 break; 4044 } 4045 case AMDGPU::G_IMPLICIT_DEF: { 4046 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4047 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4048 break; 4049 } 4050 case AMDGPU::G_FCONSTANT: 4051 case AMDGPU::G_CONSTANT: 4052 case AMDGPU::G_GLOBAL_VALUE: 4053 case AMDGPU::G_BLOCK_ADDR: 4054 case AMDGPU::G_READCYCLECOUNTER: { 4055 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4056 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4057 break; 4058 } 4059 case AMDGPU::G_FRAME_INDEX: { 4060 // TODO: This should be the same as other constants, but eliminateFrameIndex 4061 // currently assumes VALU uses. 4062 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4063 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4064 break; 4065 } 4066 case AMDGPU::G_DYN_STACKALLOC: { 4067 // Result is always uniform, and a wave reduction is needed for the source. 4068 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4069 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4070 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 4071 break; 4072 } 4073 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 4074 // This case is weird because we expect a physical register in the source, 4075 // but need to set a bank anyway. 4076 // 4077 // TODO: We could select the result to SGPR or VGPR 4078 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4079 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4080 break; 4081 } 4082 case AMDGPU::G_INSERT: { 4083 unsigned BankID = getMappingType(MRI, MI); 4084 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4085 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4086 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 4087 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 4088 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 4089 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 4090 OpdsMapping[3] = nullptr; 4091 break; 4092 } 4093 case AMDGPU::G_EXTRACT: { 4094 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4095 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4096 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4097 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 4098 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 4099 OpdsMapping[2] = nullptr; 4100 break; 4101 } 4102 case AMDGPU::G_BUILD_VECTOR: 4103 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 4104 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 4105 if (DstTy == LLT::fixed_vector(2, 16)) { 4106 unsigned DstSize = DstTy.getSizeInBits(); 4107 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4108 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4109 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4110 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 4111 4112 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 4113 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 4114 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 4115 break; 4116 } 4117 4118 [[fallthrough]]; 4119 } 4120 case AMDGPU::G_MERGE_VALUES: 4121 case AMDGPU::G_CONCAT_VECTORS: { 4122 unsigned Bank = getMappingType(MRI, MI); 4123 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4124 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4125 4126 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 4127 // Op1 and Dst should use the same register bank. 4128 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 4129 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 4130 break; 4131 } 4132 case AMDGPU::G_BITREVERSE: 4133 case AMDGPU::G_BITCAST: 4134 case AMDGPU::G_INTTOPTR: 4135 case AMDGPU::G_PTRTOINT: 4136 case AMDGPU::G_FABS: 4137 case AMDGPU::G_FNEG: { 4138 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4139 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4140 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 4141 break; 4142 } 4143 case AMDGPU::G_AMDGPU_FFBH_U32: 4144 case AMDGPU::G_AMDGPU_FFBL_B32: 4145 case AMDGPU::G_CTLZ_ZERO_UNDEF: 4146 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 4147 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4148 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4149 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 4150 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 4151 break; 4152 } 4153 case AMDGPU::G_CTPOP: { 4154 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4155 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4156 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 4157 4158 // This should really be getValueMappingSGPR64Only, but allowing the generic 4159 // code to handle the register split just makes using LegalizerHelper more 4160 // difficult. 4161 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 4162 break; 4163 } 4164 case AMDGPU::G_TRUNC: { 4165 Register Dst = MI.getOperand(0).getReg(); 4166 Register Src = MI.getOperand(1).getReg(); 4167 unsigned Bank = getRegBankID(Src, MRI); 4168 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 4169 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 4170 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 4171 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 4172 break; 4173 } 4174 case AMDGPU::G_ZEXT: 4175 case AMDGPU::G_SEXT: 4176 case AMDGPU::G_ANYEXT: 4177 case AMDGPU::G_SEXT_INREG: { 4178 Register Dst = MI.getOperand(0).getReg(); 4179 Register Src = MI.getOperand(1).getReg(); 4180 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 4181 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 4182 4183 unsigned DstBank; 4184 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 4185 assert(SrcBank); 4186 switch (SrcBank->getID()) { 4187 case AMDGPU::SGPRRegBankID: 4188 DstBank = AMDGPU::SGPRRegBankID; 4189 break; 4190 default: 4191 DstBank = AMDGPU::VGPRRegBankID; 4192 break; 4193 } 4194 4195 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 4196 // 32-bits, and then to 64. 4197 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 4198 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 4199 SrcSize); 4200 break; 4201 } 4202 case AMDGPU::G_IS_FPCLASS: { 4203 Register SrcReg = MI.getOperand(1).getReg(); 4204 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4205 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4206 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4207 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4208 break; 4209 } 4210 case AMDGPU::G_STORE: { 4211 assert(MI.getOperand(0).isReg()); 4212 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4213 4214 // FIXME: We need to specify a different reg bank once scalar stores are 4215 // supported. 4216 const ValueMapping *ValMapping = 4217 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4218 OpdsMapping[0] = ValMapping; 4219 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4220 break; 4221 } 4222 case AMDGPU::G_ICMP: 4223 case AMDGPU::G_FCMP: { 4224 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4225 4226 // See if the result register has already been constrained to vcc, which may 4227 // happen due to control flow intrinsic lowering. 4228 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4229 AMDGPU::SGPRRegBankID); 4230 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4231 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4232 4233 auto canUseSCCICMP = [&]() { 4234 auto Pred = 4235 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 4236 return Size == 32 || 4237 (Size == 64 && 4238 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4239 Subtarget.hasScalarCompareEq64()); 4240 }; 4241 auto canUseSCCFCMP = [&]() { 4242 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16); 4243 }; 4244 4245 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP; 4246 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4247 Op2Bank == AMDGPU::SGPRRegBankID && 4248 Op3Bank == AMDGPU::SGPRRegBankID && 4249 (isICMP ? canUseSCCICMP() : canUseSCCFCMP()); 4250 4251 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4252 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4253 4254 // TODO: Use 32-bit for scalar output size. 4255 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4256 const unsigned ResultSize = 1; 4257 4258 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4259 OpdsMapping[1] = nullptr; // Predicate Operand. 4260 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4261 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4262 break; 4263 } 4264 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4265 // VGPR index can be used for waterfall when indexing a SGPR vector. 4266 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4267 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4268 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4269 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4270 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4271 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4272 4273 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4274 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4275 4276 // The index can be either if the source vector is VGPR. 4277 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4278 break; 4279 } 4280 case AMDGPU::G_INSERT_VECTOR_ELT: { 4281 unsigned OutputBankID = isSALUMapping(MI) ? 4282 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4283 4284 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4285 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4286 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4287 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4288 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4289 4290 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4291 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4292 4293 // This is a weird case, because we need to break down the mapping based on 4294 // the register bank of a different operand. 4295 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4296 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4297 InsertSize); 4298 } else { 4299 assert(InsertSize == 32 || InsertSize == 64); 4300 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4301 } 4302 4303 // The index can be either if the source vector is VGPR. 4304 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4305 break; 4306 } 4307 case AMDGPU::G_UNMERGE_VALUES: { 4308 unsigned Bank = getMappingType(MRI, MI); 4309 4310 // Op1 and Dst should use the same register bank. 4311 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4312 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4313 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4314 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4315 } 4316 break; 4317 } 4318 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4319 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4320 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4321 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4322 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4323 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4324 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 4325 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4326 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4327 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4328 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4329 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4330 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4331 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4332 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4333 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4334 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4335 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4336 4337 // rsrc 4338 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4339 4340 // vindex 4341 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4342 4343 // voffset 4344 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4345 4346 // soffset 4347 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4348 4349 // Any remaining operands are immediates and were correctly null 4350 // initialized. 4351 break; 4352 } 4353 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4354 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4355 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4356 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4357 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4358 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4359 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4360 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4361 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4362 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4363 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4364 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4365 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4366 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: 4367 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4368 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4369 // vdata_out 4370 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4371 4372 // vdata_in 4373 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4374 4375 // rsrc 4376 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4377 4378 // vindex 4379 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4380 4381 // voffset 4382 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4383 4384 // soffset 4385 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4386 4387 // Any remaining operands are immediates and were correctly null 4388 // initialized. 4389 break; 4390 } 4391 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4392 // vdata_out 4393 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4394 4395 // vdata_in 4396 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4397 4398 // cmp 4399 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4400 4401 // rsrc 4402 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4403 4404 // vindex 4405 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4406 4407 // voffset 4408 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4409 4410 // soffset 4411 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4412 4413 // Any remaining operands are immediates and were correctly null 4414 // initialized. 4415 break; 4416 } 4417 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: 4418 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: 4419 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: 4420 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: 4421 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { 4422 // Lie and claim everything is legal, even though some need to be 4423 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4424 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4425 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4426 4427 // We need to convert this to a MUBUF if either the resource of offset is 4428 // VGPR. 4429 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4430 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4431 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4432 4433 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4434 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4435 break; 4436 } 4437 case AMDGPU::G_INTRINSIC: 4438 case AMDGPU::G_INTRINSIC_CONVERGENT: { 4439 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 4440 default: 4441 return getInvalidInstructionMapping(); 4442 case Intrinsic::amdgcn_div_fmas: 4443 case Intrinsic::amdgcn_div_fixup: 4444 case Intrinsic::amdgcn_trig_preop: 4445 case Intrinsic::amdgcn_sin: 4446 case Intrinsic::amdgcn_cos: 4447 case Intrinsic::amdgcn_log_clamp: 4448 case Intrinsic::amdgcn_rcp_legacy: 4449 case Intrinsic::amdgcn_rsq_legacy: 4450 case Intrinsic::amdgcn_rsq_clamp: 4451 case Intrinsic::amdgcn_fmul_legacy: 4452 case Intrinsic::amdgcn_fma_legacy: 4453 case Intrinsic::amdgcn_frexp_mant: 4454 case Intrinsic::amdgcn_frexp_exp: 4455 case Intrinsic::amdgcn_fract: 4456 case Intrinsic::amdgcn_cvt_pknorm_i16: 4457 case Intrinsic::amdgcn_cvt_pknorm_u16: 4458 case Intrinsic::amdgcn_cvt_pk_i16: 4459 case Intrinsic::amdgcn_cvt_pk_u16: 4460 case Intrinsic::amdgcn_fmed3: 4461 case Intrinsic::amdgcn_cubeid: 4462 case Intrinsic::amdgcn_cubema: 4463 case Intrinsic::amdgcn_cubesc: 4464 case Intrinsic::amdgcn_cubetc: 4465 case Intrinsic::amdgcn_sffbh: 4466 case Intrinsic::amdgcn_fmad_ftz: 4467 case Intrinsic::amdgcn_mbcnt_lo: 4468 case Intrinsic::amdgcn_mbcnt_hi: 4469 case Intrinsic::amdgcn_mul_u24: 4470 case Intrinsic::amdgcn_mul_i24: 4471 case Intrinsic::amdgcn_mulhi_u24: 4472 case Intrinsic::amdgcn_mulhi_i24: 4473 case Intrinsic::amdgcn_lerp: 4474 case Intrinsic::amdgcn_sad_u8: 4475 case Intrinsic::amdgcn_msad_u8: 4476 case Intrinsic::amdgcn_sad_hi_u8: 4477 case Intrinsic::amdgcn_sad_u16: 4478 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4479 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4480 case Intrinsic::amdgcn_mqsad_u32_u8: 4481 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4482 case Intrinsic::amdgcn_alignbyte: 4483 case Intrinsic::amdgcn_perm: 4484 case Intrinsic::amdgcn_fdot2: 4485 case Intrinsic::amdgcn_sdot2: 4486 case Intrinsic::amdgcn_udot2: 4487 case Intrinsic::amdgcn_sdot4: 4488 case Intrinsic::amdgcn_udot4: 4489 case Intrinsic::amdgcn_sdot8: 4490 case Intrinsic::amdgcn_udot8: 4491 case Intrinsic::amdgcn_fdot2_bf16_bf16: 4492 case Intrinsic::amdgcn_fdot2_f16_f16: 4493 case Intrinsic::amdgcn_fdot2_f32_bf16: 4494 case Intrinsic::amdgcn_sudot4: 4495 case Intrinsic::amdgcn_sudot8: 4496 case Intrinsic::amdgcn_dot4_f32_fp8_bf8: 4497 case Intrinsic::amdgcn_dot4_f32_bf8_fp8: 4498 case Intrinsic::amdgcn_dot4_f32_fp8_fp8: 4499 case Intrinsic::amdgcn_dot4_f32_bf8_bf8: 4500 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: 4501 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: 4502 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: 4503 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: 4504 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: 4505 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: 4506 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: 4507 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: 4508 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: 4509 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: 4510 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: 4511 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: 4512 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: 4513 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 4514 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 4515 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 4516 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 4517 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 4518 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 4519 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: 4520 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 4521 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 4522 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 4523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: 4524 return getDefaultMappingVOP(MI); 4525 case Intrinsic::amdgcn_log: 4526 case Intrinsic::amdgcn_exp2: 4527 case Intrinsic::amdgcn_rcp: 4528 case Intrinsic::amdgcn_rsq: 4529 case Intrinsic::amdgcn_sqrt: { 4530 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4531 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && 4532 isSALUMapping(MI)) 4533 return getDefaultMappingSOP(MI); 4534 return getDefaultMappingVOP(MI); 4535 } 4536 case Intrinsic::amdgcn_sbfe: 4537 case Intrinsic::amdgcn_ubfe: 4538 if (isSALUMapping(MI)) 4539 return getDefaultMappingSOP(MI); 4540 return getDefaultMappingVOP(MI); 4541 case Intrinsic::amdgcn_ds_swizzle: 4542 case Intrinsic::amdgcn_ds_permute: 4543 case Intrinsic::amdgcn_ds_bpermute: 4544 case Intrinsic::amdgcn_update_dpp: 4545 case Intrinsic::amdgcn_mov_dpp8: 4546 case Intrinsic::amdgcn_mov_dpp: 4547 case Intrinsic::amdgcn_strict_wwm: 4548 case Intrinsic::amdgcn_wwm: 4549 case Intrinsic::amdgcn_strict_wqm: 4550 case Intrinsic::amdgcn_wqm: 4551 case Intrinsic::amdgcn_softwqm: 4552 case Intrinsic::amdgcn_set_inactive: 4553 case Intrinsic::amdgcn_set_inactive_chain_arg: 4554 case Intrinsic::amdgcn_permlane64: 4555 return getDefaultMappingAllVGPR(MI); 4556 case Intrinsic::amdgcn_cvt_pkrtz: 4557 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) 4558 return getDefaultMappingSOP(MI); 4559 return getDefaultMappingVOP(MI); 4560 case Intrinsic::amdgcn_kernarg_segment_ptr: 4561 case Intrinsic::amdgcn_s_getpc: 4562 case Intrinsic::amdgcn_groupstaticsize: 4563 case Intrinsic::amdgcn_reloc_constant: 4564 case Intrinsic::returnaddress: { 4565 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4566 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4567 break; 4568 } 4569 case Intrinsic::amdgcn_wqm_vote: { 4570 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4571 OpdsMapping[0] = OpdsMapping[2] 4572 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4573 break; 4574 } 4575 case Intrinsic::amdgcn_ps_live: { 4576 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4577 break; 4578 } 4579 case Intrinsic::amdgcn_div_scale: { 4580 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4581 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4582 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4583 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4584 4585 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4586 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4587 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4588 break; 4589 } 4590 case Intrinsic::amdgcn_class: { 4591 Register Src0Reg = MI.getOperand(2).getReg(); 4592 Register Src1Reg = MI.getOperand(3).getReg(); 4593 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4594 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4595 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4596 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4597 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4598 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4599 break; 4600 } 4601 case Intrinsic::amdgcn_icmp: 4602 case Intrinsic::amdgcn_fcmp: { 4603 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4604 // This is not VCCRegBank because this is not used in boolean contexts. 4605 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4606 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4607 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4608 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4609 break; 4610 } 4611 case Intrinsic::amdgcn_readlane: { 4612 // This must be an SGPR, but accept a VGPR. 4613 Register IdxReg = MI.getOperand(3).getReg(); 4614 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4615 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4616 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4617 [[fallthrough]]; 4618 } 4619 case Intrinsic::amdgcn_readfirstlane: { 4620 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4621 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4622 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4623 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4624 break; 4625 } 4626 case Intrinsic::amdgcn_writelane: { 4627 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4628 Register SrcReg = MI.getOperand(2).getReg(); 4629 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4630 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4631 Register IdxReg = MI.getOperand(3).getReg(); 4632 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4633 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4634 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4635 4636 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4637 // to legalize. 4638 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4639 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4640 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4641 break; 4642 } 4643 case Intrinsic::amdgcn_if_break: { 4644 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4645 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4646 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4647 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4648 break; 4649 } 4650 case Intrinsic::amdgcn_permlane16: 4651 case Intrinsic::amdgcn_permlanex16: { 4652 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4653 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4654 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4655 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4656 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4657 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4658 break; 4659 } 4660 case Intrinsic::amdgcn_permlane16_var: 4661 case Intrinsic::amdgcn_permlanex16_var: { 4662 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4663 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4664 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4665 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4666 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4667 break; 4668 } 4669 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4670 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4671 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4672 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4673 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4674 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4675 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4676 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4677 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4678 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4679 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4680 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4681 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4682 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4683 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4684 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4685 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4686 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4687 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4688 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4689 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4690 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4691 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4692 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4693 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4694 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4695 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4696 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4697 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4698 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4699 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: 4700 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: 4701 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: 4702 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: 4703 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: 4704 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: 4705 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: 4706 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: 4707 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { 4708 // Default for MAI intrinsics. 4709 // srcC can also be an immediate which can be folded later. 4710 // FIXME: Should we eventually add an alternative mapping with AGPR src 4711 // for srcA/srcB? 4712 // 4713 // vdst, srcA, srcB, srcC 4714 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4715 OpdsMapping[0] = 4716 Info->mayNeedAGPRs() 4717 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4718 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4719 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4720 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4721 OpdsMapping[4] = 4722 Info->mayNeedAGPRs() 4723 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4724 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4725 break; 4726 } 4727 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4728 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4729 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4730 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4731 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4732 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 4733 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 4734 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 4735 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 4736 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 4737 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 4738 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 4739 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 4740 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { 4741 // vdst, srcA, srcB, srcC, idx 4742 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4743 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4744 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4745 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4746 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4747 break; 4748 } 4749 case Intrinsic::amdgcn_interp_p1: 4750 case Intrinsic::amdgcn_interp_p2: 4751 case Intrinsic::amdgcn_interp_mov: 4752 case Intrinsic::amdgcn_interp_p1_f16: 4753 case Intrinsic::amdgcn_interp_p2_f16: 4754 case Intrinsic::amdgcn_lds_param_load: { 4755 const int M0Idx = MI.getNumOperands() - 1; 4756 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4757 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4758 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4759 4760 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4761 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4762 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4763 4764 // Must be SGPR, but we must take whatever the original bank is and fix it 4765 // later. 4766 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4767 break; 4768 } 4769 case Intrinsic::amdgcn_interp_inreg_p10: 4770 case Intrinsic::amdgcn_interp_inreg_p2: 4771 case Intrinsic::amdgcn_interp_inreg_p10_f16: 4772 case Intrinsic::amdgcn_interp_inreg_p2_f16: { 4773 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4774 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4775 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4776 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4777 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4778 break; 4779 } 4780 case Intrinsic::amdgcn_ballot: { 4781 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4782 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4783 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4784 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4785 break; 4786 } 4787 case Intrinsic::amdgcn_inverse_ballot: { 4788 // This must be an SGPR, but accept a VGPR. 4789 Register MaskReg = MI.getOperand(2).getReg(); 4790 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 4791 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4792 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4793 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 4794 break; 4795 } 4796 case Intrinsic::amdgcn_s_quadmask: 4797 case Intrinsic::amdgcn_s_wqm: { 4798 Register MaskReg = MI.getOperand(2).getReg(); 4799 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 4800 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4801 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize); 4802 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 4803 break; 4804 } 4805 case Intrinsic::amdgcn_wave_reduce_umin: 4806 case Intrinsic::amdgcn_wave_reduce_umax: { 4807 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4808 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4809 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4810 auto regBankID = 4811 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4812 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); 4813 break; 4814 } 4815 case Intrinsic::amdgcn_s_bitreplicate: 4816 Register MaskReg = MI.getOperand(2).getReg(); 4817 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4818 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4819 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); 4820 } 4821 break; 4822 } 4823 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4824 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4825 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4826 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4827 auto IntrID = AMDGPU::getIntrinsicID(MI); 4828 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4829 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4830 // Non-images can have complications from operands that allow both SGPR 4831 // and VGPR. For now it's too complicated to figure out the final opcode 4832 // to derive the register bank from the MCInstrDesc. 4833 assert(RSrcIntrin->IsImage); 4834 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4835 } 4836 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4837 unsigned N = MI.getNumExplicitOperands() - 2; 4838 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4839 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4840 if (N == 3) { 4841 // Sequential form: all operands combined into VGPR256/VGPR512 4842 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4843 if (Size > 256) 4844 Size = 512; 4845 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4846 } else { 4847 // NSA form 4848 for (unsigned I = 2; I < N; ++I) { 4849 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); 4850 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4851 } 4852 } 4853 break; 4854 } 4855 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 4856 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { 4857 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 4858 switch (IntrID) { 4859 case Intrinsic::amdgcn_s_getreg: 4860 case Intrinsic::amdgcn_s_memtime: 4861 case Intrinsic::amdgcn_s_memrealtime: 4862 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 4863 case Intrinsic::amdgcn_s_sendmsg_rtn: { 4864 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4865 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4866 break; 4867 } 4868 case Intrinsic::amdgcn_global_atomic_fadd: 4869 case Intrinsic::amdgcn_global_atomic_csub: 4870 case Intrinsic::amdgcn_global_atomic_fmin: 4871 case Intrinsic::amdgcn_global_atomic_fmax: 4872 case Intrinsic::amdgcn_global_atomic_fmin_num: 4873 case Intrinsic::amdgcn_global_atomic_fmax_num: 4874 case Intrinsic::amdgcn_flat_atomic_fadd: 4875 case Intrinsic::amdgcn_flat_atomic_fmin: 4876 case Intrinsic::amdgcn_flat_atomic_fmax: 4877 case Intrinsic::amdgcn_flat_atomic_fmin_num: 4878 case Intrinsic::amdgcn_flat_atomic_fmax_num: 4879 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4880 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4881 case Intrinsic::amdgcn_atomic_cond_sub_u32: 4882 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 4883 case Intrinsic::amdgcn_global_load_tr: 4884 return getDefaultMappingAllVGPR(MI); 4885 case Intrinsic::amdgcn_ds_ordered_add: 4886 case Intrinsic::amdgcn_ds_ordered_swap: 4887 case Intrinsic::amdgcn_ds_fadd_v2bf16: { 4888 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4889 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4890 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4891 AMDGPU::SGPRRegBankID); 4892 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4893 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4894 break; 4895 } 4896 case Intrinsic::amdgcn_ds_append: 4897 case Intrinsic::amdgcn_ds_consume: { 4898 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4899 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4900 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4901 break; 4902 } 4903 case Intrinsic::amdgcn_exp_compr: 4904 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4905 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4906 break; 4907 case Intrinsic::amdgcn_exp: 4908 // FIXME: Could we support packed types here? 4909 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4910 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4911 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4912 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4913 break; 4914 case Intrinsic::amdgcn_exp_row: 4915 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4916 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4917 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4918 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4919 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 4920 break; 4921 case Intrinsic::amdgcn_s_sendmsg: 4922 case Intrinsic::amdgcn_s_sendmsghalt: { 4923 // This must be an SGPR, but accept a VGPR. 4924 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4925 AMDGPU::SGPRRegBankID); 4926 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4927 break; 4928 } 4929 case Intrinsic::amdgcn_s_setreg: { 4930 // This must be an SGPR, but accept a VGPR. 4931 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4932 AMDGPU::SGPRRegBankID); 4933 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4934 break; 4935 } 4936 case Intrinsic::amdgcn_s_ttracedata: { 4937 // This must be an SGPR, but accept a VGPR. 4938 unsigned Bank = 4939 getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); 4940 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4941 break; 4942 } 4943 case Intrinsic::amdgcn_end_cf: { 4944 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4945 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4946 break; 4947 } 4948 case Intrinsic::amdgcn_else: { 4949 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4950 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4951 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4952 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4953 break; 4954 } 4955 case Intrinsic::amdgcn_live_mask: { 4956 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4957 break; 4958 } 4959 case Intrinsic::amdgcn_wqm_demote: 4960 case Intrinsic::amdgcn_kill: { 4961 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4962 break; 4963 } 4964 case Intrinsic::amdgcn_raw_buffer_load: 4965 case Intrinsic::amdgcn_raw_ptr_buffer_load: 4966 case Intrinsic::amdgcn_raw_tbuffer_load: 4967 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 4968 // FIXME: Should make intrinsic ID the last operand of the instruction, 4969 // then this would be the same as store 4970 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4971 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4972 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4973 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4974 break; 4975 } 4976 case Intrinsic::amdgcn_raw_buffer_load_lds: 4977 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 4978 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4979 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4980 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4981 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4982 break; 4983 } 4984 case Intrinsic::amdgcn_raw_buffer_store: 4985 case Intrinsic::amdgcn_raw_ptr_buffer_store: 4986 case Intrinsic::amdgcn_raw_buffer_store_format: 4987 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 4988 case Intrinsic::amdgcn_raw_tbuffer_store: 4989 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 4990 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4991 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4992 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4993 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4994 break; 4995 } 4996 case Intrinsic::amdgcn_struct_buffer_load: 4997 case Intrinsic::amdgcn_struct_ptr_buffer_load: 4998 case Intrinsic::amdgcn_struct_tbuffer_load: 4999 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { 5000 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5001 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5002 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5003 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5004 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5005 break; 5006 } 5007 case Intrinsic::amdgcn_struct_buffer_load_lds: 5008 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 5009 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5010 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5011 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5012 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5013 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 5014 break; 5015 } 5016 case Intrinsic::amdgcn_struct_buffer_store: 5017 case Intrinsic::amdgcn_struct_ptr_buffer_store: 5018 case Intrinsic::amdgcn_struct_tbuffer_store: 5019 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 5020 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5021 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5022 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5023 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 5024 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 5025 break; 5026 } 5027 case Intrinsic::amdgcn_init_exec_from_input: { 5028 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 5029 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 5030 break; 5031 } 5032 case Intrinsic::amdgcn_ds_gws_init: 5033 case Intrinsic::amdgcn_ds_gws_barrier: 5034 case Intrinsic::amdgcn_ds_gws_sema_br: { 5035 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5036 5037 // This must be an SGPR, but accept a VGPR. 5038 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5039 AMDGPU::SGPRRegBankID); 5040 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 5041 break; 5042 } 5043 case Intrinsic::amdgcn_ds_gws_sema_v: 5044 case Intrinsic::amdgcn_ds_gws_sema_p: 5045 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 5046 // This must be an SGPR, but accept a VGPR. 5047 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 5048 AMDGPU::SGPRRegBankID); 5049 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 5050 break; 5051 } 5052 case Intrinsic::amdgcn_global_load_lds: { 5053 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5054 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5055 break; 5056 } 5057 case Intrinsic::amdgcn_lds_direct_load: { 5058 const int M0Idx = MI.getNumOperands() - 1; 5059 Register M0Reg = MI.getOperand(M0Idx).getReg(); 5060 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 5061 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5062 5063 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 5064 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 5065 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5066 5067 // Must be SGPR, but we must take whatever the original bank is and fix it 5068 // later. 5069 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 5070 break; 5071 } 5072 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 5073 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: 5074 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5075 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5076 break; 5077 case Intrinsic::amdgcn_ds_bvh_stack_rtn: { 5078 OpdsMapping[0] = 5079 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst 5080 OpdsMapping[1] = 5081 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr 5082 OpdsMapping[3] = 5083 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr 5084 OpdsMapping[4] = 5085 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0 5086 OpdsMapping[5] = 5087 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 5088 break; 5089 } 5090 case Intrinsic::amdgcn_s_sleep_var: 5091 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5092 break; 5093 case Intrinsic::amdgcn_s_barrier_signal_var: 5094 case Intrinsic::amdgcn_s_barrier_join: 5095 case Intrinsic::amdgcn_s_wakeup_barrier: 5096 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5097 break; 5098 case Intrinsic::amdgcn_s_barrier_init: 5099 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5100 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5101 break; 5102 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: { 5103 const unsigned ResultSize = 1; 5104 OpdsMapping[0] = 5105 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); 5106 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5107 break; 5108 } 5109 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 5110 case Intrinsic::amdgcn_s_barrier_leave: { 5111 const unsigned ResultSize = 1; 5112 OpdsMapping[0] = 5113 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); 5114 break; 5115 } 5116 case Intrinsic::amdgcn_s_get_barrier_state: { 5117 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5118 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5119 break; 5120 } 5121 default: 5122 return getInvalidInstructionMapping(); 5123 } 5124 break; 5125 } 5126 case AMDGPU::G_SELECT: { 5127 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5128 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5129 AMDGPU::SGPRRegBankID); 5130 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 5131 AMDGPU::SGPRRegBankID); 5132 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 5133 Op3Bank == AMDGPU::SGPRRegBankID; 5134 5135 unsigned CondBankDefault = SGPRSrcs ? 5136 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 5137 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 5138 CondBankDefault); 5139 if (CondBank == AMDGPU::SGPRRegBankID) 5140 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 5141 else if (CondBank == AMDGPU::VGPRRegBankID) 5142 CondBank = AMDGPU::VCCRegBankID; 5143 5144 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 5145 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 5146 5147 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 5148 5149 // TODO: Should report 32-bit for scalar condition type. 5150 if (Size == 64) { 5151 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5152 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 5153 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5154 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5155 } else { 5156 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 5157 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 5158 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 5159 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 5160 } 5161 5162 break; 5163 } 5164 5165 case AMDGPU::G_SI_CALL: { 5166 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 5167 // Lie and claim everything is legal, even though some need to be 5168 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 5169 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5170 5171 // Allow anything for implicit arguments 5172 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 5173 if (MI.getOperand(I).isReg()) { 5174 Register Reg = MI.getOperand(I).getReg(); 5175 auto OpBank = getRegBankID(Reg, MRI); 5176 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 5177 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 5178 } 5179 } 5180 break; 5181 } 5182 case AMDGPU::G_LOAD: 5183 case AMDGPU::G_ZEXTLOAD: 5184 case AMDGPU::G_SEXTLOAD: 5185 return getInstrMappingForLoad(MI); 5186 5187 case AMDGPU::G_ATOMICRMW_XCHG: 5188 case AMDGPU::G_ATOMICRMW_ADD: 5189 case AMDGPU::G_ATOMICRMW_SUB: 5190 case AMDGPU::G_ATOMICRMW_AND: 5191 case AMDGPU::G_ATOMICRMW_OR: 5192 case AMDGPU::G_ATOMICRMW_XOR: 5193 case AMDGPU::G_ATOMICRMW_MAX: 5194 case AMDGPU::G_ATOMICRMW_MIN: 5195 case AMDGPU::G_ATOMICRMW_UMAX: 5196 case AMDGPU::G_ATOMICRMW_UMIN: 5197 case AMDGPU::G_ATOMICRMW_FADD: 5198 case AMDGPU::G_ATOMICRMW_UINC_WRAP: 5199 case AMDGPU::G_ATOMICRMW_UDEC_WRAP: 5200 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 5201 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 5202 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 5203 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5204 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 5205 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5206 break; 5207 } 5208 case AMDGPU::G_ATOMIC_CMPXCHG: { 5209 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5210 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 5211 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5212 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5213 break; 5214 } 5215 case AMDGPU::G_BRCOND: { 5216 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 5217 AMDGPU::SGPRRegBankID); 5218 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 5219 if (Bank != AMDGPU::SGPRRegBankID) 5220 Bank = AMDGPU::VCCRegBankID; 5221 5222 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 5223 break; 5224 } 5225 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 5226 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 5227 return getDefaultMappingVOP(MI); 5228 case AMDGPU::G_PREFETCH: 5229 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5230 break; 5231 } 5232 5233 return getInstructionMapping(/*ID*/1, /*Cost*/1, 5234 getOperandsMapping(OpdsMapping), 5235 MI.getNumOperands()); 5236 } 5237