1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 MachineIRBuilder &B; 101 const AMDGPURegisterBankInfo &RBI; 102 MachineRegisterInfo &MRI; 103 const RegisterBank *NewBank; 104 SmallVector<MachineInstr *, 4> NewInsts; 105 106 public: 107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_, 108 MachineRegisterInfo &MRI_, const RegisterBank *RB) 109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) { 110 assert(!B.isObservingChanges()); 111 B.setChangeObserver(*this); 112 } 113 114 ~ApplyRegBankMapping() { 115 for (MachineInstr *MI : NewInsts) 116 applyBank(*MI); 117 118 B.stopObservingChanges(); 119 } 120 121 /// Set any registers that don't have a set register class or bank to SALU. 122 void applyBank(MachineInstr &MI) { 123 const unsigned Opc = MI.getOpcode(); 124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 125 Opc == AMDGPU::G_SEXT) { 126 // LegalizerHelper wants to use the basic legalization artifacts when 127 // widening etc. We don't handle selection with vcc in artifact sources, 128 // so we need to use a select instead to handle these properly. 129 Register DstReg = MI.getOperand(0).getReg(); 130 Register SrcReg = MI.getOperand(1).getReg(); 131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 132 if (SrcBank == &AMDGPU::VCCRegBank) { 133 const LLT S32 = LLT::scalar(32); 134 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 135 assert(MRI.getType(DstReg) == S32); 136 assert(NewBank == &AMDGPU::VGPRRegBank); 137 138 // Replace the extension with a select, which really uses the boolean 139 // source. 140 B.setInsertPt(*MI.getParent(), MI); 141 142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 143 auto False = B.buildConstant(S32, 0); 144 B.buildSelect(DstReg, SrcReg, True, False); 145 MRI.setRegBank(True.getReg(0), *NewBank); 146 MRI.setRegBank(False.getReg(0), *NewBank); 147 MI.eraseFromParent(); 148 } 149 150 assert(!MRI.getRegClassOrRegBank(DstReg)); 151 MRI.setRegBank(DstReg, *NewBank); 152 return; 153 } 154 155 #ifndef NDEBUG 156 if (Opc == AMDGPU::G_TRUNC) { 157 Register DstReg = MI.getOperand(0).getReg(); 158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 159 assert(DstBank != &AMDGPU::VCCRegBank); 160 } 161 #endif 162 163 for (MachineOperand &Op : MI.operands()) { 164 if (!Op.isReg()) 165 continue; 166 167 // We may see physical registers if building a real MI 168 Register Reg = Op.getReg(); 169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 170 continue; 171 172 const RegisterBank *RB = NewBank; 173 if (MRI.getType(Reg) == LLT::scalar(1)) { 174 assert(NewBank == &AMDGPU::VGPRRegBank && 175 "s1 operands should only be used for vector bools"); 176 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 177 MI.getOpcode() != AMDGPU::G_ANYEXT) && 178 "not expecting legalization artifacts here"); 179 RB = &AMDGPU::VCCRegBank; 180 } 181 182 MRI.setRegBank(Reg, *RB); 183 } 184 } 185 186 void erasingInstr(MachineInstr &MI) override {} 187 188 void createdInstr(MachineInstr &MI) override { 189 // At this point, the instruction was just inserted and has no operands. 190 NewInsts.push_back(&MI); 191 } 192 193 void changingInstr(MachineInstr &MI) override {} 194 void changedInstr(MachineInstr &MI) override { 195 // FIXME: In principle we should probably add the instruction to NewInsts, 196 // but the way the LegalizerHelper uses the observer, we will always see the 197 // registers we need to set the regbank on also referenced in a new 198 // instruction. 199 } 200 }; 201 202 } 203 204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 206 TII(Subtarget.getInstrInfo()) { 207 208 // HACK: Until this is fully tablegen'd. 209 static llvm::once_flag InitializeRegisterBankFlag; 210 211 static auto InitializeRegisterBankOnce = [this]() { 212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 215 (void)this; 216 }; 217 218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 219 } 220 221 static bool isVectorRegisterBank(const RegisterBank &Bank) { 222 unsigned BankID = Bank.getID(); 223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 224 } 225 226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { 227 return RB != &AMDGPU::SGPRRegBank; 228 } 229 230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 231 const RegisterBank &Src, 232 TypeSize Size) const { 233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 234 if (Dst.getID() == AMDGPU::SGPRRegBankID && 235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 236 return std::numeric_limits<unsigned>::max(); 237 } 238 239 // Bool values are tricky, because the meaning is based on context. The SCC 240 // and VCC banks are for the natural scalar and vector conditions produced by 241 // a compare. 242 // 243 // Legalization doesn't know about the necessary context, so an s1 use may 244 // have been a truncate from an arbitrary value, in which case a copy (lowered 245 // as a compare with 0) needs to be inserted. 246 if (Size == 1 && 247 (Dst.getID() == AMDGPU::SGPRRegBankID) && 248 (isVectorRegisterBank(Src) || 249 Src.getID() == AMDGPU::SGPRRegBankID || 250 Src.getID() == AMDGPU::VCCRegBankID)) 251 return std::numeric_limits<unsigned>::max(); 252 253 // There is no direct copy between AGPRs. 254 if (Dst.getID() == AMDGPU::AGPRRegBankID && 255 Src.getID() == AMDGPU::AGPRRegBankID) 256 return 4; 257 258 return RegisterBankInfo::copyCost(Dst, Src, Size); 259 } 260 261 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 262 const ValueMapping &ValMapping, 263 const RegisterBank *CurBank) const { 264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 265 // VGPR. 266 // FIXME: Is there a better way to do this? 267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 268 return 10; // This is expensive. 269 270 assert(ValMapping.NumBreakDowns == 2 && 271 ValMapping.BreakDown[0].Length == 32 && 272 ValMapping.BreakDown[0].StartIdx == 0 && 273 ValMapping.BreakDown[1].Length == 32 && 274 ValMapping.BreakDown[1].StartIdx == 32 && 275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 276 277 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 279 // want. 280 281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 282 // alignment restrictions, but this probably isn't important. 283 return 1; 284 } 285 286 const RegisterBank & 287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 288 LLT Ty) const { 289 if (&RC == &AMDGPU::SReg_1RegClass) 290 return AMDGPU::VCCRegBank; 291 292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 293 // VCC-like use. 294 if (TRI->isSGPRClass(&RC)) { 295 // FIXME: This probably came from a copy from a physical register, which 296 // should be inferable from the copied to-type. We don't have many boolean 297 // physical register constraints so just assume a normal SGPR for now. 298 if (!Ty.isValid()) 299 return AMDGPU::SGPRRegBank; 300 301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 302 } 303 304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 305 } 306 307 template <unsigned NumOps> 308 RegisterBankInfo::InstructionMappings 309 AMDGPURegisterBankInfo::addMappingFromTable( 310 const MachineInstr &MI, const MachineRegisterInfo &MRI, 311 const std::array<unsigned, NumOps> RegSrcOpIdx, 312 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 313 314 InstructionMappings AltMappings; 315 316 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 317 318 unsigned Sizes[NumOps]; 319 for (unsigned I = 0; I < NumOps; ++I) { 320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 322 } 323 324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 327 } 328 329 // getInstrMapping's default mapping uses ID 1, so start at 2. 330 unsigned MappingID = 2; 331 for (const auto &Entry : Table) { 332 for (unsigned I = 0; I < NumOps; ++I) { 333 int OpIdx = RegSrcOpIdx[I]; 334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 335 } 336 337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 338 getOperandsMapping(Operands), 339 Operands.size())); 340 } 341 342 return AltMappings; 343 } 344 345 RegisterBankInfo::InstructionMappings 346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 349 case Intrinsic::amdgcn_readlane: { 350 static const OpRegBankEntry<3> Table[2] = { 351 // Perfectly legal. 352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 353 354 // Need a readfirstlane for the index. 355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 356 }; 357 358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 360 } 361 case Intrinsic::amdgcn_writelane: { 362 static const OpRegBankEntry<4> Table[4] = { 363 // Perfectly legal. 364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 365 366 // Need readfirstlane of first op 367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 368 369 // Need readfirstlane of second op 370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 371 372 // Need readfirstlane of both ops 373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 374 }; 375 376 // rsrc, voffset, offset 377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); 379 } 380 default: 381 return RegisterBankInfo::getInstrAlternativeMappings(MI); 382 } 383 } 384 385 RegisterBankInfo::InstructionMappings 386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 388 389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 390 case Intrinsic::amdgcn_s_buffer_load: { 391 static const OpRegBankEntry<2> Table[4] = { 392 // Perfectly legal. 393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 394 395 // Only need 1 register in loop 396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 397 398 // Have to waterfall the resource. 399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 400 401 // Have to waterfall the resource, and the offset. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 403 }; 404 405 // rsrc, offset 406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); 408 } 409 case Intrinsic::amdgcn_ds_ordered_add: 410 case Intrinsic::amdgcn_ds_ordered_swap: { 411 // VGPR = M0, VGPR 412 static const OpRegBankEntry<3> Table[2] = { 413 // Perfectly legal. 414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 415 416 // Need a readfirstlane for m0 417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 418 }; 419 420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 422 } 423 case Intrinsic::amdgcn_s_sendmsg: 424 case Intrinsic::amdgcn_s_sendmsghalt: { 425 // FIXME: Should have no register for immediate 426 static const OpRegBankEntry<1> Table[2] = { 427 // Perfectly legal. 428 { { AMDGPU::SGPRRegBankID }, 1 }, 429 430 // Need readlane 431 { { AMDGPU::VGPRRegBankID }, 3 } 432 }; 433 434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); 436 } 437 default: 438 return RegisterBankInfo::getInstrAlternativeMappings(MI); 439 } 440 } 441 442 // FIXME: Returns uniform if there's no source value information. This is 443 // probably wrong. 444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { 445 if (!MI.hasOneMemOperand()) 446 return false; 447 448 const MachineMemOperand *MMO = *MI.memoperands_begin(); 449 const unsigned AS = MMO->getAddrSpace(); 450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 451 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 452 // Require 4-byte alignment. 453 return MMO->getAlign() >= Align(4) && 454 // Can't do a scalar atomic load. 455 !MMO->isAtomic() && 456 // Don't use scalar loads for volatile accesses to non-constant address 457 // spaces. 458 (IsConst || !MMO->isVolatile()) && 459 // Memory must be known constant, or not written before this load. 460 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 461 AMDGPUInstrInfo::isUniformMMO(MMO); 462 } 463 464 RegisterBankInfo::InstructionMappings 465 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 466 const MachineInstr &MI) const { 467 468 const MachineFunction &MF = *MI.getParent()->getParent(); 469 const MachineRegisterInfo &MRI = MF.getRegInfo(); 470 471 472 InstructionMappings AltMappings; 473 switch (MI.getOpcode()) { 474 case TargetOpcode::G_CONSTANT: 475 case TargetOpcode::G_IMPLICIT_DEF: { 476 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 477 if (Size == 1) { 478 static const OpRegBankEntry<1> Table[3] = { 479 { { AMDGPU::VGPRRegBankID }, 1 }, 480 { { AMDGPU::SGPRRegBankID }, 1 }, 481 { { AMDGPU::VCCRegBankID }, 1 } 482 }; 483 484 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 485 } 486 487 [[fallthrough]]; 488 } 489 case TargetOpcode::G_FCONSTANT: 490 case TargetOpcode::G_FRAME_INDEX: 491 case TargetOpcode::G_GLOBAL_VALUE: { 492 static const OpRegBankEntry<1> Table[2] = { 493 { { AMDGPU::VGPRRegBankID }, 1 }, 494 { { AMDGPU::SGPRRegBankID }, 1 } 495 }; 496 497 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 498 } 499 case TargetOpcode::G_AND: 500 case TargetOpcode::G_OR: 501 case TargetOpcode::G_XOR: { 502 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 503 504 if (Size == 1) { 505 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 506 const InstructionMapping &SCCMapping = getInstructionMapping( 507 1, 1, getOperandsMapping( 508 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 509 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 510 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 511 3); // Num Operands 512 AltMappings.push_back(&SCCMapping); 513 514 const InstructionMapping &VCCMapping0 = getInstructionMapping( 515 2, 1, getOperandsMapping( 516 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 517 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 518 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 519 3); // Num Operands 520 AltMappings.push_back(&VCCMapping0); 521 return AltMappings; 522 } 523 524 if (Size != 64) 525 break; 526 527 const InstructionMapping &SSMapping = getInstructionMapping( 528 1, 1, getOperandsMapping( 529 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 530 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 531 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 532 3); // Num Operands 533 AltMappings.push_back(&SSMapping); 534 535 const InstructionMapping &VVMapping = getInstructionMapping( 536 2, 2, getOperandsMapping( 537 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 538 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 539 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 540 3); // Num Operands 541 AltMappings.push_back(&VVMapping); 542 break; 543 } 544 case TargetOpcode::G_LOAD: 545 case TargetOpcode::G_ZEXTLOAD: 546 case TargetOpcode::G_SEXTLOAD: { 547 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 548 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 549 unsigned PtrSize = PtrTy.getSizeInBits(); 550 unsigned AS = PtrTy.getAddressSpace(); 551 552 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 553 AS != AMDGPUAS::PRIVATE_ADDRESS) && 554 isScalarLoadLegal(MI)) { 555 const InstructionMapping &SSMapping = getInstructionMapping( 556 1, 1, getOperandsMapping( 557 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 558 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 559 2); // Num Operands 560 AltMappings.push_back(&SSMapping); 561 } 562 563 const InstructionMapping &VVMapping = getInstructionMapping( 564 2, 1, 565 getOperandsMapping( 566 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 567 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 568 2); // Num Operands 569 AltMappings.push_back(&VVMapping); 570 571 // It may be possible to have a vgpr = load sgpr mapping here, because 572 // the mubuf instructions support this kind of load, but probably for only 573 // gfx7 and older. However, the addressing mode matching in the instruction 574 // selector should be able to do a better job of detecting and selecting 575 // these kinds of loads from the vgpr = load vgpr mapping. 576 577 return AltMappings; 578 579 } 580 case TargetOpcode::G_SELECT: { 581 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 582 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 583 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 584 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 585 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 586 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 587 4); // Num Operands 588 AltMappings.push_back(&SSMapping); 589 590 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 591 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 592 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 593 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 594 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 595 4); // Num Operands 596 AltMappings.push_back(&VVMapping); 597 598 return AltMappings; 599 } 600 case TargetOpcode::G_UADDE: 601 case TargetOpcode::G_USUBE: 602 case TargetOpcode::G_SADDE: 603 case TargetOpcode::G_SSUBE: { 604 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 605 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 606 getOperandsMapping( 607 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 608 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 609 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 610 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 611 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 612 5); // Num Operands 613 AltMappings.push_back(&SSMapping); 614 615 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 616 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 617 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 618 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 619 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 620 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 621 5); // Num Operands 622 AltMappings.push_back(&VVMapping); 623 return AltMappings; 624 } 625 case AMDGPU::G_BRCOND: { 626 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 627 628 // TODO: Change type to 32 for scalar 629 const InstructionMapping &SMapping = getInstructionMapping( 630 1, 1, getOperandsMapping( 631 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 632 2); // Num Operands 633 AltMappings.push_back(&SMapping); 634 635 const InstructionMapping &VMapping = getInstructionMapping( 636 1, 1, getOperandsMapping( 637 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 638 2); // Num Operands 639 AltMappings.push_back(&VMapping); 640 return AltMappings; 641 } 642 case AMDGPU::G_INTRINSIC: 643 case AMDGPU::G_INTRINSIC_CONVERGENT: 644 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 645 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 646 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 647 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 648 default: 649 break; 650 } 651 return RegisterBankInfo::getInstrAlternativeMappings(MI); 652 } 653 654 void AMDGPURegisterBankInfo::split64BitValueForMapping( 655 MachineIRBuilder &B, 656 SmallVector<Register, 2> &Regs, 657 LLT HalfTy, 658 Register Reg) const { 659 assert(HalfTy.getSizeInBits() == 32); 660 MachineRegisterInfo *MRI = B.getMRI(); 661 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 662 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 663 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 664 MRI->setRegBank(LoLHS, *Bank); 665 MRI->setRegBank(HiLHS, *Bank); 666 667 Regs.push_back(LoLHS); 668 Regs.push_back(HiLHS); 669 670 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 671 .addDef(LoLHS) 672 .addDef(HiLHS) 673 .addUse(Reg); 674 } 675 676 /// Replace the current type each register in \p Regs has with \p NewTy 677 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 678 LLT NewTy) { 679 for (Register Reg : Regs) { 680 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 681 MRI.setType(Reg, NewTy); 682 } 683 } 684 685 static LLT getHalfSizedType(LLT Ty) { 686 if (Ty.isVector()) { 687 assert(Ty.getElementCount().isKnownMultipleOf(2)); 688 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 689 Ty.getElementType()); 690 } 691 692 assert(Ty.getScalarSizeInBits() % 2 == 0); 693 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 694 } 695 696 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 697 // source value into a scalar register. 698 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 699 MachineRegisterInfo &MRI, 700 Register Src) const { 701 LLT Ty = MRI.getType(Src); 702 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 703 704 if (Bank == &AMDGPU::SGPRRegBank) 705 return Src; 706 707 unsigned Bits = Ty.getSizeInBits(); 708 assert(Bits % 32 == 0); 709 710 if (Bank != &AMDGPU::VGPRRegBank) { 711 // We need to copy from AGPR to VGPR 712 Src = B.buildCopy(Ty, Src).getReg(0); 713 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 714 } 715 716 LLT S32 = LLT::scalar(32); 717 unsigned NumParts = Bits / 32; 718 SmallVector<Register, 8> SrcParts; 719 SmallVector<Register, 8> DstParts; 720 721 if (Bits == 32) { 722 SrcParts.push_back(Src); 723 } else { 724 auto Unmerge = B.buildUnmerge(S32, Src); 725 for (unsigned i = 0; i < NumParts; ++i) 726 SrcParts.push_back(Unmerge.getReg(i)); 727 } 728 729 for (unsigned i = 0; i < NumParts; ++i) { 730 Register SrcPart = SrcParts[i]; 731 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 732 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 733 734 const TargetRegisterClass *Constrained = 735 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 736 (void)Constrained; 737 assert(Constrained && "Failed to constrain readfirstlane src reg"); 738 739 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 740 741 DstParts.push_back(DstPart); 742 } 743 744 if (Bits == 32) 745 return DstParts[0]; 746 747 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0); 748 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 749 return Dst; 750 } 751 752 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 753 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 754 /// execute the instruction for each unique combination of values in all lanes 755 /// in the wave. The block will be split such that rest of the instructions are 756 /// moved to a new block. 757 /// 758 /// Essentially performs this loop: 759 // 760 /// Save Execution Mask 761 /// For (Lane : Wavefront) { 762 /// Enable Lane, Disable all other lanes 763 /// SGPR = read SGPR value for current lane from VGPR 764 /// VGPRResult[Lane] = use_op SGPR 765 /// } 766 /// Restore Execution Mask 767 /// 768 /// There is additional complexity to try for compare values to identify the 769 /// unique values used. 770 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 771 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, 772 SmallSet<Register, 4> &SGPROperandRegs) const { 773 // Track use registers which have already been expanded with a readfirstlane 774 // sequence. This may have multiple uses if moving a sequence. 775 DenseMap<Register, Register> WaterfalledRegMap; 776 777 MachineBasicBlock &MBB = B.getMBB(); 778 MachineFunction *MF = &B.getMF(); 779 780 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 781 const unsigned MovExecOpc = 782 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 783 const unsigned MovExecTermOpc = 784 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 785 786 const unsigned XorTermOpc = Subtarget.isWave32() ? 787 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 788 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 789 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 790 const unsigned ExecReg = Subtarget.isWave32() ? 791 AMDGPU::EXEC_LO : AMDGPU::EXEC; 792 793 #ifndef NDEBUG 794 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 795 #endif 796 797 MachineRegisterInfo &MRI = *B.getMRI(); 798 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 799 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 800 801 // Don't bother using generic instructions/registers for the exec mask. 802 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 803 .addDef(InitSaveExecReg); 804 805 Register PhiExec = MRI.createVirtualRegister(WaveRC); 806 Register NewExec = MRI.createVirtualRegister(WaveRC); 807 808 // To insert the loop we need to split the block. Move everything before this 809 // point to a new block, and insert a new empty block before this instruction. 810 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 811 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 812 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 813 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 814 MachineFunction::iterator MBBI(MBB); 815 ++MBBI; 816 MF->insert(MBBI, LoopBB); 817 MF->insert(MBBI, BodyBB); 818 MF->insert(MBBI, RestoreExecBB); 819 MF->insert(MBBI, RemainderBB); 820 821 LoopBB->addSuccessor(BodyBB); 822 BodyBB->addSuccessor(RestoreExecBB); 823 BodyBB->addSuccessor(LoopBB); 824 825 // Move the rest of the block into a new block. 826 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 827 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 828 829 MBB.addSuccessor(LoopBB); 830 RestoreExecBB->addSuccessor(RemainderBB); 831 832 B.setInsertPt(*LoopBB, LoopBB->end()); 833 834 B.buildInstr(TargetOpcode::PHI) 835 .addDef(PhiExec) 836 .addReg(InitSaveExecReg) 837 .addMBB(&MBB) 838 .addReg(NewExec) 839 .addMBB(BodyBB); 840 841 const DebugLoc &DL = B.getDL(); 842 843 MachineInstr &FirstInst = *Range.begin(); 844 845 // Move the instruction into the loop body. Note we moved everything after 846 // Range.end() already into a new block, so Range.end() is no longer valid. 847 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 848 849 // Figure out the iterator range after splicing the instructions. 850 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 851 auto NewEnd = BodyBB->end(); 852 853 B.setMBB(*LoopBB); 854 855 LLT S1 = LLT::scalar(1); 856 Register CondReg; 857 858 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 859 860 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 861 for (MachineOperand &Op : MI.all_uses()) { 862 Register OldReg = Op.getReg(); 863 if (!SGPROperandRegs.count(OldReg)) 864 continue; 865 866 // See if we already processed this register in another instruction in the 867 // sequence. 868 auto OldVal = WaterfalledRegMap.find(OldReg); 869 if (OldVal != WaterfalledRegMap.end()) { 870 Op.setReg(OldVal->second); 871 continue; 872 } 873 874 Register OpReg = Op.getReg(); 875 LLT OpTy = MRI.getType(OpReg); 876 877 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 878 if (OpBank != &AMDGPU::VGPRRegBank) { 879 // Insert copy from AGPR to VGPR before the loop. 880 B.setMBB(MBB); 881 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 882 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 883 B.setMBB(*LoopBB); 884 } 885 886 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 887 888 // Build the comparison(s). 889 unsigned OpSize = OpTy.getSizeInBits(); 890 bool Is64 = OpSize % 64 == 0; 891 unsigned PartSize = Is64 ? 64 : 32; 892 LLT PartTy = LLT::scalar(PartSize); 893 unsigned NumParts = OpSize / PartSize; 894 SmallVector<Register, 8> OpParts; 895 SmallVector<Register, 8> CurrentLaneParts; 896 897 if (NumParts == 1) { 898 OpParts.push_back(OpReg); 899 CurrentLaneParts.push_back(CurrentLaneReg); 900 } else { 901 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 902 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 903 for (unsigned i = 0; i < NumParts; ++i) { 904 OpParts.push_back(UnmergeOp.getReg(i)); 905 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 906 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 907 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 908 } 909 } 910 911 for (unsigned i = 0; i < NumParts; ++i) { 912 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 913 OpParts[i]).getReg(0); 914 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 915 916 if (!CondReg) { 917 CondReg = CmpReg; 918 } else { 919 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 920 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 921 } 922 } 923 924 Op.setReg(CurrentLaneReg); 925 926 // Make sure we don't re-process this register again. 927 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); 928 } 929 } 930 931 // The ballot becomes a no-op during instruction selection. 932 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 933 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}) 934 .addReg(CondReg) 935 .getReg(0); 936 MRI.setRegClass(CondReg, WaveRC); 937 938 // Update EXEC, save the original EXEC value to VCC. 939 B.buildInstr(AndSaveExecOpc) 940 .addDef(NewExec) 941 .addReg(CondReg, RegState::Kill); 942 943 MRI.setSimpleHint(NewExec, CondReg); 944 945 B.setInsertPt(*BodyBB, BodyBB->end()); 946 947 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 948 B.buildInstr(XorTermOpc) 949 .addDef(ExecReg) 950 .addReg(ExecReg) 951 .addReg(NewExec); 952 953 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 954 // s_cbranch_scc0? 955 956 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 957 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 958 959 // Save the EXEC mask before the loop. 960 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 961 .addReg(ExecReg); 962 963 // Restore the EXEC mask after the loop. 964 B.setMBB(*RestoreExecBB); 965 B.buildInstr(MovExecTermOpc) 966 .addDef(ExecReg) 967 .addReg(SaveExecReg); 968 969 // Set the insert point after the original instruction, so any new 970 // instructions will be in the remainder. 971 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 972 973 return true; 974 } 975 976 // Return any unique registers used by \p MI at \p OpIndices that need to be 977 // handled in a waterfall loop. Returns these registers in \p 978 // SGPROperandRegs. Returns true if there are any operands to handle and a 979 // waterfall loop is necessary. 980 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 981 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 982 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 983 for (unsigned Op : OpIndices) { 984 assert(MI.getOperand(Op).isUse()); 985 Register Reg = MI.getOperand(Op).getReg(); 986 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 987 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 988 SGPROperandRegs.insert(Reg); 989 } 990 991 // No operands need to be replaced, so no need to loop. 992 return !SGPROperandRegs.empty(); 993 } 994 995 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 996 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const { 997 // Use a set to avoid extra readfirstlanes in the case where multiple operands 998 // are the same register. 999 SmallSet<Register, 4> SGPROperandRegs; 1000 1001 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices)) 1002 return false; 1003 1004 MachineBasicBlock::iterator I = MI.getIterator(); 1005 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1006 SGPROperandRegs); 1007 } 1008 1009 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1010 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1011 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const { 1012 Register Reg = MI.getOperand(OpIdx).getReg(); 1013 MachineRegisterInfo &MRI = *B.getMRI(); 1014 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1015 if (Bank == &AMDGPU::SGPRRegBank) 1016 return; 1017 1018 Reg = buildReadFirstLane(B, MRI, Reg); 1019 MI.getOperand(OpIdx).setReg(Reg); 1020 } 1021 1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1023 /// rest will be in the remainder. 1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1025 unsigned TotalSize = Ty.getSizeInBits(); 1026 if (!Ty.isVector()) 1027 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1028 1029 LLT EltTy = Ty.getElementType(); 1030 unsigned EltSize = EltTy.getSizeInBits(); 1031 assert(FirstSize % EltSize == 0); 1032 1033 unsigned FirstPartNumElts = FirstSize / EltSize; 1034 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1035 1036 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1037 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1038 } 1039 1040 static LLT widen96To128(LLT Ty) { 1041 if (!Ty.isVector()) 1042 return LLT::scalar(128); 1043 1044 LLT EltTy = Ty.getElementType(); 1045 assert(128 % EltTy.getSizeInBits() == 0); 1046 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1047 } 1048 1049 bool AMDGPURegisterBankInfo::applyMappingLoad( 1050 MachineIRBuilder &B, 1051 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1052 MachineInstr &MI) const { 1053 MachineRegisterInfo &MRI = *B.getMRI(); 1054 Register DstReg = MI.getOperand(0).getReg(); 1055 const LLT LoadTy = MRI.getType(DstReg); 1056 unsigned LoadSize = LoadTy.getSizeInBits(); 1057 const unsigned MaxNonSmrdLoadSize = 128; 1058 1059 const RegisterBank *DstBank = 1060 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1061 if (DstBank == &AMDGPU::SGPRRegBank) { 1062 // There are some special cases that we need to look at for 32 bit and 96 1063 // bit SGPR loads otherwise we have nothing to do. 1064 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) 1065 return false; 1066 1067 MachineMemOperand *MMO = *MI.memoperands_begin(); 1068 const unsigned MemSize = 8 * MMO->getSize(); 1069 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1070 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1071 // scalar loads should have a load size of 32 but memory access size of less 1072 // than 32. 1073 if (LoadSize == 32 && 1074 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1075 return false; 1076 1077 Register PtrReg = MI.getOperand(1).getReg(); 1078 1079 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 1080 1081 if (LoadSize == 32) { 1082 // This is an extending load from a sub-dword size. Widen the memory 1083 // access size to 4 bytes and clear the extra high bits appropriately 1084 const LLT S32 = LLT::scalar(32); 1085 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1086 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1087 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1088 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1089 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1090 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1091 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1092 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1093 } else 1094 // We do not need to touch the higher bits for regular loads. 1095 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1096 } else { 1097 // 96-bit loads are only available for vector loads. We need to split this 1098 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1099 if (MMO->getAlign() < Align(16)) { 1100 LegalizerHelper Helper(B.getMF(), ApplyBank, B); 1101 LLT Part64, Part32; 1102 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1103 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1104 LegalizerHelper::Legalized) 1105 return false; 1106 return true; 1107 } else { 1108 LLT WiderTy = widen96To128(LoadTy); 1109 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1110 if (WiderTy.isScalar()) 1111 B.buildTrunc(MI.getOperand(0), WideLoad); 1112 else { 1113 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1114 WideLoad); 1115 } 1116 } 1117 } 1118 1119 MI.eraseFromParent(); 1120 return true; 1121 } 1122 1123 // 128-bit loads are supported for all instruction types. 1124 if (LoadSize <= MaxNonSmrdLoadSize) 1125 return false; 1126 1127 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1128 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1129 1130 if (SrcRegs.empty()) 1131 SrcRegs.push_back(MI.getOperand(1).getReg()); 1132 1133 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1134 1135 // RegBankSelect only emits scalar types, so we need to reset the pointer 1136 // operand to a pointer type. 1137 Register BasePtrReg = SrcRegs[0]; 1138 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1139 MRI.setType(BasePtrReg, PtrTy); 1140 1141 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1142 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1143 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); 1144 LegalizerHelper Helper(B.getMF(), O, B); 1145 1146 if (LoadTy.isVector()) { 1147 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1148 return false; 1149 } else { 1150 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1151 return false; 1152 } 1153 1154 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1155 return true; 1156 } 1157 1158 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1159 MachineIRBuilder &B, 1160 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1161 MachineInstr &MI) const { 1162 MachineRegisterInfo &MRI = *B.getMRI(); 1163 const MachineFunction &MF = B.getMF(); 1164 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1165 const auto &TFI = *ST.getFrameLowering(); 1166 1167 // Guard in case the stack growth direction ever changes with scratch 1168 // instructions. 1169 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1170 return false; 1171 1172 Register Dst = MI.getOperand(0).getReg(); 1173 Register AllocSize = MI.getOperand(1).getReg(); 1174 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1175 1176 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1177 1178 // TODO: Need to emit a wave reduction to get the maximum size. 1179 if (SizeBank != &AMDGPU::SGPRRegBank) 1180 return false; 1181 1182 LLT PtrTy = MRI.getType(Dst); 1183 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1184 1185 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1186 Register SPReg = Info->getStackPtrOffsetReg(); 1187 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); 1188 1189 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1190 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1191 1192 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1193 if (Alignment > TFI.getStackAlign()) { 1194 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1195 B.buildMaskLowPtrBits(Dst, PtrAdd, 1196 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1197 } else { 1198 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1199 } 1200 1201 MI.eraseFromParent(); 1202 return true; 1203 } 1204 1205 bool AMDGPURegisterBankInfo::applyMappingImage( 1206 MachineIRBuilder &B, MachineInstr &MI, 1207 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1208 int RsrcIdx) const { 1209 const int NumDefs = MI.getNumExplicitDefs(); 1210 1211 // The reported argument index is relative to the IR intrinsic call arguments, 1212 // so we need to shift by the number of defs and the intrinsic ID. 1213 RsrcIdx += NumDefs + 1; 1214 1215 // Insert copies to VGPR arguments. 1216 applyDefaultMapping(OpdMapper); 1217 1218 // Fixup any SGPR arguments. 1219 SmallVector<unsigned, 4> SGPRIndexes; 1220 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1221 if (!MI.getOperand(I).isReg()) 1222 continue; 1223 1224 // If this intrinsic has a sampler, it immediately follows rsrc. 1225 if (I == RsrcIdx || I == RsrcIdx + 1) 1226 SGPRIndexes.push_back(I); 1227 } 1228 1229 executeInWaterfallLoop(B, MI, SGPRIndexes); 1230 return true; 1231 } 1232 1233 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1234 // the three offsets (voffset, soffset and instoffset) 1235 unsigned AMDGPURegisterBankInfo::setBufferOffsets( 1236 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, 1237 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { 1238 const LLT S32 = LLT::scalar(32); 1239 MachineRegisterInfo *MRI = B.getMRI(); 1240 1241 if (std::optional<int64_t> Imm = 1242 getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1243 uint32_t SOffset, ImmOffset; 1244 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { 1245 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1246 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1247 InstOffsetVal = ImmOffset; 1248 1249 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1250 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1251 return SOffset + ImmOffset; 1252 } 1253 } 1254 1255 Register Base; 1256 unsigned Offset; 1257 1258 std::tie(Base, Offset) = 1259 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1260 1261 uint32_t SOffset, ImmOffset; 1262 if ((int)Offset > 0 && 1263 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 1264 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1265 VOffsetReg = Base; 1266 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1267 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1268 InstOffsetVal = ImmOffset; 1269 return 0; // XXX - Why is this 0? 1270 } 1271 1272 // If we have SGPR base, we can use it for soffset. 1273 if (SOffset == 0) { 1274 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1275 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1276 SOffsetReg = Base; 1277 InstOffsetVal = ImmOffset; 1278 return 0; // XXX - Why is this 0? 1279 } 1280 } 1281 1282 // Handle the variable sgpr + vgpr case. 1283 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1284 if (Add && (int)Offset >= 0) { 1285 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); 1286 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); 1287 1288 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); 1289 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); 1290 1291 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1292 VOffsetReg = Src0; 1293 SOffsetReg = Src1; 1294 return 0; 1295 } 1296 1297 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1298 VOffsetReg = Src1; 1299 SOffsetReg = Src0; 1300 return 0; 1301 } 1302 } 1303 1304 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1305 // have an SGPR offset and a VGPR resource. 1306 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1307 VOffsetReg = CombinedOffset; 1308 } else { 1309 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1310 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1311 } 1312 1313 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1314 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1315 return 0; 1316 } 1317 1318 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1319 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 1320 MachineInstr &MI = OpdMapper.getMI(); 1321 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1322 1323 const LLT S32 = LLT::scalar(32); 1324 Register Dst = MI.getOperand(0).getReg(); 1325 LLT Ty = MRI.getType(Dst); 1326 1327 const RegisterBank *RSrcBank = 1328 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1329 const RegisterBank *OffsetBank = 1330 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1331 if (RSrcBank == &AMDGPU::SGPRRegBank && 1332 OffsetBank == &AMDGPU::SGPRRegBank) 1333 return true; // Legal mapping 1334 1335 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1336 // here but don't have an MMO. 1337 1338 unsigned LoadSize = Ty.getSizeInBits(); 1339 int NumLoads = 1; 1340 if (LoadSize == 256 || LoadSize == 512) { 1341 NumLoads = LoadSize / 128; 1342 Ty = Ty.divide(NumLoads); 1343 } 1344 1345 // Use the alignment to ensure that the required offsets will fit into the 1346 // immediate offsets. 1347 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1348 1349 MachineFunction &MF = B.getMF(); 1350 1351 Register SOffset; 1352 Register VOffset; 1353 int64_t ImmOffset = 0; 1354 1355 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset, 1356 SOffset, ImmOffset, Alignment); 1357 1358 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1359 // can, but we need to track an MMO for that. 1360 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1361 const Align MemAlign(4); // FIXME: ABI type alignment? 1362 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1363 MachinePointerInfo(), 1364 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1365 MachineMemOperand::MOInvariant, 1366 MemSize, MemAlign); 1367 if (MMOOffset != 0) 1368 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1369 1370 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1371 // assume that the buffer is unswizzled. 1372 1373 Register RSrc = MI.getOperand(1).getReg(); 1374 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1375 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1376 1377 SmallVector<Register, 4> LoadParts(NumLoads); 1378 1379 MachineBasicBlock::iterator MII = MI.getIterator(); 1380 MachineInstrSpan Span(MII, &B.getMBB()); 1381 1382 for (int i = 0; i < NumLoads; ++i) { 1383 if (NumLoads == 1) { 1384 LoadParts[i] = Dst; 1385 } else { 1386 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1387 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1388 } 1389 1390 MachineMemOperand *MMO = BaseMMO; 1391 if (i != 0) 1392 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1393 1394 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1395 .addDef(LoadParts[i]) // vdata 1396 .addUse(RSrc) // rsrc 1397 .addUse(VIndex) // vindex 1398 .addUse(VOffset) // voffset 1399 .addUse(SOffset) // soffset 1400 .addImm(ImmOffset + 16 * i) // offset(imm) 1401 .addImm(0) // cachepolicy, swizzled buffer(imm) 1402 .addImm(0) // idxen(imm) 1403 .addMemOperand(MMO); 1404 } 1405 1406 // TODO: If only the resource is a VGPR, it may be better to execute the 1407 // scalar load in the waterfall loop if the resource is expected to frequently 1408 // be dynamically uniform. 1409 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1410 // Remove the original instruction to avoid potentially confusing the 1411 // waterfall loop logic. 1412 B.setInstr(*Span.begin()); 1413 MI.eraseFromParent(); 1414 1415 SmallSet<Register, 4> OpsToWaterfall; 1416 1417 OpsToWaterfall.insert(RSrc); 1418 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1419 OpsToWaterfall); 1420 } 1421 1422 if (NumLoads != 1) { 1423 if (Ty.isVector()) 1424 B.buildConcatVectors(Dst, LoadParts); 1425 else 1426 B.buildMergeLikeInstr(Dst, LoadParts); 1427 } 1428 1429 // We removed the instruction earlier with a waterfall loop. 1430 if (RSrcBank == &AMDGPU::SGPRRegBank) 1431 MI.eraseFromParent(); 1432 1433 return true; 1434 } 1435 1436 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, 1437 const OperandsMapper &OpdMapper, 1438 bool Signed) const { 1439 MachineInstr &MI = OpdMapper.getMI(); 1440 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1441 1442 // Insert basic copies 1443 applyDefaultMapping(OpdMapper); 1444 1445 Register DstReg = MI.getOperand(0).getReg(); 1446 LLT Ty = MRI.getType(DstReg); 1447 1448 const LLT S32 = LLT::scalar(32); 1449 1450 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; 1451 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1452 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1453 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1454 1455 const RegisterBank *DstBank = 1456 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1457 if (DstBank == &AMDGPU::VGPRRegBank) { 1458 if (Ty == S32) 1459 return true; 1460 1461 // There is no 64-bit vgpr bitfield extract instructions so the operation 1462 // is expanded to a sequence of instructions that implement the operation. 1463 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); 1464 1465 const LLT S64 = LLT::scalar(64); 1466 // Shift the source operand so that extracted bits start at bit 0. 1467 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1468 : B.buildLShr(S64, SrcReg, OffsetReg); 1469 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1470 1471 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1472 // if the width is a constant. 1473 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1474 // Use the 32-bit bitfield extract instruction if the width is a constant. 1475 // Depending on the width size, use either the low or high 32-bits. 1476 auto Zero = B.buildConstant(S32, 0); 1477 auto WidthImm = ConstWidth->Value.getZExtValue(); 1478 if (WidthImm <= 32) { 1479 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1480 // or clear the upper 32-bits. 1481 auto Extract = 1482 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1483 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1484 auto Extend = 1485 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1486 B.buildMergeLikeInstr(DstReg, {Extract, Extend}); 1487 } else { 1488 // Use bitfield extract on upper 32-bit source, and combine with lower 1489 // 32-bit source. 1490 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1491 auto Extract = 1492 Signed 1493 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1494 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1495 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1496 } 1497 MI.eraseFromParent(); 1498 return true; 1499 } 1500 1501 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1502 // operations. 1503 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1504 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1505 if (Signed) 1506 B.buildAShr(S64, SignBit, ExtShift); 1507 else 1508 B.buildLShr(S64, SignBit, ExtShift); 1509 MI.eraseFromParent(); 1510 return true; 1511 } 1512 1513 // The scalar form packs the offset and width in a single operand. 1514 1515 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); 1516 1517 // Ensure the high bits are clear to insert the offset. 1518 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1519 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1520 1521 // Zeros out the low bits, so don't bother clamping the input value. 1522 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1523 1524 // Transformation function, pack the offset and width of a BFE into 1525 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1526 // source, bits [5:0] contain the offset and bits [22:16] the width. 1527 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1528 1529 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1530 // register class constraints. 1531 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1532 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1533 1534 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1535 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1536 llvm_unreachable("failed to constrain BFE"); 1537 1538 MI.eraseFromParent(); 1539 return true; 1540 } 1541 1542 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1543 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 1544 MachineInstr &MI = OpdMapper.getMI(); 1545 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1546 1547 // Insert basic copies. 1548 applyDefaultMapping(OpdMapper); 1549 1550 Register Dst0 = MI.getOperand(0).getReg(); 1551 Register Dst1 = MI.getOperand(1).getReg(); 1552 Register Src0 = MI.getOperand(2).getReg(); 1553 Register Src1 = MI.getOperand(3).getReg(); 1554 Register Src2 = MI.getOperand(4).getReg(); 1555 1556 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1557 return true; 1558 1559 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1560 LLT S1 = LLT::scalar(1); 1561 LLT S32 = LLT::scalar(32); 1562 1563 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1564 bool Accumulate = true; 1565 1566 if (!DstOnValu) { 1567 if (mi_match(Src2, MRI, m_ZeroInt())) 1568 Accumulate = false; 1569 } 1570 1571 // Keep the multiplication on the SALU. 1572 Register DstHi; 1573 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1574 bool MulHiInVgpr = false; 1575 1576 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1577 1578 if (Subtarget.hasSMulHi()) { 1579 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1580 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1581 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1582 } else { 1583 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1584 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1585 1586 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1587 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1588 1589 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1590 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1591 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1592 1593 if (!DstOnValu) { 1594 DstHi = buildReadFirstLane(B, MRI, DstHi); 1595 } else { 1596 MulHiInVgpr = true; 1597 } 1598 } 1599 1600 // Accumulate and produce the "carry-out" bit. 1601 // 1602 // The "carry-out" is defined as bit 64 of the result when computed as a 1603 // big integer. For unsigned multiply-add, this matches the usual definition 1604 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1605 // result, which is determined as: 1606 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1607 LLT CarryType = DstOnValu ? S1 : S32; 1608 const RegisterBank &CarryBank = 1609 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1610 const RegisterBank &DstBank = 1611 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1612 Register Carry; 1613 Register Zero; 1614 1615 if (!IsUnsigned) { 1616 Zero = B.buildConstant(S32, 0).getReg(0); 1617 MRI.setRegBank(Zero, 1618 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1619 1620 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1621 .getReg(0); 1622 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1623 : AMDGPU::SGPRRegBank); 1624 1625 if (DstOnValu && !MulHiInVgpr) { 1626 Carry = B.buildTrunc(S1, Carry).getReg(0); 1627 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1628 } 1629 } 1630 1631 if (Accumulate) { 1632 if (DstOnValu) { 1633 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1634 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1635 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1636 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1637 } 1638 1639 auto Unmerge = B.buildUnmerge(S32, Src2); 1640 Register Src2Lo = Unmerge.getReg(0); 1641 Register Src2Hi = Unmerge.getReg(1); 1642 MRI.setRegBank(Src2Lo, DstBank); 1643 MRI.setRegBank(Src2Hi, DstBank); 1644 1645 if (!IsUnsigned) { 1646 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1647 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1648 1649 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1650 MRI.setRegBank(Carry, CarryBank); 1651 } 1652 1653 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1654 DstLo = AddLo.getReg(0); 1655 Register CarryLo = AddLo.getReg(1); 1656 MRI.setRegBank(DstLo, DstBank); 1657 MRI.setRegBank(CarryLo, CarryBank); 1658 1659 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1660 DstHi = AddHi.getReg(0); 1661 MRI.setRegBank(DstHi, DstBank); 1662 1663 Register CarryHi = AddHi.getReg(1); 1664 MRI.setRegBank(CarryHi, CarryBank); 1665 1666 if (IsUnsigned) { 1667 Carry = CarryHi; 1668 } else { 1669 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1670 MRI.setRegBank(Carry, CarryBank); 1671 } 1672 } else { 1673 if (IsUnsigned) { 1674 Carry = B.buildConstant(CarryType, 0).getReg(0); 1675 MRI.setRegBank(Carry, CarryBank); 1676 } 1677 } 1678 1679 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi}); 1680 1681 if (DstOnValu) { 1682 B.buildCopy(Dst1, Carry); 1683 } else { 1684 B.buildTrunc(Dst1, Carry); 1685 } 1686 1687 MI.eraseFromParent(); 1688 return true; 1689 } 1690 1691 // Return a suitable opcode for extending the operands of Opc when widening. 1692 static unsigned getExtendOp(unsigned Opc) { 1693 switch (Opc) { 1694 case TargetOpcode::G_ASHR: 1695 case TargetOpcode::G_SMIN: 1696 case TargetOpcode::G_SMAX: 1697 return TargetOpcode::G_SEXT; 1698 case TargetOpcode::G_LSHR: 1699 case TargetOpcode::G_UMIN: 1700 case TargetOpcode::G_UMAX: 1701 return TargetOpcode::G_ZEXT; 1702 default: 1703 return TargetOpcode::G_ANYEXT; 1704 } 1705 } 1706 1707 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1708 // any illegal vector extend or unmerge operations. 1709 static std::pair<Register, Register> 1710 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1711 const LLT S32 = LLT::scalar(32); 1712 auto Bitcast = B.buildBitcast(S32, Src); 1713 1714 if (ExtOpcode == TargetOpcode::G_SEXT) { 1715 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1716 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1717 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1718 } 1719 1720 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1721 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1722 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1723 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1724 } 1725 1726 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1727 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1728 } 1729 1730 // For cases where only a single copy is inserted for matching register banks. 1731 // Replace the register in the instruction operand 1732 static bool substituteSimpleCopyRegs( 1733 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1734 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1735 if (!SrcReg.empty()) { 1736 assert(SrcReg.size() == 1); 1737 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1738 return true; 1739 } 1740 1741 return false; 1742 } 1743 1744 /// Handle register layout difference for f16 images for some subtargets. 1745 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1746 MachineRegisterInfo &MRI, 1747 Register Reg) const { 1748 if (!Subtarget.hasUnpackedD16VMem()) 1749 return Reg; 1750 1751 const LLT S16 = LLT::scalar(16); 1752 LLT StoreVT = MRI.getType(Reg); 1753 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1754 return Reg; 1755 1756 auto Unmerge = B.buildUnmerge(S16, Reg); 1757 1758 1759 SmallVector<Register, 4> WideRegs; 1760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1761 WideRegs.push_back(Unmerge.getReg(I)); 1762 1763 const LLT S32 = LLT::scalar(32); 1764 int NumElts = StoreVT.getNumElements(); 1765 1766 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs) 1767 .getReg(0); 1768 } 1769 1770 static std::pair<Register, unsigned> 1771 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1772 int64_t Const; 1773 if (mi_match(Reg, MRI, m_ICst(Const))) 1774 return std::pair(Register(), Const); 1775 1776 Register Base; 1777 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1778 return std::pair(Base, Const); 1779 1780 // TODO: Handle G_OR used for add case 1781 return std::pair(Reg, 0); 1782 } 1783 1784 std::pair<Register, unsigned> 1785 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1786 Register OrigOffset) const { 1787 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget); 1788 Register BaseReg; 1789 unsigned ImmOffset; 1790 const LLT S32 = LLT::scalar(32); 1791 1792 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. 1793 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1794 OrigOffset); 1795 1796 unsigned C1 = 0; 1797 if (ImmOffset != 0) { 1798 // If the immediate value is too big for the immoffset field, put only bits 1799 // that would normally fit in the immoffset field. The remaining value that 1800 // is copied/added for the voffset field is a large power of 2, and it 1801 // stands more chance of being CSEd with the copy/add for another similar 1802 // load/store. 1803 // However, do not do that rounding down if that is a negative 1804 // number, as it appears to be illegal to have a negative offset in the 1805 // vgpr, even if adding the immediate offset makes it positive. 1806 unsigned Overflow = ImmOffset & ~MaxImm; 1807 ImmOffset -= Overflow; 1808 if ((int32_t)Overflow < 0) { 1809 Overflow += ImmOffset; 1810 ImmOffset = 0; 1811 } 1812 1813 C1 = ImmOffset; 1814 if (Overflow != 0) { 1815 if (!BaseReg) 1816 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1817 else { 1818 auto OverflowVal = B.buildConstant(S32, Overflow); 1819 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1820 } 1821 } 1822 } 1823 1824 if (!BaseReg) 1825 BaseReg = B.buildConstant(S32, 0).getReg(0); 1826 1827 return {BaseReg, C1}; 1828 } 1829 1830 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1831 Register SrcReg) const { 1832 MachineRegisterInfo &MRI = *B.getMRI(); 1833 LLT SrcTy = MRI.getType(SrcReg); 1834 if (SrcTy.getSizeInBits() == 32) { 1835 // Use a v_mov_b32 here to make the exec dependency explicit. 1836 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1837 .addDef(DstReg) 1838 .addUse(SrcReg); 1839 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1840 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1841 } 1842 1843 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1844 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1845 1846 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1847 .addDef(TmpReg0) 1848 .addUse(SrcReg, 0, AMDGPU::sub0); 1849 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1850 .addDef(TmpReg1) 1851 .addUse(SrcReg, 0, AMDGPU::sub1); 1852 B.buildInstr(AMDGPU::REG_SEQUENCE) 1853 .addDef(DstReg) 1854 .addUse(TmpReg0) 1855 .addImm(AMDGPU::sub0) 1856 .addUse(TmpReg1) 1857 .addImm(AMDGPU::sub1); 1858 1859 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1860 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1861 } 1862 1863 /// Utility function for pushing dynamic vector indexes with a constant offset 1864 /// into waterfall loops. 1865 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1866 MachineInstr &IdxUseInstr, 1867 unsigned OpIdx, 1868 unsigned ConstOffset) { 1869 MachineRegisterInfo &MRI = *B.getMRI(); 1870 const LLT S32 = LLT::scalar(32); 1871 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1872 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1873 1874 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1875 1876 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1877 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1878 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1879 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1880 } 1881 1882 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1883 /// original 32-bit source value (to be inserted in the low part of the combined 1884 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1885 /// value. 1886 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1887 Register Hi32Reg, Register Lo32Reg, 1888 unsigned ExtOpc, 1889 const RegisterBank &RegBank, 1890 bool IsBooleanSrc = false) { 1891 if (ExtOpc == AMDGPU::G_ZEXT) { 1892 B.buildConstant(Hi32Reg, 0); 1893 } else if (ExtOpc == AMDGPU::G_SEXT) { 1894 if (IsBooleanSrc) { 1895 // If we know the original source was an s1, the high half is the same as 1896 // the low. 1897 B.buildCopy(Hi32Reg, Lo32Reg); 1898 } else { 1899 // Replicate sign bit from 32-bit extended part. 1900 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1901 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1902 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1903 } 1904 } else { 1905 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1906 B.buildUndef(Hi32Reg); 1907 } 1908 } 1909 1910 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1911 MachineIRBuilder &B, MachineInstr &MI, 1912 const OperandsMapper &OpdMapper) const { 1913 MachineRegisterInfo &MRI = *B.getMRI(); 1914 1915 Register VecReg = MI.getOperand(1).getReg(); 1916 Register Idx = MI.getOperand(2).getReg(); 1917 1918 const RegisterBank &IdxBank = 1919 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1920 1921 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1922 1923 LLT VecTy = MRI.getType(VecReg); 1924 unsigned EltSize = VecTy.getScalarSizeInBits(); 1925 unsigned NumElem = VecTy.getNumElements(); 1926 1927 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1928 IsDivergentIdx, &Subtarget)) 1929 return false; 1930 1931 LLT S32 = LLT::scalar(32); 1932 1933 const RegisterBank &DstBank = 1934 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1935 const RegisterBank &SrcBank = 1936 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1937 1938 const RegisterBank &CCBank = 1939 (DstBank == AMDGPU::SGPRRegBank && 1940 SrcBank == AMDGPU::SGPRRegBank && 1941 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1942 : AMDGPU::VCCRegBank; 1943 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1944 1945 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1946 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1947 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1948 } 1949 1950 LLT EltTy = VecTy.getScalarType(); 1951 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1952 unsigned NumLanes = DstRegs.size(); 1953 if (!NumLanes) 1954 NumLanes = 1; 1955 else 1956 EltTy = MRI.getType(DstRegs[0]); 1957 1958 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1959 SmallVector<Register, 2> Res(NumLanes); 1960 for (unsigned L = 0; L < NumLanes; ++L) 1961 Res[L] = UnmergeToEltTy.getReg(L); 1962 1963 for (unsigned I = 1; I < NumElem; ++I) { 1964 auto IC = B.buildConstant(S32, I); 1965 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1966 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1967 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1968 1969 for (unsigned L = 0; L < NumLanes; ++L) { 1970 auto S = B.buildSelect(EltTy, Cmp, 1971 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1972 1973 for (unsigned N : { 0, 2, 3 }) 1974 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1975 1976 Res[L] = S->getOperand(0).getReg(); 1977 } 1978 } 1979 1980 for (unsigned L = 0; L < NumLanes; ++L) { 1981 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1982 B.buildCopy(DstReg, Res[L]); 1983 MRI.setRegBank(DstReg, DstBank); 1984 } 1985 1986 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1987 MI.eraseFromParent(); 1988 1989 return true; 1990 } 1991 1992 // Insert a cross regbank copy for a register if it already has a bank that 1993 // differs from the one we want to set. 1994 static Register constrainRegToBank(MachineRegisterInfo &MRI, 1995 MachineIRBuilder &B, Register &Reg, 1996 const RegisterBank &Bank) { 1997 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 1998 if (CurrBank && *CurrBank != Bank) { 1999 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2000 MRI.setRegBank(Copy, Bank); 2001 return Copy; 2002 } 2003 2004 MRI.setRegBank(Reg, Bank); 2005 return Reg; 2006 } 2007 2008 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2009 MachineIRBuilder &B, MachineInstr &MI, 2010 const OperandsMapper &OpdMapper) const { 2011 2012 MachineRegisterInfo &MRI = *B.getMRI(); 2013 Register VecReg = MI.getOperand(1).getReg(); 2014 Register Idx = MI.getOperand(3).getReg(); 2015 2016 const RegisterBank &IdxBank = 2017 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2018 2019 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2020 2021 LLT VecTy = MRI.getType(VecReg); 2022 unsigned EltSize = VecTy.getScalarSizeInBits(); 2023 unsigned NumElem = VecTy.getNumElements(); 2024 2025 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2026 IsDivergentIdx, &Subtarget)) 2027 return false; 2028 2029 LLT S32 = LLT::scalar(32); 2030 2031 const RegisterBank &DstBank = 2032 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2033 const RegisterBank &SrcBank = 2034 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2035 const RegisterBank &InsBank = 2036 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2037 2038 const RegisterBank &CCBank = 2039 (DstBank == AMDGPU::SGPRRegBank && 2040 SrcBank == AMDGPU::SGPRRegBank && 2041 InsBank == AMDGPU::SGPRRegBank && 2042 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2043 : AMDGPU::VCCRegBank; 2044 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2045 2046 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2047 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2048 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2049 } 2050 2051 LLT EltTy = VecTy.getScalarType(); 2052 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2053 unsigned NumLanes = InsRegs.size(); 2054 if (!NumLanes) { 2055 NumLanes = 1; 2056 InsRegs.push_back(MI.getOperand(2).getReg()); 2057 } else { 2058 EltTy = MRI.getType(InsRegs[0]); 2059 } 2060 2061 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2062 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2063 2064 for (unsigned I = 0; I < NumElem; ++I) { 2065 auto IC = B.buildConstant(S32, I); 2066 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2067 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2068 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2069 2070 for (unsigned L = 0; L < NumLanes; ++L) { 2071 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2072 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2073 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2074 2075 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2076 MRI.setRegBank(Select, DstBank); 2077 2078 Ops[I * NumLanes + L] = Select; 2079 } 2080 } 2081 2082 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2083 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2084 B.buildBuildVector(MI.getOperand(0), Ops); 2085 } else { 2086 auto Vec = B.buildBuildVector(MergeTy, Ops); 2087 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2088 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2089 } 2090 2091 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2092 MI.eraseFromParent(); 2093 2094 return true; 2095 } 2096 2097 // Break s_mul_u64 into 32-bit vector operations. 2098 void AMDGPURegisterBankInfo::applyMappingSMULU64( 2099 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 2100 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2101 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2102 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2103 2104 // All inputs are SGPRs, nothing special to do. 2105 if (DefRegs.empty()) { 2106 assert(Src0Regs.empty() && Src1Regs.empty()); 2107 applyDefaultMapping(OpdMapper); 2108 return; 2109 } 2110 2111 assert(DefRegs.size() == 2); 2112 assert(Src0Regs.size() == Src1Regs.size() && 2113 (Src0Regs.empty() || Src0Regs.size() == 2)); 2114 2115 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2116 MachineInstr &MI = OpdMapper.getMI(); 2117 Register DstReg = MI.getOperand(0).getReg(); 2118 LLT HalfTy = LLT::scalar(32); 2119 2120 // Depending on where the source registers came from, the generic code may 2121 // have decided to split the inputs already or not. If not, we still need to 2122 // extract the values. 2123 2124 if (Src0Regs.empty()) 2125 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2126 else 2127 setRegsToType(MRI, Src0Regs, HalfTy); 2128 2129 if (Src1Regs.empty()) 2130 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2131 else 2132 setRegsToType(MRI, Src1Regs, HalfTy); 2133 2134 setRegsToType(MRI, DefRegs, HalfTy); 2135 2136 // The multiplication is done as follows: 2137 // 2138 // Op1H Op1L 2139 // * Op0H Op0L 2140 // -------------------- 2141 // Op1H*Op0L Op1L*Op0L 2142 // + Op1H*Op0H Op1L*Op0H 2143 // ----------------------------------------- 2144 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L 2145 // 2146 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit 2147 // value and that would overflow. 2148 // The low 32-bit value is Op1L*Op0L. 2149 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from 2150 // Op1L*Op0L). 2151 2152 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); 2153 2154 Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0); 2155 Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0); 2156 Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0); 2157 Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0); 2158 B.buildAdd(DefRegs[1], Add, MulHiLo); 2159 B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]); 2160 2161 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2162 MI.eraseFromParent(); 2163 } 2164 2165 void AMDGPURegisterBankInfo::applyMappingImpl( 2166 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { 2167 MachineInstr &MI = OpdMapper.getMI(); 2168 B.setInstrAndDebugLoc(MI); 2169 unsigned Opc = MI.getOpcode(); 2170 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2171 switch (Opc) { 2172 case AMDGPU::G_CONSTANT: 2173 case AMDGPU::G_IMPLICIT_DEF: { 2174 Register DstReg = MI.getOperand(0).getReg(); 2175 LLT DstTy = MRI.getType(DstReg); 2176 if (DstTy != LLT::scalar(1)) 2177 break; 2178 2179 const RegisterBank *DstBank = 2180 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2181 if (DstBank == &AMDGPU::VCCRegBank) 2182 break; 2183 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2184 if (DefRegs.empty()) 2185 DefRegs.push_back(DstReg); 2186 2187 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2188 2189 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 2190 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 2191 2192 MI.getOperand(0).setReg(NewDstReg); 2193 if (Opc != AMDGPU::G_IMPLICIT_DEF) { 2194 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); 2195 MI.getOperand(1).setCImm( 2196 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal)); 2197 } 2198 2199 MRI.setRegBank(NewDstReg, *DstBank); 2200 B.buildTrunc(DefRegs[0], NewDstReg); 2201 return; 2202 } 2203 case AMDGPU::G_PHI: { 2204 Register DstReg = MI.getOperand(0).getReg(); 2205 LLT DstTy = MRI.getType(DstReg); 2206 if (DstTy != LLT::scalar(1)) 2207 break; 2208 2209 const LLT S32 = LLT::scalar(32); 2210 const RegisterBank *DstBank = 2211 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2212 if (DstBank == &AMDGPU::VCCRegBank) { 2213 applyDefaultMapping(OpdMapper); 2214 // The standard handling only considers the result register bank for 2215 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2216 // produce an invalid copy. We can only copy with some kind of compare to 2217 // get a vector boolean result. Insert a register bank copy that will be 2218 // correctly lowered to a compare. 2219 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2220 Register SrcReg = MI.getOperand(I).getReg(); 2221 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2222 2223 if (SrcBank != &AMDGPU::VCCRegBank) { 2224 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2225 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2226 2227 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2228 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2229 MI.getOperand(I).setReg(Copy.getReg(0)); 2230 } 2231 } 2232 2233 return; 2234 } 2235 2236 // Phi handling is strange and only considers the bank of the destination. 2237 substituteSimpleCopyRegs(OpdMapper, 0); 2238 2239 // Promote SGPR/VGPR booleans to s32 2240 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 2241 B.setInsertPt(B.getMBB(), MI); 2242 LegalizerHelper Helper(B.getMF(), ApplyBank, B); 2243 2244 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2245 llvm_unreachable("widen scalar should have succeeded"); 2246 2247 return; 2248 } 2249 case AMDGPU::G_FCMP: 2250 if (!Subtarget.hasSALUFloatInsts()) 2251 break; 2252 LLVM_FALLTHROUGH; 2253 case AMDGPU::G_ICMP: 2254 case AMDGPU::G_UADDO: 2255 case AMDGPU::G_USUBO: 2256 case AMDGPU::G_UADDE: 2257 case AMDGPU::G_SADDE: 2258 case AMDGPU::G_USUBE: 2259 case AMDGPU::G_SSUBE: { 2260 unsigned BoolDstOp = 2261 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1; 2262 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2263 2264 const RegisterBank *DstBank = 2265 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2266 if (DstBank != &AMDGPU::SGPRRegBank) 2267 break; 2268 2269 const bool HasCarryIn = MI.getNumOperands() == 5; 2270 2271 // If this is a scalar compare, promote the result to s32, as the selection 2272 // will end up using a copy to a 32-bit vreg. 2273 const LLT S32 = LLT::scalar(32); 2274 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2275 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2276 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2277 2278 if (HasCarryIn) { 2279 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2280 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2281 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2282 MI.getOperand(4).setReg(NewSrcReg); 2283 } 2284 2285 MachineBasicBlock *MBB = MI.getParent(); 2286 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2287 2288 // If we had a constrained VCC result register, a copy was inserted to VCC 2289 // from SGPR. 2290 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2291 if (DefRegs.empty()) 2292 DefRegs.push_back(DstReg); 2293 B.buildTrunc(DefRegs[0], NewDstReg); 2294 return; 2295 } 2296 case AMDGPU::G_SELECT: { 2297 Register DstReg = MI.getOperand(0).getReg(); 2298 LLT DstTy = MRI.getType(DstReg); 2299 2300 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2301 if (CondRegs.empty()) 2302 CondRegs.push_back(MI.getOperand(1).getReg()); 2303 else { 2304 assert(CondRegs.size() == 1); 2305 } 2306 2307 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2308 if (CondBank == &AMDGPU::SGPRRegBank) { 2309 const LLT S32 = LLT::scalar(32); 2310 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2311 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2312 2313 MI.getOperand(1).setReg(NewCondReg); 2314 B.buildZExt(NewCondReg, CondRegs[0]); 2315 } 2316 2317 if (DstTy.getSizeInBits() != 64) 2318 break; 2319 2320 LLT HalfTy = getHalfSizedType(DstTy); 2321 2322 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2323 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2324 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2325 2326 // All inputs are SGPRs, nothing special to do. 2327 if (DefRegs.empty()) { 2328 assert(Src1Regs.empty() && Src2Regs.empty()); 2329 break; 2330 } 2331 2332 if (Src1Regs.empty()) 2333 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2334 else { 2335 setRegsToType(MRI, Src1Regs, HalfTy); 2336 } 2337 2338 if (Src2Regs.empty()) 2339 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2340 else 2341 setRegsToType(MRI, Src2Regs, HalfTy); 2342 2343 setRegsToType(MRI, DefRegs, HalfTy); 2344 2345 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2346 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2347 2348 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2349 MI.eraseFromParent(); 2350 return; 2351 } 2352 case AMDGPU::G_BRCOND: { 2353 Register CondReg = MI.getOperand(0).getReg(); 2354 // FIXME: Should use legalizer helper, but should change bool ext type. 2355 const RegisterBank *CondBank = 2356 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2357 2358 if (CondBank == &AMDGPU::SGPRRegBank) { 2359 const LLT S32 = LLT::scalar(32); 2360 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2361 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2362 2363 MI.getOperand(0).setReg(NewCondReg); 2364 B.buildZExt(NewCondReg, CondReg); 2365 return; 2366 } 2367 2368 break; 2369 } 2370 case AMDGPU::G_AND: 2371 case AMDGPU::G_OR: 2372 case AMDGPU::G_XOR: { 2373 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2374 // there is a VGPR input. 2375 Register DstReg = MI.getOperand(0).getReg(); 2376 LLT DstTy = MRI.getType(DstReg); 2377 2378 if (DstTy.getSizeInBits() == 1) { 2379 const RegisterBank *DstBank = 2380 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2381 if (DstBank == &AMDGPU::VCCRegBank) 2382 break; 2383 2384 MachineFunction *MF = MI.getParent()->getParent(); 2385 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); 2386 LegalizerHelper Helper(*MF, ApplyBank, B); 2387 2388 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2389 LegalizerHelper::Legalized) 2390 llvm_unreachable("widen scalar should have succeeded"); 2391 return; 2392 } 2393 2394 if (DstTy.getSizeInBits() != 64) 2395 break; 2396 2397 LLT HalfTy = getHalfSizedType(DstTy); 2398 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2399 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2400 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2401 2402 // All inputs are SGPRs, nothing special to do. 2403 if (DefRegs.empty()) { 2404 assert(Src0Regs.empty() && Src1Regs.empty()); 2405 break; 2406 } 2407 2408 assert(DefRegs.size() == 2); 2409 assert(Src0Regs.size() == Src1Regs.size() && 2410 (Src0Regs.empty() || Src0Regs.size() == 2)); 2411 2412 // Depending on where the source registers came from, the generic code may 2413 // have decided to split the inputs already or not. If not, we still need to 2414 // extract the values. 2415 2416 if (Src0Regs.empty()) 2417 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2418 else 2419 setRegsToType(MRI, Src0Regs, HalfTy); 2420 2421 if (Src1Regs.empty()) 2422 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2423 else 2424 setRegsToType(MRI, Src1Regs, HalfTy); 2425 2426 setRegsToType(MRI, DefRegs, HalfTy); 2427 2428 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2429 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2430 2431 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2432 MI.eraseFromParent(); 2433 return; 2434 } 2435 case AMDGPU::G_ABS: { 2436 Register SrcReg = MI.getOperand(1).getReg(); 2437 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2438 2439 // There is no VALU abs instruction so we need to replace it with a sub and 2440 // max combination. 2441 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2442 MachineFunction *MF = MI.getParent()->getParent(); 2443 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); 2444 LegalizerHelper Helper(*MF, Apply, B); 2445 2446 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2447 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2448 return; 2449 } 2450 [[fallthrough]]; 2451 } 2452 case AMDGPU::G_ADD: 2453 case AMDGPU::G_SUB: 2454 case AMDGPU::G_MUL: 2455 case AMDGPU::G_SHL: 2456 case AMDGPU::G_LSHR: 2457 case AMDGPU::G_ASHR: 2458 case AMDGPU::G_SMIN: 2459 case AMDGPU::G_SMAX: 2460 case AMDGPU::G_UMIN: 2461 case AMDGPU::G_UMAX: { 2462 Register DstReg = MI.getOperand(0).getReg(); 2463 LLT DstTy = MRI.getType(DstReg); 2464 2465 // Special case for s_mul_u64. There is not a vector equivalent of 2466 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector 2467 // multiplications. 2468 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { 2469 applyMappingSMULU64(B, OpdMapper); 2470 return; 2471 } 2472 2473 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2474 // Packed 16-bit operations need to be scalarized and promoted. 2475 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2476 break; 2477 2478 const RegisterBank *DstBank = 2479 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2480 if (DstBank == &AMDGPU::VGPRRegBank) 2481 break; 2482 2483 const LLT S32 = LLT::scalar(32); 2484 MachineBasicBlock *MBB = MI.getParent(); 2485 MachineFunction *MF = MBB->getParent(); 2486 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); 2487 2488 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) { 2489 Register WideSrcLo, WideSrcHi; 2490 2491 std::tie(WideSrcLo, WideSrcHi) = 2492 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT); 2493 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo}); 2494 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi}); 2495 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2496 MI.eraseFromParent(); 2497 return; 2498 } 2499 2500 if (DstTy.isVector()) { 2501 Register WideSrc0Lo, WideSrc0Hi; 2502 Register WideSrc1Lo, WideSrc1Hi; 2503 2504 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2505 std::tie(WideSrc0Lo, WideSrc0Hi) 2506 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2507 std::tie(WideSrc1Lo, WideSrc1Hi) 2508 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2509 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2510 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2511 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2512 MI.eraseFromParent(); 2513 } else { 2514 LegalizerHelper Helper(*MF, ApplySALU, B); 2515 2516 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2517 llvm_unreachable("widen scalar should have succeeded"); 2518 2519 // FIXME: s16 shift amounts should be legal. 2520 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2521 Opc == AMDGPU::G_ASHR) { 2522 B.setInsertPt(*MBB, MI.getIterator()); 2523 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2524 llvm_unreachable("widen scalar should have succeeded"); 2525 } 2526 } 2527 2528 return; 2529 } 2530 case AMDGPU::G_AMDGPU_S_MUL_I64_I32: 2531 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { 2532 // This is a special case for s_mul_u64. We use 2533 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation 2534 // where the 33 higher bits are sign-extended and 2535 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation 2536 // where the 32 higher bits are zero-extended. In case scalar registers are 2537 // selected, both opcodes are lowered as s_mul_u64. If the vector registers 2538 // are selected, then G_AMDGPU_S_MUL_I64_I32 and 2539 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. 2540 2541 // Insert basic copies. 2542 applyDefaultMapping(OpdMapper); 2543 2544 Register DstReg = MI.getOperand(0).getReg(); 2545 Register SrcReg0 = MI.getOperand(1).getReg(); 2546 Register SrcReg1 = MI.getOperand(2).getReg(); 2547 const LLT S32 = LLT::scalar(32); 2548 const LLT S64 = LLT::scalar(64); 2549 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " 2550 "that handles only 64-bit operands."); 2551 const RegisterBank *DstBank = 2552 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2553 2554 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 2555 // with s_mul_u64 operation. 2556 if (DstBank == &AMDGPU::SGPRRegBank) { 2557 MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); 2558 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); 2559 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); 2560 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); 2561 return; 2562 } 2563 2564 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 2565 // with a vector mad. 2566 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && 2567 "The destination operand should be in vector registers."); 2568 2569 DebugLoc DL = MI.getDebugLoc(); 2570 2571 // Extract the lower subregister from the first operand. 2572 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2573 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); 2574 MRI.setType(Op0L, S32); 2575 B.buildTrunc(Op0L, SrcReg0); 2576 2577 // Extract the lower subregister from the second operand. 2578 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2579 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); 2580 MRI.setType(Op1L, S32); 2581 B.buildTrunc(Op1L, SrcReg1); 2582 2583 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 2584 ? AMDGPU::G_AMDGPU_MAD_U64_U32 2585 : AMDGPU::G_AMDGPU_MAD_I64_I32; 2586 2587 MachineIRBuilder B(MI); 2588 Register Zero64 = B.buildConstant(S64, 0).getReg(0); 2589 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); 2590 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2591 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); 2592 B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64}); 2593 MI.eraseFromParent(); 2594 return; 2595 } 2596 case AMDGPU::G_SEXT_INREG: { 2597 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2598 if (SrcRegs.empty()) 2599 break; // Nothing to repair 2600 2601 const LLT S32 = LLT::scalar(32); 2602 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); 2603 2604 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2605 // we would need to further expand, and doesn't let us directly set the 2606 // result registers. 2607 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2608 2609 int Amt = MI.getOperand(2).getImm(); 2610 if (Amt <= 32) { 2611 // Downstream users have expectations for the high bit behavior, so freeze 2612 // incoming undefined bits. 2613 if (Amt == 32) { 2614 // The low bits are unchanged. 2615 B.buildFreeze(DstRegs[0], SrcRegs[0]); 2616 } else { 2617 auto Freeze = B.buildFreeze(S32, SrcRegs[0]); 2618 // Extend in the low bits and propagate the sign bit to the high half. 2619 B.buildSExtInReg(DstRegs[0], Freeze, Amt); 2620 } 2621 2622 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2623 } else { 2624 // The low bits are unchanged, and extend in the high bits. 2625 // No freeze required 2626 B.buildCopy(DstRegs[0], SrcRegs[0]); 2627 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2628 } 2629 2630 Register DstReg = MI.getOperand(0).getReg(); 2631 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2632 MI.eraseFromParent(); 2633 return; 2634 } 2635 case AMDGPU::G_CTPOP: 2636 case AMDGPU::G_BITREVERSE: { 2637 const RegisterBank *DstBank = 2638 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2639 if (DstBank == &AMDGPU::SGPRRegBank) 2640 break; 2641 2642 Register SrcReg = MI.getOperand(1).getReg(); 2643 const LLT S32 = LLT::scalar(32); 2644 LLT Ty = MRI.getType(SrcReg); 2645 if (Ty == S32) 2646 break; 2647 2648 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); 2649 2650 MachineFunction &MF = B.getMF(); 2651 LegalizerHelper Helper(MF, ApplyVALU, B); 2652 2653 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2654 llvm_unreachable("narrowScalar should have succeeded"); 2655 return; 2656 } 2657 case AMDGPU::G_AMDGPU_FFBH_U32: 2658 case AMDGPU::G_AMDGPU_FFBL_B32: 2659 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2660 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2661 const RegisterBank *DstBank = 2662 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2663 if (DstBank == &AMDGPU::SGPRRegBank) 2664 break; 2665 2666 Register SrcReg = MI.getOperand(1).getReg(); 2667 const LLT S32 = LLT::scalar(32); 2668 LLT Ty = MRI.getType(SrcReg); 2669 if (Ty == S32) 2670 break; 2671 2672 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2673 // which return -1 when the input is zero: 2674 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2675 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2676 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2677 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2678 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); 2679 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2680 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2681 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2682 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2683 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2684 : Opc; 2685 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2686 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2687 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2688 unsigned AddOpc = 2689 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2690 ? AMDGPU::G_ADD 2691 : AMDGPU::G_UADDSAT; 2692 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2693 Register DstReg = MI.getOperand(0).getReg(); 2694 B.buildUMin(DstReg, X, Y); 2695 MI.eraseFromParent(); 2696 return; 2697 } 2698 case AMDGPU::G_SEXT: 2699 case AMDGPU::G_ZEXT: 2700 case AMDGPU::G_ANYEXT: { 2701 Register SrcReg = MI.getOperand(1).getReg(); 2702 LLT SrcTy = MRI.getType(SrcReg); 2703 const bool Signed = Opc == AMDGPU::G_SEXT; 2704 2705 assert(OpdMapper.getVRegs(1).empty()); 2706 2707 const RegisterBank *SrcBank = 2708 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2709 2710 Register DstReg = MI.getOperand(0).getReg(); 2711 LLT DstTy = MRI.getType(DstReg); 2712 if (DstTy.isScalar() && 2713 SrcBank != &AMDGPU::SGPRRegBank && 2714 SrcBank != &AMDGPU::VCCRegBank && 2715 // FIXME: Should handle any type that round to s64 when irregular 2716 // breakdowns supported. 2717 DstTy.getSizeInBits() == 64 && 2718 SrcTy.getSizeInBits() <= 32) { 2719 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2720 2721 // Extend to 32-bit, and then extend the low half. 2722 if (Signed) { 2723 // TODO: Should really be buildSExtOrCopy 2724 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2725 } else if (Opc == AMDGPU::G_ZEXT) { 2726 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2727 } else { 2728 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2729 } 2730 2731 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2732 MRI.setRegBank(DstReg, *SrcBank); 2733 MI.eraseFromParent(); 2734 return; 2735 } 2736 2737 if (SrcTy != LLT::scalar(1)) 2738 return; 2739 2740 // It is not legal to have a legalization artifact with a VCC source. Rather 2741 // than introducing a copy, insert the select we would have to select the 2742 // copy to. 2743 if (SrcBank == &AMDGPU::VCCRegBank) { 2744 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2745 2746 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2747 2748 unsigned DstSize = DstTy.getSizeInBits(); 2749 // 64-bit select is SGPR only 2750 const bool UseSel64 = DstSize > 32 && 2751 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2752 2753 // TODO: Should s16 select be legal? 2754 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2755 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2756 auto False = B.buildConstant(SelType, 0); 2757 2758 MRI.setRegBank(True.getReg(0), *DstBank); 2759 MRI.setRegBank(False.getReg(0), *DstBank); 2760 MRI.setRegBank(DstReg, *DstBank); 2761 2762 if (DstSize > 32) { 2763 B.buildSelect(DefRegs[0], SrcReg, True, False); 2764 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2765 } else if (DstSize < 32) { 2766 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2767 MRI.setRegBank(Sel.getReg(0), *DstBank); 2768 B.buildTrunc(DstReg, Sel); 2769 } else { 2770 B.buildSelect(DstReg, SrcReg, True, False); 2771 } 2772 2773 MI.eraseFromParent(); 2774 return; 2775 } 2776 2777 break; 2778 } 2779 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2780 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2781 2782 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2783 2784 Register DstReg = MI.getOperand(0).getReg(); 2785 Register SrcReg = MI.getOperand(1).getReg(); 2786 2787 const LLT S32 = LLT::scalar(32); 2788 LLT DstTy = MRI.getType(DstReg); 2789 LLT SrcTy = MRI.getType(SrcReg); 2790 2791 if (foldExtractEltToCmpSelect(B, MI, OpdMapper)) 2792 return; 2793 2794 const ValueMapping &DstMapping 2795 = OpdMapper.getInstrMapping().getOperandMapping(0); 2796 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2797 const RegisterBank *SrcBank = 2798 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2799 const RegisterBank *IdxBank = 2800 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2801 2802 Register BaseIdxReg; 2803 unsigned ConstOffset; 2804 std::tie(BaseIdxReg, ConstOffset) = 2805 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2806 2807 // See if the index is an add of a constant which will be foldable by moving 2808 // the base register of the index later if this is going to be executed in a 2809 // waterfall loop. This is essentially to reassociate the add of a constant 2810 // with the readfirstlane. 2811 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2812 ConstOffset > 0 && 2813 ConstOffset < SrcTy.getNumElements(); 2814 2815 // Move the base register. We'll re-insert the add later. 2816 if (ShouldMoveIndexIntoLoop) 2817 MI.getOperand(2).setReg(BaseIdxReg); 2818 2819 // If this is a VGPR result only because the index was a VGPR result, the 2820 // actual indexing will be done on the SGPR source vector, which will 2821 // produce a scalar result. We need to copy to the VGPR result inside the 2822 // waterfall loop. 2823 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2824 SrcBank == &AMDGPU::SGPRRegBank; 2825 if (DstRegs.empty()) { 2826 applyDefaultMapping(OpdMapper); 2827 2828 executeInWaterfallLoop(B, MI, {2}); 2829 2830 if (NeedCopyToVGPR) { 2831 // We don't want a phi for this temporary reg. 2832 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2833 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2834 MI.getOperand(0).setReg(TmpReg); 2835 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2836 2837 // Use a v_mov_b32 here to make the exec dependency explicit. 2838 buildVCopy(B, DstReg, TmpReg); 2839 } 2840 2841 // Re-insert the constant offset add inside the waterfall loop. 2842 if (ShouldMoveIndexIntoLoop) 2843 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2844 2845 return; 2846 } 2847 2848 assert(DstTy.getSizeInBits() == 64); 2849 2850 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2851 2852 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2853 auto One = B.buildConstant(S32, 1); 2854 2855 MachineBasicBlock::iterator MII = MI.getIterator(); 2856 2857 // Split the vector index into 32-bit pieces. Prepare to move all of the 2858 // new instructions into a waterfall loop if necessary. 2859 // 2860 // Don't put the bitcast or constant in the loop. 2861 MachineInstrSpan Span(MII, &B.getMBB()); 2862 2863 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2864 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2865 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2866 2867 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2868 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2869 2870 MRI.setRegBank(DstReg, *DstBank); 2871 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2872 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2873 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2874 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2875 2876 SmallSet<Register, 4> OpsToWaterfall; 2877 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2878 MI.eraseFromParent(); 2879 return; 2880 } 2881 2882 // Remove the original instruction to avoid potentially confusing the 2883 // waterfall loop logic. 2884 B.setInstr(*Span.begin()); 2885 MI.eraseFromParent(); 2886 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2887 OpsToWaterfall); 2888 2889 if (NeedCopyToVGPR) { 2890 MachineBasicBlock *LoopBB = Extract1->getParent(); 2891 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2892 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2893 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2894 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2895 2896 Extract0->getOperand(0).setReg(TmpReg0); 2897 Extract1->getOperand(0).setReg(TmpReg1); 2898 2899 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2900 2901 buildVCopy(B, DstRegs[0], TmpReg0); 2902 buildVCopy(B, DstRegs[1], TmpReg1); 2903 } 2904 2905 if (ShouldMoveIndexIntoLoop) 2906 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2907 2908 return; 2909 } 2910 case AMDGPU::G_INSERT_VECTOR_ELT: { 2911 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2912 2913 Register DstReg = MI.getOperand(0).getReg(); 2914 LLT VecTy = MRI.getType(DstReg); 2915 2916 assert(OpdMapper.getVRegs(0).empty()); 2917 assert(OpdMapper.getVRegs(3).empty()); 2918 2919 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2920 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2921 2922 if (foldInsertEltToCmpSelect(B, MI, OpdMapper)) 2923 return; 2924 2925 const RegisterBank *IdxBank = 2926 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2927 2928 Register SrcReg = MI.getOperand(1).getReg(); 2929 Register InsReg = MI.getOperand(2).getReg(); 2930 LLT InsTy = MRI.getType(InsReg); 2931 (void)InsTy; 2932 2933 Register BaseIdxReg; 2934 unsigned ConstOffset; 2935 std::tie(BaseIdxReg, ConstOffset) = 2936 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2937 2938 // See if the index is an add of a constant which will be foldable by moving 2939 // the base register of the index later if this is going to be executed in a 2940 // waterfall loop. This is essentially to reassociate the add of a constant 2941 // with the readfirstlane. 2942 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2943 ConstOffset > 0 && 2944 ConstOffset < VecTy.getNumElements(); 2945 2946 // Move the base register. We'll re-insert the add later. 2947 if (ShouldMoveIndexIntoLoop) 2948 MI.getOperand(3).setReg(BaseIdxReg); 2949 2950 2951 if (InsRegs.empty()) { 2952 executeInWaterfallLoop(B, MI, {3}); 2953 2954 // Re-insert the constant offset add inside the waterfall loop. 2955 if (ShouldMoveIndexIntoLoop) { 2956 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2957 } 2958 2959 return; 2960 } 2961 2962 assert(InsTy.getSizeInBits() == 64); 2963 2964 const LLT S32 = LLT::scalar(32); 2965 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2966 2967 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2968 auto One = B.buildConstant(S32, 1); 2969 2970 // Split the vector index into 32-bit pieces. Prepare to move all of the 2971 // new instructions into a waterfall loop if necessary. 2972 // 2973 // Don't put the bitcast or constant in the loop. 2974 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2975 2976 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2977 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2978 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2979 2980 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2981 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2982 2983 const RegisterBank *DstBank = 2984 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2985 const RegisterBank *SrcBank = 2986 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2987 const RegisterBank *InsSrcBank = 2988 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2989 2990 MRI.setRegBank(InsReg, *InsSrcBank); 2991 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2992 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2993 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2994 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2995 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2996 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2997 2998 2999 SmallSet<Register, 4> OpsToWaterfall; 3000 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 3001 B.setInsertPt(B.getMBB(), MI); 3002 B.buildBitcast(DstReg, InsHi); 3003 MI.eraseFromParent(); 3004 return; 3005 } 3006 3007 B.setInstr(*Span.begin()); 3008 MI.eraseFromParent(); 3009 3010 // Figure out the point after the waterfall loop before mangling the control 3011 // flow. 3012 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 3013 OpsToWaterfall); 3014 3015 // The insertion point is now right after the original instruction. 3016 // 3017 // Keep the bitcast to the original vector type out of the loop. Doing this 3018 // saved an extra phi we don't need inside the loop. 3019 B.buildBitcast(DstReg, InsHi); 3020 3021 // Re-insert the constant offset add inside the waterfall loop. 3022 if (ShouldMoveIndexIntoLoop) 3023 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 3024 3025 return; 3026 } 3027 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3028 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3029 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3030 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3031 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3032 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3033 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 3034 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3035 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3036 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3037 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3038 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3039 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3040 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3041 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 3042 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3043 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 3044 applyDefaultMapping(OpdMapper); 3045 executeInWaterfallLoop(B, MI, {1, 4}); 3046 return; 3047 } 3048 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3049 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3050 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3051 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3052 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3053 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3054 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3055 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3056 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3057 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3058 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3059 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 3060 applyDefaultMapping(OpdMapper); 3061 executeInWaterfallLoop(B, MI, {2, 5}); 3062 return; 3063 } 3064 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3065 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3066 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3067 applyDefaultMapping(OpdMapper); 3068 executeInWaterfallLoop(B, MI, {2, 5}); 3069 return; 3070 } 3071 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3072 applyDefaultMapping(OpdMapper); 3073 executeInWaterfallLoop(B, MI, {3, 6}); 3074 return; 3075 } 3076 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 3077 applyMappingSBufferLoad(B, OpdMapper); 3078 return; 3079 } 3080 case AMDGPU::G_INTRINSIC: 3081 case AMDGPU::G_INTRINSIC_CONVERGENT: { 3082 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 3083 case Intrinsic::amdgcn_readlane: { 3084 substituteSimpleCopyRegs(OpdMapper, 2); 3085 3086 assert(OpdMapper.getVRegs(0).empty()); 3087 assert(OpdMapper.getVRegs(3).empty()); 3088 3089 // Make sure the index is an SGPR. It doesn't make sense to run this in a 3090 // waterfall loop, so assume it's a uniform value. 3091 constrainOpWithReadfirstlane(B, MI, 3); // Index 3092 return; 3093 } 3094 case Intrinsic::amdgcn_writelane: { 3095 assert(OpdMapper.getVRegs(0).empty()); 3096 assert(OpdMapper.getVRegs(2).empty()); 3097 assert(OpdMapper.getVRegs(3).empty()); 3098 3099 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 3100 constrainOpWithReadfirstlane(B, MI, 2); // Source value 3101 constrainOpWithReadfirstlane(B, MI, 3); // Index 3102 return; 3103 } 3104 case Intrinsic::amdgcn_interp_p1: 3105 case Intrinsic::amdgcn_interp_p2: 3106 case Intrinsic::amdgcn_interp_mov: 3107 case Intrinsic::amdgcn_interp_p1_f16: 3108 case Intrinsic::amdgcn_interp_p2_f16: 3109 case Intrinsic::amdgcn_lds_param_load: { 3110 applyDefaultMapping(OpdMapper); 3111 3112 // Readlane for m0 value, which is always the last operand. 3113 // FIXME: Should this be a waterfall loop instead? 3114 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index 3115 return; 3116 } 3117 case Intrinsic::amdgcn_interp_inreg_p10: 3118 case Intrinsic::amdgcn_interp_inreg_p2: 3119 case Intrinsic::amdgcn_interp_inreg_p10_f16: 3120 case Intrinsic::amdgcn_interp_inreg_p2_f16: 3121 applyDefaultMapping(OpdMapper); 3122 return; 3123 case Intrinsic::amdgcn_permlane16: 3124 case Intrinsic::amdgcn_permlanex16: { 3125 // Doing a waterfall loop over these wouldn't make any sense. 3126 substituteSimpleCopyRegs(OpdMapper, 2); 3127 substituteSimpleCopyRegs(OpdMapper, 3); 3128 constrainOpWithReadfirstlane(B, MI, 4); 3129 constrainOpWithReadfirstlane(B, MI, 5); 3130 return; 3131 } 3132 case Intrinsic::amdgcn_sbfe: 3133 applyMappingBFE(B, OpdMapper, true); 3134 return; 3135 case Intrinsic::amdgcn_ubfe: 3136 applyMappingBFE(B, OpdMapper, false); 3137 return; 3138 case Intrinsic::amdgcn_inverse_ballot: 3139 case Intrinsic::amdgcn_s_bitreplicate: 3140 case Intrinsic::amdgcn_s_quadmask: 3141 case Intrinsic::amdgcn_s_wqm: 3142 applyDefaultMapping(OpdMapper); 3143 constrainOpWithReadfirstlane(B, MI, 2); // Mask 3144 return; 3145 case Intrinsic::amdgcn_ballot: 3146 // Use default handling and insert copy to vcc source. 3147 break; 3148 } 3149 break; 3150 } 3151 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3152 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3153 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3154 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3155 const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3156 AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI)); 3157 assert(RSrcIntrin && RSrcIntrin->IsImage); 3158 // Non-images can have complications from operands that allow both SGPR 3159 // and VGPR. For now it's too complicated to figure out the final opcode 3160 // to derive the register bank from the MCInstrDesc. 3161 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); 3162 return; 3163 } 3164 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3165 unsigned N = MI.getNumExplicitOperands() - 2; 3166 applyDefaultMapping(OpdMapper); 3167 executeInWaterfallLoop(B, MI, {N}); 3168 return; 3169 } 3170 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 3171 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { 3172 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 3173 switch (IntrID) { 3174 case Intrinsic::amdgcn_ds_ordered_add: 3175 case Intrinsic::amdgcn_ds_ordered_swap: { 3176 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3177 assert(OpdMapper.getVRegs(0).empty()); 3178 substituteSimpleCopyRegs(OpdMapper, 3); 3179 constrainOpWithReadfirstlane(B, MI, 2); // M0 3180 return; 3181 } 3182 case Intrinsic::amdgcn_ds_gws_init: 3183 case Intrinsic::amdgcn_ds_gws_barrier: 3184 case Intrinsic::amdgcn_ds_gws_sema_br: { 3185 // Only the first lane is executes, so readfirstlane is safe. 3186 substituteSimpleCopyRegs(OpdMapper, 1); 3187 constrainOpWithReadfirstlane(B, MI, 2); // M0 3188 return; 3189 } 3190 case Intrinsic::amdgcn_ds_gws_sema_v: 3191 case Intrinsic::amdgcn_ds_gws_sema_p: 3192 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3193 // Only the first lane is executes, so readfirstlane is safe. 3194 constrainOpWithReadfirstlane(B, MI, 1); // M0 3195 return; 3196 } 3197 case Intrinsic::amdgcn_ds_append: 3198 case Intrinsic::amdgcn_ds_consume: { 3199 constrainOpWithReadfirstlane(B, MI, 2); // M0 3200 return; 3201 } 3202 case Intrinsic::amdgcn_s_sendmsg: 3203 case Intrinsic::amdgcn_s_sendmsghalt: { 3204 // FIXME: Should this use a waterfall loop? 3205 constrainOpWithReadfirstlane(B, MI, 2); // M0 3206 return; 3207 } 3208 case Intrinsic::amdgcn_s_setreg: { 3209 constrainOpWithReadfirstlane(B, MI, 2); 3210 return; 3211 } 3212 case Intrinsic::amdgcn_s_ttracedata: 3213 constrainOpWithReadfirstlane(B, MI, 1); // M0 3214 return; 3215 case Intrinsic::amdgcn_raw_buffer_load_lds: 3216 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 3217 applyDefaultMapping(OpdMapper); 3218 constrainOpWithReadfirstlane(B, MI, 1); // rsrc 3219 constrainOpWithReadfirstlane(B, MI, 2); // M0 3220 constrainOpWithReadfirstlane(B, MI, 5); // soffset 3221 return; 3222 } 3223 case Intrinsic::amdgcn_struct_buffer_load_lds: 3224 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 3225 applyDefaultMapping(OpdMapper); 3226 constrainOpWithReadfirstlane(B, MI, 1); // rsrc 3227 constrainOpWithReadfirstlane(B, MI, 2); // M0 3228 constrainOpWithReadfirstlane(B, MI, 6); // soffset 3229 return; 3230 } 3231 case Intrinsic::amdgcn_global_load_lds: { 3232 applyDefaultMapping(OpdMapper); 3233 constrainOpWithReadfirstlane(B, MI, 2); 3234 return; 3235 } 3236 case Intrinsic::amdgcn_lds_direct_load: { 3237 applyDefaultMapping(OpdMapper); 3238 // Readlane for m0 value, which is always the last operand. 3239 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index 3240 return; 3241 } 3242 case Intrinsic::amdgcn_exp_row: 3243 applyDefaultMapping(OpdMapper); 3244 constrainOpWithReadfirstlane(B, MI, 8); // M0 3245 return; 3246 case Intrinsic::amdgcn_s_sleep_var: 3247 assert(OpdMapper.getVRegs(1).empty()); 3248 constrainOpWithReadfirstlane(B, MI, 1); 3249 return; 3250 case Intrinsic::amdgcn_s_barrier_signal_var: 3251 case Intrinsic::amdgcn_s_barrier_join: 3252 case Intrinsic::amdgcn_s_wakeup_barrier: 3253 constrainOpWithReadfirstlane(B, MI, 1); 3254 return; 3255 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: 3256 constrainOpWithReadfirstlane(B, MI, 2); 3257 return; 3258 case Intrinsic::amdgcn_s_barrier_init: 3259 constrainOpWithReadfirstlane(B, MI, 1); 3260 constrainOpWithReadfirstlane(B, MI, 2); 3261 return; 3262 case Intrinsic::amdgcn_s_get_barrier_state: { 3263 constrainOpWithReadfirstlane(B, MI, 2); 3264 return; 3265 } 3266 default: { 3267 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3268 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3269 // Non-images can have complications from operands that allow both SGPR 3270 // and VGPR. For now it's too complicated to figure out the final opcode 3271 // to derive the register bank from the MCInstrDesc. 3272 if (RSrcIntrin->IsImage) { 3273 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); 3274 return; 3275 } 3276 } 3277 3278 break; 3279 } 3280 } 3281 break; 3282 } 3283 case AMDGPU::G_SI_CALL: { 3284 // Use a set to avoid extra readfirstlanes in the case where multiple 3285 // operands are the same register. 3286 SmallSet<Register, 4> SGPROperandRegs; 3287 3288 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3289 break; 3290 3291 // Move all copies to physical SGPRs that are used by the call instruction 3292 // into the loop block. Start searching for these copies until the 3293 // ADJCALLSTACKUP. 3294 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3295 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3296 3297 // Move all non-copies before the copies, so that a complete range can be 3298 // moved into the waterfall loop. 3299 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3300 // Count of NonCopyInstrs found until the current LastCopy. 3301 unsigned NonCopyInstrsLen = 0; 3302 MachineBasicBlock::iterator Start(&MI); 3303 MachineBasicBlock::iterator LastCopy = Start; 3304 MachineBasicBlock *MBB = MI.getParent(); 3305 const SIMachineFunctionInfo *Info = 3306 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3307 while (Start->getOpcode() != FrameSetupOpcode) { 3308 --Start; 3309 bool IsCopy = false; 3310 if (Start->getOpcode() == AMDGPU::COPY) { 3311 auto &Dst = Start->getOperand(0); 3312 if (Dst.isReg()) { 3313 Register Reg = Dst.getReg(); 3314 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3315 IsCopy = true; 3316 } else { 3317 // Also move the copy from the scratch rsrc descriptor into the loop 3318 // to allow it to be optimized away. 3319 auto &Src = Start->getOperand(1); 3320 if (Src.isReg()) { 3321 Reg = Src.getReg(); 3322 IsCopy = Info->getScratchRSrcReg() == Reg; 3323 } 3324 } 3325 } 3326 } 3327 3328 if (IsCopy) { 3329 LastCopy = Start; 3330 NonCopyInstrsLen = NonCopyInstrs.size(); 3331 } else { 3332 NonCopyInstrs.push_back(&*Start); 3333 } 3334 } 3335 NonCopyInstrs.resize(NonCopyInstrsLen); 3336 3337 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3338 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3339 } 3340 Start = LastCopy; 3341 3342 // Do the same for copies after the loop 3343 NonCopyInstrs.clear(); 3344 NonCopyInstrsLen = 0; 3345 MachineBasicBlock::iterator End(&MI); 3346 LastCopy = End; 3347 while (End->getOpcode() != FrameDestroyOpcode) { 3348 ++End; 3349 bool IsCopy = false; 3350 if (End->getOpcode() == AMDGPU::COPY) { 3351 auto &Src = End->getOperand(1); 3352 if (Src.isReg()) { 3353 Register Reg = Src.getReg(); 3354 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3355 } 3356 } 3357 3358 if (IsCopy) { 3359 LastCopy = End; 3360 NonCopyInstrsLen = NonCopyInstrs.size(); 3361 } else { 3362 NonCopyInstrs.push_back(&*End); 3363 } 3364 } 3365 NonCopyInstrs.resize(NonCopyInstrsLen); 3366 3367 End = LastCopy; 3368 ++LastCopy; 3369 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3370 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3371 } 3372 3373 ++End; 3374 B.setInsertPt(B.getMBB(), Start); 3375 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs); 3376 break; 3377 } 3378 case AMDGPU::G_LOAD: 3379 case AMDGPU::G_ZEXTLOAD: 3380 case AMDGPU::G_SEXTLOAD: { 3381 if (applyMappingLoad(B, OpdMapper, MI)) 3382 return; 3383 break; 3384 } 3385 case AMDGPU::G_DYN_STACKALLOC: 3386 applyMappingDynStackAlloc(B, OpdMapper, MI); 3387 return; 3388 case AMDGPU::G_STACKRESTORE: { 3389 applyDefaultMapping(OpdMapper); 3390 constrainOpWithReadfirstlane(B, MI, 0); 3391 return; 3392 } 3393 case AMDGPU::G_SBFX: 3394 applyMappingBFE(B, OpdMapper, /*Signed*/ true); 3395 return; 3396 case AMDGPU::G_UBFX: 3397 applyMappingBFE(B, OpdMapper, /*Signed*/ false); 3398 return; 3399 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3400 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3401 applyMappingMAD_64_32(B, OpdMapper); 3402 return; 3403 case AMDGPU::G_PREFETCH: { 3404 if (!Subtarget.hasPrefetch()) { 3405 MI.eraseFromParent(); 3406 return; 3407 } 3408 Register PtrReg = MI.getOperand(0).getReg(); 3409 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); 3410 if (PtrBank == AMDGPU::VGPRRegBankID) { 3411 MI.eraseFromParent(); 3412 return; 3413 } 3414 unsigned AS = MRI.getType(PtrReg).getAddressSpace(); 3415 if (!AMDGPU::isFlatGlobalAddrSpace(AS) && 3416 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 3417 MI.eraseFromParent(); 3418 return; 3419 } 3420 applyDefaultMapping(OpdMapper); 3421 return; 3422 } 3423 default: 3424 break; 3425 } 3426 3427 return applyDefaultMapping(OpdMapper); 3428 } 3429 3430 // vgpr, sgpr -> vgpr 3431 // vgpr, agpr -> vgpr 3432 // agpr, agpr -> agpr 3433 // agpr, sgpr -> vgpr 3434 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3435 if (RB0 == AMDGPU::InvalidRegBankID) 3436 return RB1; 3437 if (RB1 == AMDGPU::InvalidRegBankID) 3438 return RB0; 3439 3440 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3441 return AMDGPU::SGPRRegBankID; 3442 3443 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3444 return AMDGPU::AGPRRegBankID; 3445 3446 return AMDGPU::VGPRRegBankID; 3447 } 3448 3449 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3450 if (RB0 == AMDGPU::InvalidRegBankID) 3451 return RB1; 3452 if (RB1 == AMDGPU::InvalidRegBankID) 3453 return RB0; 3454 3455 // vcc, vcc -> vcc 3456 // vcc, sgpr -> vcc 3457 // vcc, vgpr -> vcc 3458 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3459 return AMDGPU::VCCRegBankID; 3460 3461 // vcc, vgpr -> vgpr 3462 return regBankUnion(RB0, RB1); 3463 } 3464 3465 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3466 const MachineInstr &MI) const { 3467 unsigned RegBank = AMDGPU::InvalidRegBankID; 3468 3469 for (const MachineOperand &MO : MI.operands()) { 3470 if (!MO.isReg()) 3471 continue; 3472 Register Reg = MO.getReg(); 3473 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3474 RegBank = regBankUnion(RegBank, Bank->getID()); 3475 if (RegBank == AMDGPU::VGPRRegBankID) 3476 break; 3477 } 3478 } 3479 3480 return RegBank; 3481 } 3482 3483 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3484 const MachineFunction &MF = *MI.getParent()->getParent(); 3485 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3486 for (const MachineOperand &MO : MI.operands()) { 3487 if (!MO.isReg()) 3488 continue; 3489 Register Reg = MO.getReg(); 3490 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3491 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3492 return false; 3493 } 3494 } 3495 return true; 3496 } 3497 3498 const RegisterBankInfo::InstructionMapping & 3499 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3500 const MachineFunction &MF = *MI.getParent()->getParent(); 3501 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3502 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3503 3504 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3505 const MachineOperand &SrcOp = MI.getOperand(i); 3506 if (!SrcOp.isReg()) 3507 continue; 3508 3509 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3510 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3511 } 3512 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3513 MI.getNumOperands()); 3514 } 3515 3516 const RegisterBankInfo::InstructionMapping & 3517 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3518 const MachineFunction &MF = *MI.getParent()->getParent(); 3519 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3520 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3521 3522 // Even though we technically could use SGPRs, this would require knowledge of 3523 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3524 // 3525 // TODO: Unary ops are trivially OK, so accept SGPRs? 3526 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3527 const MachineOperand &Src = MI.getOperand(i); 3528 if (!Src.isReg()) 3529 continue; 3530 3531 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3532 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3533 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3534 } 3535 3536 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3537 MI.getNumOperands()); 3538 } 3539 3540 const RegisterBankInfo::InstructionMapping & 3541 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3542 const MachineFunction &MF = *MI.getParent()->getParent(); 3543 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3544 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3545 3546 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3547 const MachineOperand &Op = MI.getOperand(I); 3548 if (!Op.isReg()) 3549 continue; 3550 3551 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3552 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3553 } 3554 3555 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3556 MI.getNumOperands()); 3557 } 3558 3559 const RegisterBankInfo::InstructionMapping & 3560 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3561 const MachineInstr &MI, 3562 int RsrcIdx) const { 3563 // The reported argument index is relative to the IR intrinsic call arguments, 3564 // so we need to shift by the number of defs and the intrinsic ID. 3565 RsrcIdx += MI.getNumExplicitDefs() + 1; 3566 3567 const int NumOps = MI.getNumOperands(); 3568 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3569 3570 // TODO: Should packed/unpacked D16 difference be reported here as part of 3571 // the value mapping? 3572 for (int I = 0; I != NumOps; ++I) { 3573 if (!MI.getOperand(I).isReg()) 3574 continue; 3575 3576 Register OpReg = MI.getOperand(I).getReg(); 3577 // We replace some dead address operands with $noreg 3578 if (!OpReg) 3579 continue; 3580 3581 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3582 3583 // FIXME: Probably need a new intrinsic register bank searchable table to 3584 // handle arbitrary intrinsics easily. 3585 // 3586 // If this has a sampler, it immediately follows rsrc. 3587 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3588 3589 if (MustBeSGPR) { 3590 // If this must be an SGPR, so we must report whatever it is as legal. 3591 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3592 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3593 } else { 3594 // Some operands must be VGPR, and these are easy to copy to. 3595 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3596 } 3597 } 3598 3599 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3600 } 3601 3602 /// Return the mapping for a pointer argument. 3603 const RegisterBankInfo::ValueMapping * 3604 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3605 Register PtrReg) const { 3606 LLT PtrTy = MRI.getType(PtrReg); 3607 unsigned Size = PtrTy.getSizeInBits(); 3608 if (Subtarget.useFlatForGlobal() || 3609 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3610 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3611 3612 // If we're using MUBUF instructions for global memory, an SGPR base register 3613 // is possible. Otherwise this needs to be a VGPR. 3614 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3615 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3616 } 3617 3618 const RegisterBankInfo::InstructionMapping & 3619 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3620 3621 const MachineFunction &MF = *MI.getParent()->getParent(); 3622 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3623 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3624 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3625 Register PtrReg = MI.getOperand(1).getReg(); 3626 LLT PtrTy = MRI.getType(PtrReg); 3627 unsigned AS = PtrTy.getAddressSpace(); 3628 unsigned PtrSize = PtrTy.getSizeInBits(); 3629 3630 const ValueMapping *ValMapping; 3631 const ValueMapping *PtrMapping; 3632 3633 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3634 3635 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3636 if (isScalarLoadLegal(MI)) { 3637 // We have a uniform instruction so we want to use an SMRD load 3638 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3639 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3640 } else { 3641 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3642 3643 // If we're using MUBUF instructions for global memory, an SGPR base 3644 // register is possible. Otherwise this needs to be a VGPR. 3645 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3646 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3647 3648 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3649 } 3650 } else { 3651 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3652 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3653 } 3654 3655 OpdsMapping[0] = ValMapping; 3656 OpdsMapping[1] = PtrMapping; 3657 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3658 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3659 return Mapping; 3660 3661 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3662 // handle that during instruction selection? 3663 } 3664 3665 unsigned 3666 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3667 const MachineRegisterInfo &MRI, 3668 unsigned Default) const { 3669 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3670 return Bank ? Bank->getID() : Default; 3671 } 3672 3673 const RegisterBankInfo::ValueMapping * 3674 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3675 const MachineRegisterInfo &MRI, 3676 const TargetRegisterInfo &TRI) const { 3677 // Lie and claim anything is legal, even though this needs to be an SGPR 3678 // applyMapping will have to deal with it as a waterfall loop. 3679 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3680 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3681 return AMDGPU::getValueMapping(Bank, Size); 3682 } 3683 3684 const RegisterBankInfo::ValueMapping * 3685 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3686 const MachineRegisterInfo &MRI, 3687 const TargetRegisterInfo &TRI) const { 3688 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3689 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3690 } 3691 3692 const RegisterBankInfo::ValueMapping * 3693 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3694 const MachineRegisterInfo &MRI, 3695 const TargetRegisterInfo &TRI) const { 3696 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3697 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3698 } 3699 3700 /// 3701 /// This function must return a legal mapping, because 3702 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3703 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3704 /// VGPR to SGPR generated is illegal. 3705 /// 3706 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3707 // legal. These will be dealt with in applyMappingImpl. 3708 // 3709 const RegisterBankInfo::InstructionMapping & 3710 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3711 const MachineFunction &MF = *MI.getParent()->getParent(); 3712 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3713 3714 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3715 // The default logic bothers to analyze impossible alternative mappings. We 3716 // want the most straightforward mapping, so just directly handle this. 3717 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3718 *TRI); 3719 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3720 *TRI); 3721 assert(SrcBank && "src bank should have been assigned already"); 3722 if (!DstBank) 3723 DstBank = SrcBank; 3724 3725 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3726 if (MI.getOpcode() != AMDGPU::G_FREEZE && 3727 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size))) 3728 return getInvalidInstructionMapping(); 3729 3730 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3731 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3732 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3733 OpdsMapping[0] = &ValMap; 3734 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3735 OpdsMapping[1] = &ValMap; 3736 3737 return getInstructionMapping( 3738 1, /*Cost*/ 1, 3739 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3740 } 3741 3742 if (MI.isRegSequence()) { 3743 // If any input is a VGPR, the result must be a VGPR. The default handling 3744 // assumes any copy between banks is legal. 3745 unsigned BankID = AMDGPU::SGPRRegBankID; 3746 3747 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3748 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3749 // It doesn't make sense to use vcc or scc banks here, so just ignore 3750 // them. 3751 if (OpBank != AMDGPU::SGPRRegBankID) { 3752 BankID = AMDGPU::VGPRRegBankID; 3753 break; 3754 } 3755 } 3756 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3757 3758 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3759 return getInstructionMapping( 3760 1, /*Cost*/ 1, 3761 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3762 } 3763 3764 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3765 // properly. 3766 // 3767 // TODO: There are additional exec masking dependencies to analyze. 3768 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3769 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3770 Register DstReg = MI.getOperand(0).getReg(); 3771 3772 // Sometimes the result may have already been assigned a bank. 3773 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3774 ResultBank = DstBank->getID(); 3775 3776 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3777 Register Reg = MI.getOperand(I).getReg(); 3778 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3779 3780 // FIXME: Assuming VGPR for any undetermined inputs. 3781 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3782 ResultBank = AMDGPU::VGPRRegBankID; 3783 break; 3784 } 3785 3786 // FIXME: Need to promote SGPR case to s32 3787 unsigned OpBank = Bank->getID(); 3788 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3789 } 3790 3791 assert(ResultBank != AMDGPU::InvalidRegBankID); 3792 3793 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3794 3795 const ValueMapping &ValMap = 3796 getValueMapping(0, Size, getRegBank(ResultBank)); 3797 return getInstructionMapping( 3798 1, /*Cost*/ 1, 3799 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3800 } 3801 3802 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3803 if (Mapping.isValid()) 3804 return Mapping; 3805 3806 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3807 3808 switch (MI.getOpcode()) { 3809 default: 3810 return getInvalidInstructionMapping(); 3811 3812 case AMDGPU::G_AND: 3813 case AMDGPU::G_OR: 3814 case AMDGPU::G_XOR: 3815 case AMDGPU::G_MUL: { 3816 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3817 if (Size == 1) { 3818 const RegisterBank *DstBank 3819 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3820 3821 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3822 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3823 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3824 if (DstBank) { 3825 TargetBankID = DstBank->getID(); 3826 if (DstBank == &AMDGPU::VCCRegBank) { 3827 TargetBankID = AMDGPU::VCCRegBankID; 3828 BankLHS = AMDGPU::VCCRegBankID; 3829 BankRHS = AMDGPU::VCCRegBankID; 3830 } else { 3831 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3832 AMDGPU::SGPRRegBankID); 3833 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3834 AMDGPU::SGPRRegBankID); 3835 } 3836 } else { 3837 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3838 AMDGPU::VCCRegBankID); 3839 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3840 AMDGPU::VCCRegBankID); 3841 3842 // Both inputs should be true booleans to produce a boolean result. 3843 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3844 TargetBankID = AMDGPU::VGPRRegBankID; 3845 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3846 TargetBankID = AMDGPU::VCCRegBankID; 3847 BankLHS = AMDGPU::VCCRegBankID; 3848 BankRHS = AMDGPU::VCCRegBankID; 3849 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3850 TargetBankID = AMDGPU::SGPRRegBankID; 3851 } 3852 } 3853 3854 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3855 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3856 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3857 break; 3858 } 3859 3860 if (Size == 64) { 3861 3862 if (isSALUMapping(MI)) { 3863 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3864 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3865 } else { 3866 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3867 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3868 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3869 3870 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3871 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3872 } 3873 3874 break; 3875 } 3876 3877 [[fallthrough]]; 3878 } 3879 case AMDGPU::G_PTR_ADD: 3880 case AMDGPU::G_PTRMASK: 3881 case AMDGPU::G_ADD: 3882 case AMDGPU::G_SUB: 3883 case AMDGPU::G_SHL: 3884 case AMDGPU::G_LSHR: 3885 case AMDGPU::G_ASHR: 3886 case AMDGPU::G_UADDO: 3887 case AMDGPU::G_USUBO: 3888 case AMDGPU::G_UADDE: 3889 case AMDGPU::G_SADDE: 3890 case AMDGPU::G_USUBE: 3891 case AMDGPU::G_SSUBE: 3892 case AMDGPU::G_SMIN: 3893 case AMDGPU::G_SMAX: 3894 case AMDGPU::G_UMIN: 3895 case AMDGPU::G_UMAX: 3896 case AMDGPU::G_ABS: 3897 case AMDGPU::G_SHUFFLE_VECTOR: 3898 case AMDGPU::G_SBFX: 3899 case AMDGPU::G_UBFX: 3900 case AMDGPU::G_AMDGPU_S_MUL_I64_I32: 3901 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: 3902 if (isSALUMapping(MI)) 3903 return getDefaultMappingSOP(MI); 3904 return getDefaultMappingVOP(MI); 3905 case AMDGPU::G_FADD: 3906 case AMDGPU::G_FSUB: 3907 case AMDGPU::G_FMUL: 3908 case AMDGPU::G_FMA: 3909 case AMDGPU::G_FFLOOR: 3910 case AMDGPU::G_FCEIL: 3911 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 3912 case AMDGPU::G_FMINNUM: 3913 case AMDGPU::G_FMAXNUM: 3914 case AMDGPU::G_FMINIMUM: 3915 case AMDGPU::G_FMAXIMUM: 3916 case AMDGPU::G_INTRINSIC_TRUNC: 3917 case AMDGPU::G_STRICT_FADD: 3918 case AMDGPU::G_STRICT_FSUB: 3919 case AMDGPU::G_STRICT_FMUL: 3920 case AMDGPU::G_STRICT_FMA: { 3921 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3922 unsigned Size = Ty.getSizeInBits(); 3923 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() && 3924 (Size == 32 || Size == 16) && isSALUMapping(MI)) 3925 return getDefaultMappingSOP(MI); 3926 return getDefaultMappingVOP(MI); 3927 } 3928 case AMDGPU::G_FPTOSI: 3929 case AMDGPU::G_FPTOUI: 3930 case AMDGPU::G_SITOFP: 3931 case AMDGPU::G_UITOFP: { 3932 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3933 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3934 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 && 3935 isSALUMapping(MI)) 3936 return getDefaultMappingSOP(MI); 3937 return getDefaultMappingVOP(MI); 3938 } 3939 case AMDGPU::G_FPTRUNC: 3940 case AMDGPU::G_FPEXT: { 3941 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3942 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3943 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 && 3944 isSALUMapping(MI)) 3945 return getDefaultMappingSOP(MI); 3946 return getDefaultMappingVOP(MI); 3947 } 3948 case AMDGPU::G_FSQRT: 3949 case AMDGPU::G_FEXP2: 3950 case AMDGPU::G_FLOG2: { 3951 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3952 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && 3953 isSALUMapping(MI)) 3954 return getDefaultMappingSOP(MI); 3955 return getDefaultMappingVOP(MI); 3956 } 3957 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3958 case AMDGPU::G_SSUBSAT: 3959 case AMDGPU::G_UADDSAT: 3960 case AMDGPU::G_USUBSAT: 3961 case AMDGPU::G_FMAD: 3962 case AMDGPU::G_FLDEXP: 3963 case AMDGPU::G_FMINNUM_IEEE: 3964 case AMDGPU::G_FMAXNUM_IEEE: 3965 case AMDGPU::G_FCANONICALIZE: 3966 case AMDGPU::G_STRICT_FLDEXP: 3967 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3968 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3969 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3970 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3971 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3972 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3973 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3974 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3975 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3976 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3977 case AMDGPU::G_AMDGPU_SMED3: 3978 case AMDGPU::G_AMDGPU_FMED3: 3979 return getDefaultMappingVOP(MI); 3980 case AMDGPU::G_UMULH: 3981 case AMDGPU::G_SMULH: { 3982 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3983 return getDefaultMappingSOP(MI); 3984 return getDefaultMappingVOP(MI); 3985 } 3986 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3987 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 3988 // Three possible mappings: 3989 // 3990 // - Default SOP 3991 // - Default VOP 3992 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 3993 // 3994 // This allows instruction selection to keep the multiplication part of the 3995 // instruction on the SALU. 3996 bool AllSalu = true; 3997 bool MulSalu = true; 3998 for (unsigned i = 0; i < 5; ++i) { 3999 Register Reg = MI.getOperand(i).getReg(); 4000 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 4001 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 4002 AllSalu = false; 4003 if (i == 2 || i == 3) { 4004 MulSalu = false; 4005 break; 4006 } 4007 } 4008 } 4009 } 4010 4011 if (AllSalu) 4012 return getDefaultMappingSOP(MI); 4013 4014 // If the multiply-add is full-rate in VALU, use that even if the 4015 // multiplication part is scalar. Accumulating separately on the VALU would 4016 // take two instructions. 4017 if (!MulSalu || Subtarget.hasFullRate64Ops()) 4018 return getDefaultMappingVOP(MI); 4019 4020 // Keep the multiplication on the SALU, then accumulate on the VALU. 4021 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 4022 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4023 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4024 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4025 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 4026 break; 4027 } 4028 case AMDGPU::G_IMPLICIT_DEF: { 4029 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4030 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4031 break; 4032 } 4033 case AMDGPU::G_FCONSTANT: 4034 case AMDGPU::G_CONSTANT: 4035 case AMDGPU::G_GLOBAL_VALUE: 4036 case AMDGPU::G_BLOCK_ADDR: 4037 case AMDGPU::G_READCYCLECOUNTER: { 4038 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4039 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4040 break; 4041 } 4042 case AMDGPU::G_FRAME_INDEX: { 4043 // TODO: This should be the same as other constants, but eliminateFrameIndex 4044 // currently assumes VALU uses. 4045 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4046 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4047 break; 4048 } 4049 case AMDGPU::G_DYN_STACKALLOC: { 4050 // Result is always uniform, and a wave reduction is needed for the source. 4051 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4052 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4053 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 4054 break; 4055 } 4056 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 4057 // This case is weird because we expect a physical register in the source, 4058 // but need to set a bank anyway. 4059 // 4060 // TODO: We could select the result to SGPR or VGPR 4061 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4062 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 4063 break; 4064 } 4065 case AMDGPU::G_INSERT: { 4066 unsigned BankID = getMappingType(MRI, MI); 4067 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4068 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4069 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 4070 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 4071 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 4072 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 4073 OpdsMapping[3] = nullptr; 4074 break; 4075 } 4076 case AMDGPU::G_EXTRACT: { 4077 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4078 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4079 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4080 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 4081 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 4082 OpdsMapping[2] = nullptr; 4083 break; 4084 } 4085 case AMDGPU::G_BUILD_VECTOR: 4086 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 4087 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 4088 if (DstTy == LLT::fixed_vector(2, 16)) { 4089 unsigned DstSize = DstTy.getSizeInBits(); 4090 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4091 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4092 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4093 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 4094 4095 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 4096 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 4097 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 4098 break; 4099 } 4100 4101 [[fallthrough]]; 4102 } 4103 case AMDGPU::G_MERGE_VALUES: 4104 case AMDGPU::G_CONCAT_VECTORS: { 4105 unsigned Bank = getMappingType(MRI, MI); 4106 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4107 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4108 4109 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 4110 // Op1 and Dst should use the same register bank. 4111 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 4112 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 4113 break; 4114 } 4115 case AMDGPU::G_BITREVERSE: 4116 case AMDGPU::G_BITCAST: 4117 case AMDGPU::G_INTTOPTR: 4118 case AMDGPU::G_PTRTOINT: 4119 case AMDGPU::G_FABS: 4120 case AMDGPU::G_FNEG: { 4121 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4122 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4123 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 4124 break; 4125 } 4126 case AMDGPU::G_AMDGPU_FFBH_U32: 4127 case AMDGPU::G_AMDGPU_FFBL_B32: 4128 case AMDGPU::G_CTLZ_ZERO_UNDEF: 4129 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 4130 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4131 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4132 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 4133 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 4134 break; 4135 } 4136 case AMDGPU::G_CTPOP: { 4137 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4138 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4139 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 4140 4141 // This should really be getValueMappingSGPR64Only, but allowing the generic 4142 // code to handle the register split just makes using LegalizerHelper more 4143 // difficult. 4144 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 4145 break; 4146 } 4147 case AMDGPU::G_TRUNC: { 4148 Register Dst = MI.getOperand(0).getReg(); 4149 Register Src = MI.getOperand(1).getReg(); 4150 unsigned Bank = getRegBankID(Src, MRI); 4151 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 4152 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 4153 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 4154 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 4155 break; 4156 } 4157 case AMDGPU::G_ZEXT: 4158 case AMDGPU::G_SEXT: 4159 case AMDGPU::G_ANYEXT: 4160 case AMDGPU::G_SEXT_INREG: { 4161 Register Dst = MI.getOperand(0).getReg(); 4162 Register Src = MI.getOperand(1).getReg(); 4163 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 4164 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 4165 4166 unsigned DstBank; 4167 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 4168 assert(SrcBank); 4169 switch (SrcBank->getID()) { 4170 case AMDGPU::SGPRRegBankID: 4171 DstBank = AMDGPU::SGPRRegBankID; 4172 break; 4173 default: 4174 DstBank = AMDGPU::VGPRRegBankID; 4175 break; 4176 } 4177 4178 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 4179 // 32-bits, and then to 64. 4180 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 4181 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 4182 SrcSize); 4183 break; 4184 } 4185 case AMDGPU::G_IS_FPCLASS: { 4186 Register SrcReg = MI.getOperand(1).getReg(); 4187 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4188 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4189 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4190 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4191 break; 4192 } 4193 case AMDGPU::G_STORE: { 4194 assert(MI.getOperand(0).isReg()); 4195 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4196 4197 // FIXME: We need to specify a different reg bank once scalar stores are 4198 // supported. 4199 const ValueMapping *ValMapping = 4200 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4201 OpdsMapping[0] = ValMapping; 4202 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4203 break; 4204 } 4205 case AMDGPU::G_ICMP: 4206 case AMDGPU::G_FCMP: { 4207 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4208 4209 // See if the result register has already been constrained to vcc, which may 4210 // happen due to control flow intrinsic lowering. 4211 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4212 AMDGPU::SGPRRegBankID); 4213 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4214 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4215 4216 auto canUseSCCICMP = [&]() { 4217 auto Pred = 4218 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 4219 return Size == 32 || 4220 (Size == 64 && 4221 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4222 Subtarget.hasScalarCompareEq64()); 4223 }; 4224 auto canUseSCCFCMP = [&]() { 4225 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16); 4226 }; 4227 4228 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP; 4229 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4230 Op2Bank == AMDGPU::SGPRRegBankID && 4231 Op3Bank == AMDGPU::SGPRRegBankID && 4232 (isICMP ? canUseSCCICMP() : canUseSCCFCMP()); 4233 4234 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4235 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4236 4237 // TODO: Use 32-bit for scalar output size. 4238 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4239 const unsigned ResultSize = 1; 4240 4241 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4242 OpdsMapping[1] = nullptr; // Predicate Operand. 4243 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4244 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4245 break; 4246 } 4247 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4248 // VGPR index can be used for waterfall when indexing a SGPR vector. 4249 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4250 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4251 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4252 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4253 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4254 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4255 4256 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4257 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4258 4259 // The index can be either if the source vector is VGPR. 4260 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4261 break; 4262 } 4263 case AMDGPU::G_INSERT_VECTOR_ELT: { 4264 unsigned OutputBankID = isSALUMapping(MI) ? 4265 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4266 4267 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4268 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4269 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4270 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4271 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4272 4273 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4274 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4275 4276 // This is a weird case, because we need to break down the mapping based on 4277 // the register bank of a different operand. 4278 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4279 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4280 InsertSize); 4281 } else { 4282 assert(InsertSize == 32 || InsertSize == 64); 4283 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4284 } 4285 4286 // The index can be either if the source vector is VGPR. 4287 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4288 break; 4289 } 4290 case AMDGPU::G_UNMERGE_VALUES: { 4291 unsigned Bank = getMappingType(MRI, MI); 4292 4293 // Op1 and Dst should use the same register bank. 4294 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4295 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4296 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4297 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4298 } 4299 break; 4300 } 4301 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4302 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4303 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4304 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4305 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4306 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4307 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 4308 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4309 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4310 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4311 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4312 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4313 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4314 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4315 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4316 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4317 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4318 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4319 4320 // rsrc 4321 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4322 4323 // vindex 4324 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4325 4326 // voffset 4327 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4328 4329 // soffset 4330 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4331 4332 // Any remaining operands are immediates and were correctly null 4333 // initialized. 4334 break; 4335 } 4336 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4337 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4338 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4339 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4340 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4341 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4342 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4343 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4344 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4345 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4346 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4347 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4348 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4349 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4350 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4351 // vdata_out 4352 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4353 4354 // vdata_in 4355 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4356 4357 // rsrc 4358 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4359 4360 // vindex 4361 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4362 4363 // voffset 4364 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4365 4366 // soffset 4367 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4368 4369 // Any remaining operands are immediates and were correctly null 4370 // initialized. 4371 break; 4372 } 4373 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4374 // vdata_out 4375 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4376 4377 // vdata_in 4378 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4379 4380 // cmp 4381 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4382 4383 // rsrc 4384 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4385 4386 // vindex 4387 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4388 4389 // voffset 4390 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4391 4392 // soffset 4393 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4394 4395 // Any remaining operands are immediates and were correctly null 4396 // initialized. 4397 break; 4398 } 4399 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4400 // Lie and claim everything is legal, even though some need to be 4401 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4402 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4403 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4404 4405 // We need to convert this to a MUBUF if either the resource of offset is 4406 // VGPR. 4407 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4408 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4409 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4410 4411 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4412 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4413 break; 4414 } 4415 case AMDGPU::G_INTRINSIC: 4416 case AMDGPU::G_INTRINSIC_CONVERGENT: { 4417 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 4418 default: 4419 return getInvalidInstructionMapping(); 4420 case Intrinsic::amdgcn_div_fmas: 4421 case Intrinsic::amdgcn_div_fixup: 4422 case Intrinsic::amdgcn_trig_preop: 4423 case Intrinsic::amdgcn_sin: 4424 case Intrinsic::amdgcn_cos: 4425 case Intrinsic::amdgcn_log_clamp: 4426 case Intrinsic::amdgcn_rcp_legacy: 4427 case Intrinsic::amdgcn_rsq_legacy: 4428 case Intrinsic::amdgcn_rsq_clamp: 4429 case Intrinsic::amdgcn_fmul_legacy: 4430 case Intrinsic::amdgcn_fma_legacy: 4431 case Intrinsic::amdgcn_frexp_mant: 4432 case Intrinsic::amdgcn_frexp_exp: 4433 case Intrinsic::amdgcn_fract: 4434 case Intrinsic::amdgcn_cvt_pknorm_i16: 4435 case Intrinsic::amdgcn_cvt_pknorm_u16: 4436 case Intrinsic::amdgcn_cvt_pk_i16: 4437 case Intrinsic::amdgcn_cvt_pk_u16: 4438 case Intrinsic::amdgcn_fmed3: 4439 case Intrinsic::amdgcn_cubeid: 4440 case Intrinsic::amdgcn_cubema: 4441 case Intrinsic::amdgcn_cubesc: 4442 case Intrinsic::amdgcn_cubetc: 4443 case Intrinsic::amdgcn_sffbh: 4444 case Intrinsic::amdgcn_fmad_ftz: 4445 case Intrinsic::amdgcn_mbcnt_lo: 4446 case Intrinsic::amdgcn_mbcnt_hi: 4447 case Intrinsic::amdgcn_mul_u24: 4448 case Intrinsic::amdgcn_mul_i24: 4449 case Intrinsic::amdgcn_mulhi_u24: 4450 case Intrinsic::amdgcn_mulhi_i24: 4451 case Intrinsic::amdgcn_lerp: 4452 case Intrinsic::amdgcn_sad_u8: 4453 case Intrinsic::amdgcn_msad_u8: 4454 case Intrinsic::amdgcn_sad_hi_u8: 4455 case Intrinsic::amdgcn_sad_u16: 4456 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4457 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4458 case Intrinsic::amdgcn_mqsad_u32_u8: 4459 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4460 case Intrinsic::amdgcn_alignbyte: 4461 case Intrinsic::amdgcn_perm: 4462 case Intrinsic::amdgcn_fdot2: 4463 case Intrinsic::amdgcn_sdot2: 4464 case Intrinsic::amdgcn_udot2: 4465 case Intrinsic::amdgcn_sdot4: 4466 case Intrinsic::amdgcn_udot4: 4467 case Intrinsic::amdgcn_sdot8: 4468 case Intrinsic::amdgcn_udot8: 4469 case Intrinsic::amdgcn_fdot2_bf16_bf16: 4470 case Intrinsic::amdgcn_fdot2_f16_f16: 4471 case Intrinsic::amdgcn_fdot2_f32_bf16: 4472 case Intrinsic::amdgcn_sudot4: 4473 case Intrinsic::amdgcn_sudot8: 4474 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: 4475 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: 4476 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: 4477 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: 4478 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: 4479 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: 4480 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: 4481 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: 4482 return getDefaultMappingVOP(MI); 4483 case Intrinsic::amdgcn_log: 4484 case Intrinsic::amdgcn_exp2: 4485 case Intrinsic::amdgcn_rcp: 4486 case Intrinsic::amdgcn_rsq: 4487 case Intrinsic::amdgcn_sqrt: { 4488 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4489 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && 4490 isSALUMapping(MI)) 4491 return getDefaultMappingSOP(MI); 4492 return getDefaultMappingVOP(MI); 4493 } 4494 case Intrinsic::amdgcn_sbfe: 4495 case Intrinsic::amdgcn_ubfe: 4496 if (isSALUMapping(MI)) 4497 return getDefaultMappingSOP(MI); 4498 return getDefaultMappingVOP(MI); 4499 case Intrinsic::amdgcn_ds_swizzle: 4500 case Intrinsic::amdgcn_ds_permute: 4501 case Intrinsic::amdgcn_ds_bpermute: 4502 case Intrinsic::amdgcn_update_dpp: 4503 case Intrinsic::amdgcn_mov_dpp8: 4504 case Intrinsic::amdgcn_mov_dpp: 4505 case Intrinsic::amdgcn_strict_wwm: 4506 case Intrinsic::amdgcn_wwm: 4507 case Intrinsic::amdgcn_strict_wqm: 4508 case Intrinsic::amdgcn_wqm: 4509 case Intrinsic::amdgcn_softwqm: 4510 case Intrinsic::amdgcn_set_inactive: 4511 case Intrinsic::amdgcn_set_inactive_chain_arg: 4512 case Intrinsic::amdgcn_permlane64: 4513 return getDefaultMappingAllVGPR(MI); 4514 case Intrinsic::amdgcn_cvt_pkrtz: 4515 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) 4516 return getDefaultMappingSOP(MI); 4517 return getDefaultMappingVOP(MI); 4518 case Intrinsic::amdgcn_kernarg_segment_ptr: 4519 case Intrinsic::amdgcn_s_getpc: 4520 case Intrinsic::amdgcn_groupstaticsize: 4521 case Intrinsic::amdgcn_reloc_constant: 4522 case Intrinsic::returnaddress: { 4523 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4524 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4525 break; 4526 } 4527 case Intrinsic::amdgcn_wqm_vote: { 4528 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4529 OpdsMapping[0] = OpdsMapping[2] 4530 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4531 break; 4532 } 4533 case Intrinsic::amdgcn_ps_live: { 4534 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4535 break; 4536 } 4537 case Intrinsic::amdgcn_div_scale: { 4538 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4539 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4540 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4541 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4542 4543 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4544 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4545 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4546 break; 4547 } 4548 case Intrinsic::amdgcn_class: { 4549 Register Src0Reg = MI.getOperand(2).getReg(); 4550 Register Src1Reg = MI.getOperand(3).getReg(); 4551 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4552 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4553 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4554 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4555 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4556 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4557 break; 4558 } 4559 case Intrinsic::amdgcn_icmp: 4560 case Intrinsic::amdgcn_fcmp: { 4561 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4562 // This is not VCCRegBank because this is not used in boolean contexts. 4563 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4564 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4565 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4566 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4567 break; 4568 } 4569 case Intrinsic::amdgcn_readlane: { 4570 // This must be an SGPR, but accept a VGPR. 4571 Register IdxReg = MI.getOperand(3).getReg(); 4572 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4573 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4574 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4575 [[fallthrough]]; 4576 } 4577 case Intrinsic::amdgcn_readfirstlane: { 4578 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4579 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4580 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4581 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4582 break; 4583 } 4584 case Intrinsic::amdgcn_writelane: { 4585 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4586 Register SrcReg = MI.getOperand(2).getReg(); 4587 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4588 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4589 Register IdxReg = MI.getOperand(3).getReg(); 4590 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4591 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4592 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4593 4594 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4595 // to legalize. 4596 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4597 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4598 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4599 break; 4600 } 4601 case Intrinsic::amdgcn_if_break: { 4602 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4603 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4604 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4605 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4606 break; 4607 } 4608 case Intrinsic::amdgcn_permlane16: 4609 case Intrinsic::amdgcn_permlanex16: { 4610 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4611 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4612 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4613 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4614 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4615 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4616 break; 4617 } 4618 case Intrinsic::amdgcn_permlane16_var: 4619 case Intrinsic::amdgcn_permlanex16_var: { 4620 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4621 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4622 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4623 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4624 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4625 break; 4626 } 4627 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4628 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4629 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4630 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4631 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4632 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4633 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4634 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4635 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4636 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4637 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4638 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4639 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4640 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4641 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4642 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4643 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4644 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4645 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4646 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4647 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4648 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4649 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4650 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4651 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4652 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4653 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4654 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4655 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4656 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4657 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: 4658 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: 4659 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: 4660 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: 4661 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: 4662 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: 4663 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: 4664 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: 4665 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { 4666 // Default for MAI intrinsics. 4667 // srcC can also be an immediate which can be folded later. 4668 // FIXME: Should we eventually add an alternative mapping with AGPR src 4669 // for srcA/srcB? 4670 // 4671 // vdst, srcA, srcB, srcC 4672 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4673 OpdsMapping[0] = 4674 Info->mayNeedAGPRs() 4675 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4676 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4677 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4678 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4679 OpdsMapping[4] = 4680 Info->mayNeedAGPRs() 4681 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4682 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4683 break; 4684 } 4685 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4686 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4687 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4688 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4689 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4690 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 4691 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 4692 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 4693 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 4694 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 4695 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 4696 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 4697 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 4698 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { 4699 // vdst, srcA, srcB, srcC, idx 4700 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4701 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4702 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4703 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4704 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4705 break; 4706 } 4707 case Intrinsic::amdgcn_interp_p1: 4708 case Intrinsic::amdgcn_interp_p2: 4709 case Intrinsic::amdgcn_interp_mov: 4710 case Intrinsic::amdgcn_interp_p1_f16: 4711 case Intrinsic::amdgcn_interp_p2_f16: 4712 case Intrinsic::amdgcn_lds_param_load: { 4713 const int M0Idx = MI.getNumOperands() - 1; 4714 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4715 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4716 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4717 4718 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4719 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4720 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4721 4722 // Must be SGPR, but we must take whatever the original bank is and fix it 4723 // later. 4724 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4725 break; 4726 } 4727 case Intrinsic::amdgcn_interp_inreg_p10: 4728 case Intrinsic::amdgcn_interp_inreg_p2: 4729 case Intrinsic::amdgcn_interp_inreg_p10_f16: 4730 case Intrinsic::amdgcn_interp_inreg_p2_f16: { 4731 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4732 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4733 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4734 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4735 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4736 break; 4737 } 4738 case Intrinsic::amdgcn_ballot: { 4739 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4740 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4741 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4742 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4743 break; 4744 } 4745 case Intrinsic::amdgcn_inverse_ballot: { 4746 // This must be an SGPR, but accept a VGPR. 4747 Register MaskReg = MI.getOperand(2).getReg(); 4748 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 4749 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4750 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4751 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 4752 break; 4753 } 4754 case Intrinsic::amdgcn_s_quadmask: 4755 case Intrinsic::amdgcn_s_wqm: { 4756 Register MaskReg = MI.getOperand(2).getReg(); 4757 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 4758 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4759 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize); 4760 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 4761 break; 4762 } 4763 case Intrinsic::amdgcn_wave_reduce_umin: 4764 case Intrinsic::amdgcn_wave_reduce_umax: { 4765 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4766 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4767 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4768 auto regBankID = 4769 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4770 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); 4771 break; 4772 } 4773 case Intrinsic::amdgcn_s_bitreplicate: 4774 Register MaskReg = MI.getOperand(2).getReg(); 4775 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4776 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4777 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); 4778 } 4779 break; 4780 } 4781 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4782 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4783 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4784 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4785 auto IntrID = AMDGPU::getIntrinsicID(MI); 4786 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4787 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4788 // Non-images can have complications from operands that allow both SGPR 4789 // and VGPR. For now it's too complicated to figure out the final opcode 4790 // to derive the register bank from the MCInstrDesc. 4791 assert(RSrcIntrin->IsImage); 4792 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4793 } 4794 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4795 unsigned N = MI.getNumExplicitOperands() - 2; 4796 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4797 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4798 if (N == 3) { 4799 // Sequential form: all operands combined into VGPR256/VGPR512 4800 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4801 if (Size > 256) 4802 Size = 512; 4803 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4804 } else { 4805 // NSA form 4806 for (unsigned I = 2; I < N; ++I) { 4807 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); 4808 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4809 } 4810 } 4811 break; 4812 } 4813 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 4814 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { 4815 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 4816 switch (IntrID) { 4817 case Intrinsic::amdgcn_s_getreg: 4818 case Intrinsic::amdgcn_s_memtime: 4819 case Intrinsic::amdgcn_s_memrealtime: 4820 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 4821 case Intrinsic::amdgcn_s_sendmsg_rtn: { 4822 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4823 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4824 break; 4825 } 4826 case Intrinsic::amdgcn_global_atomic_fadd: 4827 case Intrinsic::amdgcn_global_atomic_csub: 4828 case Intrinsic::amdgcn_global_atomic_fmin: 4829 case Intrinsic::amdgcn_global_atomic_fmax: 4830 case Intrinsic::amdgcn_global_atomic_fmin_num: 4831 case Intrinsic::amdgcn_global_atomic_fmax_num: 4832 case Intrinsic::amdgcn_flat_atomic_fadd: 4833 case Intrinsic::amdgcn_flat_atomic_fmin: 4834 case Intrinsic::amdgcn_flat_atomic_fmax: 4835 case Intrinsic::amdgcn_flat_atomic_fmin_num: 4836 case Intrinsic::amdgcn_flat_atomic_fmax_num: 4837 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4838 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4839 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 4840 return getDefaultMappingAllVGPR(MI); 4841 case Intrinsic::amdgcn_ds_ordered_add: 4842 case Intrinsic::amdgcn_ds_ordered_swap: 4843 case Intrinsic::amdgcn_ds_fadd_v2bf16: { 4844 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4845 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4846 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4847 AMDGPU::SGPRRegBankID); 4848 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4849 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4850 break; 4851 } 4852 case Intrinsic::amdgcn_ds_append: 4853 case Intrinsic::amdgcn_ds_consume: { 4854 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4855 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4856 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4857 break; 4858 } 4859 case Intrinsic::amdgcn_exp_compr: 4860 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4861 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4862 break; 4863 case Intrinsic::amdgcn_exp: 4864 // FIXME: Could we support packed types here? 4865 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4866 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4867 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4868 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4869 break; 4870 case Intrinsic::amdgcn_exp_row: 4871 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4872 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4873 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4874 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4875 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 4876 break; 4877 case Intrinsic::amdgcn_s_sendmsg: 4878 case Intrinsic::amdgcn_s_sendmsghalt: { 4879 // This must be an SGPR, but accept a VGPR. 4880 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4881 AMDGPU::SGPRRegBankID); 4882 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4883 break; 4884 } 4885 case Intrinsic::amdgcn_s_setreg: { 4886 // This must be an SGPR, but accept a VGPR. 4887 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4888 AMDGPU::SGPRRegBankID); 4889 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4890 break; 4891 } 4892 case Intrinsic::amdgcn_s_ttracedata: { 4893 // This must be an SGPR, but accept a VGPR. 4894 unsigned Bank = 4895 getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); 4896 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4897 break; 4898 } 4899 case Intrinsic::amdgcn_end_cf: { 4900 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4901 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4902 break; 4903 } 4904 case Intrinsic::amdgcn_else: { 4905 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4906 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4907 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4908 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4909 break; 4910 } 4911 case Intrinsic::amdgcn_live_mask: { 4912 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4913 break; 4914 } 4915 case Intrinsic::amdgcn_wqm_demote: 4916 case Intrinsic::amdgcn_kill: { 4917 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4918 break; 4919 } 4920 case Intrinsic::amdgcn_raw_buffer_load: 4921 case Intrinsic::amdgcn_raw_ptr_buffer_load: 4922 case Intrinsic::amdgcn_raw_tbuffer_load: 4923 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 4924 // FIXME: Should make intrinsic ID the last operand of the instruction, 4925 // then this would be the same as store 4926 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4927 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4928 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4929 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4930 break; 4931 } 4932 case Intrinsic::amdgcn_raw_buffer_load_lds: 4933 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 4934 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4935 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4936 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4937 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4938 break; 4939 } 4940 case Intrinsic::amdgcn_raw_buffer_store: 4941 case Intrinsic::amdgcn_raw_ptr_buffer_store: 4942 case Intrinsic::amdgcn_raw_buffer_store_format: 4943 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 4944 case Intrinsic::amdgcn_raw_tbuffer_store: 4945 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 4946 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4947 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4948 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4949 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4950 break; 4951 } 4952 case Intrinsic::amdgcn_struct_buffer_load: 4953 case Intrinsic::amdgcn_struct_ptr_buffer_load: 4954 case Intrinsic::amdgcn_struct_tbuffer_load: 4955 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { 4956 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4957 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4958 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4959 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4960 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4961 break; 4962 } 4963 case Intrinsic::amdgcn_struct_buffer_load_lds: 4964 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 4965 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4966 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4967 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4968 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4969 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4970 break; 4971 } 4972 case Intrinsic::amdgcn_struct_buffer_store: 4973 case Intrinsic::amdgcn_struct_ptr_buffer_store: 4974 case Intrinsic::amdgcn_struct_tbuffer_store: 4975 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 4976 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4977 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4978 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4979 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4980 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4981 break; 4982 } 4983 case Intrinsic::amdgcn_init_exec_from_input: { 4984 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4985 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4986 break; 4987 } 4988 case Intrinsic::amdgcn_ds_gws_init: 4989 case Intrinsic::amdgcn_ds_gws_barrier: 4990 case Intrinsic::amdgcn_ds_gws_sema_br: { 4991 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4992 4993 // This must be an SGPR, but accept a VGPR. 4994 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4995 AMDGPU::SGPRRegBankID); 4996 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4997 break; 4998 } 4999 case Intrinsic::amdgcn_ds_gws_sema_v: 5000 case Intrinsic::amdgcn_ds_gws_sema_p: 5001 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 5002 // This must be an SGPR, but accept a VGPR. 5003 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 5004 AMDGPU::SGPRRegBankID); 5005 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 5006 break; 5007 } 5008 case Intrinsic::amdgcn_global_load_lds: { 5009 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5010 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5011 break; 5012 } 5013 case Intrinsic::amdgcn_lds_direct_load: { 5014 const int M0Idx = MI.getNumOperands() - 1; 5015 Register M0Reg = MI.getOperand(M0Idx).getReg(); 5016 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 5017 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5018 5019 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 5020 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 5021 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 5022 5023 // Must be SGPR, but we must take whatever the original bank is and fix it 5024 // later. 5025 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 5026 break; 5027 } 5028 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 5029 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: 5030 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5031 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5032 break; 5033 case Intrinsic::amdgcn_ds_bvh_stack_rtn: { 5034 OpdsMapping[0] = 5035 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst 5036 OpdsMapping[1] = 5037 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr 5038 OpdsMapping[3] = 5039 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr 5040 OpdsMapping[4] = 5041 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0 5042 OpdsMapping[5] = 5043 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 5044 break; 5045 } 5046 case Intrinsic::amdgcn_s_sleep_var: 5047 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5048 break; 5049 case Intrinsic::amdgcn_s_barrier_signal_var: 5050 case Intrinsic::amdgcn_s_barrier_join: 5051 case Intrinsic::amdgcn_s_wakeup_barrier: 5052 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5053 break; 5054 case Intrinsic::amdgcn_s_barrier_init: 5055 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5056 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5057 break; 5058 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: { 5059 const unsigned ResultSize = 1; 5060 OpdsMapping[0] = 5061 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); 5062 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5063 break; 5064 } 5065 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 5066 case Intrinsic::amdgcn_s_barrier_leave: { 5067 const unsigned ResultSize = 1; 5068 OpdsMapping[0] = 5069 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); 5070 break; 5071 } 5072 case Intrinsic::amdgcn_s_get_barrier_state: { 5073 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5074 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5075 break; 5076 } 5077 default: 5078 return getInvalidInstructionMapping(); 5079 } 5080 break; 5081 } 5082 case AMDGPU::G_SELECT: { 5083 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 5084 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 5085 AMDGPU::SGPRRegBankID); 5086 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 5087 AMDGPU::SGPRRegBankID); 5088 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 5089 Op3Bank == AMDGPU::SGPRRegBankID; 5090 5091 unsigned CondBankDefault = SGPRSrcs ? 5092 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 5093 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 5094 CondBankDefault); 5095 if (CondBank == AMDGPU::SGPRRegBankID) 5096 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 5097 else if (CondBank == AMDGPU::VGPRRegBankID) 5098 CondBank = AMDGPU::VCCRegBankID; 5099 5100 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 5101 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 5102 5103 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 5104 5105 // TODO: Should report 32-bit for scalar condition type. 5106 if (Size == 64) { 5107 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5108 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 5109 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5110 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 5111 } else { 5112 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 5113 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 5114 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 5115 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 5116 } 5117 5118 break; 5119 } 5120 5121 case AMDGPU::G_SI_CALL: { 5122 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 5123 // Lie and claim everything is legal, even though some need to be 5124 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 5125 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 5126 5127 // Allow anything for implicit arguments 5128 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 5129 if (MI.getOperand(I).isReg()) { 5130 Register Reg = MI.getOperand(I).getReg(); 5131 auto OpBank = getRegBankID(Reg, MRI); 5132 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 5133 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 5134 } 5135 } 5136 break; 5137 } 5138 case AMDGPU::G_LOAD: 5139 case AMDGPU::G_ZEXTLOAD: 5140 case AMDGPU::G_SEXTLOAD: 5141 return getInstrMappingForLoad(MI); 5142 5143 case AMDGPU::G_ATOMICRMW_XCHG: 5144 case AMDGPU::G_ATOMICRMW_ADD: 5145 case AMDGPU::G_ATOMICRMW_SUB: 5146 case AMDGPU::G_ATOMICRMW_AND: 5147 case AMDGPU::G_ATOMICRMW_OR: 5148 case AMDGPU::G_ATOMICRMW_XOR: 5149 case AMDGPU::G_ATOMICRMW_MAX: 5150 case AMDGPU::G_ATOMICRMW_MIN: 5151 case AMDGPU::G_ATOMICRMW_UMAX: 5152 case AMDGPU::G_ATOMICRMW_UMIN: 5153 case AMDGPU::G_ATOMICRMW_FADD: 5154 case AMDGPU::G_ATOMICRMW_UINC_WRAP: 5155 case AMDGPU::G_ATOMICRMW_UDEC_WRAP: 5156 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 5157 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 5158 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 5159 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5160 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 5161 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5162 break; 5163 } 5164 case AMDGPU::G_ATOMIC_CMPXCHG: { 5165 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5166 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 5167 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 5168 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 5169 break; 5170 } 5171 case AMDGPU::G_BRCOND: { 5172 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 5173 AMDGPU::SGPRRegBankID); 5174 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 5175 if (Bank != AMDGPU::SGPRRegBankID) 5176 Bank = AMDGPU::VCCRegBankID; 5177 5178 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 5179 break; 5180 } 5181 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 5182 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 5183 return getDefaultMappingVOP(MI); 5184 case AMDGPU::G_PREFETCH: 5185 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 5186 break; 5187 } 5188 5189 return getInstructionMapping(/*ID*/1, /*Cost*/1, 5190 getOperandsMapping(OpdsMapping), 5191 MI.getNumOperands()); 5192 } 5193