1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPURegisterBankInfo.h" 15 #include "AMDGPUInstrInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "SIRegisterInfo.h" 20 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 25 #include "llvm/CodeGen/TargetRegisterInfo.h" 26 #include "llvm/CodeGen/TargetSubtargetInfo.h" 27 #include "llvm/IR/Constants.h" 28 29 #define GET_TARGET_REGBANK_IMPL 30 #include "AMDGPUGenRegisterBank.inc" 31 32 // This file will be TableGen'ed at some point. 33 #include "AMDGPUGenRegisterBankInfo.def" 34 35 using namespace llvm; 36 using namespace MIPatternMatch; 37 38 namespace { 39 40 // Observer to apply a register bank to new registers created by LegalizerHelper. 41 class ApplyRegBankMapping final : public GISelChangeObserver { 42 private: 43 MachineRegisterInfo &MRI; 44 const RegisterBank *NewBank; 45 SmallVector<MachineInstr *, 4> NewInsts; 46 47 public: 48 ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) 49 : MRI(MRI_), NewBank(RB) {} 50 51 ~ApplyRegBankMapping() { 52 for (MachineInstr *MI : NewInsts) 53 applyBank(*MI); 54 } 55 56 /// Set any registers that don't have a set register class or bank to SALU. 57 void applyBank(MachineInstr &MI) { 58 for (MachineOperand &Op : MI.operands()) { 59 if (!Op.isReg()) 60 continue; 61 62 Register Reg = Op.getReg(); 63 if (MRI.getRegClassOrRegBank(Reg)) 64 continue; 65 66 const RegisterBank *RB = NewBank; 67 // FIXME: This might not be enough to detect when SCC should be used. 68 if (MRI.getType(Reg) == LLT::scalar(1)) 69 RB = (NewBank == &AMDGPU::SGPRRegBank ? 70 &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); 71 72 MRI.setRegBank(Reg, *RB); 73 } 74 } 75 76 void erasingInstr(MachineInstr &MI) override {} 77 78 void createdInstr(MachineInstr &MI) override { 79 // At this point, the instruction was just inserted and has no operands. 80 NewInsts.push_back(&MI); 81 } 82 83 void changingInstr(MachineInstr &MI) override {} 84 void changedInstr(MachineInstr &MI) override {} 85 }; 86 87 } 88 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 89 : AMDGPUGenRegisterBankInfo(), 90 Subtarget(ST), 91 TRI(Subtarget.getRegisterInfo()), 92 TII(Subtarget.getInstrInfo()) { 93 94 // HACK: Until this is fully tablegen'd. 95 static bool AlreadyInit = false; 96 if (AlreadyInit) 97 return; 98 99 AlreadyInit = true; 100 101 const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); 102 (void)RBSGPR; 103 assert(&RBSGPR == &AMDGPU::SGPRRegBank); 104 105 const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); 106 (void)RBVGPR; 107 assert(&RBVGPR == &AMDGPU::VGPRRegBank); 108 109 } 110 111 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 112 const RegisterBank &Src, 113 unsigned Size) const { 114 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 115 if (Dst.getID() == AMDGPU::SGPRRegBankID && 116 Src.getID() == AMDGPU::VGPRRegBankID) { 117 return std::numeric_limits<unsigned>::max(); 118 } 119 120 // Bool values are tricky, because the meaning is based on context. The SCC 121 // and VCC banks are for the natural scalar and vector conditions produced by 122 // a compare. 123 // 124 // Legalization doesn't know about the necessary context, so an s1 use may 125 // have been a truncate from an arbitrary value, in which case a copy (lowered 126 // as a compare with 0) needs to be inserted. 127 if (Size == 1 && 128 (Dst.getID() == AMDGPU::SCCRegBankID || 129 Dst.getID() == AMDGPU::SGPRRegBankID) && 130 (Src.getID() == AMDGPU::SGPRRegBankID || 131 Src.getID() == AMDGPU::VGPRRegBankID || 132 Src.getID() == AMDGPU::VCCRegBankID)) 133 return std::numeric_limits<unsigned>::max(); 134 135 if (Dst.getID() == AMDGPU::SCCRegBankID && 136 Src.getID() == AMDGPU::VCCRegBankID) 137 return std::numeric_limits<unsigned>::max(); 138 139 return RegisterBankInfo::copyCost(Dst, Src, Size); 140 } 141 142 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 143 const ValueMapping &ValMapping, 144 const RegisterBank *CurBank) const { 145 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 146 // VGPR. 147 // FIXME: Is there a better way to do this? 148 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 149 return 10; // This is expensive. 150 151 assert(ValMapping.NumBreakDowns == 2 && 152 ValMapping.BreakDown[0].Length == 32 && 153 ValMapping.BreakDown[0].StartIdx == 0 && 154 ValMapping.BreakDown[1].Length == 32 && 155 ValMapping.BreakDown[1].StartIdx == 32 && 156 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 157 158 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 159 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 160 // want. 161 162 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 163 // alignment restrictions, but this probably isn't important. 164 return 1; 165 } 166 167 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( 168 const TargetRegisterClass &RC) const { 169 if (&RC == &AMDGPU::SReg_1RegClass) 170 return AMDGPU::VCCRegBank; 171 172 return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank; 173 } 174 175 template <unsigned NumOps> 176 RegisterBankInfo::InstructionMappings 177 AMDGPURegisterBankInfo::addMappingFromTable( 178 const MachineInstr &MI, const MachineRegisterInfo &MRI, 179 const std::array<unsigned, NumOps> RegSrcOpIdx, 180 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 181 182 InstructionMappings AltMappings; 183 184 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 185 186 unsigned Sizes[NumOps]; 187 for (unsigned I = 0; I < NumOps; ++I) { 188 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 189 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 190 } 191 192 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 193 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 194 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 195 } 196 197 // getInstrMapping's default mapping uses ID 1, so start at 2. 198 unsigned MappingID = 2; 199 for (const auto &Entry : Table) { 200 for (unsigned I = 0; I < NumOps; ++I) { 201 int OpIdx = RegSrcOpIdx[I]; 202 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 203 } 204 205 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 206 getOperandsMapping(Operands), 207 Operands.size())); 208 } 209 210 return AltMappings; 211 } 212 213 RegisterBankInfo::InstructionMappings 214 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 215 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 216 switch (MI.getIntrinsicID()) { 217 case Intrinsic::amdgcn_readlane: { 218 static const OpRegBankEntry<3> Table[2] = { 219 // Perfectly legal. 220 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 221 222 // Need a readfirstlane for the index. 223 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 224 }; 225 226 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 227 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 228 } 229 case Intrinsic::amdgcn_writelane: { 230 static const OpRegBankEntry<4> Table[4] = { 231 // Perfectly legal. 232 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 233 234 // Need readfirstlane of first op 235 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 236 237 // Need readfirstlane of second op 238 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 239 240 // Need readfirstlane of both ops 241 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 242 }; 243 244 // rsrc, voffset, offset 245 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 246 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 247 } 248 default: 249 return RegisterBankInfo::getInstrAlternativeMappings(MI); 250 } 251 } 252 253 RegisterBankInfo::InstructionMappings 254 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 255 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 256 257 switch (MI.getIntrinsicID()) { 258 case Intrinsic::amdgcn_buffer_load: { 259 static const OpRegBankEntry<3> Table[4] = { 260 // Perfectly legal. 261 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 262 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 263 264 // Waterfall loop needed for rsrc. In the worst case this will execute 265 // approximately an extra 10 * wavesize + 2 instructions. 266 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 267 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } 268 }; 269 270 // rsrc, voffset, offset 271 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; 272 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 273 } 274 case Intrinsic::amdgcn_s_buffer_load: { 275 static const OpRegBankEntry<2> Table[4] = { 276 // Perfectly legal. 277 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 278 279 // Only need 1 register in loop 280 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 281 282 // Have to waterfall the resource. 283 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 284 285 // Have to waterfall the resource, and the offset. 286 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 287 }; 288 289 // rsrc, offset 290 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 291 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 292 } 293 case Intrinsic::amdgcn_ds_ordered_add: 294 case Intrinsic::amdgcn_ds_ordered_swap: { 295 // VGPR = M0, VGPR 296 static const OpRegBankEntry<3> Table[2] = { 297 // Perfectly legal. 298 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 299 300 // Need a readfirstlane for m0 301 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 302 }; 303 304 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 305 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 306 } 307 case Intrinsic::amdgcn_s_sendmsg: 308 case Intrinsic::amdgcn_s_sendmsghalt: { 309 // FIXME: Should have no register for immediate 310 static const OpRegBankEntry<1> Table[2] = { 311 // Perfectly legal. 312 { { AMDGPU::SGPRRegBankID }, 1 }, 313 314 // Need readlane 315 { { AMDGPU::VGPRRegBankID }, 3 } 316 }; 317 318 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 319 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 320 } 321 default: 322 return RegisterBankInfo::getInstrAlternativeMappings(MI); 323 } 324 } 325 326 // FIXME: Returns uniform if there's no source value information. This is 327 // probably wrong. 328 static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) { 329 if (!MI.hasOneMemOperand()) 330 return false; 331 332 const MachineMemOperand *MMO = *MI.memoperands_begin(); 333 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && 334 AMDGPUInstrInfo::isUniformMMO(MMO); 335 } 336 337 RegisterBankInfo::InstructionMappings 338 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 339 const MachineInstr &MI) const { 340 341 const MachineFunction &MF = *MI.getParent()->getParent(); 342 const MachineRegisterInfo &MRI = MF.getRegInfo(); 343 344 345 InstructionMappings AltMappings; 346 switch (MI.getOpcode()) { 347 case TargetOpcode::G_CONSTANT: { 348 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 349 if (Size == 1) { 350 static const OpRegBankEntry<1> Table[4] = { 351 { { AMDGPU::VGPRRegBankID }, 1 }, 352 { { AMDGPU::SGPRRegBankID }, 1 }, 353 { { AMDGPU::VCCRegBankID }, 1 }, 354 { { AMDGPU::SCCRegBankID }, 1 } 355 }; 356 357 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 358 } 359 360 LLVM_FALLTHROUGH; 361 } 362 case TargetOpcode::G_FCONSTANT: 363 case TargetOpcode::G_FRAME_INDEX: 364 case TargetOpcode::G_GLOBAL_VALUE: { 365 static const OpRegBankEntry<1> Table[2] = { 366 { { AMDGPU::VGPRRegBankID }, 1 }, 367 { { AMDGPU::SGPRRegBankID }, 1 } 368 }; 369 370 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 371 } 372 case TargetOpcode::G_AND: 373 case TargetOpcode::G_OR: 374 case TargetOpcode::G_XOR: { 375 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 376 377 if (Size == 1) { 378 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 379 const InstructionMapping &SCCMapping = getInstructionMapping( 380 1, 1, getOperandsMapping( 381 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size), 382 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 383 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 384 3); // Num Operands 385 AltMappings.push_back(&SCCMapping); 386 387 const InstructionMapping &SGPRMapping = getInstructionMapping( 388 1, 1, getOperandsMapping( 389 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 390 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 391 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 392 3); // Num Operands 393 AltMappings.push_back(&SGPRMapping); 394 395 const InstructionMapping &VCCMapping0 = getInstructionMapping( 396 2, 10, getOperandsMapping( 397 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 398 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 399 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 400 3); // Num Operands 401 AltMappings.push_back(&VCCMapping0); 402 return AltMappings; 403 } 404 405 if (Size != 64) 406 break; 407 408 const InstructionMapping &SSMapping = getInstructionMapping( 409 1, 1, getOperandsMapping( 410 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 411 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 412 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 413 3); // Num Operands 414 AltMappings.push_back(&SSMapping); 415 416 const InstructionMapping &VVMapping = getInstructionMapping( 417 2, 2, getOperandsMapping( 418 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 419 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 420 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 421 3); // Num Operands 422 AltMappings.push_back(&VVMapping); 423 424 const InstructionMapping &SVMapping = getInstructionMapping( 425 3, 3, getOperandsMapping( 426 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 427 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), 428 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 429 3); // Num Operands 430 AltMappings.push_back(&SVMapping); 431 432 // SGPR in LHS is slightly preferrable, so make it VS more expensive than 433 // SV. 434 const InstructionMapping &VSMapping = getInstructionMapping( 435 3, 4, getOperandsMapping( 436 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 437 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 438 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), 439 3); // Num Operands 440 AltMappings.push_back(&VSMapping); 441 break; 442 } 443 case TargetOpcode::G_LOAD: 444 case TargetOpcode::G_ZEXTLOAD: 445 case TargetOpcode::G_SEXTLOAD: { 446 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 447 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 448 unsigned PtrSize = PtrTy.getSizeInBits(); 449 unsigned AS = PtrTy.getAddressSpace(); 450 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 451 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 452 AS != AMDGPUAS::PRIVATE_ADDRESS) && 453 isInstrUniformNonExtLoadAlign4(MI)) { 454 const InstructionMapping &SSMapping = getInstructionMapping( 455 1, 1, getOperandsMapping( 456 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 457 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 458 2); // Num Operands 459 AltMappings.push_back(&SSMapping); 460 } 461 462 const InstructionMapping &VVMapping = getInstructionMapping( 463 2, 1, getOperandsMapping( 464 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), 465 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 466 2); // Num Operands 467 AltMappings.push_back(&VVMapping); 468 469 // It may be possible to have a vgpr = load sgpr mapping here, because 470 // the mubuf instructions support this kind of load, but probably for only 471 // gfx7 and older. However, the addressing mode matching in the instruction 472 // selector should be able to do a better job of detecting and selecting 473 // these kinds of loads from the vgpr = load vgpr mapping. 474 475 return AltMappings; 476 477 } 478 case TargetOpcode::G_ICMP: { 479 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 480 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 481 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 482 nullptr, // Predicate operand. 483 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 484 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 485 4); // Num Operands 486 AltMappings.push_back(&SSMapping); 487 488 const InstructionMapping &SVMapping = getInstructionMapping(2, 1, 489 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 490 nullptr, // Predicate operand. 491 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 492 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 493 4); // Num Operands 494 AltMappings.push_back(&SVMapping); 495 496 const InstructionMapping &VSMapping = getInstructionMapping(3, 1, 497 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 498 nullptr, // Predicate operand. 499 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 500 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 501 4); // Num Operands 502 AltMappings.push_back(&VSMapping); 503 504 const InstructionMapping &VVMapping = getInstructionMapping(4, 1, 505 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 506 nullptr, // Predicate operand. 507 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 508 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), 509 4); // Num Operands 510 AltMappings.push_back(&VVMapping); 511 512 return AltMappings; 513 } 514 case TargetOpcode::G_SELECT: { 515 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 516 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 517 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 518 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 520 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 521 4); // Num Operands 522 AltMappings.push_back(&SSMapping); 523 524 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 525 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 526 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 528 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 529 4); // Num Operands 530 AltMappings.push_back(&VVMapping); 531 532 return AltMappings; 533 } 534 case TargetOpcode::G_SMIN: 535 case TargetOpcode::G_SMAX: 536 case TargetOpcode::G_UMIN: 537 case TargetOpcode::G_UMAX: { 538 static const OpRegBankEntry<3> Table[4] = { 539 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 540 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 541 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 542 543 // Scalar requires cmp+select, and extends if 16-bit. 544 // FIXME: Should there be separate costs for 32 and 16-bit 545 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } 546 }; 547 548 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; 549 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 550 } 551 case TargetOpcode::G_UADDE: 552 case TargetOpcode::G_USUBE: 553 case TargetOpcode::G_SADDE: 554 case TargetOpcode::G_SSUBE: { 555 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 556 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 557 getOperandsMapping( 558 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 559 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), 560 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 561 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 562 AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}), 563 5); // Num Operands 564 AltMappings.push_back(&SSMapping); 565 566 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 567 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 568 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 569 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 570 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 571 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 572 5); // Num Operands 573 AltMappings.push_back(&VVMapping); 574 return AltMappings; 575 } 576 case AMDGPU::G_BRCOND: { 577 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 578 579 const InstructionMapping &SMapping = getInstructionMapping( 580 1, 1, getOperandsMapping( 581 {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}), 582 2); // Num Operands 583 AltMappings.push_back(&SMapping); 584 585 const InstructionMapping &VMapping = getInstructionMapping( 586 1, 1, getOperandsMapping( 587 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 588 2); // Num Operands 589 AltMappings.push_back(&VMapping); 590 return AltMappings; 591 } 592 case AMDGPU::G_INTRINSIC: 593 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 594 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 595 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 596 default: 597 break; 598 } 599 return RegisterBankInfo::getInstrAlternativeMappings(MI); 600 } 601 602 void AMDGPURegisterBankInfo::split64BitValueForMapping( 603 MachineIRBuilder &B, 604 SmallVector<Register, 2> &Regs, 605 LLT HalfTy, 606 Register Reg) const { 607 assert(HalfTy.getSizeInBits() == 32); 608 MachineRegisterInfo *MRI = B.getMRI(); 609 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 610 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 611 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 612 MRI->setRegBank(LoLHS, *Bank); 613 MRI->setRegBank(HiLHS, *Bank); 614 615 Regs.push_back(LoLHS); 616 Regs.push_back(HiLHS); 617 618 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 619 .addDef(LoLHS) 620 .addDef(HiLHS) 621 .addUse(Reg); 622 } 623 624 /// Replace the current type each register in \p Regs has with \p NewTy 625 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 626 LLT NewTy) { 627 for (Register Reg : Regs) { 628 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 629 MRI.setType(Reg, NewTy); 630 } 631 } 632 633 static LLT getHalfSizedType(LLT Ty) { 634 if (Ty.isVector()) { 635 assert(Ty.getNumElements() % 2 == 0); 636 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 637 } 638 639 assert(Ty.getSizeInBits() % 2 == 0); 640 return LLT::scalar(Ty.getSizeInBits() / 2); 641 } 642 643 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 644 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 645 /// execute the instruction for each unique combination of values in all lanes 646 /// in the wave. The block will be split such that rest of the instructions are 647 /// moved to a new block. 648 /// 649 /// Essentially performs this loop: 650 // 651 /// Save Execution Mask 652 /// For (Lane : Wavefront) { 653 /// Enable Lane, Disable all other lanes 654 /// SGPR = read SGPR value for current lane from VGPR 655 /// VGPRResult[Lane] = use_op SGPR 656 /// } 657 /// Restore Execution Mask 658 /// 659 /// There is additional complexity to try for compare values to identify the 660 /// unique values used. 661 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 662 MachineIRBuilder &B, 663 iterator_range<MachineBasicBlock::iterator> Range, 664 SmallSet<Register, 4> &SGPROperandRegs, 665 MachineRegisterInfo &MRI) const { 666 SmallVector<Register, 4> ResultRegs; 667 SmallVector<Register, 4> InitResultRegs; 668 SmallVector<Register, 4> PhiRegs; 669 670 MachineBasicBlock &MBB = B.getMBB(); 671 MachineFunction *MF = &B.getMF(); 672 673 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 674 const unsigned WaveAndOpc = Subtarget.isWave32() ? 675 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 676 const unsigned MovTermOpc = Subtarget.isWave32() ? 677 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 678 const unsigned XorTermOpc = Subtarget.isWave32() ? 679 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 680 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 681 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 682 const unsigned ExecReg = Subtarget.isWave32() ? 683 AMDGPU::EXEC_LO : AMDGPU::EXEC; 684 685 for (MachineInstr &MI : Range) { 686 for (MachineOperand &Def : MI.defs()) { 687 LLT ResTy = MRI.getType(Def.getReg()); 688 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 689 ResultRegs.push_back(Def.getReg()); 690 Register InitReg = B.buildUndef(ResTy).getReg(0); 691 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 692 InitResultRegs.push_back(InitReg); 693 PhiRegs.push_back(PhiReg); 694 MRI.setRegBank(PhiReg, *DefBank); 695 MRI.setRegBank(InitReg, *DefBank); 696 } 697 } 698 699 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 700 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 701 702 // Don't bother using generic instructions/registers for the exec mask. 703 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 704 .addDef(InitSaveExecReg); 705 706 Register PhiExec = MRI.createVirtualRegister(WaveRC); 707 Register NewExec = MRI.createVirtualRegister(WaveRC); 708 709 // To insert the loop we need to split the block. Move everything before this 710 // point to a new block, and insert a new empty block before this instruction. 711 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 712 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 713 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 714 MachineFunction::iterator MBBI(MBB); 715 ++MBBI; 716 MF->insert(MBBI, LoopBB); 717 MF->insert(MBBI, RestoreExecBB); 718 MF->insert(MBBI, RemainderBB); 719 720 LoopBB->addSuccessor(RestoreExecBB); 721 LoopBB->addSuccessor(LoopBB); 722 723 // Move the rest of the block into a new block. 724 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 725 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 726 727 MBB.addSuccessor(LoopBB); 728 RestoreExecBB->addSuccessor(RemainderBB); 729 730 B.setInsertPt(*LoopBB, LoopBB->end()); 731 732 B.buildInstr(TargetOpcode::PHI) 733 .addDef(PhiExec) 734 .addReg(InitSaveExecReg) 735 .addMBB(&MBB) 736 .addReg(NewExec) 737 .addMBB(LoopBB); 738 739 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 740 B.buildInstr(TargetOpcode::G_PHI) 741 .addDef(std::get<2>(Result)) 742 .addReg(std::get<0>(Result)) // Initial value / implicit_def 743 .addMBB(&MBB) 744 .addReg(std::get<1>(Result)) // Mid-loop value. 745 .addMBB(LoopBB); 746 } 747 748 const DebugLoc &DL = B.getDL(); 749 750 // Figure out the iterator range after splicing the instructions. 751 auto NewBegin = std::prev(LoopBB->end()); 752 753 // Move the instruction into the loop. Note we moved everything after 754 // Range.end() already into a new block, so Range.end() is no longer valid. 755 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 756 757 auto NewEnd = LoopBB->end(); 758 759 MachineBasicBlock::iterator I = Range.begin(); 760 B.setInsertPt(*LoopBB, I); 761 762 Register CondReg; 763 764 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 765 for (MachineOperand &Op : MI.uses()) { 766 if (!Op.isReg() || Op.isDef()) 767 continue; 768 769 if (SGPROperandRegs.count(Op.getReg())) { 770 LLT OpTy = MRI.getType(Op.getReg()); 771 unsigned OpSize = OpTy.getSizeInBits(); 772 773 // Can only do a readlane of 32-bit pieces. 774 if (OpSize == 32) { 775 // Avoid extra copies in the simple case of one 32-bit register. 776 Register CurrentLaneOpReg 777 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 778 MRI.setType(CurrentLaneOpReg, OpTy); 779 780 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); 781 // Read the next variant <- also loop target. 782 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 783 CurrentLaneOpReg) 784 .addReg(Op.getReg()); 785 786 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 787 bool First = CondReg == AMDGPU::NoRegister; 788 if (First) 789 CondReg = NewCondReg; 790 791 // Compare the just read M0 value to all possible Idx values. 792 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 793 .addDef(NewCondReg) 794 .addReg(CurrentLaneOpReg) 795 .addReg(Op.getReg()); 796 Op.setReg(CurrentLaneOpReg); 797 798 if (!First) { 799 Register AndReg = MRI.createVirtualRegister(WaveRC); 800 801 // If there are multiple operands to consider, and the conditions. 802 B.buildInstr(WaveAndOpc) 803 .addDef(AndReg) 804 .addReg(NewCondReg) 805 .addReg(CondReg); 806 CondReg = AndReg; 807 } 808 } else { 809 LLT S32 = LLT::scalar(32); 810 SmallVector<Register, 8> ReadlanePieces; 811 812 // The compares can be done as 64-bit, but the extract needs to be done 813 // in 32-bit pieces. 814 815 bool Is64 = OpSize % 64 == 0; 816 817 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 818 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 819 : AMDGPU::V_CMP_EQ_U32_e64; 820 821 // The compares can be done as 64-bit, but the extract needs to be done 822 // in 32-bit pieces. 823 824 // Insert the unmerge before the loop. 825 826 B.setMBB(MBB); 827 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); 828 B.setInstr(*I); 829 830 unsigned NumPieces = Unmerge->getNumOperands() - 1; 831 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 832 Register UnmergePiece = Unmerge.getReg(PieceIdx); 833 834 Register CurrentLaneOpReg; 835 if (Is64) { 836 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 837 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 838 839 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 840 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 841 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 842 843 // Read the next variant <- also loop target. 844 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 845 CurrentLaneOpRegLo) 846 .addReg(UnmergePiece, 0, AMDGPU::sub0); 847 848 // Read the next variant <- also loop target. 849 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 850 CurrentLaneOpRegHi) 851 .addReg(UnmergePiece, 0, AMDGPU::sub1); 852 853 CurrentLaneOpReg = 854 B.buildMerge(LLT::scalar(64), 855 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 856 .getReg(0); 857 858 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 859 860 if (OpTy.getScalarSizeInBits() == 64) { 861 // If we need to produce a 64-bit element vector, so use the 862 // merged pieces 863 ReadlanePieces.push_back(CurrentLaneOpReg); 864 } else { 865 // 32-bit element type. 866 ReadlanePieces.push_back(CurrentLaneOpRegLo); 867 ReadlanePieces.push_back(CurrentLaneOpRegHi); 868 } 869 } else { 870 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 871 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 872 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 873 874 // Read the next variant <- also loop target. 875 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 876 CurrentLaneOpReg) 877 .addReg(UnmergePiece); 878 ReadlanePieces.push_back(CurrentLaneOpReg); 879 } 880 881 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 882 bool First = CondReg == AMDGPU::NoRegister; 883 if (First) 884 CondReg = NewCondReg; 885 886 B.buildInstr(CmpOp) 887 .addDef(NewCondReg) 888 .addReg(CurrentLaneOpReg) 889 .addReg(UnmergePiece); 890 891 if (!First) { 892 Register AndReg = MRI.createVirtualRegister(WaveRC); 893 894 // If there are multiple operands to consider, and the conditions. 895 B.buildInstr(WaveAndOpc) 896 .addDef(AndReg) 897 .addReg(NewCondReg) 898 .addReg(CondReg); 899 CondReg = AndReg; 900 } 901 } 902 903 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 904 // BUILD_VECTOR 905 if (OpTy.isVector()) { 906 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 907 Op.setReg(Merge.getReg(0)); 908 } else { 909 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 910 Op.setReg(Merge.getReg(0)); 911 } 912 913 MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); 914 } 915 } 916 } 917 } 918 919 B.setInsertPt(*LoopBB, LoopBB->end()); 920 921 // Update EXEC, save the original EXEC value to VCC. 922 B.buildInstr(AndSaveExecOpc) 923 .addDef(NewExec) 924 .addReg(CondReg, RegState::Kill); 925 926 MRI.setSimpleHint(NewExec, CondReg); 927 928 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 929 B.buildInstr(XorTermOpc) 930 .addDef(ExecReg) 931 .addReg(ExecReg) 932 .addReg(NewExec); 933 934 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 935 // s_cbranch_scc0? 936 937 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 938 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 939 .addMBB(LoopBB); 940 941 // Save the EXEC mask before the loop. 942 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) 943 .addReg(ExecReg); 944 945 // Restore the EXEC mask after the loop. 946 B.setMBB(*RestoreExecBB); 947 B.buildInstr(MovTermOpc) 948 .addDef(ExecReg) 949 .addReg(SaveExecReg); 950 951 // Restore the insert point before the original instruction. 952 B.setInsertPt(MBB, MBB.end()); 953 954 return true; 955 } 956 957 // Return any unique registers used by \p MI at \p OpIndices that need to be 958 // handled in a waterfall loop. Returns these registers in \p 959 // SGPROperandRegs. Returns true if there are any operansd to handle and a 960 // waterfall loop is necessary. 961 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 962 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 963 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 964 for (unsigned Op : OpIndices) { 965 assert(MI.getOperand(Op).isUse()); 966 Register Reg = MI.getOperand(Op).getReg(); 967 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 968 if (OpBank->getID() == AMDGPU::VGPRRegBankID) 969 SGPROperandRegs.insert(Reg); 970 } 971 972 // No operands need to be replaced, so no need to loop. 973 return !SGPROperandRegs.empty(); 974 } 975 976 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 977 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 978 ArrayRef<unsigned> OpIndices) const { 979 // Use a set to avoid extra readfirstlanes in the case where multiple operands 980 // are the same register. 981 SmallSet<Register, 4> SGPROperandRegs; 982 983 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 984 return false; 985 986 MachineBasicBlock::iterator I = MI.getIterator(); 987 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 988 SGPROperandRegs, MRI); 989 } 990 991 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 992 MachineInstr &MI, MachineRegisterInfo &MRI, 993 ArrayRef<unsigned> OpIndices) const { 994 MachineIRBuilder B(MI); 995 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 996 } 997 998 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 999 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1000 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1001 Register Reg = MI.getOperand(OpIdx).getReg(); 1002 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1003 if (Bank != &AMDGPU::VGPRRegBank) 1004 return; 1005 1006 MachineIRBuilder B(MI); 1007 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1008 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1009 .addDef(SGPR) 1010 .addReg(Reg); 1011 1012 const TargetRegisterClass *Constrained = 1013 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1014 (void)Constrained; 1015 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1016 1017 MI.getOperand(OpIdx).setReg(SGPR); 1018 } 1019 1020 // When regbankselect repairs registers, it will insert a repair instruction 1021 // which defines the repaired register. Then it calls applyMapping and expects 1022 // that the targets will either delete or rewrite the originally wrote to the 1023 // repaired registers. Beccause of this, we end up in a situation where 1024 // we have 2 instructions defining the same registers. 1025 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, 1026 Register Reg, 1027 const MachineInstr &MI) { 1028 // Is there some way we can assert that there are exactly 2 def instructions? 1029 for (MachineInstr &Other : MRI.def_instructions(Reg)) { 1030 if (&Other != &MI) 1031 return &Other; 1032 } 1033 1034 return nullptr; 1035 } 1036 1037 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, 1038 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1039 MachineRegisterInfo &MRI) const { 1040 Register DstReg = MI.getOperand(0).getReg(); 1041 const LLT LoadTy = MRI.getType(DstReg); 1042 unsigned LoadSize = LoadTy.getSizeInBits(); 1043 const unsigned MaxNonSmrdLoadSize = 128; 1044 // 128-bit loads are supported for all instruction types. 1045 if (LoadSize <= MaxNonSmrdLoadSize) 1046 return false; 1047 1048 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); 1049 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); 1050 1051 // If the pointer is an SGPR, we have nothing to do. 1052 if (SrcRegs.empty()) { 1053 Register PtrReg = MI.getOperand(1).getReg(); 1054 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 1055 if (PtrBank == &AMDGPU::SGPRRegBank) 1056 return false; 1057 SrcRegs.push_back(PtrReg); 1058 } 1059 1060 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1061 1062 // We want to get the repair instruction now, because it will help us 1063 // determine which instruction the legalizer inserts that will also 1064 // write to DstReg. 1065 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); 1066 1067 // RegBankSelect only emits scalar types, so we need to reset the pointer 1068 // operand to a pointer type. 1069 Register BasePtrReg = SrcRegs[0]; 1070 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1071 MRI.setType(BasePtrReg, PtrTy); 1072 1073 MachineIRBuilder B(MI); 1074 1075 unsigned SplitElts = 1076 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); 1077 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); 1078 ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); 1079 GISelObserverWrapper Observer(&O); 1080 B.setChangeObserver(Observer); 1081 LegalizerHelper Helper(B.getMF(), Observer, B); 1082 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1083 return false; 1084 1085 // At this point, the legalizer has split the original load into smaller 1086 // loads. At the end of lowering, it inserts an instruction (LegalizedInst) 1087 // that combines the outputs of the lower loads and writes it to DstReg. 1088 // The register bank selector has also added the RepairInst which writes to 1089 // DstReg as well. 1090 1091 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); 1092 1093 // Replace the output of the LegalizedInst with a temporary register, since 1094 // RepairInst already defines DstReg. 1095 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); 1096 LegalizedInst->getOperand(0).setReg(TmpReg); 1097 B.setInsertPt(*RepairInst->getParent(), RepairInst); 1098 1099 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { 1100 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 1101 B.buildConstant(IdxReg, DefIdx); 1102 MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); 1103 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); 1104 } 1105 1106 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1107 return true; 1108 } 1109 1110 bool AMDGPURegisterBankInfo::applyMappingImage( 1111 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1112 MachineRegisterInfo &MRI, int RsrcIdx) const { 1113 const int NumDefs = MI.getNumExplicitDefs(); 1114 1115 // The reported argument index is relative to the IR intrinsic call arguments, 1116 // so we need to shift by the number of defs and the intrinsic ID. 1117 RsrcIdx += NumDefs + 1; 1118 1119 // Insert copies to VGPR arguments. 1120 applyDefaultMapping(OpdMapper); 1121 1122 // Fixup any SGPR arguments. 1123 SmallVector<unsigned, 4> SGPRIndexes; 1124 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1125 if (!MI.getOperand(I).isReg()) 1126 continue; 1127 1128 // If this intrinsic has a sampler, it immediately follows rsrc. 1129 if (I == RsrcIdx || I == RsrcIdx + 1) 1130 SGPRIndexes.push_back(I); 1131 } 1132 1133 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1134 return true; 1135 } 1136 1137 // For cases where only a single copy is inserted for matching register banks. 1138 // Replace the register in the instruction operand 1139 static void substituteSimpleCopyRegs( 1140 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1141 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1142 if (!SrcReg.empty()) { 1143 assert(SrcReg.size() == 1); 1144 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1145 } 1146 } 1147 1148 /// Handle register layout difference for f16 images for some subtargets. 1149 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1150 MachineRegisterInfo &MRI, 1151 Register Reg) const { 1152 if (!Subtarget.hasUnpackedD16VMem()) 1153 return Reg; 1154 1155 const LLT S16 = LLT::scalar(16); 1156 LLT StoreVT = MRI.getType(Reg); 1157 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1158 return Reg; 1159 1160 auto Unmerge = B.buildUnmerge(S16, Reg); 1161 1162 1163 SmallVector<Register, 4> WideRegs; 1164 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1165 WideRegs.push_back(Unmerge.getReg(I)); 1166 1167 const LLT S32 = LLT::scalar(32); 1168 int NumElts = StoreVT.getNumElements(); 1169 1170 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1171 } 1172 1173 static std::pair<Register, unsigned> 1174 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1175 int64_t Const; 1176 if (mi_match(Reg, MRI, m_ICst(Const))) 1177 return std::make_pair(Register(), Const); 1178 1179 Register Base; 1180 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1181 return std::make_pair(Base, Const); 1182 1183 // TODO: Handle G_OR used for add case 1184 return std::make_pair(Reg, 0); 1185 } 1186 1187 std::pair<Register, unsigned> 1188 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1189 Register OrigOffset) const { 1190 const unsigned MaxImm = 4095; 1191 Register BaseReg; 1192 unsigned ImmOffset; 1193 const LLT S32 = LLT::scalar(32); 1194 1195 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1196 OrigOffset); 1197 1198 unsigned C1 = 0; 1199 if (ImmOffset != 0) { 1200 // If the immediate value is too big for the immoffset field, put the value 1201 // and -4096 into the immoffset field so that the value that is copied/added 1202 // for the voffset field is a multiple of 4096, and it stands more chance 1203 // of being CSEd with the copy/add for another similar load/store. 1204 // However, do not do that rounding down to a multiple of 4096 if that is a 1205 // negative number, as it appears to be illegal to have a negative offset 1206 // in the vgpr, even if adding the immediate offset makes it positive. 1207 unsigned Overflow = ImmOffset & ~MaxImm; 1208 ImmOffset -= Overflow; 1209 if ((int32_t)Overflow < 0) { 1210 Overflow += ImmOffset; 1211 ImmOffset = 0; 1212 } 1213 1214 C1 = ImmOffset; 1215 if (Overflow != 0) { 1216 if (!BaseReg) 1217 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1218 else { 1219 auto OverflowVal = B.buildConstant(S32, Overflow); 1220 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1221 } 1222 } 1223 } 1224 1225 if (!BaseReg) 1226 BaseReg = B.buildConstant(S32, 0).getReg(0); 1227 1228 return {BaseReg, C1}; 1229 } 1230 1231 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 1232 int64_t C; 1233 return mi_match(Reg, MRI, m_ICst(C)) && C == 0; 1234 } 1235 1236 static unsigned extractGLC(unsigned CachePolicy) { 1237 return CachePolicy & 1; 1238 } 1239 1240 static unsigned extractSLC(unsigned CachePolicy) { 1241 return (CachePolicy >> 1) & 1; 1242 } 1243 1244 static unsigned extractDLC(unsigned CachePolicy) { 1245 return (CachePolicy >> 2) & 1; 1246 } 1247 1248 MachineInstr * 1249 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, 1250 MachineInstr &MI) const { 1251 MachineRegisterInfo &MRI = *B.getMRI(); 1252 executeInWaterfallLoop(B, MI, MRI, {2, 4}); 1253 1254 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. 1255 1256 Register VData = MI.getOperand(1).getReg(); 1257 LLT Ty = MRI.getType(VData); 1258 1259 int EltSize = Ty.getScalarSizeInBits(); 1260 int Size = Ty.getSizeInBits(); 1261 1262 // FIXME: Broken integer truncstore. 1263 if (EltSize != 32) 1264 report_fatal_error("unhandled intrinsic store"); 1265 1266 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1267 const int MemSize = (*MI.memoperands_begin())->getSize(); 1268 1269 1270 Register RSrc = MI.getOperand(2).getReg(); 1271 Register VOffset = MI.getOperand(3).getReg(); 1272 Register SOffset = MI.getOperand(4).getReg(); 1273 unsigned CachePolicy = MI.getOperand(5).getImm(); 1274 1275 unsigned ImmOffset; 1276 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 1277 1278 const bool Offen = !isZero(VOffset, MRI); 1279 1280 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; 1281 switch (8 * MemSize) { 1282 case 8: 1283 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 1284 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 1285 break; 1286 case 16: 1287 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 1288 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 1289 break; 1290 default: 1291 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 1292 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 1293 if (Size > 32) 1294 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 1295 break; 1296 } 1297 1298 1299 // Set the insertion point back to the instruction in case it was moved into a 1300 // loop. 1301 B.setInstr(MI); 1302 1303 MachineInstrBuilder MIB = B.buildInstr(Opc) 1304 .addUse(VData); 1305 1306 if (Offen) 1307 MIB.addUse(VOffset); 1308 1309 MIB.addUse(RSrc) 1310 .addUse(SOffset) 1311 .addImm(ImmOffset) 1312 .addImm(extractGLC(CachePolicy)) 1313 .addImm(extractSLC(CachePolicy)) 1314 .addImm(0) // tfe: FIXME: Remove from inst 1315 .addImm(extractDLC(CachePolicy)) 1316 .cloneMemRefs(MI); 1317 1318 // FIXME: We need a way to report failure from applyMappingImpl. 1319 // Insert constrain copies before inserting the loop. 1320 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1321 report_fatal_error("failed to constrain selected store intrinsic"); 1322 1323 return MIB; 1324 } 1325 1326 void AMDGPURegisterBankInfo::applyMappingImpl( 1327 const OperandsMapper &OpdMapper) const { 1328 MachineInstr &MI = OpdMapper.getMI(); 1329 unsigned Opc = MI.getOpcode(); 1330 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1331 switch (Opc) { 1332 case AMDGPU::G_SELECT: { 1333 Register DstReg = MI.getOperand(0).getReg(); 1334 LLT DstTy = MRI.getType(DstReg); 1335 if (DstTy.getSizeInBits() != 64) 1336 break; 1337 1338 LLT HalfTy = getHalfSizedType(DstTy); 1339 1340 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1341 SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1)); 1342 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1343 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 1344 1345 // All inputs are SGPRs, nothing special to do. 1346 if (DefRegs.empty()) { 1347 assert(Src1Regs.empty() && Src2Regs.empty()); 1348 break; 1349 } 1350 1351 MachineIRBuilder B(MI); 1352 if (Src0Regs.empty()) 1353 Src0Regs.push_back(MI.getOperand(1).getReg()); 1354 else { 1355 assert(Src0Regs.size() == 1); 1356 } 1357 1358 if (Src1Regs.empty()) 1359 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1360 else { 1361 setRegsToType(MRI, Src1Regs, HalfTy); 1362 } 1363 1364 if (Src2Regs.empty()) 1365 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 1366 else 1367 setRegsToType(MRI, Src2Regs, HalfTy); 1368 1369 setRegsToType(MRI, DefRegs, HalfTy); 1370 1371 B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]); 1372 B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]); 1373 1374 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1375 MI.eraseFromParent(); 1376 return; 1377 } 1378 case AMDGPU::G_AND: 1379 case AMDGPU::G_OR: 1380 case AMDGPU::G_XOR: { 1381 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 1382 // there is a VGPR input. 1383 Register DstReg = MI.getOperand(0).getReg(); 1384 LLT DstTy = MRI.getType(DstReg); 1385 if (DstTy.getSizeInBits() != 64) 1386 break; 1387 1388 LLT HalfTy = getHalfSizedType(DstTy); 1389 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1390 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 1391 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 1392 1393 // All inputs are SGPRs, nothing special to do. 1394 if (DefRegs.empty()) { 1395 assert(Src0Regs.empty() && Src1Regs.empty()); 1396 break; 1397 } 1398 1399 assert(DefRegs.size() == 2); 1400 assert(Src0Regs.size() == Src1Regs.size() && 1401 (Src0Regs.empty() || Src0Regs.size() == 2)); 1402 1403 // Depending on where the source registers came from, the generic code may 1404 // have decided to split the inputs already or not. If not, we still need to 1405 // extract the values. 1406 MachineIRBuilder B(MI); 1407 1408 if (Src0Regs.empty()) 1409 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 1410 else 1411 setRegsToType(MRI, Src0Regs, HalfTy); 1412 1413 if (Src1Regs.empty()) 1414 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 1415 else 1416 setRegsToType(MRI, Src1Regs, HalfTy); 1417 1418 setRegsToType(MRI, DefRegs, HalfTy); 1419 1420 B.buildInstr(Opc) 1421 .addDef(DefRegs[0]) 1422 .addUse(Src0Regs[0]) 1423 .addUse(Src1Regs[0]); 1424 1425 B.buildInstr(Opc) 1426 .addDef(DefRegs[1]) 1427 .addUse(Src0Regs[1]) 1428 .addUse(Src1Regs[1]); 1429 1430 MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); 1431 MI.eraseFromParent(); 1432 return; 1433 } 1434 case AMDGPU::G_ADD: 1435 case AMDGPU::G_SUB: 1436 case AMDGPU::G_MUL: { 1437 Register DstReg = MI.getOperand(0).getReg(); 1438 LLT DstTy = MRI.getType(DstReg); 1439 if (DstTy != LLT::scalar(16)) 1440 break; 1441 1442 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1443 if (DstBank == &AMDGPU::VGPRRegBank) 1444 break; 1445 1446 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 1447 MachineFunction *MF = MI.getParent()->getParent(); 1448 MachineIRBuilder B(MI); 1449 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); 1450 GISelObserverWrapper Observer(&ApplySALU); 1451 LegalizerHelper Helper(*MF, Observer, B); 1452 1453 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 1454 LegalizerHelper::Legalized) 1455 llvm_unreachable("widen scalar should have succeeded"); 1456 return; 1457 } 1458 case AMDGPU::G_SMIN: 1459 case AMDGPU::G_SMAX: 1460 case AMDGPU::G_UMIN: 1461 case AMDGPU::G_UMAX: { 1462 Register DstReg = MI.getOperand(0).getReg(); 1463 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1464 if (DstBank == &AMDGPU::VGPRRegBank) 1465 break; 1466 1467 MachineFunction *MF = MI.getParent()->getParent(); 1468 MachineIRBuilder B(MI); 1469 ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); 1470 GISelObserverWrapper Observer(&ApplySALU); 1471 LegalizerHelper Helper(*MF, Observer, B); 1472 1473 // Turn scalar min/max into a compare and select. 1474 LLT Ty = MRI.getType(DstReg); 1475 LLT S32 = LLT::scalar(32); 1476 LLT S16 = LLT::scalar(16); 1477 1478 if (Ty == S16) { 1479 // Need to widen to s32, and expand as cmp + select. 1480 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 1481 llvm_unreachable("widenScalar should have succeeded"); 1482 1483 // FIXME: This is relying on widenScalar leaving MI in place. 1484 if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized) 1485 llvm_unreachable("lower should have succeeded"); 1486 } else { 1487 if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized) 1488 llvm_unreachable("lower should have succeeded"); 1489 } 1490 1491 return; 1492 } 1493 case AMDGPU::G_SEXT: 1494 case AMDGPU::G_ZEXT: { 1495 Register SrcReg = MI.getOperand(1).getReg(); 1496 LLT SrcTy = MRI.getType(SrcReg); 1497 bool Signed = Opc == AMDGPU::G_SEXT; 1498 1499 MachineIRBuilder B(MI); 1500 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 1501 1502 Register DstReg = MI.getOperand(0).getReg(); 1503 LLT DstTy = MRI.getType(DstReg); 1504 if (DstTy.isScalar() && 1505 SrcBank != &AMDGPU::SGPRRegBank && 1506 SrcBank != &AMDGPU::SCCRegBank && 1507 SrcBank != &AMDGPU::VCCRegBank && 1508 // FIXME: Should handle any type that round to s64 when irregular 1509 // breakdowns supported. 1510 DstTy.getSizeInBits() == 64 && 1511 SrcTy.getSizeInBits() <= 32) { 1512 const LLT S32 = LLT::scalar(32); 1513 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1514 1515 // Extend to 32-bit, and then extend the low half. 1516 if (Signed) { 1517 // TODO: Should really be buildSExtOrCopy 1518 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 1519 1520 // Replicate sign bit from 32-bit extended part. 1521 auto ShiftAmt = B.buildConstant(S32, 31); 1522 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1523 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); 1524 } else { 1525 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 1526 B.buildConstant(DefRegs[1], 0); 1527 } 1528 1529 MRI.setRegBank(DstReg, *SrcBank); 1530 MI.eraseFromParent(); 1531 return; 1532 } 1533 1534 if (SrcTy != LLT::scalar(1)) 1535 return; 1536 1537 if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) { 1538 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 1539 1540 const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ? 1541 &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank; 1542 1543 unsigned DstSize = DstTy.getSizeInBits(); 1544 // 64-bit select is SGPR only 1545 const bool UseSel64 = DstSize > 32 && 1546 SrcBank->getID() == AMDGPU::SCCRegBankID; 1547 1548 // TODO: Should s16 select be legal? 1549 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 1550 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 1551 auto False = B.buildConstant(SelType, 0); 1552 1553 MRI.setRegBank(True.getReg(0), *DstBank); 1554 MRI.setRegBank(False.getReg(0), *DstBank); 1555 MRI.setRegBank(DstReg, *DstBank); 1556 1557 if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) { 1558 B.buildSelect(DefRegs[0], SrcReg, True, False); 1559 B.buildCopy(DefRegs[1], DefRegs[0]); 1560 } else if (DstSize < 32) { 1561 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 1562 MRI.setRegBank(Sel.getReg(0), *DstBank); 1563 B.buildTrunc(DstReg, Sel); 1564 } else { 1565 B.buildSelect(DstReg, SrcReg, True, False); 1566 } 1567 1568 MI.eraseFromParent(); 1569 return; 1570 } 1571 1572 // Fixup the case with an s1 src that isn't a condition register. Use shifts 1573 // instead of introducing a compare to avoid an unnecessary condition 1574 // register (and since there's no scalar 16-bit compares). 1575 auto Ext = B.buildAnyExt(DstTy, SrcReg); 1576 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); 1577 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); 1578 1579 if (MI.getOpcode() == AMDGPU::G_SEXT) 1580 B.buildAShr(DstReg, Shl, ShiftAmt); 1581 else 1582 B.buildLShr(DstReg, Shl, ShiftAmt); 1583 1584 MRI.setRegBank(DstReg, *SrcBank); 1585 MRI.setRegBank(Ext.getReg(0), *SrcBank); 1586 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); 1587 MRI.setRegBank(Shl.getReg(0), *SrcBank); 1588 MI.eraseFromParent(); 1589 return; 1590 } 1591 case AMDGPU::G_BUILD_VECTOR: 1592 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 1593 Register DstReg = MI.getOperand(0).getReg(); 1594 LLT DstTy = MRI.getType(DstReg); 1595 if (DstTy != LLT::vector(2, 16)) 1596 break; 1597 1598 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 1599 substituteSimpleCopyRegs(OpdMapper, 1); 1600 substituteSimpleCopyRegs(OpdMapper, 2); 1601 1602 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1603 if (DstBank == &AMDGPU::SGPRRegBank) 1604 break; // Can use S_PACK_* instructions. 1605 1606 MachineIRBuilder B(MI); 1607 1608 Register Lo = MI.getOperand(1).getReg(); 1609 Register Hi = MI.getOperand(2).getReg(); 1610 const LLT S32 = LLT::scalar(32); 1611 1612 const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI); 1613 const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI); 1614 1615 Register ZextLo; 1616 Register ShiftHi; 1617 1618 if (Opc == AMDGPU::G_BUILD_VECTOR) { 1619 ZextLo = B.buildZExt(S32, Lo).getReg(0); 1620 MRI.setRegBank(ZextLo, *BankLo); 1621 1622 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 1623 MRI.setRegBank(ZextHi, *BankHi); 1624 1625 auto ShiftAmt = B.buildConstant(S32, 16); 1626 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 1627 1628 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 1629 MRI.setRegBank(ShiftHi, *BankHi); 1630 } else { 1631 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 1632 MRI.setRegBank(MaskLo, *BankLo); 1633 1634 auto ShiftAmt = B.buildConstant(S32, 16); 1635 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 1636 1637 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 1638 MRI.setRegBank(ShiftHi, *BankHi); 1639 1640 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 1641 MRI.setRegBank(ZextLo, *BankLo); 1642 } 1643 1644 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 1645 MRI.setRegBank(Or.getReg(0), *DstBank); 1646 1647 B.buildBitcast(DstReg, Or); 1648 MI.eraseFromParent(); 1649 return; 1650 } 1651 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 1652 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1653 1654 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 1655 1656 if (DstRegs.empty()) { 1657 applyDefaultMapping(OpdMapper); 1658 executeInWaterfallLoop(MI, MRI, { 2 }); 1659 return; 1660 } 1661 1662 Register DstReg = MI.getOperand(0).getReg(); 1663 Register SrcReg = MI.getOperand(1).getReg(); 1664 Register IdxReg = MI.getOperand(2).getReg(); 1665 LLT DstTy = MRI.getType(DstReg); 1666 (void)DstTy; 1667 1668 assert(DstTy.getSizeInBits() == 64); 1669 1670 LLT SrcTy = MRI.getType(SrcReg); 1671 const LLT S32 = LLT::scalar(32); 1672 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 1673 1674 MachineIRBuilder B(MI); 1675 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 1676 auto One = B.buildConstant(S32, 1); 1677 1678 // Split the vector index into 32-bit pieces. Prepare to move all of the 1679 // new instructions into a waterfall loop if necessary. 1680 // 1681 // Don't put the bitcast or constant in the loop. 1682 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 1683 1684 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 1685 auto IdxLo = B.buildShl(S32, IdxReg, One); 1686 auto IdxHi = B.buildAdd(S32, IdxLo, One); 1687 B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 1688 B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 1689 1690 const ValueMapping &DstMapping 1691 = OpdMapper.getInstrMapping().getOperandMapping(0); 1692 1693 // FIXME: Should be getting from mapping or not? 1694 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 1695 MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank); 1696 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 1697 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 1698 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 1699 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 1700 1701 SmallSet<Register, 4> OpsToWaterfall; 1702 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 1703 MI.eraseFromParent(); 1704 return; 1705 } 1706 1707 // Remove the original instruction to avoid potentially confusing the 1708 // waterfall loop logic. 1709 B.setInstr(*Span.begin()); 1710 MI.eraseFromParent(); 1711 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1712 OpsToWaterfall, MRI); 1713 return; 1714 } 1715 case AMDGPU::G_INSERT_VECTOR_ELT: { 1716 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 1717 1718 assert(OpdMapper.getVRegs(0).empty()); 1719 assert(OpdMapper.getVRegs(1).empty()); 1720 assert(OpdMapper.getVRegs(3).empty()); 1721 1722 if (InsRegs.empty()) { 1723 applyDefaultMapping(OpdMapper); 1724 executeInWaterfallLoop(MI, MRI, { 3 }); 1725 return; 1726 } 1727 1728 Register DstReg = MI.getOperand(0).getReg(); 1729 Register SrcReg = MI.getOperand(1).getReg(); 1730 Register InsReg = MI.getOperand(2).getReg(); 1731 Register IdxReg = MI.getOperand(3).getReg(); 1732 LLT SrcTy = MRI.getType(SrcReg); 1733 LLT InsTy = MRI.getType(InsReg); 1734 (void)InsTy; 1735 1736 assert(InsTy.getSizeInBits() == 64); 1737 1738 const LLT S32 = LLT::scalar(32); 1739 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 1740 1741 MachineIRBuilder B(MI); 1742 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 1743 auto One = B.buildConstant(S32, 1); 1744 1745 // Split the vector index into 32-bit pieces. Prepare to move all of the 1746 // new instructions into a waterfall loop if necessary. 1747 // 1748 // Don't put the bitcast or constant in the loop. 1749 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 1750 1751 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 1752 auto IdxLo = B.buildShl(S32, IdxReg, One); 1753 auto IdxHi = B.buildAdd(S32, IdxLo, One); 1754 1755 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 1756 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 1757 B.buildBitcast(DstReg, InsHi); 1758 1759 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); 1760 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 1761 const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI); 1762 1763 MRI.setRegBank(InsReg, *InsSrcBank); 1764 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 1765 MRI.setRegBank(InsLo.getReg(0), *DstBank); 1766 MRI.setRegBank(InsHi.getReg(0), *DstBank); 1767 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 1768 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 1769 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 1770 1771 1772 SmallSet<Register, 4> OpsToWaterfall; 1773 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 1774 MI.eraseFromParent(); 1775 return; 1776 } 1777 1778 B.setInstr(*Span.begin()); 1779 MI.eraseFromParent(); 1780 1781 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1782 OpsToWaterfall, MRI); 1783 return; 1784 } 1785 case AMDGPU::G_INTRINSIC: { 1786 switch (MI.getIntrinsicID()) { 1787 case Intrinsic::amdgcn_s_buffer_load: { 1788 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS 1789 executeInWaterfallLoop(MI, MRI, { 2, 3 }); 1790 return; 1791 } 1792 case Intrinsic::amdgcn_readlane: { 1793 substituteSimpleCopyRegs(OpdMapper, 2); 1794 1795 assert(OpdMapper.getVRegs(0).empty()); 1796 assert(OpdMapper.getVRegs(3).empty()); 1797 1798 // Make sure the index is an SGPR. It doesn't make sense to run this in a 1799 // waterfall loop, so assume it's a uniform value. 1800 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 1801 return; 1802 } 1803 case Intrinsic::amdgcn_writelane: { 1804 assert(OpdMapper.getVRegs(0).empty()); 1805 assert(OpdMapper.getVRegs(2).empty()); 1806 assert(OpdMapper.getVRegs(3).empty()); 1807 1808 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 1809 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 1810 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 1811 return; 1812 } 1813 default: 1814 break; 1815 } 1816 break; 1817 } 1818 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 1819 auto IntrID = MI.getIntrinsicID(); 1820 switch (IntrID) { 1821 case Intrinsic::amdgcn_buffer_load: { 1822 executeInWaterfallLoop(MI, MRI, { 2 }); 1823 return; 1824 } 1825 case Intrinsic::amdgcn_ds_ordered_add: 1826 case Intrinsic::amdgcn_ds_ordered_swap: { 1827 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 1828 assert(OpdMapper.getVRegs(0).empty()); 1829 substituteSimpleCopyRegs(OpdMapper, 3); 1830 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1831 return; 1832 } 1833 case Intrinsic::amdgcn_ds_gws_init: 1834 case Intrinsic::amdgcn_ds_gws_barrier: 1835 case Intrinsic::amdgcn_ds_gws_sema_br: { 1836 // Only the first lane is executes, so readfirstlane is safe. 1837 substituteSimpleCopyRegs(OpdMapper, 1); 1838 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1839 return; 1840 } 1841 case Intrinsic::amdgcn_ds_gws_sema_v: 1842 case Intrinsic::amdgcn_ds_gws_sema_p: 1843 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 1844 // Only the first lane is executes, so readfirstlane is safe. 1845 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 1846 return; 1847 } 1848 case Intrinsic::amdgcn_s_sendmsg: 1849 case Intrinsic::amdgcn_s_sendmsghalt: { 1850 // FIXME: Should this use a waterfall loop? 1851 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 1852 return; 1853 } 1854 case Intrinsic::amdgcn_raw_buffer_load: 1855 case Intrinsic::amdgcn_raw_buffer_load_format: 1856 case Intrinsic::amdgcn_raw_tbuffer_load: 1857 case Intrinsic::amdgcn_raw_buffer_store: 1858 case Intrinsic::amdgcn_raw_buffer_store_format: 1859 case Intrinsic::amdgcn_raw_tbuffer_store: { 1860 applyDefaultMapping(OpdMapper); 1861 executeInWaterfallLoop(MI, MRI, {2, 4}); 1862 return; 1863 } 1864 case Intrinsic::amdgcn_struct_buffer_load: 1865 case Intrinsic::amdgcn_struct_buffer_store: 1866 case Intrinsic::amdgcn_struct_tbuffer_load: 1867 case Intrinsic::amdgcn_struct_tbuffer_store: { 1868 applyDefaultMapping(OpdMapper); 1869 executeInWaterfallLoop(MI, MRI, {2, 5}); 1870 return; 1871 } 1872 default: { 1873 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 1874 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 1875 // Non-images can have complications from operands that allow both SGPR 1876 // and VGPR. For now it's too complicated to figure out the final opcode 1877 // to derive the register bank from the MCInstrDesc. 1878 if (RSrcIntrin->IsImage) { 1879 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 1880 return; 1881 } 1882 } 1883 1884 break; 1885 } 1886 } 1887 break; 1888 } 1889 case AMDGPU::G_LOAD: 1890 case AMDGPU::G_ZEXTLOAD: 1891 case AMDGPU::G_SEXTLOAD: { 1892 if (applyMappingWideLoad(MI, OpdMapper, MRI)) 1893 return; 1894 break; 1895 } 1896 default: 1897 break; 1898 } 1899 1900 return applyDefaultMapping(OpdMapper); 1901 } 1902 1903 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 1904 const MachineFunction &MF = *MI.getParent()->getParent(); 1905 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1906 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 1907 if (!MI.getOperand(i).isReg()) 1908 continue; 1909 Register Reg = MI.getOperand(i).getReg(); 1910 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 1911 if (Bank->getID() == AMDGPU::VGPRRegBankID) 1912 return false; 1913 1914 assert(Bank->getID() == AMDGPU::SGPRRegBankID || 1915 Bank->getID() == AMDGPU::SCCRegBankID); 1916 } 1917 } 1918 return true; 1919 } 1920 1921 const RegisterBankInfo::InstructionMapping & 1922 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 1923 const MachineFunction &MF = *MI.getParent()->getParent(); 1924 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1925 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1926 1927 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 1928 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 1929 unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID; 1930 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 1931 } 1932 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1933 MI.getNumOperands()); 1934 } 1935 1936 const RegisterBankInfo::InstructionMapping & 1937 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 1938 const MachineFunction &MF = *MI.getParent()->getParent(); 1939 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1940 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1941 unsigned OpdIdx = 0; 1942 1943 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 1944 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 1945 1946 if (MI.getOperand(OpdIdx).isIntrinsicID()) 1947 OpdsMapping[OpdIdx++] = nullptr; 1948 1949 Register Reg1 = MI.getOperand(OpdIdx).getReg(); 1950 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); 1951 1952 unsigned DefaultBankID = Size1 == 1 ? 1953 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 1954 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); 1955 1956 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); 1957 1958 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { 1959 const MachineOperand &MO = MI.getOperand(OpdIdx); 1960 if (!MO.isReg()) 1961 continue; 1962 1963 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); 1964 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 1965 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); 1966 } 1967 1968 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1969 MI.getNumOperands()); 1970 } 1971 1972 const RegisterBankInfo::InstructionMapping & 1973 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 1974 const MachineFunction &MF = *MI.getParent()->getParent(); 1975 const MachineRegisterInfo &MRI = MF.getRegInfo(); 1976 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 1977 1978 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 1979 const MachineOperand &Op = MI.getOperand(I); 1980 if (!Op.isReg()) 1981 continue; 1982 1983 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 1984 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 1985 } 1986 1987 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 1988 MI.getNumOperands()); 1989 } 1990 1991 const RegisterBankInfo::InstructionMapping & 1992 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 1993 const MachineInstr &MI, 1994 int RsrcIdx) const { 1995 // The reported argument index is relative to the IR intrinsic call arguments, 1996 // so we need to shift by the number of defs and the intrinsic ID. 1997 RsrcIdx += MI.getNumExplicitDefs() + 1; 1998 1999 const int NumOps = MI.getNumOperands(); 2000 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 2001 2002 // TODO: Should packed/unpacked D16 difference be reported here as part of 2003 // the value mapping? 2004 for (int I = 0; I != NumOps; ++I) { 2005 if (!MI.getOperand(I).isReg()) 2006 continue; 2007 2008 Register OpReg = MI.getOperand(I).getReg(); 2009 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 2010 2011 // FIXME: Probably need a new intrinsic register bank searchable table to 2012 // handle arbitrary intrinsics easily. 2013 // 2014 // If this has a sampler, it immediately follows rsrc. 2015 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 2016 2017 if (MustBeSGPR) { 2018 // If this must be an SGPR, so we must report whatever it is as legal. 2019 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2020 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 2021 } else { 2022 // Some operands must be VGPR, and these are easy to copy to. 2023 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2024 } 2025 } 2026 2027 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 2028 } 2029 2030 const RegisterBankInfo::InstructionMapping & 2031 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 2032 2033 const MachineFunction &MF = *MI.getParent()->getParent(); 2034 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2035 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 2036 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2037 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); 2038 Register PtrReg = MI.getOperand(1).getReg(); 2039 LLT PtrTy = MRI.getType(PtrReg); 2040 unsigned AS = PtrTy.getAddressSpace(); 2041 unsigned PtrSize = PtrTy.getSizeInBits(); 2042 2043 const ValueMapping *ValMapping; 2044 const ValueMapping *PtrMapping; 2045 2046 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 2047 2048 if (PtrBank == &AMDGPU::SGPRRegBank && 2049 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 2050 AS != AMDGPUAS::PRIVATE_ADDRESS) && 2051 isInstrUniformNonExtLoadAlign4(MI)) { 2052 // We have a uniform instruction so we want to use an SMRD load 2053 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2054 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 2055 } else { 2056 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); 2057 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 2058 } 2059 2060 OpdsMapping[0] = ValMapping; 2061 OpdsMapping[1] = PtrMapping; 2062 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 2063 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 2064 return Mapping; 2065 2066 // FIXME: Do we want to add a mapping for FLAT load, or should we just 2067 // handle that during instruction selection? 2068 } 2069 2070 unsigned 2071 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 2072 const MachineRegisterInfo &MRI, 2073 const TargetRegisterInfo &TRI, 2074 unsigned Default) const { 2075 2076 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); 2077 return Bank ? Bank->getID() : Default; 2078 } 2079 2080 2081 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 2082 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ? 2083 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2084 } 2085 2086 const RegisterBankInfo::ValueMapping * 2087 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 2088 const MachineRegisterInfo &MRI, 2089 const TargetRegisterInfo &TRI) const { 2090 // Lie and claim anything is legal, even though this needs to be an SGPR 2091 // applyMapping will have to deal with it as a waterfall loop. 2092 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID); 2093 unsigned Size = getSizeInBits(Reg, MRI, TRI); 2094 return AMDGPU::getValueMapping(Bank, Size); 2095 } 2096 2097 const RegisterBankInfo::ValueMapping * 2098 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 2099 const MachineRegisterInfo &MRI, 2100 const TargetRegisterInfo &TRI) const { 2101 unsigned Size = getSizeInBits(Reg, MRI, TRI); 2102 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2103 } 2104 2105 /// 2106 /// This function must return a legal mapping, because 2107 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 2108 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 2109 /// VGPR to SGPR generated is illegal. 2110 /// 2111 const RegisterBankInfo::InstructionMapping & 2112 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 2113 const MachineFunction &MF = *MI.getParent()->getParent(); 2114 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2115 2116 if (MI.isRegSequence()) { 2117 // If any input is a VGPR, the result must be a VGPR. The default handling 2118 // assumes any copy between banks is legal. 2119 unsigned BankID = AMDGPU::SGPRRegBankID; 2120 2121 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2122 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); 2123 // It doesn't make sense to use vcc or scc banks here, so just ignore 2124 // them. 2125 if (OpBank != AMDGPU::SGPRRegBankID) { 2126 BankID = AMDGPU::VGPRRegBankID; 2127 break; 2128 } 2129 } 2130 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2131 2132 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 2133 return getInstructionMapping( 2134 1, /*Cost*/ 1, 2135 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 2136 } 2137 2138 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 2139 // properly. 2140 // 2141 // TODO: There are additional exec masking dependencies to analyze. 2142 if (MI.getOpcode() == TargetOpcode::G_PHI) { 2143 // TODO: Generate proper invalid bank enum. 2144 int ResultBank = -1; 2145 2146 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2147 Register Reg = MI.getOperand(I).getReg(); 2148 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 2149 2150 // FIXME: Assuming VGPR for any undetermined inputs. 2151 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 2152 ResultBank = AMDGPU::VGPRRegBankID; 2153 break; 2154 } 2155 2156 unsigned OpBank = Bank->getID(); 2157 // scc, scc -> sgpr 2158 if (OpBank == AMDGPU::SCCRegBankID) { 2159 // There's only one SCC register, so a phi requires copying to SGPR. 2160 OpBank = AMDGPU::SGPRRegBankID; 2161 } else if (OpBank == AMDGPU::VCCRegBankID) { 2162 // vcc, vcc -> vcc 2163 // vcc, sgpr -> vgpr 2164 if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) { 2165 ResultBank = AMDGPU::VGPRRegBankID; 2166 break; 2167 } 2168 } 2169 2170 ResultBank = OpBank; 2171 } 2172 2173 assert(ResultBank != -1); 2174 2175 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2176 2177 const ValueMapping &ValMap = 2178 getValueMapping(0, Size, getRegBank(ResultBank)); 2179 return getInstructionMapping( 2180 1, /*Cost*/ 1, 2181 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 2182 } 2183 2184 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 2185 if (Mapping.isValid()) 2186 return Mapping; 2187 2188 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 2189 2190 switch (MI.getOpcode()) { 2191 default: 2192 return getInvalidInstructionMapping(); 2193 2194 case AMDGPU::G_AND: 2195 case AMDGPU::G_OR: 2196 case AMDGPU::G_XOR: { 2197 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2198 if (Size == 1) { 2199 const RegisterBank *DstBank 2200 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 2201 2202 unsigned TargetBankID = -1; 2203 unsigned BankLHS = -1; 2204 unsigned BankRHS = -1; 2205 if (DstBank) { 2206 TargetBankID = DstBank->getID(); 2207 if (DstBank == &AMDGPU::VCCRegBank) { 2208 TargetBankID = AMDGPU::VCCRegBankID; 2209 BankLHS = AMDGPU::VCCRegBankID; 2210 BankRHS = AMDGPU::VCCRegBankID; 2211 } else if (DstBank == &AMDGPU::SCCRegBank) { 2212 TargetBankID = AMDGPU::SCCRegBankID; 2213 BankLHS = AMDGPU::SGPRRegBankID; 2214 BankRHS = AMDGPU::SGPRRegBankID; 2215 } else { 2216 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2217 AMDGPU::SGPRRegBankID); 2218 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2219 AMDGPU::SGPRRegBankID); 2220 } 2221 } else { 2222 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2223 AMDGPU::VCCRegBankID); 2224 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2225 AMDGPU::VCCRegBankID); 2226 2227 // Both inputs should be true booleans to produce a boolean result. 2228 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 2229 TargetBankID = AMDGPU::VGPRRegBankID; 2230 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 2231 TargetBankID = AMDGPU::VCCRegBankID; 2232 BankLHS = AMDGPU::VCCRegBankID; 2233 BankRHS = AMDGPU::VCCRegBankID; 2234 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 2235 TargetBankID = AMDGPU::SGPRRegBankID; 2236 } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) { 2237 // The operation must be done on a 32-bit register, but it will set 2238 // scc. The result type could interchangably be SCC or SGPR, since 2239 // both values will be produced. 2240 TargetBankID = AMDGPU::SCCRegBankID; 2241 BankLHS = AMDGPU::SGPRRegBankID; 2242 BankRHS = AMDGPU::SGPRRegBankID; 2243 } 2244 } 2245 2246 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 2247 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 2248 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 2249 break; 2250 } 2251 2252 if (Size == 64) { 2253 2254 if (isSALUMapping(MI)) { 2255 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 2256 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 2257 } else { 2258 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 2259 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); 2260 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 2261 2262 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); 2263 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 2264 } 2265 2266 break; 2267 } 2268 2269 LLVM_FALLTHROUGH; 2270 } 2271 case AMDGPU::G_GEP: 2272 case AMDGPU::G_ADD: 2273 case AMDGPU::G_SUB: 2274 case AMDGPU::G_MUL: 2275 case AMDGPU::G_SHL: 2276 case AMDGPU::G_LSHR: 2277 case AMDGPU::G_ASHR: 2278 case AMDGPU::G_UADDO: 2279 case AMDGPU::G_USUBO: 2280 case AMDGPU::G_UADDE: 2281 case AMDGPU::G_SADDE: 2282 case AMDGPU::G_USUBE: 2283 case AMDGPU::G_SSUBE: 2284 case AMDGPU::G_SMIN: 2285 case AMDGPU::G_SMAX: 2286 case AMDGPU::G_UMIN: 2287 case AMDGPU::G_UMAX: 2288 if (isSALUMapping(MI)) 2289 return getDefaultMappingSOP(MI); 2290 LLVM_FALLTHROUGH; 2291 2292 case AMDGPU::G_FADD: 2293 case AMDGPU::G_FSUB: 2294 case AMDGPU::G_FPTOSI: 2295 case AMDGPU::G_FPTOUI: 2296 case AMDGPU::G_FMUL: 2297 case AMDGPU::G_FMA: 2298 case AMDGPU::G_FMAD: 2299 case AMDGPU::G_FSQRT: 2300 case AMDGPU::G_FFLOOR: 2301 case AMDGPU::G_FCEIL: 2302 case AMDGPU::G_FRINT: 2303 case AMDGPU::G_SITOFP: 2304 case AMDGPU::G_UITOFP: 2305 case AMDGPU::G_FPTRUNC: 2306 case AMDGPU::G_FPEXT: 2307 case AMDGPU::G_FEXP2: 2308 case AMDGPU::G_FLOG2: 2309 case AMDGPU::G_FMINNUM: 2310 case AMDGPU::G_FMAXNUM: 2311 case AMDGPU::G_FMINNUM_IEEE: 2312 case AMDGPU::G_FMAXNUM_IEEE: 2313 case AMDGPU::G_FCANONICALIZE: 2314 case AMDGPU::G_INTRINSIC_TRUNC: 2315 case AMDGPU::G_INTRINSIC_ROUND: 2316 case AMDGPU::G_AMDGPU_FFBH_U32: 2317 return getDefaultMappingVOP(MI); 2318 case AMDGPU::G_UMULH: 2319 case AMDGPU::G_SMULH: { 2320 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 2321 return getDefaultMappingSOP(MI); 2322 return getDefaultMappingVOP(MI); 2323 } 2324 case AMDGPU::G_IMPLICIT_DEF: { 2325 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2326 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2327 break; 2328 } 2329 case AMDGPU::G_FCONSTANT: 2330 case AMDGPU::G_CONSTANT: 2331 case AMDGPU::G_GLOBAL_VALUE: 2332 case AMDGPU::G_BLOCK_ADDR: { 2333 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2334 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2335 break; 2336 } 2337 case AMDGPU::G_FRAME_INDEX: { 2338 // TODO: This should be the same as other constants, but eliminateFrameIndex 2339 // currently assumes VALU uses. 2340 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2341 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2342 break; 2343 } 2344 case AMDGPU::G_INSERT: { 2345 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 2346 AMDGPU::VGPRRegBankID; 2347 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2348 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2349 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 2350 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 2351 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 2352 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 2353 OpdsMapping[3] = nullptr; 2354 break; 2355 } 2356 case AMDGPU::G_EXTRACT: { 2357 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2358 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2359 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2360 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 2361 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 2362 OpdsMapping[2] = nullptr; 2363 break; 2364 } 2365 case AMDGPU::G_BUILD_VECTOR: 2366 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2367 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 2368 if (DstTy == LLT::vector(2, 16)) { 2369 unsigned DstSize = DstTy.getSizeInBits(); 2370 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2371 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2372 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2373 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 2374 2375 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 2376 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 2377 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 2378 break; 2379 } 2380 2381 LLVM_FALLTHROUGH; 2382 } 2383 case AMDGPU::G_MERGE_VALUES: 2384 case AMDGPU::G_CONCAT_VECTORS: { 2385 unsigned Bank = isSALUMapping(MI) ? 2386 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2387 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2388 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2389 2390 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 2391 // Op1 and Dst should use the same register bank. 2392 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 2393 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 2394 break; 2395 } 2396 case AMDGPU::G_BITCAST: 2397 case AMDGPU::G_INTTOPTR: 2398 case AMDGPU::G_PTRTOINT: 2399 case AMDGPU::G_CTLZ: 2400 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2401 case AMDGPU::G_CTTZ: 2402 case AMDGPU::G_CTTZ_ZERO_UNDEF: 2403 case AMDGPU::G_CTPOP: 2404 case AMDGPU::G_BSWAP: 2405 case AMDGPU::G_BITREVERSE: 2406 case AMDGPU::G_FABS: 2407 case AMDGPU::G_FNEG: { 2408 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2409 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2410 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 2411 break; 2412 } 2413 case AMDGPU::G_TRUNC: { 2414 Register Dst = MI.getOperand(0).getReg(); 2415 Register Src = MI.getOperand(1).getReg(); 2416 unsigned Bank = getRegBankID(Src, MRI, *TRI); 2417 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 2418 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 2419 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 2420 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 2421 break; 2422 } 2423 case AMDGPU::G_ZEXT: 2424 case AMDGPU::G_SEXT: 2425 case AMDGPU::G_ANYEXT: { 2426 Register Dst = MI.getOperand(0).getReg(); 2427 Register Src = MI.getOperand(1).getReg(); 2428 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 2429 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 2430 2431 unsigned DstBank; 2432 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 2433 assert(SrcBank); 2434 switch (SrcBank->getID()) { 2435 case AMDGPU::SCCRegBankID: 2436 case AMDGPU::SGPRRegBankID: 2437 DstBank = AMDGPU::SGPRRegBankID; 2438 break; 2439 default: 2440 DstBank = AMDGPU::VGPRRegBankID; 2441 break; 2442 } 2443 2444 // TODO: Should anyext be split into 32-bit part as well? 2445 if (MI.getOpcode() == AMDGPU::G_ANYEXT) { 2446 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); 2447 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); 2448 } else { 2449 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 2450 // 32-bits, and then to 64. 2451 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 2452 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 2453 SrcSize); 2454 } 2455 break; 2456 } 2457 case AMDGPU::G_FCMP: { 2458 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2459 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2460 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2461 OpdsMapping[1] = nullptr; // Predicate Operand. 2462 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 2463 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2464 break; 2465 } 2466 case AMDGPU::G_STORE: { 2467 assert(MI.getOperand(0).isReg()); 2468 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2469 // FIXME: We need to specify a different reg bank once scalar stores 2470 // are supported. 2471 const ValueMapping *ValMapping = 2472 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 2473 // FIXME: Depending on the type of store, the pointer could be in 2474 // the SGPR Reg bank. 2475 // FIXME: Pointer size should be based on the address space. 2476 const ValueMapping *PtrMapping = 2477 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 2478 2479 OpdsMapping[0] = ValMapping; 2480 OpdsMapping[1] = PtrMapping; 2481 break; 2482 } 2483 2484 case AMDGPU::G_ICMP: { 2485 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 2486 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2487 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2488 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2489 2490 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && 2491 Op3Bank == AMDGPU::SGPRRegBankID && 2492 (Size == 32 || (Size == 64 && 2493 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 2494 Subtarget.hasScalarCompareEq64())); 2495 2496 unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2497 2498 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); 2499 OpdsMapping[1] = nullptr; // Predicate Operand. 2500 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 2501 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); 2502 break; 2503 } 2504 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2505 // VGPR index can be used for waterfall when indexing a SGPR vector. 2506 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2507 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2508 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2509 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2510 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2511 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 2512 2513 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 2514 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 2515 2516 // The index can be either if the source vector is VGPR. 2517 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2518 break; 2519 } 2520 case AMDGPU::G_INSERT_VECTOR_ELT: { 2521 unsigned OutputBankID = isSALUMapping(MI) ? 2522 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2523 2524 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2525 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2526 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 2527 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); 2528 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), 2529 MRI, *TRI); 2530 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2531 2532 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 2533 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); 2534 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, 2535 InsertSize); 2536 2537 // The index can be either if the source vector is VGPR. 2538 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 2539 break; 2540 } 2541 case AMDGPU::G_UNMERGE_VALUES: { 2542 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : 2543 AMDGPU::VGPRRegBankID; 2544 2545 // Op1 and Dst should use the same register bank. 2546 // FIXME: Shouldn't this be the default? Why do we need to handle this? 2547 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2548 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 2549 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 2550 } 2551 break; 2552 } 2553 case AMDGPU::G_INTRINSIC: { 2554 switch (MI.getIntrinsicID()) { 2555 default: 2556 return getInvalidInstructionMapping(); 2557 case Intrinsic::amdgcn_div_fmas: 2558 case Intrinsic::amdgcn_trig_preop: 2559 case Intrinsic::amdgcn_sin: 2560 case Intrinsic::amdgcn_cos: 2561 case Intrinsic::amdgcn_log_clamp: 2562 case Intrinsic::amdgcn_rcp: 2563 case Intrinsic::amdgcn_rcp_legacy: 2564 case Intrinsic::amdgcn_rsq: 2565 case Intrinsic::amdgcn_rsq_legacy: 2566 case Intrinsic::amdgcn_rsq_clamp: 2567 case Intrinsic::amdgcn_ldexp: 2568 case Intrinsic::amdgcn_frexp_mant: 2569 case Intrinsic::amdgcn_frexp_exp: 2570 case Intrinsic::amdgcn_fract: 2571 case Intrinsic::amdgcn_cvt_pkrtz: 2572 case Intrinsic::amdgcn_cvt_pknorm_i16: 2573 case Intrinsic::amdgcn_cvt_pknorm_u16: 2574 case Intrinsic::amdgcn_cvt_pk_i16: 2575 case Intrinsic::amdgcn_cvt_pk_u16: 2576 case Intrinsic::amdgcn_fmed3: 2577 case Intrinsic::amdgcn_cubeid: 2578 case Intrinsic::amdgcn_cubema: 2579 case Intrinsic::amdgcn_cubesc: 2580 case Intrinsic::amdgcn_cubetc: 2581 case Intrinsic::amdgcn_sffbh: 2582 case Intrinsic::amdgcn_fmad_ftz: 2583 case Intrinsic::amdgcn_mbcnt_lo: 2584 case Intrinsic::amdgcn_mbcnt_hi: 2585 case Intrinsic::amdgcn_ubfe: 2586 case Intrinsic::amdgcn_sbfe: 2587 case Intrinsic::amdgcn_mul_u24: 2588 case Intrinsic::amdgcn_mul_i24: 2589 case Intrinsic::amdgcn_lerp: 2590 case Intrinsic::amdgcn_sad_u8: 2591 case Intrinsic::amdgcn_msad_u8: 2592 case Intrinsic::amdgcn_sad_hi_u8: 2593 case Intrinsic::amdgcn_sad_u16: 2594 case Intrinsic::amdgcn_qsad_pk_u16_u8: 2595 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 2596 case Intrinsic::amdgcn_mqsad_u32_u8: 2597 case Intrinsic::amdgcn_cvt_pk_u8_f32: 2598 case Intrinsic::amdgcn_alignbit: 2599 case Intrinsic::amdgcn_alignbyte: 2600 case Intrinsic::amdgcn_fdot2: 2601 case Intrinsic::amdgcn_sdot2: 2602 case Intrinsic::amdgcn_udot2: 2603 case Intrinsic::amdgcn_sdot4: 2604 case Intrinsic::amdgcn_udot4: 2605 case Intrinsic::amdgcn_sdot8: 2606 case Intrinsic::amdgcn_udot8: 2607 case Intrinsic::amdgcn_wwm: 2608 case Intrinsic::amdgcn_wqm: 2609 return getDefaultMappingVOP(MI); 2610 case Intrinsic::amdgcn_ds_swizzle: 2611 case Intrinsic::amdgcn_ds_permute: 2612 case Intrinsic::amdgcn_ds_bpermute: 2613 case Intrinsic::amdgcn_update_dpp: 2614 return getDefaultMappingAllVGPR(MI); 2615 case Intrinsic::amdgcn_kernarg_segment_ptr: 2616 case Intrinsic::amdgcn_s_getpc: 2617 case Intrinsic::amdgcn_groupstaticsize: { 2618 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2619 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2620 break; 2621 } 2622 case Intrinsic::amdgcn_wqm_vote: { 2623 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2624 OpdsMapping[0] = OpdsMapping[2] 2625 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 2626 break; 2627 } 2628 case Intrinsic::amdgcn_s_buffer_load: { 2629 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS 2630 Register RSrc = MI.getOperand(2).getReg(); // SGPR 2631 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm 2632 2633 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2634 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 2635 unsigned Size3 = MRI.getType(Offset).getSizeInBits(); 2636 2637 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 2638 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 2639 2640 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); 2641 OpdsMapping[1] = nullptr; // intrinsic id 2642 2643 // Lie and claim everything is legal, even though some need to be 2644 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 2645 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 2646 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); 2647 OpdsMapping[4] = nullptr; 2648 break; 2649 } 2650 case Intrinsic::amdgcn_div_scale: { 2651 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2652 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 2653 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 2654 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 2655 2656 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 2657 OpdsMapping[3] = AMDGPU::getValueMapping( 2658 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); 2659 OpdsMapping[4] = AMDGPU::getValueMapping( 2660 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); 2661 2662 break; 2663 } 2664 case Intrinsic::amdgcn_class: { 2665 Register Src0Reg = MI.getOperand(2).getReg(); 2666 Register Src1Reg = MI.getOperand(3).getReg(); 2667 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 2668 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 2669 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2670 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 2671 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), 2672 Src0Size); 2673 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), 2674 Src1Size); 2675 break; 2676 } 2677 case Intrinsic::amdgcn_icmp: 2678 case Intrinsic::amdgcn_fcmp: { 2679 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2680 // This is not VCCRegBank because this is not used in boolean contexts. 2681 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 2682 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2683 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); 2684 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); 2685 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); 2686 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); 2687 break; 2688 } 2689 case Intrinsic::amdgcn_readlane: { 2690 // This must be an SGPR, but accept a VGPR. 2691 Register IdxReg = MI.getOperand(3).getReg(); 2692 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 2693 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2694 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2695 LLVM_FALLTHROUGH; 2696 } 2697 case Intrinsic::amdgcn_readfirstlane: { 2698 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2699 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 2700 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 2701 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 2702 break; 2703 } 2704 case Intrinsic::amdgcn_writelane: { 2705 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2706 Register SrcReg = MI.getOperand(2).getReg(); 2707 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 2708 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2709 Register IdxReg = MI.getOperand(3).getReg(); 2710 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 2711 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); 2712 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 2713 2714 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 2715 // to legalize. 2716 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 2717 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 2718 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 2719 break; 2720 } 2721 case Intrinsic::amdgcn_if_break: { 2722 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 2723 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2724 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2725 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2726 break; 2727 } 2728 } 2729 break; 2730 } 2731 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2732 auto IntrID = MI.getIntrinsicID(); 2733 switch (IntrID) { 2734 case Intrinsic::amdgcn_s_getreg: 2735 case Intrinsic::amdgcn_s_memtime: 2736 case Intrinsic::amdgcn_s_memrealtime: 2737 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 2738 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2739 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2740 break; 2741 } 2742 case Intrinsic::amdgcn_ds_append: 2743 case Intrinsic::amdgcn_ds_consume: 2744 case Intrinsic::amdgcn_ds_fadd: 2745 case Intrinsic::amdgcn_ds_fmin: 2746 case Intrinsic::amdgcn_ds_fmax: 2747 case Intrinsic::amdgcn_atomic_inc: 2748 case Intrinsic::amdgcn_atomic_dec: 2749 return getDefaultMappingAllVGPR(MI); 2750 case Intrinsic::amdgcn_ds_ordered_add: 2751 case Intrinsic::amdgcn_ds_ordered_swap: { 2752 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2753 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 2754 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2755 AMDGPU::SGPRRegBankID); 2756 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 2757 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2758 break; 2759 } 2760 case Intrinsic::amdgcn_exp_compr: 2761 OpdsMapping[0] = nullptr; // IntrinsicID 2762 // FIXME: These are immediate values which can't be read from registers. 2763 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2764 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2765 // FIXME: Could we support packed types here? 2766 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2767 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2768 // FIXME: These are immediate values which can't be read from registers. 2769 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2770 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 2771 break; 2772 case Intrinsic::amdgcn_exp: 2773 // FIXME: Could we support packed types here? 2774 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2775 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2776 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2777 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2778 break; 2779 case Intrinsic::amdgcn_buffer_load: { 2780 Register RSrc = MI.getOperand(2).getReg(); // SGPR 2781 Register VIndex = MI.getOperand(3).getReg(); // VGPR 2782 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm 2783 2784 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2785 unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); 2786 unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); 2787 unsigned Size4 = MRI.getType(Offset).getSizeInBits(); 2788 2789 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); 2790 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); 2791 2792 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); 2793 OpdsMapping[1] = nullptr; // intrinsic id 2794 2795 // Lie and claim everything is legal, even though some need to be 2796 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 2797 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc 2798 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); 2799 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); 2800 OpdsMapping[5] = nullptr; 2801 OpdsMapping[6] = nullptr; 2802 break; 2803 } 2804 case Intrinsic::amdgcn_s_sendmsg: 2805 case Intrinsic::amdgcn_s_sendmsghalt: { 2806 // This must be an SGPR, but accept a VGPR. 2807 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2808 AMDGPU::SGPRRegBankID); 2809 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 2810 break; 2811 } 2812 case Intrinsic::amdgcn_end_cf: 2813 case Intrinsic::amdgcn_init_exec: { 2814 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2815 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2816 break; 2817 } 2818 case Intrinsic::amdgcn_else: { 2819 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2820 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2821 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 2822 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 2823 break; 2824 } 2825 case Intrinsic::amdgcn_kill: { 2826 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 2827 break; 2828 } 2829 case Intrinsic::amdgcn_raw_buffer_load: 2830 case Intrinsic::amdgcn_raw_tbuffer_load: { 2831 // FIXME: Should make intrinsic ID the last operand of the instruction, 2832 // then this would be the same as store 2833 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 2834 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 2835 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 2836 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 2837 break; 2838 } 2839 case Intrinsic::amdgcn_raw_buffer_store: 2840 case Intrinsic::amdgcn_raw_buffer_store_format: 2841 case Intrinsic::amdgcn_raw_tbuffer_store: { 2842 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 2843 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 2844 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 2845 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 2846 break; 2847 } 2848 case Intrinsic::amdgcn_struct_buffer_load: 2849 case Intrinsic::amdgcn_struct_tbuffer_load: { 2850 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 2851 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 2852 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 2853 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 2854 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 2855 break; 2856 } 2857 case Intrinsic::amdgcn_struct_buffer_store: 2858 case Intrinsic::amdgcn_struct_tbuffer_store: { 2859 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 2860 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 2861 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 2862 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 2863 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 2864 break; 2865 } 2866 case Intrinsic::amdgcn_init_exec_from_input: { 2867 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 2868 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2869 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 2870 break; 2871 } 2872 case Intrinsic::amdgcn_ds_gws_init: 2873 case Intrinsic::amdgcn_ds_gws_barrier: 2874 case Intrinsic::amdgcn_ds_gws_sema_br: { 2875 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 2876 2877 // This must be an SGPR, but accept a VGPR. 2878 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2879 AMDGPU::SGPRRegBankID); 2880 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 2881 break; 2882 } 2883 case Intrinsic::amdgcn_ds_gws_sema_v: 2884 case Intrinsic::amdgcn_ds_gws_sema_p: 2885 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 2886 // This must be an SGPR, but accept a VGPR. 2887 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2888 AMDGPU::SGPRRegBankID); 2889 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 2890 break; 2891 } 2892 default: 2893 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 2894 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 2895 // Non-images can have complications from operands that allow both SGPR 2896 // and VGPR. For now it's too complicated to figure out the final opcode 2897 // to derive the register bank from the MCInstrDesc. 2898 if (RSrcIntrin->IsImage) 2899 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 2900 } 2901 2902 return getInvalidInstructionMapping(); 2903 } 2904 break; 2905 } 2906 case AMDGPU::G_SELECT: { 2907 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 2908 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, 2909 AMDGPU::SGPRRegBankID); 2910 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, 2911 AMDGPU::SGPRRegBankID); 2912 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 2913 Op3Bank == AMDGPU::SGPRRegBankID; 2914 2915 unsigned CondBankDefault = SGPRSrcs ? 2916 AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2917 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, 2918 CondBankDefault); 2919 if (CondBank == AMDGPU::SGPRRegBankID) 2920 CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; 2921 else if (CondBank == AMDGPU::VGPRRegBankID) 2922 CondBank = AMDGPU::VCCRegBankID; 2923 2924 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ? 2925 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 2926 2927 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID); 2928 2929 if (Size == 64) { 2930 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2931 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 2932 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2933 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 2934 } else { 2935 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 2936 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 2937 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 2938 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 2939 } 2940 2941 break; 2942 } 2943 2944 case AMDGPU::G_LOAD: 2945 case AMDGPU::G_ZEXTLOAD: 2946 case AMDGPU::G_SEXTLOAD: 2947 return getInstrMappingForLoad(MI); 2948 2949 case AMDGPU::G_ATOMICRMW_XCHG: 2950 case AMDGPU::G_ATOMICRMW_ADD: 2951 case AMDGPU::G_ATOMICRMW_SUB: 2952 case AMDGPU::G_ATOMICRMW_AND: 2953 case AMDGPU::G_ATOMICRMW_OR: 2954 case AMDGPU::G_ATOMICRMW_XOR: 2955 case AMDGPU::G_ATOMICRMW_MAX: 2956 case AMDGPU::G_ATOMICRMW_MIN: 2957 case AMDGPU::G_ATOMICRMW_UMAX: 2958 case AMDGPU::G_ATOMICRMW_UMIN: 2959 case AMDGPU::G_ATOMICRMW_FADD: 2960 case AMDGPU::G_ATOMIC_CMPXCHG: { 2961 return getDefaultMappingAllVGPR(MI); 2962 } 2963 case AMDGPU::G_BRCOND: { 2964 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, 2965 AMDGPU::SGPRRegBankID); 2966 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 2967 if (Bank != AMDGPU::SCCRegBankID) 2968 Bank = AMDGPU::VCCRegBankID; 2969 2970 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 2971 break; 2972 } 2973 } 2974 2975 return getInstructionMapping(/*ID*/1, /*Cost*/1, 2976 getOperandsMapping(OpdsMapping), 2977 MI.getNumOperands()); 2978 } 2979