1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 27 #define DEBUG_TYPE "amdgpu-isel" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 static cl::opt<bool> AllowRiskySelect( 33 "amdgpu-global-isel-risky-select", 34 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), 35 cl::init(false), 36 cl::ReallyHidden); 37 38 #define GET_GLOBALISEL_IMPL 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenGlobalISel.inc" 41 #undef GET_GLOBALISEL_IMPL 42 #undef AMDGPUSubtarget 43 44 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 45 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 46 const AMDGPUTargetMachine &TM) 47 : InstructionSelector(), TII(*STI.getInstrInfo()), 48 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 49 STI(STI), 50 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 51 #define GET_GLOBALISEL_PREDICATES_INIT 52 #include "AMDGPUGenGlobalISel.inc" 53 #undef GET_GLOBALISEL_PREDICATES_INIT 54 #define GET_GLOBALISEL_TEMPORARIES_INIT 55 #include "AMDGPUGenGlobalISel.inc" 56 #undef GET_GLOBALISEL_TEMPORARIES_INIT 57 { 58 } 59 60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 61 62 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, 63 CodeGenCoverage &CoverageInfo) { 64 MRI = &MF.getRegInfo(); 65 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 66 InstructionSelector::setupMF(MF, KB, CoverageInfo); 67 } 68 69 bool AMDGPUInstructionSelector::isVCC(Register Reg, 70 const MachineRegisterInfo &MRI) const { 71 // The verifier is oblivious to s1 being a valid value for wavesize registers. 72 if (Reg.isPhysical()) 73 return false; 74 75 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 76 const TargetRegisterClass *RC = 77 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 78 if (RC) { 79 const LLT Ty = MRI.getType(Reg); 80 return RC->hasSuperClassEq(TRI.getBoolRC()) && 81 Ty.isValid() && Ty.getSizeInBits() == 1; 82 } 83 84 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 85 return RB->getID() == AMDGPU::VCCRegBankID; 86 } 87 88 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 89 unsigned NewOpc) const { 90 MI.setDesc(TII.get(NewOpc)); 91 MI.RemoveOperand(1); // Remove intrinsic ID. 92 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 93 94 MachineOperand &Dst = MI.getOperand(0); 95 MachineOperand &Src = MI.getOperand(1); 96 97 // TODO: This should be legalized to s32 if needed 98 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 99 return false; 100 101 const TargetRegisterClass *DstRC 102 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 103 const TargetRegisterClass *SrcRC 104 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 105 if (!DstRC || DstRC != SrcRC) 106 return false; 107 108 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 109 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 110 } 111 112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 113 const DebugLoc &DL = I.getDebugLoc(); 114 MachineBasicBlock *BB = I.getParent(); 115 I.setDesc(TII.get(TargetOpcode::COPY)); 116 117 const MachineOperand &Src = I.getOperand(1); 118 MachineOperand &Dst = I.getOperand(0); 119 Register DstReg = Dst.getReg(); 120 Register SrcReg = Src.getReg(); 121 122 if (isVCC(DstReg, *MRI)) { 123 if (SrcReg == AMDGPU::SCC) { 124 const TargetRegisterClass *RC 125 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 126 if (!RC) 127 return true; 128 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 129 } 130 131 if (!isVCC(SrcReg, *MRI)) { 132 // TODO: Should probably leave the copy and let copyPhysReg expand it. 133 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 134 return false; 135 136 const TargetRegisterClass *SrcRC 137 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 138 139 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 140 141 // We can't trust the high bits at this point, so clear them. 142 143 // TODO: Skip masking high bits if def is known boolean. 144 145 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? 146 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 147 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 148 .addImm(1) 149 .addReg(SrcReg); 150 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 151 .addImm(0) 152 .addReg(MaskedReg); 153 154 if (!MRI->getRegClassOrNull(SrcReg)) 155 MRI->setRegClass(SrcReg, SrcRC); 156 I.eraseFromParent(); 157 return true; 158 } 159 160 const TargetRegisterClass *RC = 161 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 162 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 163 return false; 164 165 return true; 166 } 167 168 for (const MachineOperand &MO : I.operands()) { 169 if (MO.getReg().isPhysical()) 170 continue; 171 172 const TargetRegisterClass *RC = 173 TRI.getConstrainedRegClassForOperand(MO, *MRI); 174 if (!RC) 175 continue; 176 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 177 } 178 return true; 179 } 180 181 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 182 const Register DefReg = I.getOperand(0).getReg(); 183 const LLT DefTy = MRI->getType(DefReg); 184 if (DefTy == LLT::scalar(1)) { 185 if (!AllowRiskySelect) { 186 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); 187 return false; 188 } 189 190 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); 191 } 192 193 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 194 195 const RegClassOrRegBank &RegClassOrBank = 196 MRI->getRegClassOrRegBank(DefReg); 197 198 const TargetRegisterClass *DefRC 199 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 200 if (!DefRC) { 201 if (!DefTy.isValid()) { 202 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 203 return false; 204 } 205 206 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 207 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); 208 if (!DefRC) { 209 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 210 return false; 211 } 212 } 213 214 // TODO: Verify that all registers have the same bank 215 I.setDesc(TII.get(TargetOpcode::PHI)); 216 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 217 } 218 219 MachineOperand 220 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 221 const TargetRegisterClass &SubRC, 222 unsigned SubIdx) const { 223 224 MachineInstr *MI = MO.getParent(); 225 MachineBasicBlock *BB = MO.getParent()->getParent(); 226 Register DstReg = MRI->createVirtualRegister(&SubRC); 227 228 if (MO.isReg()) { 229 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 230 Register Reg = MO.getReg(); 231 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 232 .addReg(Reg, 0, ComposedSubIdx); 233 234 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 235 MO.isKill(), MO.isDead(), MO.isUndef(), 236 MO.isEarlyClobber(), 0, MO.isDebug(), 237 MO.isInternalRead()); 238 } 239 240 assert(MO.isImm()); 241 242 APInt Imm(64, MO.getImm()); 243 244 switch (SubIdx) { 245 default: 246 llvm_unreachable("do not know to split immediate with this sub index."); 247 case AMDGPU::sub0: 248 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 249 case AMDGPU::sub1: 250 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 251 } 252 } 253 254 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 255 switch (Opc) { 256 case AMDGPU::G_AND: 257 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 258 case AMDGPU::G_OR: 259 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 260 case AMDGPU::G_XOR: 261 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 262 default: 263 llvm_unreachable("not a bit op"); 264 } 265 } 266 267 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 268 Register DstReg = I.getOperand(0).getReg(); 269 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 270 271 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 272 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 273 DstRB->getID() != AMDGPU::VCCRegBankID) 274 return false; 275 276 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 277 STI.isWave64()); 278 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 279 280 // Dead implicit-def of scc 281 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 282 true, // isImp 283 false, // isKill 284 true)); // isDead 285 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 286 } 287 288 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 289 MachineBasicBlock *BB = I.getParent(); 290 MachineFunction *MF = BB->getParent(); 291 Register DstReg = I.getOperand(0).getReg(); 292 const DebugLoc &DL = I.getDebugLoc(); 293 LLT Ty = MRI->getType(DstReg); 294 if (Ty.isVector()) 295 return false; 296 297 unsigned Size = Ty.getSizeInBits(); 298 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 299 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 300 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 301 302 if (Size == 32) { 303 if (IsSALU) { 304 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 305 MachineInstr *Add = 306 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 307 .add(I.getOperand(1)) 308 .add(I.getOperand(2)); 309 I.eraseFromParent(); 310 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 311 } 312 313 if (STI.hasAddNoCarry()) { 314 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 315 I.setDesc(TII.get(Opc)); 316 I.addOperand(*MF, MachineOperand::CreateImm(0)); 317 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 318 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 319 } 320 321 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 322 323 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 324 MachineInstr *Add 325 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 326 .addDef(UnusedCarry, RegState::Dead) 327 .add(I.getOperand(1)) 328 .add(I.getOperand(2)) 329 .addImm(0); 330 I.eraseFromParent(); 331 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 332 } 333 334 assert(!Sub && "illegal sub should not reach here"); 335 336 const TargetRegisterClass &RC 337 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 338 const TargetRegisterClass &HalfRC 339 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 340 341 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 342 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 343 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 344 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 345 346 Register DstLo = MRI->createVirtualRegister(&HalfRC); 347 Register DstHi = MRI->createVirtualRegister(&HalfRC); 348 349 if (IsSALU) { 350 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 351 .add(Lo1) 352 .add(Lo2); 353 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 354 .add(Hi1) 355 .add(Hi2); 356 } else { 357 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 358 Register CarryReg = MRI->createVirtualRegister(CarryRC); 359 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 360 .addDef(CarryReg) 361 .add(Lo1) 362 .add(Lo2) 363 .addImm(0); 364 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 365 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 366 .add(Hi1) 367 .add(Hi2) 368 .addReg(CarryReg, RegState::Kill) 369 .addImm(0); 370 371 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 372 return false; 373 } 374 375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 376 .addReg(DstLo) 377 .addImm(AMDGPU::sub0) 378 .addReg(DstHi) 379 .addImm(AMDGPU::sub1); 380 381 382 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 383 return false; 384 385 I.eraseFromParent(); 386 return true; 387 } 388 389 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 390 MachineInstr &I) const { 391 MachineBasicBlock *BB = I.getParent(); 392 MachineFunction *MF = BB->getParent(); 393 const DebugLoc &DL = I.getDebugLoc(); 394 Register Dst0Reg = I.getOperand(0).getReg(); 395 Register Dst1Reg = I.getOperand(1).getReg(); 396 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 397 I.getOpcode() == AMDGPU::G_UADDE; 398 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 399 I.getOpcode() == AMDGPU::G_USUBE; 400 401 if (isVCC(Dst1Reg, *MRI)) { 402 unsigned NoCarryOpc = 403 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 404 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 405 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 406 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 407 I.addOperand(*MF, MachineOperand::CreateImm(0)); 408 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 409 } 410 411 Register Src0Reg = I.getOperand(2).getReg(); 412 Register Src1Reg = I.getOperand(3).getReg(); 413 414 if (HasCarryIn) { 415 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 416 .addReg(I.getOperand(4).getReg()); 417 } 418 419 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 420 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 421 422 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 423 .add(I.getOperand(2)) 424 .add(I.getOperand(3)); 425 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 426 .addReg(AMDGPU::SCC); 427 428 if (!MRI->getRegClassOrNull(Dst1Reg)) 429 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 430 431 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 432 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 433 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 434 return false; 435 436 if (HasCarryIn && 437 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 438 AMDGPU::SReg_32RegClass, *MRI)) 439 return false; 440 441 I.eraseFromParent(); 442 return true; 443 } 444 445 // TODO: We should probably legalize these to only using 32-bit results. 446 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 447 MachineBasicBlock *BB = I.getParent(); 448 Register DstReg = I.getOperand(0).getReg(); 449 Register SrcReg = I.getOperand(1).getReg(); 450 LLT DstTy = MRI->getType(DstReg); 451 LLT SrcTy = MRI->getType(SrcReg); 452 const unsigned SrcSize = SrcTy.getSizeInBits(); 453 unsigned DstSize = DstTy.getSizeInBits(); 454 455 // TODO: Should handle any multiple of 32 offset. 456 unsigned Offset = I.getOperand(2).getImm(); 457 if (Offset % 32 != 0 || DstSize > 128) 458 return false; 459 460 // 16-bit operations really use 32-bit registers. 461 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 462 if (DstSize == 16) 463 DstSize = 32; 464 465 const TargetRegisterClass *DstRC = 466 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 467 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 468 return false; 469 470 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 471 const TargetRegisterClass *SrcRC = 472 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 473 if (!SrcRC) 474 return false; 475 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 476 DstSize / 32); 477 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 478 if (!SrcRC) 479 return false; 480 481 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 482 *SrcRC, I.getOperand(1)); 483 const DebugLoc &DL = I.getDebugLoc(); 484 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 485 .addReg(SrcReg, 0, SubReg); 486 487 I.eraseFromParent(); 488 return true; 489 } 490 491 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 492 MachineBasicBlock *BB = MI.getParent(); 493 Register DstReg = MI.getOperand(0).getReg(); 494 LLT DstTy = MRI->getType(DstReg); 495 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 496 497 const unsigned SrcSize = SrcTy.getSizeInBits(); 498 if (SrcSize < 32) 499 return selectImpl(MI, *CoverageInfo); 500 501 const DebugLoc &DL = MI.getDebugLoc(); 502 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 503 const unsigned DstSize = DstTy.getSizeInBits(); 504 const TargetRegisterClass *DstRC = 505 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 506 if (!DstRC) 507 return false; 508 509 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 510 MachineInstrBuilder MIB = 511 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 512 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 513 MachineOperand &Src = MI.getOperand(I + 1); 514 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 515 MIB.addImm(SubRegs[I]); 516 517 const TargetRegisterClass *SrcRC 518 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 519 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 520 return false; 521 } 522 523 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 524 return false; 525 526 MI.eraseFromParent(); 527 return true; 528 } 529 530 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 531 MachineBasicBlock *BB = MI.getParent(); 532 const int NumDst = MI.getNumOperands() - 1; 533 534 MachineOperand &Src = MI.getOperand(NumDst); 535 536 Register SrcReg = Src.getReg(); 537 Register DstReg0 = MI.getOperand(0).getReg(); 538 LLT DstTy = MRI->getType(DstReg0); 539 LLT SrcTy = MRI->getType(SrcReg); 540 541 const unsigned DstSize = DstTy.getSizeInBits(); 542 const unsigned SrcSize = SrcTy.getSizeInBits(); 543 const DebugLoc &DL = MI.getDebugLoc(); 544 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 545 546 const TargetRegisterClass *SrcRC = 547 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); 548 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 549 return false; 550 551 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 552 // source, and this relies on the fact that the same subregister indices are 553 // used for both. 554 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 555 for (int I = 0, E = NumDst; I != E; ++I) { 556 MachineOperand &Dst = MI.getOperand(I); 557 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 558 .addReg(SrcReg, 0, SubRegs[I]); 559 560 // Make sure the subregister index is valid for the source register. 561 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 562 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 563 return false; 564 565 const TargetRegisterClass *DstRC = 566 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 567 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 568 return false; 569 } 570 571 MI.eraseFromParent(); 572 return true; 573 } 574 575 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( 576 MachineInstr &MI) const { 577 if (selectImpl(MI, *CoverageInfo)) 578 return true; 579 580 const LLT S32 = LLT::scalar(32); 581 const LLT V2S16 = LLT::vector(2, 16); 582 583 Register Dst = MI.getOperand(0).getReg(); 584 if (MRI->getType(Dst) != V2S16) 585 return false; 586 587 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 588 if (DstBank->getID() != AMDGPU::SGPRRegBankID) 589 return false; 590 591 Register Src0 = MI.getOperand(1).getReg(); 592 Register Src1 = MI.getOperand(2).getReg(); 593 if (MRI->getType(Src0) != S32) 594 return false; 595 596 const DebugLoc &DL = MI.getDebugLoc(); 597 MachineBasicBlock *BB = MI.getParent(); 598 599 auto ConstSrc1 = 600 getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true); 601 if (ConstSrc1) { 602 auto ConstSrc0 = 603 getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true); 604 if (ConstSrc0) { 605 const int64_t K0 = ConstSrc0->Value.getSExtValue(); 606 const int64_t K1 = ConstSrc1->Value.getSExtValue(); 607 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; 608 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; 609 610 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) 611 .addImm(Lo16 | (Hi16 << 16)); 612 MI.eraseFromParent(); 613 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 614 } 615 } 616 617 // TODO: This should probably be a combine somewhere 618 // (build_vector_trunc $src0, undef -> copy $src0 619 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 620 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 621 MI.setDesc(TII.get(AMDGPU::COPY)); 622 MI.RemoveOperand(2); 623 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && 624 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); 625 } 626 627 Register ShiftSrc0; 628 Register ShiftSrc1; 629 630 // With multiple uses of the shift, this will duplicate the shift and 631 // increase register pressure. 632 // 633 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 634 // => (S_PACK_HH_B32_B16 $src0, $src1) 635 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) 636 // => (S_PACK_LH_B32_B16 $src0, $src1) 637 // (build_vector_trunc $src0, $src1) 638 // => (S_PACK_LL_B32_B16 $src0, $src1) 639 640 bool Shift0 = mi_match( 641 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); 642 643 bool Shift1 = mi_match( 644 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); 645 646 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 647 if (Shift0 && Shift1) { 648 Opc = AMDGPU::S_PACK_HH_B32_B16; 649 MI.getOperand(1).setReg(ShiftSrc0); 650 MI.getOperand(2).setReg(ShiftSrc1); 651 } else if (Shift1) { 652 Opc = AMDGPU::S_PACK_LH_B32_B16; 653 MI.getOperand(2).setReg(ShiftSrc1); 654 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { 655 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 656 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 657 .addReg(ShiftSrc0) 658 .addImm(16); 659 660 MI.eraseFromParent(); 661 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 662 } 663 664 MI.setDesc(TII.get(Opc)); 665 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 666 } 667 668 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { 669 return selectG_ADD_SUB(I); 670 } 671 672 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 673 const MachineOperand &MO = I.getOperand(0); 674 675 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 676 // regbank check here is to know why getConstrainedRegClassForOperand failed. 677 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 678 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 679 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 680 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 681 return true; 682 } 683 684 return false; 685 } 686 687 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 688 MachineBasicBlock *BB = I.getParent(); 689 690 Register DstReg = I.getOperand(0).getReg(); 691 Register Src0Reg = I.getOperand(1).getReg(); 692 Register Src1Reg = I.getOperand(2).getReg(); 693 LLT Src1Ty = MRI->getType(Src1Reg); 694 695 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 696 unsigned InsSize = Src1Ty.getSizeInBits(); 697 698 int64_t Offset = I.getOperand(3).getImm(); 699 700 // FIXME: These cases should have been illegal and unnecessary to check here. 701 if (Offset % 32 != 0 || InsSize % 32 != 0) 702 return false; 703 704 // Currently not handled by getSubRegFromChannel. 705 if (InsSize > 128) 706 return false; 707 708 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 709 if (SubReg == AMDGPU::NoSubRegister) 710 return false; 711 712 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 713 const TargetRegisterClass *DstRC = 714 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 715 if (!DstRC) 716 return false; 717 718 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 719 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 720 const TargetRegisterClass *Src0RC = 721 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); 722 const TargetRegisterClass *Src1RC = 723 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); 724 725 // Deal with weird cases where the class only partially supports the subreg 726 // index. 727 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 728 if (!Src0RC || !Src1RC) 729 return false; 730 731 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 732 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 733 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 734 return false; 735 736 const DebugLoc &DL = I.getDebugLoc(); 737 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 738 .addReg(Src0Reg) 739 .addReg(Src1Reg) 740 .addImm(SubReg); 741 742 I.eraseFromParent(); 743 return true; 744 } 745 746 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 747 if (STI.getLDSBankCount() != 16) 748 return selectImpl(MI, *CoverageInfo); 749 750 Register Dst = MI.getOperand(0).getReg(); 751 Register Src0 = MI.getOperand(2).getReg(); 752 Register M0Val = MI.getOperand(6).getReg(); 753 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 754 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 755 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 756 return false; 757 758 // This requires 2 instructions. It is possible to write a pattern to support 759 // this, but the generated isel emitter doesn't correctly deal with multiple 760 // output instructions using the same physical register input. The copy to m0 761 // is incorrectly placed before the second instruction. 762 // 763 // TODO: Match source modifiers. 764 765 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 766 const DebugLoc &DL = MI.getDebugLoc(); 767 MachineBasicBlock *MBB = MI.getParent(); 768 769 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 770 .addReg(M0Val); 771 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 772 .addImm(2) 773 .addImm(MI.getOperand(4).getImm()) // $attr 774 .addImm(MI.getOperand(3).getImm()); // $attrchan 775 776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 777 .addImm(0) // $src0_modifiers 778 .addReg(Src0) // $src0 779 .addImm(MI.getOperand(4).getImm()) // $attr 780 .addImm(MI.getOperand(3).getImm()) // $attrchan 781 .addImm(0) // $src2_modifiers 782 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 783 .addImm(MI.getOperand(5).getImm()) // $high 784 .addImm(0) // $clamp 785 .addImm(0); // $omod 786 787 MI.eraseFromParent(); 788 return true; 789 } 790 791 // Writelane is special in that it can use SGPR and M0 (which would normally 792 // count as using the constant bus twice - but in this case it is allowed since 793 // the lane selector doesn't count as a use of the constant bus). However, it is 794 // still required to abide by the 1 SGPR rule. Fix this up if we might have 795 // multiple SGPRs. 796 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { 797 // With a constant bus limit of at least 2, there's no issue. 798 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) 799 return selectImpl(MI, *CoverageInfo); 800 801 MachineBasicBlock *MBB = MI.getParent(); 802 const DebugLoc &DL = MI.getDebugLoc(); 803 Register VDst = MI.getOperand(0).getReg(); 804 Register Val = MI.getOperand(2).getReg(); 805 Register LaneSelect = MI.getOperand(3).getReg(); 806 Register VDstIn = MI.getOperand(4).getReg(); 807 808 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); 809 810 Optional<ValueAndVReg> ConstSelect = 811 getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true); 812 if (ConstSelect) { 813 // The selector has to be an inline immediate, so we can use whatever for 814 // the other operands. 815 MIB.addReg(Val); 816 MIB.addImm(ConstSelect->Value.getSExtValue() & 817 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); 818 } else { 819 Optional<ValueAndVReg> ConstVal = 820 getConstantVRegValWithLookThrough(Val, *MRI, true, true); 821 822 // If the value written is an inline immediate, we can get away without a 823 // copy to m0. 824 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(), 825 STI.hasInv2PiInlineImm())) { 826 MIB.addImm(ConstVal->Value.getSExtValue()); 827 MIB.addReg(LaneSelect); 828 } else { 829 MIB.addReg(Val); 830 831 // If the lane selector was originally in a VGPR and copied with 832 // readfirstlane, there's a hazard to read the same SGPR from the 833 // VALU. Constrain to a different SGPR to help avoid needing a nop later. 834 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); 835 836 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 837 .addReg(LaneSelect); 838 MIB.addReg(AMDGPU::M0); 839 } 840 } 841 842 MIB.addReg(VDstIn); 843 844 MI.eraseFromParent(); 845 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 846 } 847 848 // We need to handle this here because tablegen doesn't support matching 849 // instructions with multiple outputs. 850 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 851 Register Dst0 = MI.getOperand(0).getReg(); 852 Register Dst1 = MI.getOperand(1).getReg(); 853 854 LLT Ty = MRI->getType(Dst0); 855 unsigned Opc; 856 if (Ty == LLT::scalar(32)) 857 Opc = AMDGPU::V_DIV_SCALE_F32_e64; 858 else if (Ty == LLT::scalar(64)) 859 Opc = AMDGPU::V_DIV_SCALE_F64_e64; 860 else 861 return false; 862 863 // TODO: Match source modifiers. 864 865 const DebugLoc &DL = MI.getDebugLoc(); 866 MachineBasicBlock *MBB = MI.getParent(); 867 868 Register Numer = MI.getOperand(3).getReg(); 869 Register Denom = MI.getOperand(4).getReg(); 870 unsigned ChooseDenom = MI.getOperand(5).getImm(); 871 872 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 873 874 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 875 .addDef(Dst1) 876 .addImm(0) // $src0_modifiers 877 .addUse(Src0) // $src0 878 .addImm(0) // $src1_modifiers 879 .addUse(Denom) // $src1 880 .addImm(0) // $src2_modifiers 881 .addUse(Numer) // $src2 882 .addImm(0) // $clamp 883 .addImm(0); // $omod 884 885 MI.eraseFromParent(); 886 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 887 } 888 889 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 890 unsigned IntrinsicID = I.getIntrinsicID(); 891 switch (IntrinsicID) { 892 case Intrinsic::amdgcn_if_break: { 893 MachineBasicBlock *BB = I.getParent(); 894 895 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 896 // SelectionDAG uses for wave32 vs wave64. 897 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 898 .add(I.getOperand(0)) 899 .add(I.getOperand(2)) 900 .add(I.getOperand(3)); 901 902 Register DstReg = I.getOperand(0).getReg(); 903 Register Src0Reg = I.getOperand(2).getReg(); 904 Register Src1Reg = I.getOperand(3).getReg(); 905 906 I.eraseFromParent(); 907 908 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 909 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 910 911 return true; 912 } 913 case Intrinsic::amdgcn_interp_p1_f16: 914 return selectInterpP1F16(I); 915 case Intrinsic::amdgcn_wqm: 916 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 917 case Intrinsic::amdgcn_softwqm: 918 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 919 case Intrinsic::amdgcn_wwm: 920 return constrainCopyLikeIntrin(I, AMDGPU::WWM); 921 case Intrinsic::amdgcn_writelane: 922 return selectWritelane(I); 923 case Intrinsic::amdgcn_div_scale: 924 return selectDivScale(I); 925 case Intrinsic::amdgcn_icmp: 926 return selectIntrinsicIcmp(I); 927 case Intrinsic::amdgcn_ballot: 928 return selectBallot(I); 929 case Intrinsic::amdgcn_reloc_constant: 930 return selectRelocConstant(I); 931 case Intrinsic::amdgcn_groupstaticsize: 932 return selectGroupStaticSize(I); 933 case Intrinsic::returnaddress: 934 return selectReturnAddress(I); 935 default: 936 return selectImpl(I, *CoverageInfo); 937 } 938 } 939 940 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { 941 if (Size != 32 && Size != 64) 942 return -1; 943 switch (P) { 944 default: 945 llvm_unreachable("Unknown condition code!"); 946 case CmpInst::ICMP_NE: 947 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; 948 case CmpInst::ICMP_EQ: 949 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; 950 case CmpInst::ICMP_SGT: 951 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; 952 case CmpInst::ICMP_SGE: 953 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; 954 case CmpInst::ICMP_SLT: 955 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; 956 case CmpInst::ICMP_SLE: 957 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; 958 case CmpInst::ICMP_UGT: 959 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; 960 case CmpInst::ICMP_UGE: 961 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; 962 case CmpInst::ICMP_ULT: 963 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; 964 case CmpInst::ICMP_ULE: 965 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; 966 } 967 } 968 969 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 970 unsigned Size) const { 971 if (Size == 64) { 972 if (!STI.hasScalarCompareEq64()) 973 return -1; 974 975 switch (P) { 976 case CmpInst::ICMP_NE: 977 return AMDGPU::S_CMP_LG_U64; 978 case CmpInst::ICMP_EQ: 979 return AMDGPU::S_CMP_EQ_U64; 980 default: 981 return -1; 982 } 983 } 984 985 if (Size != 32) 986 return -1; 987 988 switch (P) { 989 case CmpInst::ICMP_NE: 990 return AMDGPU::S_CMP_LG_U32; 991 case CmpInst::ICMP_EQ: 992 return AMDGPU::S_CMP_EQ_U32; 993 case CmpInst::ICMP_SGT: 994 return AMDGPU::S_CMP_GT_I32; 995 case CmpInst::ICMP_SGE: 996 return AMDGPU::S_CMP_GE_I32; 997 case CmpInst::ICMP_SLT: 998 return AMDGPU::S_CMP_LT_I32; 999 case CmpInst::ICMP_SLE: 1000 return AMDGPU::S_CMP_LE_I32; 1001 case CmpInst::ICMP_UGT: 1002 return AMDGPU::S_CMP_GT_U32; 1003 case CmpInst::ICMP_UGE: 1004 return AMDGPU::S_CMP_GE_U32; 1005 case CmpInst::ICMP_ULT: 1006 return AMDGPU::S_CMP_LT_U32; 1007 case CmpInst::ICMP_ULE: 1008 return AMDGPU::S_CMP_LE_U32; 1009 default: 1010 llvm_unreachable("Unknown condition code!"); 1011 } 1012 } 1013 1014 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { 1015 MachineBasicBlock *BB = I.getParent(); 1016 const DebugLoc &DL = I.getDebugLoc(); 1017 1018 Register SrcReg = I.getOperand(2).getReg(); 1019 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1020 1021 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 1022 1023 Register CCReg = I.getOperand(0).getReg(); 1024 if (!isVCC(CCReg, *MRI)) { 1025 int Opcode = getS_CMPOpcode(Pred, Size); 1026 if (Opcode == -1) 1027 return false; 1028 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1029 .add(I.getOperand(2)) 1030 .add(I.getOperand(3)); 1031 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1032 .addReg(AMDGPU::SCC); 1033 bool Ret = 1034 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1035 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1036 I.eraseFromParent(); 1037 return Ret; 1038 } 1039 1040 int Opcode = getV_CMPOpcode(Pred, Size); 1041 if (Opcode == -1) 1042 return false; 1043 1044 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1045 I.getOperand(0).getReg()) 1046 .add(I.getOperand(2)) 1047 .add(I.getOperand(3)); 1048 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1049 *TRI.getBoolRC(), *MRI); 1050 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1051 I.eraseFromParent(); 1052 return Ret; 1053 } 1054 1055 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { 1056 Register Dst = I.getOperand(0).getReg(); 1057 if (isVCC(Dst, *MRI)) 1058 return false; 1059 1060 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) 1061 return false; 1062 1063 MachineBasicBlock *BB = I.getParent(); 1064 const DebugLoc &DL = I.getDebugLoc(); 1065 Register SrcReg = I.getOperand(2).getReg(); 1066 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1067 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1068 1069 int Opcode = getV_CMPOpcode(Pred, Size); 1070 if (Opcode == -1) 1071 return false; 1072 1073 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) 1074 .add(I.getOperand(2)) 1075 .add(I.getOperand(3)); 1076 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), 1077 *MRI); 1078 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1079 I.eraseFromParent(); 1080 return Ret; 1081 } 1082 1083 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1084 MachineBasicBlock *BB = I.getParent(); 1085 const DebugLoc &DL = I.getDebugLoc(); 1086 Register DstReg = I.getOperand(0).getReg(); 1087 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1088 const bool Is64 = Size == 64; 1089 1090 if (Size != STI.getWavefrontSize()) 1091 return false; 1092 1093 Optional<ValueAndVReg> Arg = 1094 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); 1095 1096 if (Arg.hasValue()) { 1097 const int64_t Value = Arg.getValue().Value.getSExtValue(); 1098 if (Value == 0) { 1099 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1100 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1101 } else if (Value == -1) { // all ones 1102 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 1103 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1104 } else 1105 return false; 1106 } else { 1107 Register SrcReg = I.getOperand(2).getReg(); 1108 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); 1109 } 1110 1111 I.eraseFromParent(); 1112 return true; 1113 } 1114 1115 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1116 Register DstReg = I.getOperand(0).getReg(); 1117 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1118 const TargetRegisterClass *DstRC = 1119 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); 1120 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1121 return false; 1122 1123 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1124 1125 Module *M = MF->getFunction().getParent(); 1126 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1127 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1128 auto RelocSymbol = cast<GlobalVariable>( 1129 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1130 1131 MachineBasicBlock *BB = I.getParent(); 1132 BuildMI(*BB, &I, I.getDebugLoc(), 1133 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1134 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1135 1136 I.eraseFromParent(); 1137 return true; 1138 } 1139 1140 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { 1141 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); 1142 1143 Register DstReg = I.getOperand(0).getReg(); 1144 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1145 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? 1146 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1147 1148 MachineBasicBlock *MBB = I.getParent(); 1149 const DebugLoc &DL = I.getDebugLoc(); 1150 1151 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); 1152 1153 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { 1154 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1155 MIB.addImm(MFI->getLDSSize()); 1156 } else { 1157 Module *M = MF->getFunction().getParent(); 1158 const GlobalValue *GV 1159 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); 1160 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 1161 } 1162 1163 I.eraseFromParent(); 1164 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1165 } 1166 1167 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1168 MachineBasicBlock *MBB = I.getParent(); 1169 MachineFunction &MF = *MBB->getParent(); 1170 const DebugLoc &DL = I.getDebugLoc(); 1171 1172 MachineOperand &Dst = I.getOperand(0); 1173 Register DstReg = Dst.getReg(); 1174 unsigned Depth = I.getOperand(2).getImm(); 1175 1176 const TargetRegisterClass *RC 1177 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1178 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1179 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1180 return false; 1181 1182 // Check for kernel and shader functions 1183 if (Depth != 0 || 1184 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1185 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1186 .addImm(0); 1187 I.eraseFromParent(); 1188 return true; 1189 } 1190 1191 MachineFrameInfo &MFI = MF.getFrameInfo(); 1192 // There is a call to @llvm.returnaddress in this function 1193 MFI.setReturnAddressIsTaken(true); 1194 1195 // Get the return address reg and mark it as an implicit live-in 1196 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1197 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1198 AMDGPU::SReg_64RegClass); 1199 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1200 .addReg(LiveIn); 1201 I.eraseFromParent(); 1202 return true; 1203 } 1204 1205 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1206 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick 1207 // SelectionDAG uses for wave32 vs wave64. 1208 MachineBasicBlock *BB = MI.getParent(); 1209 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1210 .add(MI.getOperand(1)); 1211 1212 Register Reg = MI.getOperand(1).getReg(); 1213 MI.eraseFromParent(); 1214 1215 if (!MRI->getRegClassOrNull(Reg)) 1216 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1217 return true; 1218 } 1219 1220 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1221 MachineInstr &MI, Intrinsic::ID IntrID) const { 1222 MachineBasicBlock *MBB = MI.getParent(); 1223 MachineFunction *MF = MBB->getParent(); 1224 const DebugLoc &DL = MI.getDebugLoc(); 1225 1226 unsigned IndexOperand = MI.getOperand(7).getImm(); 1227 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1228 bool WaveDone = MI.getOperand(9).getImm() != 0; 1229 1230 if (WaveDone && !WaveRelease) 1231 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1232 1233 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1234 IndexOperand &= ~0x3f; 1235 unsigned CountDw = 0; 1236 1237 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1238 CountDw = (IndexOperand >> 24) & 0xf; 1239 IndexOperand &= ~(0xf << 24); 1240 1241 if (CountDw < 1 || CountDw > 4) { 1242 report_fatal_error( 1243 "ds_ordered_count: dword count must be between 1 and 4"); 1244 } 1245 } 1246 1247 if (IndexOperand) 1248 report_fatal_error("ds_ordered_count: bad index operand"); 1249 1250 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1251 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1252 1253 unsigned Offset0 = OrderedCountIndex << 2; 1254 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | 1255 (Instruction << 4); 1256 1257 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1258 Offset1 |= (CountDw - 1) << 6; 1259 1260 unsigned Offset = Offset0 | (Offset1 << 8); 1261 1262 Register M0Val = MI.getOperand(2).getReg(); 1263 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1264 .addReg(M0Val); 1265 1266 Register DstReg = MI.getOperand(0).getReg(); 1267 Register ValReg = MI.getOperand(3).getReg(); 1268 MachineInstrBuilder DS = 1269 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1270 .addReg(ValReg) 1271 .addImm(Offset) 1272 .cloneMemRefs(MI); 1273 1274 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1275 return false; 1276 1277 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1278 MI.eraseFromParent(); 1279 return Ret; 1280 } 1281 1282 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1283 switch (IntrID) { 1284 case Intrinsic::amdgcn_ds_gws_init: 1285 return AMDGPU::DS_GWS_INIT; 1286 case Intrinsic::amdgcn_ds_gws_barrier: 1287 return AMDGPU::DS_GWS_BARRIER; 1288 case Intrinsic::amdgcn_ds_gws_sema_v: 1289 return AMDGPU::DS_GWS_SEMA_V; 1290 case Intrinsic::amdgcn_ds_gws_sema_br: 1291 return AMDGPU::DS_GWS_SEMA_BR; 1292 case Intrinsic::amdgcn_ds_gws_sema_p: 1293 return AMDGPU::DS_GWS_SEMA_P; 1294 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1295 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1296 default: 1297 llvm_unreachable("not a gws intrinsic"); 1298 } 1299 } 1300 1301 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1302 Intrinsic::ID IID) const { 1303 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1304 !STI.hasGWSSemaReleaseAll()) 1305 return false; 1306 1307 // intrinsic ID, vsrc, offset 1308 const bool HasVSrc = MI.getNumOperands() == 3; 1309 assert(HasVSrc || MI.getNumOperands() == 2); 1310 1311 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1312 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1313 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1314 return false; 1315 1316 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1317 assert(OffsetDef); 1318 1319 unsigned ImmOffset; 1320 1321 MachineBasicBlock *MBB = MI.getParent(); 1322 const DebugLoc &DL = MI.getDebugLoc(); 1323 1324 MachineInstr *Readfirstlane = nullptr; 1325 1326 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1327 // incoming offset, in case there's an add of a constant. We'll have to put it 1328 // back later. 1329 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1330 Readfirstlane = OffsetDef; 1331 BaseOffset = OffsetDef->getOperand(1).getReg(); 1332 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1333 } 1334 1335 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1336 // If we have a constant offset, try to use the 0 in m0 as the base. 1337 // TODO: Look into changing the default m0 initialization value. If the 1338 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1339 // the immediate offset. 1340 1341 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1342 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1343 .addImm(0); 1344 } else { 1345 std::tie(BaseOffset, ImmOffset) = 1346 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); 1347 1348 if (Readfirstlane) { 1349 // We have the constant offset now, so put the readfirstlane back on the 1350 // variable component. 1351 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1352 return false; 1353 1354 Readfirstlane->getOperand(1).setReg(BaseOffset); 1355 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1356 } else { 1357 if (!RBI.constrainGenericRegister(BaseOffset, 1358 AMDGPU::SReg_32RegClass, *MRI)) 1359 return false; 1360 } 1361 1362 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1363 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1364 .addReg(BaseOffset) 1365 .addImm(16); 1366 1367 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1368 .addReg(M0Base); 1369 } 1370 1371 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1372 // offset field) % 64. Some versions of the programming guide omit the m0 1373 // part, or claim it's from offset 0. 1374 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1375 1376 if (HasVSrc) { 1377 Register VSrc = MI.getOperand(1).getReg(); 1378 MIB.addReg(VSrc); 1379 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1380 return false; 1381 } 1382 1383 MIB.addImm(ImmOffset) 1384 .cloneMemRefs(MI); 1385 1386 MI.eraseFromParent(); 1387 return true; 1388 } 1389 1390 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1391 bool IsAppend) const { 1392 Register PtrBase = MI.getOperand(2).getReg(); 1393 LLT PtrTy = MRI->getType(PtrBase); 1394 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1395 1396 unsigned Offset; 1397 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1398 1399 // TODO: Should this try to look through readfirstlane like GWS? 1400 if (!isDSOffsetLegal(PtrBase, Offset)) { 1401 PtrBase = MI.getOperand(2).getReg(); 1402 Offset = 0; 1403 } 1404 1405 MachineBasicBlock *MBB = MI.getParent(); 1406 const DebugLoc &DL = MI.getDebugLoc(); 1407 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1408 1409 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1410 .addReg(PtrBase); 1411 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1412 return false; 1413 1414 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1415 .addImm(Offset) 1416 .addImm(IsGDS ? -1 : 0) 1417 .cloneMemRefs(MI); 1418 MI.eraseFromParent(); 1419 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1420 } 1421 1422 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { 1423 if (TM.getOptLevel() > CodeGenOpt::None) { 1424 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; 1425 if (WGSize <= STI.getWavefrontSize()) { 1426 MachineBasicBlock *MBB = MI.getParent(); 1427 const DebugLoc &DL = MI.getDebugLoc(); 1428 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); 1429 MI.eraseFromParent(); 1430 return true; 1431 } 1432 } 1433 return selectImpl(MI, *CoverageInfo); 1434 } 1435 1436 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1437 bool &IsTexFail) { 1438 if (TexFailCtrl) 1439 IsTexFail = true; 1440 1441 TFE = (TexFailCtrl & 0x1) ? 1 : 0; 1442 TexFailCtrl &= ~(uint64_t)0x1; 1443 LWE = (TexFailCtrl & 0x2) ? 1 : 0; 1444 TexFailCtrl &= ~(uint64_t)0x2; 1445 1446 return TexFailCtrl == 0; 1447 } 1448 1449 static bool parseCachePolicy(uint64_t Value, 1450 bool *GLC, bool *SLC, bool *DLC) { 1451 if (GLC) { 1452 *GLC = (Value & 0x1) ? 1 : 0; 1453 Value &= ~(uint64_t)0x1; 1454 } 1455 if (SLC) { 1456 *SLC = (Value & 0x2) ? 1 : 0; 1457 Value &= ~(uint64_t)0x2; 1458 } 1459 if (DLC) { 1460 *DLC = (Value & 0x4) ? 1 : 0; 1461 Value &= ~(uint64_t)0x4; 1462 } 1463 1464 return Value == 0; 1465 } 1466 1467 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1468 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1469 MachineBasicBlock *MBB = MI.getParent(); 1470 const DebugLoc &DL = MI.getDebugLoc(); 1471 1472 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1473 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1474 1475 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1476 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 1477 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); 1478 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 1479 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); 1480 unsigned IntrOpcode = Intr->BaseOpcode; 1481 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); 1482 1483 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; 1484 1485 Register VDataIn, VDataOut; 1486 LLT VDataTy; 1487 int NumVDataDwords = -1; 1488 bool IsD16 = false; 1489 1490 bool Unorm; 1491 if (!BaseOpcode->Sampler) 1492 Unorm = true; 1493 else 1494 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0; 1495 1496 bool TFE; 1497 bool LWE; 1498 bool IsTexFail = false; 1499 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(), 1500 TFE, LWE, IsTexFail)) 1501 return false; 1502 1503 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm(); 1504 const bool IsA16 = (Flags & 1) != 0; 1505 const bool IsG16 = (Flags & 2) != 0; 1506 1507 // A16 implies 16 bit gradients 1508 if (IsA16 && !IsG16) 1509 return false; 1510 1511 unsigned DMask = 0; 1512 unsigned DMaskLanes = 0; 1513 1514 if (BaseOpcode->Atomic) { 1515 VDataOut = MI.getOperand(0).getReg(); 1516 VDataIn = MI.getOperand(2).getReg(); 1517 LLT Ty = MRI->getType(VDataIn); 1518 1519 // Be careful to allow atomic swap on 16-bit element vectors. 1520 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1521 Ty.getSizeInBits() == 128 : 1522 Ty.getSizeInBits() == 64; 1523 1524 if (BaseOpcode->AtomicX2) { 1525 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1526 1527 DMask = Is64Bit ? 0xf : 0x3; 1528 NumVDataDwords = Is64Bit ? 4 : 2; 1529 } else { 1530 DMask = Is64Bit ? 0x3 : 0x1; 1531 NumVDataDwords = Is64Bit ? 2 : 1; 1532 } 1533 } else { 1534 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 1535 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); 1536 1537 // One memoperand is mandatory, except for getresinfo. 1538 // FIXME: Check this in verifier. 1539 if (!MI.memoperands_empty()) { 1540 const MachineMemOperand *MMO = *MI.memoperands_begin(); 1541 1542 // Infer d16 from the memory size, as the register type will be mangled by 1543 // unpacked subtargets, or by TFE. 1544 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; 1545 } 1546 1547 if (BaseOpcode->Store) { 1548 VDataIn = MI.getOperand(1).getReg(); 1549 VDataTy = MRI->getType(VDataIn); 1550 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1551 } else { 1552 VDataOut = MI.getOperand(0).getReg(); 1553 VDataTy = MRI->getType(VDataOut); 1554 NumVDataDwords = DMaskLanes; 1555 1556 if (IsD16 && !STI.hasUnpackedD16VMem()) 1557 NumVDataDwords = (DMaskLanes + 1) / 2; 1558 } 1559 } 1560 1561 // Optimize _L to _LZ when _L is zero 1562 if (LZMappingInfo) { 1563 // The legalizer replaced the register with an immediate 0 if we need to 1564 // change the opcode. 1565 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex); 1566 if (Lod.isImm()) { 1567 assert(Lod.getImm() == 0); 1568 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l 1569 } 1570 } 1571 1572 // Optimize _mip away, when 'lod' is zero 1573 if (MIPMappingInfo) { 1574 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex); 1575 if (Lod.isImm()) { 1576 assert(Lod.getImm() == 0); 1577 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip 1578 } 1579 } 1580 1581 // Set G16 opcode 1582 if (IsG16 && !IsA16) { 1583 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1584 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1585 assert(G16MappingInfo); 1586 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1587 } 1588 1589 // TODO: Check this in verifier. 1590 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1591 1592 bool GLC = false; 1593 bool SLC = false; 1594 bool DLC = false; 1595 if (BaseOpcode->Atomic) { 1596 GLC = true; // TODO no-return optimization 1597 if (!parseCachePolicy( 1598 MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr, 1599 &SLC, IsGFX10Plus ? &DLC : nullptr)) 1600 return false; 1601 } else { 1602 if (!parseCachePolicy( 1603 MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC, 1604 &SLC, IsGFX10Plus ? &DLC : nullptr)) 1605 return false; 1606 } 1607 1608 int NumVAddrRegs = 0; 1609 int NumVAddrDwords = 0; 1610 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 1611 // Skip the $noregs and 0s inserted during legalization. 1612 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I); 1613 if (!AddrOp.isReg()) 1614 continue; // XXX - Break? 1615 1616 Register Addr = AddrOp.getReg(); 1617 if (!Addr) 1618 break; 1619 1620 ++NumVAddrRegs; 1621 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1622 } 1623 1624 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1625 // NSA, these should have beeen packed into a single value in the first 1626 // address register 1627 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; 1628 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1629 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1630 return false; 1631 } 1632 1633 if (IsTexFail) 1634 ++NumVDataDwords; 1635 1636 int Opcode = -1; 1637 if (IsGFX10Plus) { 1638 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1639 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1640 : AMDGPU::MIMGEncGfx10Default, 1641 NumVDataDwords, NumVAddrDwords); 1642 } else { 1643 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1644 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1645 NumVDataDwords, NumVAddrDwords); 1646 if (Opcode == -1) 1647 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1648 NumVDataDwords, NumVAddrDwords); 1649 } 1650 assert(Opcode != -1); 1651 1652 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1653 .cloneMemRefs(MI); 1654 1655 if (VDataOut) { 1656 if (BaseOpcode->AtomicX2) { 1657 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1658 1659 Register TmpReg = MRI->createVirtualRegister( 1660 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1661 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1662 1663 MIB.addDef(TmpReg); 1664 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1665 .addReg(TmpReg, RegState::Kill, SubReg); 1666 1667 } else { 1668 MIB.addDef(VDataOut); // vdata output 1669 } 1670 } 1671 1672 if (VDataIn) 1673 MIB.addReg(VDataIn); // vdata input 1674 1675 for (int I = 0; I != NumVAddrRegs; ++I) { 1676 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I); 1677 if (SrcOp.isReg()) { 1678 assert(SrcOp.getReg() != 0); 1679 MIB.addReg(SrcOp.getReg()); 1680 } 1681 } 1682 1683 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg()); 1684 if (BaseOpcode->Sampler) 1685 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg()); 1686 1687 MIB.addImm(DMask); // dmask 1688 1689 if (IsGFX10Plus) 1690 MIB.addImm(DimInfo->Encoding); 1691 MIB.addImm(Unorm); 1692 if (IsGFX10Plus) 1693 MIB.addImm(DLC); 1694 1695 MIB.addImm(GLC); 1696 MIB.addImm(SLC); 1697 MIB.addImm(IsA16 && // a16 or r128 1698 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 1699 if (IsGFX10Plus) 1700 MIB.addImm(IsA16 ? -1 : 0); 1701 1702 MIB.addImm(TFE); // tfe 1703 MIB.addImm(LWE); // lwe 1704 if (!IsGFX10Plus) 1705 MIB.addImm(DimInfo->DA ? -1 : 0); 1706 if (BaseOpcode->HasD16) 1707 MIB.addImm(IsD16 ? -1 : 0); 1708 1709 MI.eraseFromParent(); 1710 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1711 } 1712 1713 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 1714 MachineInstr &I) const { 1715 unsigned IntrinsicID = I.getIntrinsicID(); 1716 switch (IntrinsicID) { 1717 case Intrinsic::amdgcn_end_cf: 1718 return selectEndCfIntrinsic(I); 1719 case Intrinsic::amdgcn_ds_ordered_add: 1720 case Intrinsic::amdgcn_ds_ordered_swap: 1721 return selectDSOrderedIntrinsic(I, IntrinsicID); 1722 case Intrinsic::amdgcn_ds_gws_init: 1723 case Intrinsic::amdgcn_ds_gws_barrier: 1724 case Intrinsic::amdgcn_ds_gws_sema_v: 1725 case Intrinsic::amdgcn_ds_gws_sema_br: 1726 case Intrinsic::amdgcn_ds_gws_sema_p: 1727 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1728 return selectDSGWSIntrinsic(I, IntrinsicID); 1729 case Intrinsic::amdgcn_ds_append: 1730 return selectDSAppendConsume(I, true); 1731 case Intrinsic::amdgcn_ds_consume: 1732 return selectDSAppendConsume(I, false); 1733 case Intrinsic::amdgcn_s_barrier: 1734 return selectSBarrier(I); 1735 case Intrinsic::amdgcn_global_atomic_fadd: 1736 return selectGlobalAtomicFaddIntrinsic(I); 1737 default: { 1738 return selectImpl(I, *CoverageInfo); 1739 } 1740 } 1741 } 1742 1743 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 1744 if (selectImpl(I, *CoverageInfo)) 1745 return true; 1746 1747 MachineBasicBlock *BB = I.getParent(); 1748 const DebugLoc &DL = I.getDebugLoc(); 1749 1750 Register DstReg = I.getOperand(0).getReg(); 1751 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 1752 assert(Size <= 32 || Size == 64); 1753 const MachineOperand &CCOp = I.getOperand(1); 1754 Register CCReg = CCOp.getReg(); 1755 if (!isVCC(CCReg, *MRI)) { 1756 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 1757 AMDGPU::S_CSELECT_B32; 1758 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 1759 .addReg(CCReg); 1760 1761 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 1762 // bank, because it does not cover the register class that we used to represent 1763 // for it. So we need to manually set the register class here. 1764 if (!MRI->getRegClassOrNull(CCReg)) 1765 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 1766 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 1767 .add(I.getOperand(2)) 1768 .add(I.getOperand(3)); 1769 1770 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | 1771 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 1772 I.eraseFromParent(); 1773 return Ret; 1774 } 1775 1776 // Wide VGPR select should have been split in RegBankSelect. 1777 if (Size > 32) 1778 return false; 1779 1780 MachineInstr *Select = 1781 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1782 .addImm(0) 1783 .add(I.getOperand(3)) 1784 .addImm(0) 1785 .add(I.getOperand(2)) 1786 .add(I.getOperand(1)); 1787 1788 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 1789 I.eraseFromParent(); 1790 return Ret; 1791 } 1792 1793 static int sizeToSubRegIndex(unsigned Size) { 1794 switch (Size) { 1795 case 32: 1796 return AMDGPU::sub0; 1797 case 64: 1798 return AMDGPU::sub0_sub1; 1799 case 96: 1800 return AMDGPU::sub0_sub1_sub2; 1801 case 128: 1802 return AMDGPU::sub0_sub1_sub2_sub3; 1803 case 256: 1804 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 1805 default: 1806 if (Size < 32) 1807 return AMDGPU::sub0; 1808 if (Size > 256) 1809 return -1; 1810 return sizeToSubRegIndex(PowerOf2Ceil(Size)); 1811 } 1812 } 1813 1814 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 1815 Register DstReg = I.getOperand(0).getReg(); 1816 Register SrcReg = I.getOperand(1).getReg(); 1817 const LLT DstTy = MRI->getType(DstReg); 1818 const LLT SrcTy = MRI->getType(SrcReg); 1819 const LLT S1 = LLT::scalar(1); 1820 1821 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 1822 const RegisterBank *DstRB; 1823 if (DstTy == S1) { 1824 // This is a special case. We don't treat s1 for legalization artifacts as 1825 // vcc booleans. 1826 DstRB = SrcRB; 1827 } else { 1828 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1829 if (SrcRB != DstRB) 1830 return false; 1831 } 1832 1833 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 1834 1835 unsigned DstSize = DstTy.getSizeInBits(); 1836 unsigned SrcSize = SrcTy.getSizeInBits(); 1837 1838 const TargetRegisterClass *SrcRC 1839 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); 1840 const TargetRegisterClass *DstRC 1841 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); 1842 if (!SrcRC || !DstRC) 1843 return false; 1844 1845 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 1846 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 1847 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 1848 return false; 1849 } 1850 1851 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { 1852 MachineBasicBlock *MBB = I.getParent(); 1853 const DebugLoc &DL = I.getDebugLoc(); 1854 1855 Register LoReg = MRI->createVirtualRegister(DstRC); 1856 Register HiReg = MRI->createVirtualRegister(DstRC); 1857 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 1858 .addReg(SrcReg, 0, AMDGPU::sub0); 1859 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 1860 .addReg(SrcReg, 0, AMDGPU::sub1); 1861 1862 if (IsVALU && STI.hasSDWA()) { 1863 // Write the low 16-bits of the high element into the high 16-bits of the 1864 // low element. 1865 MachineInstr *MovSDWA = 1866 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 1867 .addImm(0) // $src0_modifiers 1868 .addReg(HiReg) // $src0 1869 .addImm(0) // $clamp 1870 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 1871 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 1872 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 1873 .addReg(LoReg, RegState::Implicit); 1874 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 1875 } else { 1876 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 1877 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 1878 Register ImmReg = MRI->createVirtualRegister(DstRC); 1879 if (IsVALU) { 1880 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 1881 .addImm(16) 1882 .addReg(HiReg); 1883 } else { 1884 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 1885 .addReg(HiReg) 1886 .addImm(16); 1887 } 1888 1889 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1890 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 1891 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 1892 1893 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 1894 .addImm(0xffff); 1895 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 1896 .addReg(LoReg) 1897 .addReg(ImmReg); 1898 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 1899 .addReg(TmpReg0) 1900 .addReg(TmpReg1); 1901 } 1902 1903 I.eraseFromParent(); 1904 return true; 1905 } 1906 1907 if (!DstTy.isScalar()) 1908 return false; 1909 1910 if (SrcSize > 32) { 1911 int SubRegIdx = sizeToSubRegIndex(DstSize); 1912 if (SubRegIdx == -1) 1913 return false; 1914 1915 // Deal with weird cases where the class only partially supports the subreg 1916 // index. 1917 const TargetRegisterClass *SrcWithSubRC 1918 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 1919 if (!SrcWithSubRC) 1920 return false; 1921 1922 if (SrcWithSubRC != SrcRC) { 1923 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 1924 return false; 1925 } 1926 1927 I.getOperand(1).setSubReg(SubRegIdx); 1928 } 1929 1930 I.setDesc(TII.get(TargetOpcode::COPY)); 1931 return true; 1932 } 1933 1934 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 1935 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 1936 Mask = maskTrailingOnes<unsigned>(Size); 1937 int SignedMask = static_cast<int>(Mask); 1938 return SignedMask >= -16 && SignedMask <= 64; 1939 } 1940 1941 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 1942 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 1943 Register Reg, const MachineRegisterInfo &MRI, 1944 const TargetRegisterInfo &TRI) const { 1945 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 1946 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 1947 return RB; 1948 1949 // Ignore the type, since we don't use vcc in artifacts. 1950 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 1951 return &RBI.getRegBankFromRegClass(*RC, LLT()); 1952 return nullptr; 1953 } 1954 1955 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 1956 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 1957 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 1958 const DebugLoc &DL = I.getDebugLoc(); 1959 MachineBasicBlock &MBB = *I.getParent(); 1960 const Register DstReg = I.getOperand(0).getReg(); 1961 const Register SrcReg = I.getOperand(1).getReg(); 1962 1963 const LLT DstTy = MRI->getType(DstReg); 1964 const LLT SrcTy = MRI->getType(SrcReg); 1965 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 1966 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 1967 const unsigned DstSize = DstTy.getSizeInBits(); 1968 if (!DstTy.isScalar()) 1969 return false; 1970 1971 // Artifact casts should never use vcc. 1972 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 1973 1974 // FIXME: This should probably be illegal and split earlier. 1975 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 1976 if (DstSize <= 32) 1977 return selectCOPY(I); 1978 1979 const TargetRegisterClass *SrcRC = 1980 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); 1981 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1982 const TargetRegisterClass *DstRC = 1983 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); 1984 1985 Register UndefReg = MRI->createVirtualRegister(SrcRC); 1986 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 1987 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1988 .addReg(SrcReg) 1989 .addImm(AMDGPU::sub0) 1990 .addReg(UndefReg) 1991 .addImm(AMDGPU::sub1); 1992 I.eraseFromParent(); 1993 1994 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 1995 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 1996 } 1997 1998 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 1999 // 64-bit should have been split up in RegBankSelect 2000 2001 // Try to use an and with a mask if it will save code size. 2002 unsigned Mask; 2003 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2004 MachineInstr *ExtI = 2005 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 2006 .addImm(Mask) 2007 .addReg(SrcReg); 2008 I.eraseFromParent(); 2009 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2010 } 2011 2012 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2013 MachineInstr *ExtI = 2014 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 2015 .addReg(SrcReg) 2016 .addImm(0) // Offset 2017 .addImm(SrcSize); // Width 2018 I.eraseFromParent(); 2019 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2020 } 2021 2022 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 2023 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 2024 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 2025 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 2026 return false; 2027 2028 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 2029 const unsigned SextOpc = SrcSize == 8 ? 2030 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 2031 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 2032 .addReg(SrcReg); 2033 I.eraseFromParent(); 2034 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2035 } 2036 2037 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 2038 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2039 2040 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2041 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 2042 // We need a 64-bit register source, but the high bits don't matter. 2043 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 2044 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2045 unsigned SubReg = InReg ? AMDGPU::sub0 : 0; 2046 2047 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2048 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 2049 .addReg(SrcReg, 0, SubReg) 2050 .addImm(AMDGPU::sub0) 2051 .addReg(UndefReg) 2052 .addImm(AMDGPU::sub1); 2053 2054 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 2055 .addReg(ExtReg) 2056 .addImm(SrcSize << 16); 2057 2058 I.eraseFromParent(); 2059 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 2060 } 2061 2062 unsigned Mask; 2063 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2064 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 2065 .addReg(SrcReg) 2066 .addImm(Mask); 2067 } else { 2068 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 2069 .addReg(SrcReg) 2070 .addImm(SrcSize << 16); 2071 } 2072 2073 I.eraseFromParent(); 2074 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2075 } 2076 2077 return false; 2078 } 2079 2080 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 2081 MachineBasicBlock *BB = I.getParent(); 2082 MachineOperand &ImmOp = I.getOperand(1); 2083 Register DstReg = I.getOperand(0).getReg(); 2084 unsigned Size = MRI->getType(DstReg).getSizeInBits(); 2085 2086 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 2087 if (ImmOp.isFPImm()) { 2088 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 2089 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 2090 } else if (ImmOp.isCImm()) { 2091 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 2092 } else { 2093 llvm_unreachable("Not supported by g_constants"); 2094 } 2095 2096 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2097 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID; 2098 2099 unsigned Opcode; 2100 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 2101 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2102 } else { 2103 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2104 2105 // We should never produce s1 values on banks other than VCC. If the user of 2106 // this already constrained the register, we may incorrectly think it's VCC 2107 // if it wasn't originally. 2108 if (Size == 1) 2109 return false; 2110 } 2111 2112 if (Size != 64) { 2113 I.setDesc(TII.get(Opcode)); 2114 I.addImplicitDefUseOperands(*MF); 2115 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2116 } 2117 2118 const DebugLoc &DL = I.getDebugLoc(); 2119 2120 APInt Imm(Size, I.getOperand(1).getImm()); 2121 2122 MachineInstr *ResInst; 2123 if (IsSgpr && TII.isInlineConstant(Imm)) { 2124 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2125 .addImm(I.getOperand(1).getImm()); 2126 } else { 2127 const TargetRegisterClass *RC = IsSgpr ? 2128 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2129 Register LoReg = MRI->createVirtualRegister(RC); 2130 Register HiReg = MRI->createVirtualRegister(RC); 2131 2132 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2133 .addImm(Imm.trunc(32).getZExtValue()); 2134 2135 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2136 .addImm(Imm.ashr(32).getZExtValue()); 2137 2138 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2139 .addReg(LoReg) 2140 .addImm(AMDGPU::sub0) 2141 .addReg(HiReg) 2142 .addImm(AMDGPU::sub1); 2143 } 2144 2145 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2146 // work for target independent opcodes 2147 I.eraseFromParent(); 2148 const TargetRegisterClass *DstRC = 2149 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2150 if (!DstRC) 2151 return true; 2152 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2153 } 2154 2155 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2156 // Only manually handle the f64 SGPR case. 2157 // 2158 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2159 // the bit ops theoretically have a second result due to the implicit def of 2160 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2161 // that is easy by disabling the check. The result works, but uses a 2162 // nonsensical sreg32orlds_and_sreg_1 regclass. 2163 // 2164 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2165 // the variadic REG_SEQUENCE operands. 2166 2167 Register Dst = MI.getOperand(0).getReg(); 2168 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2169 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2170 MRI->getType(Dst) != LLT::scalar(64)) 2171 return false; 2172 2173 Register Src = MI.getOperand(1).getReg(); 2174 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2175 if (Fabs) 2176 Src = Fabs->getOperand(1).getReg(); 2177 2178 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2179 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2180 return false; 2181 2182 MachineBasicBlock *BB = MI.getParent(); 2183 const DebugLoc &DL = MI.getDebugLoc(); 2184 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2185 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2186 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2187 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2188 2189 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2190 .addReg(Src, 0, AMDGPU::sub0); 2191 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2192 .addReg(Src, 0, AMDGPU::sub1); 2193 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2194 .addImm(0x80000000); 2195 2196 // Set or toggle sign bit. 2197 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2198 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2199 .addReg(HiReg) 2200 .addReg(ConstReg); 2201 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2202 .addReg(LoReg) 2203 .addImm(AMDGPU::sub0) 2204 .addReg(OpReg) 2205 .addImm(AMDGPU::sub1); 2206 MI.eraseFromParent(); 2207 return true; 2208 } 2209 2210 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2211 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2212 Register Dst = MI.getOperand(0).getReg(); 2213 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2214 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2215 MRI->getType(Dst) != LLT::scalar(64)) 2216 return false; 2217 2218 Register Src = MI.getOperand(1).getReg(); 2219 MachineBasicBlock *BB = MI.getParent(); 2220 const DebugLoc &DL = MI.getDebugLoc(); 2221 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2222 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2223 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2224 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2225 2226 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2227 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2228 return false; 2229 2230 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2231 .addReg(Src, 0, AMDGPU::sub0); 2232 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2233 .addReg(Src, 0, AMDGPU::sub1); 2234 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2235 .addImm(0x7fffffff); 2236 2237 // Clear sign bit. 2238 // TODO: Should this used S_BITSET0_*? 2239 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2240 .addReg(HiReg) 2241 .addReg(ConstReg); 2242 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2243 .addReg(LoReg) 2244 .addImm(AMDGPU::sub0) 2245 .addReg(OpReg) 2246 .addImm(AMDGPU::sub1); 2247 2248 MI.eraseFromParent(); 2249 return true; 2250 } 2251 2252 static bool isConstant(const MachineInstr &MI) { 2253 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2254 } 2255 2256 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2257 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2258 2259 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); 2260 2261 assert(PtrMI); 2262 2263 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2264 return; 2265 2266 GEPInfo GEPInfo(*PtrMI); 2267 2268 for (unsigned i = 1; i != 3; ++i) { 2269 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2270 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2271 assert(OpDef); 2272 if (i == 2 && isConstant(*OpDef)) { 2273 // TODO: Could handle constant base + variable offset, but a combine 2274 // probably should have commuted it. 2275 assert(GEPInfo.Imm == 0); 2276 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2277 continue; 2278 } 2279 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2280 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2281 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2282 else 2283 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2284 } 2285 2286 AddrInfo.push_back(GEPInfo); 2287 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2288 } 2289 2290 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { 2291 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; 2292 } 2293 2294 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2295 if (!MI.hasOneMemOperand()) 2296 return false; 2297 2298 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2299 const Value *Ptr = MMO->getValue(); 2300 2301 // UndefValue means this is a load of a kernel input. These are uniform. 2302 // Sometimes LDS instructions have constant pointers. 2303 // If Ptr is null, then that means this mem operand contains a 2304 // PseudoSourceValue like GOT. 2305 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2306 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2307 return true; 2308 2309 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2310 return true; 2311 2312 const Instruction *I = dyn_cast<Instruction>(Ptr); 2313 return I && I->getMetadata("amdgpu.uniform"); 2314 } 2315 2316 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2317 for (const GEPInfo &GEPInfo : AddrInfo) { 2318 if (!GEPInfo.VgprParts.empty()) 2319 return true; 2320 } 2321 return false; 2322 } 2323 2324 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2325 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2326 unsigned AS = PtrTy.getAddressSpace(); 2327 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2328 STI.ldsRequiresM0Init()) { 2329 MachineBasicBlock *BB = I.getParent(); 2330 2331 // If DS instructions require M0 initializtion, insert it before selecting. 2332 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2333 .addImm(-1); 2334 } 2335 } 2336 2337 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2338 MachineInstr &I) const { 2339 initM0(I); 2340 return selectImpl(I, *CoverageInfo); 2341 } 2342 2343 // TODO: No rtn optimization. 2344 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( 2345 MachineInstr &MI) const { 2346 Register PtrReg = MI.getOperand(1).getReg(); 2347 const LLT PtrTy = MRI->getType(PtrReg); 2348 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || 2349 STI.useFlatForGlobal()) 2350 return selectImpl(MI, *CoverageInfo); 2351 2352 Register DstReg = MI.getOperand(0).getReg(); 2353 const LLT Ty = MRI->getType(DstReg); 2354 const bool Is64 = Ty.getSizeInBits() == 64; 2355 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2356 Register TmpReg = MRI->createVirtualRegister( 2357 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2358 2359 const DebugLoc &DL = MI.getDebugLoc(); 2360 MachineBasicBlock *BB = MI.getParent(); 2361 2362 Register VAddr, RSrcReg, SOffset; 2363 int64_t Offset = 0; 2364 2365 unsigned Opcode; 2366 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { 2367 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : 2368 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; 2369 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, 2370 RSrcReg, SOffset, Offset)) { 2371 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : 2372 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; 2373 } else 2374 return selectImpl(MI, *CoverageInfo); 2375 2376 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) 2377 .addReg(MI.getOperand(2).getReg()); 2378 2379 if (VAddr) 2380 MIB.addReg(VAddr); 2381 2382 MIB.addReg(RSrcReg); 2383 if (SOffset) 2384 MIB.addReg(SOffset); 2385 else 2386 MIB.addImm(0); 2387 2388 MIB.addImm(Offset); 2389 MIB.addImm(1); // glc 2390 MIB.addImm(0); // slc 2391 MIB.cloneMemRefs(MI); 2392 2393 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) 2394 .addReg(TmpReg, RegState::Kill, SubReg); 2395 2396 MI.eraseFromParent(); 2397 2398 MRI->setRegClass( 2399 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); 2400 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2401 } 2402 2403 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2404 MachineBasicBlock *BB = I.getParent(); 2405 MachineOperand &CondOp = I.getOperand(0); 2406 Register CondReg = CondOp.getReg(); 2407 const DebugLoc &DL = I.getDebugLoc(); 2408 2409 unsigned BrOpcode; 2410 Register CondPhysReg; 2411 const TargetRegisterClass *ConstrainRC; 2412 2413 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2414 // whether the branch is uniform when selecting the instruction. In 2415 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2416 // RegBankSelect knows what it's doing if the branch condition is scc, even 2417 // though it currently does not. 2418 if (!isVCC(CondReg, *MRI)) { 2419 if (MRI->getType(CondReg) != LLT::scalar(32)) 2420 return false; 2421 2422 CondPhysReg = AMDGPU::SCC; 2423 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2424 ConstrainRC = &AMDGPU::SReg_32RegClass; 2425 } else { 2426 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? 2427 // We sort of know that a VCC producer based on the register bank, that ands 2428 // inactive lanes with 0. What if there was a logical operation with vcc 2429 // producers in different blocks/with different exec masks? 2430 // FIXME: Should scc->vcc copies and with exec? 2431 CondPhysReg = TRI.getVCC(); 2432 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2433 ConstrainRC = TRI.getBoolRC(); 2434 } 2435 2436 if (!MRI->getRegClassOrNull(CondReg)) 2437 MRI->setRegClass(CondReg, ConstrainRC); 2438 2439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2440 .addReg(CondReg); 2441 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2442 .addMBB(I.getOperand(1).getMBB()); 2443 2444 I.eraseFromParent(); 2445 return true; 2446 } 2447 2448 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 2449 MachineInstr &I) const { 2450 Register DstReg = I.getOperand(0).getReg(); 2451 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2452 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2453 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2454 if (IsVGPR) 2455 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2456 2457 return RBI.constrainGenericRegister( 2458 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2459 } 2460 2461 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2462 Register DstReg = I.getOperand(0).getReg(); 2463 Register SrcReg = I.getOperand(1).getReg(); 2464 Register MaskReg = I.getOperand(2).getReg(); 2465 LLT Ty = MRI->getType(DstReg); 2466 LLT MaskTy = MRI->getType(MaskReg); 2467 2468 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2469 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2470 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2471 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2472 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2473 return false; 2474 2475 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2476 const TargetRegisterClass &RegRC 2477 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2478 2479 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, 2480 *MRI); 2481 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, 2482 *MRI); 2483 const TargetRegisterClass *MaskRC = 2484 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); 2485 2486 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2487 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2488 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2489 return false; 2490 2491 MachineBasicBlock *BB = I.getParent(); 2492 const DebugLoc &DL = I.getDebugLoc(); 2493 if (Ty.getSizeInBits() == 32) { 2494 assert(MaskTy.getSizeInBits() == 32 && 2495 "ptrmask should have been narrowed during legalize"); 2496 2497 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2498 .addReg(SrcReg) 2499 .addReg(MaskReg); 2500 I.eraseFromParent(); 2501 return true; 2502 } 2503 2504 Register HiReg = MRI->createVirtualRegister(&RegRC); 2505 Register LoReg = MRI->createVirtualRegister(&RegRC); 2506 2507 // Extract the subregisters from the source pointer. 2508 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2509 .addReg(SrcReg, 0, AMDGPU::sub0); 2510 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2511 .addReg(SrcReg, 0, AMDGPU::sub1); 2512 2513 Register MaskedLo, MaskedHi; 2514 2515 // Try to avoid emitting a bit operation when we only need to touch half of 2516 // the 64-bit pointer. 2517 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); 2518 2519 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2520 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2521 if ((MaskOnes & MaskLo32) == MaskLo32) { 2522 // If all the bits in the low half are 1, we only need a copy for it. 2523 MaskedLo = LoReg; 2524 } else { 2525 // Extract the mask subregister and apply the and. 2526 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2527 MaskedLo = MRI->createVirtualRegister(&RegRC); 2528 2529 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2530 .addReg(MaskReg, 0, AMDGPU::sub0); 2531 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2532 .addReg(LoReg) 2533 .addReg(MaskLo); 2534 } 2535 2536 if ((MaskOnes & MaskHi32) == MaskHi32) { 2537 // If all the bits in the high half are 1, we only need a copy for it. 2538 MaskedHi = HiReg; 2539 } else { 2540 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2541 MaskedHi = MRI->createVirtualRegister(&RegRC); 2542 2543 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2544 .addReg(MaskReg, 0, AMDGPU::sub1); 2545 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2546 .addReg(HiReg) 2547 .addReg(MaskHi); 2548 } 2549 2550 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2551 .addReg(MaskedLo) 2552 .addImm(AMDGPU::sub0) 2553 .addReg(MaskedHi) 2554 .addImm(AMDGPU::sub1); 2555 I.eraseFromParent(); 2556 return true; 2557 } 2558 2559 /// Return the register to use for the index value, and the subregister to use 2560 /// for the indirectly accessed register. 2561 static std::pair<Register, unsigned> 2562 computeIndirectRegIndex(MachineRegisterInfo &MRI, 2563 const SIRegisterInfo &TRI, 2564 const TargetRegisterClass *SuperRC, 2565 Register IdxReg, 2566 unsigned EltSize) { 2567 Register IdxBaseReg; 2568 int Offset; 2569 2570 std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); 2571 if (IdxBaseReg == AMDGPU::NoRegister) { 2572 // This will happen if the index is a known constant. This should ordinarily 2573 // be legalized out, but handle it as a register just in case. 2574 assert(Offset == 0); 2575 IdxBaseReg = IdxReg; 2576 } 2577 2578 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 2579 2580 // Skip out of bounds offsets, or else we would end up using an undefined 2581 // register. 2582 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 2583 return std::make_pair(IdxReg, SubRegs[0]); 2584 return std::make_pair(IdxBaseReg, SubRegs[Offset]); 2585 } 2586 2587 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 2588 MachineInstr &MI) const { 2589 Register DstReg = MI.getOperand(0).getReg(); 2590 Register SrcReg = MI.getOperand(1).getReg(); 2591 Register IdxReg = MI.getOperand(2).getReg(); 2592 2593 LLT DstTy = MRI->getType(DstReg); 2594 LLT SrcTy = MRI->getType(SrcReg); 2595 2596 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2597 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2598 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2599 2600 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2601 // into a waterfall loop. 2602 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2603 return false; 2604 2605 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, 2606 *MRI); 2607 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, 2608 *MRI); 2609 if (!SrcRC || !DstRC) 2610 return false; 2611 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2612 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2613 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2614 return false; 2615 2616 MachineBasicBlock *BB = MI.getParent(); 2617 const DebugLoc &DL = MI.getDebugLoc(); 2618 const bool Is64 = DstTy.getSizeInBits() == 64; 2619 2620 unsigned SubReg; 2621 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, 2622 DstTy.getSizeInBits() / 8); 2623 2624 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 2625 if (DstTy.getSizeInBits() != 32 && !Is64) 2626 return false; 2627 2628 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2629 .addReg(IdxReg); 2630 2631 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 2632 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 2633 .addReg(SrcReg, 0, SubReg) 2634 .addReg(SrcReg, RegState::Implicit); 2635 MI.eraseFromParent(); 2636 return true; 2637 } 2638 2639 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 2640 return false; 2641 2642 if (!STI.useVGPRIndexMode()) { 2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2644 .addReg(IdxReg); 2645 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 2646 .addReg(SrcReg, 0, SubReg) 2647 .addReg(SrcReg, RegState::Implicit); 2648 MI.eraseFromParent(); 2649 return true; 2650 } 2651 2652 const MCInstrDesc &GPRIDXDesc = 2653 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true); 2654 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 2655 .addReg(SrcReg) 2656 .addReg(IdxReg) 2657 .addImm(SubReg); 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 2664 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 2665 MachineInstr &MI) const { 2666 Register DstReg = MI.getOperand(0).getReg(); 2667 Register VecReg = MI.getOperand(1).getReg(); 2668 Register ValReg = MI.getOperand(2).getReg(); 2669 Register IdxReg = MI.getOperand(3).getReg(); 2670 2671 LLT VecTy = MRI->getType(DstReg); 2672 LLT ValTy = MRI->getType(ValReg); 2673 unsigned VecSize = VecTy.getSizeInBits(); 2674 unsigned ValSize = ValTy.getSizeInBits(); 2675 2676 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 2677 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 2678 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 2679 2680 assert(VecTy.getElementType() == ValTy); 2681 2682 // The index must be scalar. If it wasn't RegBankSelect should have moved this 2683 // into a waterfall loop. 2684 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 2685 return false; 2686 2687 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, 2688 *MRI); 2689 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, 2690 *MRI); 2691 2692 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 2693 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 2694 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 2695 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 2696 return false; 2697 2698 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 2699 return false; 2700 2701 unsigned SubReg; 2702 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, 2703 ValSize / 8); 2704 2705 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 2706 STI.useVGPRIndexMode(); 2707 2708 MachineBasicBlock *BB = MI.getParent(); 2709 const DebugLoc &DL = MI.getDebugLoc(); 2710 2711 if (!IndexMode) { 2712 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 2713 .addReg(IdxReg); 2714 2715 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( 2716 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); 2717 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 2718 .addReg(VecReg) 2719 .addReg(ValReg) 2720 .addImm(SubReg); 2721 MI.eraseFromParent(); 2722 return true; 2723 } 2724 2725 const MCInstrDesc &GPRIDXDesc = 2726 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 2727 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 2728 .addReg(VecReg) 2729 .addReg(ValReg) 2730 .addReg(IdxReg) 2731 .addImm(SubReg); 2732 2733 MI.eraseFromParent(); 2734 return true; 2735 } 2736 2737 static bool isZeroOrUndef(int X) { 2738 return X == 0 || X == -1; 2739 } 2740 2741 static bool isOneOrUndef(int X) { 2742 return X == 1 || X == -1; 2743 } 2744 2745 static bool isZeroOrOneOrUndef(int X) { 2746 return X == 0 || X == 1 || X == -1; 2747 } 2748 2749 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single 2750 // 32-bit register. 2751 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, 2752 ArrayRef<int> Mask) { 2753 NewMask[0] = Mask[0]; 2754 NewMask[1] = Mask[1]; 2755 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) 2756 return Src0; 2757 2758 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); 2759 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); 2760 2761 // Shift the mask inputs to be 0/1; 2762 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; 2763 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; 2764 return Src1; 2765 } 2766 2767 // This is only legal with VOP3P instructions as an aid to op_sel matching. 2768 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( 2769 MachineInstr &MI) const { 2770 Register DstReg = MI.getOperand(0).getReg(); 2771 Register Src0Reg = MI.getOperand(1).getReg(); 2772 Register Src1Reg = MI.getOperand(2).getReg(); 2773 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); 2774 2775 const LLT V2S16 = LLT::vector(2, 16); 2776 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) 2777 return false; 2778 2779 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) 2780 return false; 2781 2782 assert(ShufMask.size() == 2); 2783 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); 2784 2785 MachineBasicBlock *MBB = MI.getParent(); 2786 const DebugLoc &DL = MI.getDebugLoc(); 2787 2788 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2789 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2790 const TargetRegisterClass &RC = IsVALU ? 2791 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2792 2793 // Handle the degenerate case which should have folded out. 2794 if (ShufMask[0] == -1 && ShufMask[1] == -1) { 2795 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); 2796 2797 MI.eraseFromParent(); 2798 return RBI.constrainGenericRegister(DstReg, RC, *MRI); 2799 } 2800 2801 // A legal VOP3P mask only reads one of the sources. 2802 int Mask[2]; 2803 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); 2804 2805 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || 2806 !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) 2807 return false; 2808 2809 // TODO: This also should have been folded out 2810 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { 2811 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) 2812 .addReg(SrcVec); 2813 2814 MI.eraseFromParent(); 2815 return true; 2816 } 2817 2818 if (Mask[0] == 1 && Mask[1] == -1) { 2819 if (IsVALU) { 2820 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 2821 .addImm(16) 2822 .addReg(SrcVec); 2823 } else { 2824 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 2825 .addReg(SrcVec) 2826 .addImm(16); 2827 } 2828 } else if (Mask[0] == -1 && Mask[1] == 0) { 2829 if (IsVALU) { 2830 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) 2831 .addImm(16) 2832 .addReg(SrcVec); 2833 } else { 2834 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) 2835 .addReg(SrcVec) 2836 .addImm(16); 2837 } 2838 } else if (Mask[0] == 0 && Mask[1] == 0) { 2839 if (IsVALU) { 2840 // Write low half of the register into the high half. 2841 MachineInstr *MovSDWA = 2842 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2843 .addImm(0) // $src0_modifiers 2844 .addReg(SrcVec) // $src0 2845 .addImm(0) // $clamp 2846 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2847 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2848 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2849 .addReg(SrcVec, RegState::Implicit); 2850 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2851 } else { 2852 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2853 .addReg(SrcVec) 2854 .addReg(SrcVec); 2855 } 2856 } else if (Mask[0] == 1 && Mask[1] == 1) { 2857 if (IsVALU) { 2858 // Write high half of the register into the low half. 2859 MachineInstr *MovSDWA = 2860 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2861 .addImm(0) // $src0_modifiers 2862 .addReg(SrcVec) // $src0 2863 .addImm(0) // $clamp 2864 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel 2865 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2866 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel 2867 .addReg(SrcVec, RegState::Implicit); 2868 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2869 } else { 2870 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) 2871 .addReg(SrcVec) 2872 .addReg(SrcVec); 2873 } 2874 } else if (Mask[0] == 1 && Mask[1] == 0) { 2875 if (IsVALU) { 2876 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg) 2877 .addReg(SrcVec) 2878 .addReg(SrcVec) 2879 .addImm(16); 2880 } else { 2881 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2882 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) 2883 .addReg(SrcVec) 2884 .addImm(16); 2885 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) 2886 .addReg(TmpReg) 2887 .addReg(SrcVec); 2888 } 2889 } else 2890 llvm_unreachable("all shuffle masks should be handled"); 2891 2892 MI.eraseFromParent(); 2893 return true; 2894 } 2895 2896 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( 2897 MachineInstr &MI) const { 2898 2899 MachineBasicBlock *MBB = MI.getParent(); 2900 const DebugLoc &DL = MI.getDebugLoc(); 2901 2902 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { 2903 Function &F = MBB->getParent()->getFunction(); 2904 DiagnosticInfoUnsupported 2905 NoFpRet(F, "return versions of fp atomics not supported", 2906 MI.getDebugLoc(), DS_Error); 2907 F.getContext().diagnose(NoFpRet); 2908 return false; 2909 } 2910 2911 // FIXME: This is only needed because tablegen requires number of dst operands 2912 // in match and replace pattern to be the same. Otherwise patterns can be 2913 // exported from SDag path. 2914 MachineOperand &VDataIn = MI.getOperand(1); 2915 MachineOperand &VIndex = MI.getOperand(3); 2916 MachineOperand &VOffset = MI.getOperand(4); 2917 MachineOperand &SOffset = MI.getOperand(5); 2918 int16_t Offset = MI.getOperand(6).getImm(); 2919 2920 bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); 2921 bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); 2922 2923 unsigned Opcode; 2924 if (HasVOffset) { 2925 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN 2926 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; 2927 } else { 2928 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN 2929 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; 2930 } 2931 2932 if (MRI->getType(VDataIn.getReg()).isVector()) { 2933 switch (Opcode) { 2934 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: 2935 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; 2936 break; 2937 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: 2938 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; 2939 break; 2940 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: 2941 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; 2942 break; 2943 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: 2944 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; 2945 break; 2946 } 2947 } 2948 2949 auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); 2950 I.add(VDataIn); 2951 2952 if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || 2953 Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { 2954 Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 2955 BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 2956 .addReg(VIndex.getReg()) 2957 .addImm(AMDGPU::sub0) 2958 .addReg(VOffset.getReg()) 2959 .addImm(AMDGPU::sub1); 2960 2961 I.addReg(IdxReg); 2962 } else if (HasVIndex) { 2963 I.add(VIndex); 2964 } else if (HasVOffset) { 2965 I.add(VOffset); 2966 } 2967 2968 I.add(MI.getOperand(2)); // rsrc 2969 I.add(SOffset); 2970 I.addImm(Offset); 2971 renderExtractSLC(I, MI, 7); 2972 I.cloneMemRefs(MI); 2973 2974 MI.eraseFromParent(); 2975 2976 return true; 2977 } 2978 2979 bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( 2980 MachineInstr &MI) const{ 2981 2982 MachineBasicBlock *MBB = MI.getParent(); 2983 const DebugLoc &DL = MI.getDebugLoc(); 2984 2985 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { 2986 Function &F = MBB->getParent()->getFunction(); 2987 DiagnosticInfoUnsupported 2988 NoFpRet(F, "return versions of fp atomics not supported", 2989 MI.getDebugLoc(), DS_Error); 2990 F.getContext().diagnose(NoFpRet); 2991 return false; 2992 } 2993 2994 // FIXME: This is only needed because tablegen requires number of dst operands 2995 // in match and replace pattern to be the same. Otherwise patterns can be 2996 // exported from SDag path. 2997 auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2)); 2998 2999 Register Data = MI.getOperand(3).getReg(); 3000 const unsigned Opc = MRI->getType(Data).isVector() ? 3001 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; 3002 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3003 .addReg(Addr.first) 3004 .addReg(Data) 3005 .addImm(Addr.second) 3006 .addImm(0) // SLC 3007 .cloneMemRefs(MI); 3008 3009 MI.eraseFromParent(); 3010 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3011 } 3012 3013 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ 3014 MI.setDesc(TII.get(MI.getOperand(1).getImm())); 3015 MI.RemoveOperand(1); 3016 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3017 return true; 3018 } 3019 3020 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 3021 if (I.isPHI()) 3022 return selectPHI(I); 3023 3024 if (!I.isPreISelOpcode()) { 3025 if (I.isCopy()) 3026 return selectCOPY(I); 3027 return true; 3028 } 3029 3030 switch (I.getOpcode()) { 3031 case TargetOpcode::G_AND: 3032 case TargetOpcode::G_OR: 3033 case TargetOpcode::G_XOR: 3034 if (selectImpl(I, *CoverageInfo)) 3035 return true; 3036 return selectG_AND_OR_XOR(I); 3037 case TargetOpcode::G_ADD: 3038 case TargetOpcode::G_SUB: 3039 if (selectImpl(I, *CoverageInfo)) 3040 return true; 3041 return selectG_ADD_SUB(I); 3042 case TargetOpcode::G_UADDO: 3043 case TargetOpcode::G_USUBO: 3044 case TargetOpcode::G_UADDE: 3045 case TargetOpcode::G_USUBE: 3046 return selectG_UADDO_USUBO_UADDE_USUBE(I); 3047 case TargetOpcode::G_INTTOPTR: 3048 case TargetOpcode::G_BITCAST: 3049 case TargetOpcode::G_PTRTOINT: 3050 return selectCOPY(I); 3051 case TargetOpcode::G_CONSTANT: 3052 case TargetOpcode::G_FCONSTANT: 3053 return selectG_CONSTANT(I); 3054 case TargetOpcode::G_FNEG: 3055 if (selectImpl(I, *CoverageInfo)) 3056 return true; 3057 return selectG_FNEG(I); 3058 case TargetOpcode::G_FABS: 3059 if (selectImpl(I, *CoverageInfo)) 3060 return true; 3061 return selectG_FABS(I); 3062 case TargetOpcode::G_EXTRACT: 3063 return selectG_EXTRACT(I); 3064 case TargetOpcode::G_MERGE_VALUES: 3065 case TargetOpcode::G_BUILD_VECTOR: 3066 case TargetOpcode::G_CONCAT_VECTORS: 3067 return selectG_MERGE_VALUES(I); 3068 case TargetOpcode::G_UNMERGE_VALUES: 3069 return selectG_UNMERGE_VALUES(I); 3070 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 3071 return selectG_BUILD_VECTOR_TRUNC(I); 3072 case TargetOpcode::G_PTR_ADD: 3073 return selectG_PTR_ADD(I); 3074 case TargetOpcode::G_IMPLICIT_DEF: 3075 return selectG_IMPLICIT_DEF(I); 3076 case TargetOpcode::G_FREEZE: 3077 return selectCOPY(I); 3078 case TargetOpcode::G_INSERT: 3079 return selectG_INSERT(I); 3080 case TargetOpcode::G_INTRINSIC: 3081 return selectG_INTRINSIC(I); 3082 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3083 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 3084 case TargetOpcode::G_ICMP: 3085 if (selectG_ICMP(I)) 3086 return true; 3087 return selectImpl(I, *CoverageInfo); 3088 case TargetOpcode::G_LOAD: 3089 case TargetOpcode::G_STORE: 3090 case TargetOpcode::G_ATOMIC_CMPXCHG: 3091 case TargetOpcode::G_ATOMICRMW_XCHG: 3092 case TargetOpcode::G_ATOMICRMW_ADD: 3093 case TargetOpcode::G_ATOMICRMW_SUB: 3094 case TargetOpcode::G_ATOMICRMW_AND: 3095 case TargetOpcode::G_ATOMICRMW_OR: 3096 case TargetOpcode::G_ATOMICRMW_XOR: 3097 case TargetOpcode::G_ATOMICRMW_MIN: 3098 case TargetOpcode::G_ATOMICRMW_MAX: 3099 case TargetOpcode::G_ATOMICRMW_UMIN: 3100 case TargetOpcode::G_ATOMICRMW_UMAX: 3101 case TargetOpcode::G_ATOMICRMW_FADD: 3102 case AMDGPU::G_AMDGPU_ATOMIC_INC: 3103 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 3104 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 3105 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: 3106 return selectG_LOAD_STORE_ATOMICRMW(I); 3107 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 3108 return selectG_AMDGPU_ATOMIC_CMPXCHG(I); 3109 case TargetOpcode::G_SELECT: 3110 return selectG_SELECT(I); 3111 case TargetOpcode::G_TRUNC: 3112 return selectG_TRUNC(I); 3113 case TargetOpcode::G_SEXT: 3114 case TargetOpcode::G_ZEXT: 3115 case TargetOpcode::G_ANYEXT: 3116 case TargetOpcode::G_SEXT_INREG: 3117 if (selectImpl(I, *CoverageInfo)) 3118 return true; 3119 return selectG_SZA_EXT(I); 3120 case TargetOpcode::G_BRCOND: 3121 return selectG_BRCOND(I); 3122 case TargetOpcode::G_GLOBAL_VALUE: 3123 return selectG_GLOBAL_VALUE(I); 3124 case TargetOpcode::G_PTRMASK: 3125 return selectG_PTRMASK(I); 3126 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3127 return selectG_EXTRACT_VECTOR_ELT(I); 3128 case TargetOpcode::G_INSERT_VECTOR_ELT: 3129 return selectG_INSERT_VECTOR_ELT(I); 3130 case TargetOpcode::G_SHUFFLE_VECTOR: 3131 return selectG_SHUFFLE_VECTOR(I); 3132 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3133 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 3134 const AMDGPU::ImageDimIntrinsicInfo *Intr 3135 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); 3136 assert(Intr && "not an image intrinsic with image pseudo"); 3137 return selectImageIntrinsic(I, Intr); 3138 } 3139 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: 3140 return selectBVHIntrinsic(I); 3141 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3142 return selectAMDGPU_BUFFER_ATOMIC_FADD(I); 3143 default: 3144 return selectImpl(I, *CoverageInfo); 3145 } 3146 return false; 3147 } 3148 3149 InstructionSelector::ComplexRendererFns 3150 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 3151 return {{ 3152 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3153 }}; 3154 3155 } 3156 3157 std::pair<Register, unsigned> 3158 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, 3159 bool AllowAbs) const { 3160 Register Src = Root.getReg(); 3161 Register OrigSrc = Src; 3162 unsigned Mods = 0; 3163 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 3164 3165 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { 3166 Src = MI->getOperand(1).getReg(); 3167 Mods |= SISrcMods::NEG; 3168 MI = getDefIgnoringCopies(Src, *MRI); 3169 } 3170 3171 if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) { 3172 Src = MI->getOperand(1).getReg(); 3173 Mods |= SISrcMods::ABS; 3174 } 3175 3176 if (Mods != 0 && 3177 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 3178 MachineInstr *UseMI = Root.getParent(); 3179 3180 // If we looked through copies to find source modifiers on an SGPR operand, 3181 // we now have an SGPR register source. To avoid potentially violating the 3182 // constant bus restriction, we need to insert a copy to a VGPR. 3183 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); 3184 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 3185 TII.get(AMDGPU::COPY), VGPRSrc) 3186 .addReg(Src); 3187 Src = VGPRSrc; 3188 } 3189 3190 return std::make_pair(Src, Mods); 3191 } 3192 3193 /// 3194 /// This will select either an SGPR or VGPR operand and will save us from 3195 /// having to write an extra tablegen pattern. 3196 InstructionSelector::ComplexRendererFns 3197 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 3198 return {{ 3199 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3200 }}; 3201 } 3202 3203 InstructionSelector::ComplexRendererFns 3204 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 3205 Register Src; 3206 unsigned Mods; 3207 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3208 3209 return {{ 3210 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3211 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3212 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3213 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3214 }}; 3215 } 3216 3217 InstructionSelector::ComplexRendererFns 3218 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { 3219 Register Src; 3220 unsigned Mods; 3221 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); 3222 3223 return {{ 3224 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3225 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3226 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3227 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3228 }}; 3229 } 3230 3231 InstructionSelector::ComplexRendererFns 3232 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 3233 return {{ 3234 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 3235 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3236 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3237 }}; 3238 } 3239 3240 InstructionSelector::ComplexRendererFns 3241 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 3242 Register Src; 3243 unsigned Mods; 3244 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3245 3246 return {{ 3247 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3248 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3249 }}; 3250 } 3251 3252 InstructionSelector::ComplexRendererFns 3253 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { 3254 Register Src; 3255 unsigned Mods; 3256 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); 3257 3258 return {{ 3259 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3261 }}; 3262 } 3263 3264 InstructionSelector::ComplexRendererFns 3265 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3266 Register Reg = Root.getReg(); 3267 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3268 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || 3269 Def->getOpcode() == AMDGPU::G_FABS)) 3270 return {}; 3271 return {{ 3272 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3273 }}; 3274 } 3275 3276 std::pair<Register, unsigned> 3277 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3278 Register Src, const MachineRegisterInfo &MRI) const { 3279 unsigned Mods = 0; 3280 MachineInstr *MI = MRI.getVRegDef(Src); 3281 3282 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3283 // It's possible to see an f32 fneg here, but unlikely. 3284 // TODO: Treat f32 fneg as only high bit. 3285 MRI.getType(Src) == LLT::vector(2, 16)) { 3286 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3287 Src = MI->getOperand(1).getReg(); 3288 MI = MRI.getVRegDef(Src); 3289 } 3290 3291 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3292 3293 // Packed instructions do not have abs modifiers. 3294 Mods |= SISrcMods::OP_SEL_1; 3295 3296 return std::make_pair(Src, Mods); 3297 } 3298 3299 InstructionSelector::ComplexRendererFns 3300 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3301 MachineRegisterInfo &MRI 3302 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3303 3304 Register Src; 3305 unsigned Mods; 3306 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3307 3308 return {{ 3309 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3310 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3311 }}; 3312 } 3313 3314 InstructionSelector::ComplexRendererFns 3315 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { 3316 Register Src; 3317 unsigned Mods; 3318 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3319 if (!isKnownNeverNaN(Src, *MRI)) 3320 return None; 3321 3322 return {{ 3323 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3324 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3325 }}; 3326 } 3327 3328 InstructionSelector::ComplexRendererFns 3329 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 3330 // FIXME: Handle op_sel 3331 return {{ 3332 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, 3333 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods 3334 }}; 3335 } 3336 3337 InstructionSelector::ComplexRendererFns 3338 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 3339 SmallVector<GEPInfo, 4> AddrInfo; 3340 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3341 3342 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3343 return None; 3344 3345 const GEPInfo &GEPInfo = AddrInfo[0]; 3346 Optional<int64_t> EncodedImm = 3347 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); 3348 if (!EncodedImm) 3349 return None; 3350 3351 unsigned PtrReg = GEPInfo.SgprParts[0]; 3352 return {{ 3353 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3354 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3355 }}; 3356 } 3357 3358 InstructionSelector::ComplexRendererFns 3359 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 3360 SmallVector<GEPInfo, 4> AddrInfo; 3361 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 3362 3363 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3364 return None; 3365 3366 const GEPInfo &GEPInfo = AddrInfo[0]; 3367 Register PtrReg = GEPInfo.SgprParts[0]; 3368 Optional<int64_t> EncodedImm = 3369 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 3370 if (!EncodedImm) 3371 return None; 3372 3373 return {{ 3374 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3375 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 3376 }}; 3377 } 3378 3379 InstructionSelector::ComplexRendererFns 3380 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 3381 MachineInstr *MI = Root.getParent(); 3382 MachineBasicBlock *MBB = MI->getParent(); 3383 3384 SmallVector<GEPInfo, 4> AddrInfo; 3385 getAddrModeInfo(*MI, *MRI, AddrInfo); 3386 3387 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 3388 // then we can select all ptr + 32-bit offsets not just immediate offsets. 3389 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 3390 return None; 3391 3392 const GEPInfo &GEPInfo = AddrInfo[0]; 3393 // SGPR offset is unsigned. 3394 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) 3395 return None; 3396 3397 // If we make it this far we have a load with an 32-bit immediate offset. 3398 // It is OK to select this using a sgpr offset, because we have already 3399 // failed trying to select this load into one of the _IMM variants since 3400 // the _IMM Patterns are considered before the _SGPR patterns. 3401 Register PtrReg = GEPInfo.SgprParts[0]; 3402 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3403 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) 3404 .addImm(GEPInfo.Imm); 3405 return {{ 3406 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 3407 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } 3408 }}; 3409 } 3410 3411 template <bool Signed> 3412 std::pair<Register, int> 3413 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { 3414 MachineInstr *MI = Root.getParent(); 3415 3416 auto Default = std::make_pair(Root.getReg(), 0); 3417 3418 if (!STI.hasFlatInstOffsets()) 3419 return Default; 3420 3421 Register PtrBase; 3422 int64_t ConstOffset; 3423 std::tie(PtrBase, ConstOffset) = 3424 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3425 if (ConstOffset == 0) 3426 return Default; 3427 3428 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 3429 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed)) 3430 return Default; 3431 3432 return std::make_pair(PtrBase, ConstOffset); 3433 } 3434 3435 InstructionSelector::ComplexRendererFns 3436 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 3437 auto PtrWithOffset = selectFlatOffsetImpl<false>(Root); 3438 3439 return {{ 3440 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3441 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3442 }}; 3443 } 3444 3445 InstructionSelector::ComplexRendererFns 3446 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { 3447 auto PtrWithOffset = selectFlatOffsetImpl<true>(Root); 3448 3449 return {{ 3450 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 3451 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 3452 }}; 3453 } 3454 3455 /// Match a zero extend from a 32-bit value to 64-bits. 3456 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { 3457 Register ZExtSrc; 3458 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) 3459 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); 3460 3461 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3462 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 3463 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) 3464 return false; 3465 3466 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { 3467 return Def->getOperand(1).getReg(); 3468 } 3469 3470 return Register(); 3471 } 3472 3473 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 3474 InstructionSelector::ComplexRendererFns 3475 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { 3476 Register Addr = Root.getReg(); 3477 Register PtrBase; 3478 int64_t ConstOffset; 3479 int64_t ImmOffset = 0; 3480 3481 // Match the immediate offset first, which canonically is moved as low as 3482 // possible. 3483 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3484 3485 if (ConstOffset != 0) { 3486 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) { 3487 Addr = PtrBase; 3488 ImmOffset = ConstOffset; 3489 } else if (ConstOffset > 0) { 3490 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); 3491 if (!PtrBaseDef) 3492 return None; 3493 3494 if (isSGPR(PtrBaseDef->Reg)) { 3495 // Offset is too large. 3496 // 3497 // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) 3498 // + (large_offset & MaxOffset); 3499 int64_t SplitImmOffset, RemainderOffset; 3500 std::tie(SplitImmOffset, RemainderOffset) 3501 = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true); 3502 3503 if (isUInt<32>(RemainderOffset)) { 3504 MachineInstr *MI = Root.getParent(); 3505 MachineBasicBlock *MBB = MI->getParent(); 3506 Register HighBits 3507 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3508 3509 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3510 HighBits) 3511 .addImm(RemainderOffset); 3512 3513 return {{ 3514 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr 3515 [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset 3516 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, 3517 }}; 3518 } 3519 } 3520 } 3521 } 3522 3523 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3524 if (!AddrDef) 3525 return None; 3526 3527 // Match the variable offset. 3528 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) { 3529 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 3530 // drop this. 3531 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || 3532 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT) 3533 return None; 3534 3535 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 3536 // moves required to copy a 64-bit SGPR to VGPR. 3537 const Register SAddr = AddrDef->Reg; 3538 if (!isSGPR(SAddr)) 3539 return None; 3540 3541 MachineInstr *MI = Root.getParent(); 3542 MachineBasicBlock *MBB = MI->getParent(); 3543 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3544 3545 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3546 VOffset) 3547 .addImm(0); 3548 3549 return {{ 3550 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 3551 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset 3552 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3553 }}; 3554 } 3555 3556 // Look through the SGPR->VGPR copy. 3557 Register SAddr = 3558 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3559 if (!SAddr || !isSGPR(SAddr)) 3560 return None; 3561 3562 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3563 3564 // It's possible voffset is an SGPR here, but the copy to VGPR will be 3565 // inserted later. 3566 Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset); 3567 if (!VOffset) 3568 return None; 3569 3570 return {{[=](MachineInstrBuilder &MIB) { // saddr 3571 MIB.addReg(SAddr); 3572 }, 3573 [=](MachineInstrBuilder &MIB) { // voffset 3574 MIB.addReg(VOffset); 3575 }, 3576 [=](MachineInstrBuilder &MIB) { // offset 3577 MIB.addImm(ImmOffset); 3578 }}}; 3579 } 3580 3581 InstructionSelector::ComplexRendererFns 3582 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { 3583 Register Addr = Root.getReg(); 3584 Register PtrBase; 3585 int64_t ConstOffset; 3586 int64_t ImmOffset = 0; 3587 3588 // Match the immediate offset first, which canonically is moved as low as 3589 // possible. 3590 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 3591 3592 if (ConstOffset != 0 && 3593 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { 3594 Addr = PtrBase; 3595 ImmOffset = ConstOffset; 3596 } 3597 3598 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3599 if (!AddrDef) 3600 return None; 3601 3602 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3603 int FI = AddrDef->MI->getOperand(1).getIndex(); 3604 return {{ 3605 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 3606 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3607 }}; 3608 } 3609 3610 Register SAddr = AddrDef->Reg; 3611 3612 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3613 Register LHS = AddrDef->MI->getOperand(1).getReg(); 3614 Register RHS = AddrDef->MI->getOperand(2).getReg(); 3615 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 3616 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); 3617 3618 if (LHSDef && RHSDef && 3619 LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && 3620 isSGPR(RHSDef->Reg)) { 3621 int FI = LHSDef->MI->getOperand(1).getIndex(); 3622 MachineInstr &I = *Root.getParent(); 3623 MachineBasicBlock *BB = I.getParent(); 3624 const DebugLoc &DL = I.getDebugLoc(); 3625 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3626 3627 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr) 3628 .addFrameIndex(FI) 3629 .addReg(RHSDef->Reg); 3630 } 3631 } 3632 3633 if (!isSGPR(SAddr)) 3634 return None; 3635 3636 return {{ 3637 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 3638 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 3639 }}; 3640 } 3641 3642 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { 3643 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); 3644 return PSV && PSV->isStack(); 3645 } 3646 3647 InstructionSelector::ComplexRendererFns 3648 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 3649 MachineInstr *MI = Root.getParent(); 3650 MachineBasicBlock *MBB = MI->getParent(); 3651 MachineFunction *MF = MBB->getParent(); 3652 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3653 3654 int64_t Offset = 0; 3655 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 3656 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 3657 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3658 3659 // TODO: Should this be inside the render function? The iterator seems to 3660 // move. 3661 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 3662 HighBits) 3663 .addImm(Offset & ~4095); 3664 3665 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3666 MIB.addReg(Info->getScratchRSrcReg()); 3667 }, 3668 [=](MachineInstrBuilder &MIB) { // vaddr 3669 MIB.addReg(HighBits); 3670 }, 3671 [=](MachineInstrBuilder &MIB) { // soffset 3672 // Use constant zero for soffset and rely on eliminateFrameIndex 3673 // to choose the appropriate frame register if need be. 3674 MIB.addImm(0); 3675 }, 3676 [=](MachineInstrBuilder &MIB) { // offset 3677 MIB.addImm(Offset & 4095); 3678 }}}; 3679 } 3680 3681 assert(Offset == 0 || Offset == -1); 3682 3683 // Try to fold a frame index directly into the MUBUF vaddr field, and any 3684 // offsets. 3685 Optional<int> FI; 3686 Register VAddr = Root.getReg(); 3687 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 3688 if (isBaseWithConstantOffset(Root, *MRI)) { 3689 const MachineOperand &LHS = RootDef->getOperand(1); 3690 const MachineOperand &RHS = RootDef->getOperand(2); 3691 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); 3692 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); 3693 if (LHSDef && RHSDef) { 3694 int64_t PossibleOffset = 3695 RHSDef->getOperand(1).getCImm()->getSExtValue(); 3696 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && 3697 (!STI.privateMemoryResourceIsRangeChecked() || 3698 KnownBits->signBitIsZero(LHS.getReg()))) { 3699 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 3700 FI = LHSDef->getOperand(1).getIndex(); 3701 else 3702 VAddr = LHS.getReg(); 3703 Offset = PossibleOffset; 3704 } 3705 } 3706 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 3707 FI = RootDef->getOperand(1).getIndex(); 3708 } 3709 } 3710 3711 return {{[=](MachineInstrBuilder &MIB) { // rsrc 3712 MIB.addReg(Info->getScratchRSrcReg()); 3713 }, 3714 [=](MachineInstrBuilder &MIB) { // vaddr 3715 if (FI.hasValue()) 3716 MIB.addFrameIndex(FI.getValue()); 3717 else 3718 MIB.addReg(VAddr); 3719 }, 3720 [=](MachineInstrBuilder &MIB) { // soffset 3721 // Use constant zero for soffset and rely on eliminateFrameIndex 3722 // to choose the appropriate frame register if need be. 3723 MIB.addImm(0); 3724 }, 3725 [=](MachineInstrBuilder &MIB) { // offset 3726 MIB.addImm(Offset); 3727 }}}; 3728 } 3729 3730 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 3731 int64_t Offset) const { 3732 if (!isUInt<16>(Offset)) 3733 return false; 3734 3735 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3736 return true; 3737 3738 // On Southern Islands instruction with a negative base value and an offset 3739 // don't seem to work. 3740 return KnownBits->signBitIsZero(Base); 3741 } 3742 3743 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, 3744 int64_t Offset1, 3745 unsigned Size) const { 3746 if (Offset0 % Size != 0 || Offset1 % Size != 0) 3747 return false; 3748 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 3749 return false; 3750 3751 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 3752 return true; 3753 3754 // On Southern Islands instruction with a negative base value and an offset 3755 // don't seem to work. 3756 return KnownBits->signBitIsZero(Base); 3757 } 3758 3759 InstructionSelector::ComplexRendererFns 3760 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 3761 MachineOperand &Root) const { 3762 MachineInstr *MI = Root.getParent(); 3763 MachineBasicBlock *MBB = MI->getParent(); 3764 3765 int64_t Offset = 0; 3766 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 3767 !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) 3768 return {}; 3769 3770 const MachineFunction *MF = MBB->getParent(); 3771 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3772 const MachineMemOperand *MMO = *MI->memoperands_begin(); 3773 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); 3774 3775 return {{ 3776 [=](MachineInstrBuilder &MIB) { // rsrc 3777 MIB.addReg(Info->getScratchRSrcReg()); 3778 }, 3779 [=](MachineInstrBuilder &MIB) { // soffset 3780 if (isStackPtrRelative(PtrInfo)) 3781 MIB.addReg(Info->getStackPtrOffsetReg()); 3782 else 3783 MIB.addImm(0); 3784 }, 3785 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 3786 }}; 3787 } 3788 3789 std::pair<Register, unsigned> 3790 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 3791 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3792 if (!RootDef) 3793 return std::make_pair(Root.getReg(), 0); 3794 3795 int64_t ConstAddr = 0; 3796 3797 Register PtrBase; 3798 int64_t Offset; 3799 std::tie(PtrBase, Offset) = 3800 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3801 3802 if (Offset) { 3803 if (isDSOffsetLegal(PtrBase, Offset)) { 3804 // (add n0, c0) 3805 return std::make_pair(PtrBase, Offset); 3806 } 3807 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3808 // TODO 3809 3810 3811 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3812 // TODO 3813 3814 } 3815 3816 return std::make_pair(Root.getReg(), 0); 3817 } 3818 3819 InstructionSelector::ComplexRendererFns 3820 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 3821 Register Reg; 3822 unsigned Offset; 3823 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 3824 return {{ 3825 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3826 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 3827 }}; 3828 } 3829 3830 InstructionSelector::ComplexRendererFns 3831 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 3832 return selectDSReadWrite2(Root, 4); 3833 } 3834 3835 InstructionSelector::ComplexRendererFns 3836 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { 3837 return selectDSReadWrite2(Root, 8); 3838 } 3839 3840 InstructionSelector::ComplexRendererFns 3841 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, 3842 unsigned Size) const { 3843 Register Reg; 3844 unsigned Offset; 3845 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); 3846 return {{ 3847 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3848 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 3849 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 3850 }}; 3851 } 3852 3853 std::pair<Register, unsigned> 3854 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, 3855 unsigned Size) const { 3856 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 3857 if (!RootDef) 3858 return std::make_pair(Root.getReg(), 0); 3859 3860 int64_t ConstAddr = 0; 3861 3862 Register PtrBase; 3863 int64_t Offset; 3864 std::tie(PtrBase, Offset) = 3865 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 3866 3867 if (Offset) { 3868 int64_t OffsetValue0 = Offset; 3869 int64_t OffsetValue1 = Offset + Size; 3870 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { 3871 // (add n0, c0) 3872 return std::make_pair(PtrBase, OffsetValue0 / Size); 3873 } 3874 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 3875 // TODO 3876 3877 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 3878 // TODO 3879 3880 } 3881 3882 return std::make_pair(Root.getReg(), 0); 3883 } 3884 3885 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 3886 /// the base value with the constant offset. There may be intervening copies 3887 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 3888 /// not match the pattern. 3889 std::pair<Register, int64_t> 3890 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 3891 Register Root, const MachineRegisterInfo &MRI) const { 3892 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); 3893 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 3894 return {Root, 0}; 3895 3896 MachineOperand &RHS = RootI->getOperand(2); 3897 Optional<ValueAndVReg> MaybeOffset 3898 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); 3899 if (!MaybeOffset) 3900 return {Root, 0}; 3901 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; 3902 } 3903 3904 static void addZeroImm(MachineInstrBuilder &MIB) { 3905 MIB.addImm(0); 3906 } 3907 3908 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 3909 /// BasePtr is not valid, a null base pointer will be used. 3910 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3911 uint32_t FormatLo, uint32_t FormatHi, 3912 Register BasePtr) { 3913 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3914 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 3915 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3916 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 3917 3918 B.buildInstr(AMDGPU::S_MOV_B32) 3919 .addDef(RSrc2) 3920 .addImm(FormatLo); 3921 B.buildInstr(AMDGPU::S_MOV_B32) 3922 .addDef(RSrc3) 3923 .addImm(FormatHi); 3924 3925 // Build the half of the subregister with the constants before building the 3926 // full 128-bit register. If we are building multiple resource descriptors, 3927 // this will allow CSEing of the 2-component register. 3928 B.buildInstr(AMDGPU::REG_SEQUENCE) 3929 .addDef(RSrcHi) 3930 .addReg(RSrc2) 3931 .addImm(AMDGPU::sub0) 3932 .addReg(RSrc3) 3933 .addImm(AMDGPU::sub1); 3934 3935 Register RSrcLo = BasePtr; 3936 if (!BasePtr) { 3937 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3938 B.buildInstr(AMDGPU::S_MOV_B64) 3939 .addDef(RSrcLo) 3940 .addImm(0); 3941 } 3942 3943 B.buildInstr(AMDGPU::REG_SEQUENCE) 3944 .addDef(RSrc) 3945 .addReg(RSrcLo) 3946 .addImm(AMDGPU::sub0_sub1) 3947 .addReg(RSrcHi) 3948 .addImm(AMDGPU::sub2_sub3); 3949 3950 return RSrc; 3951 } 3952 3953 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3954 const SIInstrInfo &TII, Register BasePtr) { 3955 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3956 3957 // FIXME: Why are half the "default" bits ignored based on the addressing 3958 // mode? 3959 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 3960 } 3961 3962 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 3963 const SIInstrInfo &TII, Register BasePtr) { 3964 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 3965 3966 // FIXME: Why are half the "default" bits ignored based on the addressing 3967 // mode? 3968 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 3969 } 3970 3971 AMDGPUInstructionSelector::MUBUFAddressData 3972 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 3973 MUBUFAddressData Data; 3974 Data.N0 = Src; 3975 3976 Register PtrBase; 3977 int64_t Offset; 3978 3979 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 3980 if (isUInt<32>(Offset)) { 3981 Data.N0 = PtrBase; 3982 Data.Offset = Offset; 3983 } 3984 3985 if (MachineInstr *InputAdd 3986 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 3987 Data.N2 = InputAdd->getOperand(1).getReg(); 3988 Data.N3 = InputAdd->getOperand(2).getReg(); 3989 3990 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 3991 // FIXME: Don't know this was defined by operand 0 3992 // 3993 // TODO: Remove this when we have copy folding optimizations after 3994 // RegBankSelect. 3995 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 3996 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 3997 } 3998 3999 return Data; 4000 } 4001 4002 /// Return if the addr64 mubuf mode should be used for the given address. 4003 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 4004 // (ptr_add N2, N3) -> addr64, or 4005 // (ptr_add (ptr_add N2, N3), C1) -> addr64 4006 if (Addr.N2) 4007 return true; 4008 4009 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 4010 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 4011 } 4012 4013 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 4014 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 4015 /// component. 4016 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 4017 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 4018 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) 4019 return; 4020 4021 // Illegal offset, store it in soffset. 4022 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4023 B.buildInstr(AMDGPU::S_MOV_B32) 4024 .addDef(SOffset) 4025 .addImm(ImmOffset); 4026 ImmOffset = 0; 4027 } 4028 4029 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 4030 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 4031 Register &SOffset, int64_t &Offset) const { 4032 // FIXME: Predicates should stop this from reaching here. 4033 // addr64 bit was removed for volcanic islands. 4034 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 4035 return false; 4036 4037 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 4038 if (!shouldUseAddr64(AddrData)) 4039 return false; 4040 4041 Register N0 = AddrData.N0; 4042 Register N2 = AddrData.N2; 4043 Register N3 = AddrData.N3; 4044 Offset = AddrData.Offset; 4045 4046 // Base pointer for the SRD. 4047 Register SRDPtr; 4048 4049 if (N2) { 4050 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4051 assert(N3); 4052 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4053 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 4054 // addr64, and construct the default resource from a 0 address. 4055 VAddr = N0; 4056 } else { 4057 SRDPtr = N3; 4058 VAddr = N2; 4059 } 4060 } else { 4061 // N2 is not divergent. 4062 SRDPtr = N2; 4063 VAddr = N3; 4064 } 4065 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 4066 // Use the default null pointer in the resource 4067 VAddr = N0; 4068 } else { 4069 // N0 -> offset, or 4070 // (N0 + C1) -> offset 4071 SRDPtr = N0; 4072 } 4073 4074 MachineIRBuilder B(*Root.getParent()); 4075 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 4076 splitIllegalMUBUFOffset(B, SOffset, Offset); 4077 return true; 4078 } 4079 4080 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 4081 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 4082 int64_t &Offset) const { 4083 4084 // FIXME: Pattern should not reach here. 4085 if (STI.useFlatForGlobal()) 4086 return false; 4087 4088 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 4089 if (shouldUseAddr64(AddrData)) 4090 return false; 4091 4092 // N0 -> offset, or 4093 // (N0 + C1) -> offset 4094 Register SRDPtr = AddrData.N0; 4095 Offset = AddrData.Offset; 4096 4097 // TODO: Look through extensions for 32-bit soffset. 4098 MachineIRBuilder B(*Root.getParent()); 4099 4100 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 4101 splitIllegalMUBUFOffset(B, SOffset, Offset); 4102 return true; 4103 } 4104 4105 InstructionSelector::ComplexRendererFns 4106 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 4107 Register VAddr; 4108 Register RSrcReg; 4109 Register SOffset; 4110 int64_t Offset = 0; 4111 4112 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 4113 return {}; 4114 4115 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 4116 // pattern. 4117 return {{ 4118 [=](MachineInstrBuilder &MIB) { // rsrc 4119 MIB.addReg(RSrcReg); 4120 }, 4121 [=](MachineInstrBuilder &MIB) { // vaddr 4122 MIB.addReg(VAddr); 4123 }, 4124 [=](MachineInstrBuilder &MIB) { // soffset 4125 if (SOffset) 4126 MIB.addReg(SOffset); 4127 else 4128 MIB.addImm(0); 4129 }, 4130 [=](MachineInstrBuilder &MIB) { // offset 4131 MIB.addImm(Offset); 4132 }, 4133 addZeroImm, // glc 4134 addZeroImm, // slc 4135 addZeroImm, // tfe 4136 addZeroImm, // dlc 4137 addZeroImm // swz 4138 }}; 4139 } 4140 4141 InstructionSelector::ComplexRendererFns 4142 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 4143 Register RSrcReg; 4144 Register SOffset; 4145 int64_t Offset = 0; 4146 4147 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 4148 return {}; 4149 4150 return {{ 4151 [=](MachineInstrBuilder &MIB) { // rsrc 4152 MIB.addReg(RSrcReg); 4153 }, 4154 [=](MachineInstrBuilder &MIB) { // soffset 4155 if (SOffset) 4156 MIB.addReg(SOffset); 4157 else 4158 MIB.addImm(0); 4159 }, 4160 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 4161 addZeroImm, // glc 4162 addZeroImm, // slc 4163 addZeroImm, // tfe 4164 addZeroImm, // dlc 4165 addZeroImm // swz 4166 }}; 4167 } 4168 4169 InstructionSelector::ComplexRendererFns 4170 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { 4171 Register VAddr; 4172 Register RSrcReg; 4173 Register SOffset; 4174 int64_t Offset = 0; 4175 4176 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 4177 return {}; 4178 4179 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 4180 // pattern. 4181 return {{ 4182 [=](MachineInstrBuilder &MIB) { // rsrc 4183 MIB.addReg(RSrcReg); 4184 }, 4185 [=](MachineInstrBuilder &MIB) { // vaddr 4186 MIB.addReg(VAddr); 4187 }, 4188 [=](MachineInstrBuilder &MIB) { // soffset 4189 if (SOffset) 4190 MIB.addReg(SOffset); 4191 else 4192 MIB.addImm(0); 4193 }, 4194 [=](MachineInstrBuilder &MIB) { // offset 4195 MIB.addImm(Offset); 4196 }, 4197 addZeroImm // slc 4198 }}; 4199 } 4200 4201 InstructionSelector::ComplexRendererFns 4202 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { 4203 Register RSrcReg; 4204 Register SOffset; 4205 int64_t Offset = 0; 4206 4207 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 4208 return {}; 4209 4210 return {{ 4211 [=](MachineInstrBuilder &MIB) { // rsrc 4212 MIB.addReg(RSrcReg); 4213 }, 4214 [=](MachineInstrBuilder &MIB) { // soffset 4215 if (SOffset) 4216 MIB.addReg(SOffset); 4217 else 4218 MIB.addImm(0); 4219 }, 4220 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 4221 addZeroImm // slc 4222 }}; 4223 } 4224 4225 /// Get an immediate that must be 32-bits, and treated as zero extended. 4226 static Optional<uint64_t> getConstantZext32Val(Register Reg, 4227 const MachineRegisterInfo &MRI) { 4228 // getConstantVRegVal sexts any values, so see if that matters. 4229 Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI); 4230 if (!OffsetVal || !isInt<32>(*OffsetVal)) 4231 return None; 4232 return Lo_32(*OffsetVal); 4233 } 4234 4235 InstructionSelector::ComplexRendererFns 4236 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 4237 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 4238 if (!OffsetVal) 4239 return {}; 4240 4241 Optional<int64_t> EncodedImm = 4242 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 4243 if (!EncodedImm) 4244 return {}; 4245 4246 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 4247 } 4248 4249 InstructionSelector::ComplexRendererFns 4250 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 4251 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 4252 4253 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 4254 if (!OffsetVal) 4255 return {}; 4256 4257 Optional<int64_t> EncodedImm 4258 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 4259 if (!EncodedImm) 4260 return {}; 4261 4262 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 4263 } 4264 4265 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 4266 const MachineInstr &MI, 4267 int OpIdx) const { 4268 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4269 "Expected G_CONSTANT"); 4270 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 4271 } 4272 4273 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 4274 const MachineInstr &MI, 4275 int OpIdx) const { 4276 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4277 "Expected G_CONSTANT"); 4278 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 4279 } 4280 4281 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 4282 const MachineInstr &MI, 4283 int OpIdx) const { 4284 assert(OpIdx == -1); 4285 4286 const MachineOperand &Op = MI.getOperand(1); 4287 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 4288 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 4289 else { 4290 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 4291 MIB.addImm(Op.getCImm()->getSExtValue()); 4292 } 4293 } 4294 4295 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 4296 const MachineInstr &MI, 4297 int OpIdx) const { 4298 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 4299 "Expected G_CONSTANT"); 4300 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); 4301 } 4302 4303 /// This only really exists to satisfy DAG type checking machinery, so is a 4304 /// no-op here. 4305 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 4306 const MachineInstr &MI, 4307 int OpIdx) const { 4308 MIB.addImm(MI.getOperand(OpIdx).getImm()); 4309 } 4310 4311 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, 4312 const MachineInstr &MI, 4313 int OpIdx) const { 4314 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4315 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); 4316 } 4317 4318 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, 4319 const MachineInstr &MI, 4320 int OpIdx) const { 4321 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4322 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); 4323 } 4324 4325 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, 4326 const MachineInstr &MI, 4327 int OpIdx) const { 4328 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4329 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); 4330 } 4331 4332 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 4333 const MachineInstr &MI, 4334 int OpIdx) const { 4335 assert(OpIdx >= 0 && "expected to match an immediate operand"); 4336 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); 4337 } 4338 4339 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 4340 const MachineInstr &MI, 4341 int OpIdx) const { 4342 MIB.addFrameIndex((MI.getOperand(1).getIndex())); 4343 } 4344 4345 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { 4346 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); 4347 } 4348 4349 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const { 4350 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm()); 4351 } 4352 4353 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const { 4354 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm()); 4355 } 4356 4357 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 4358 return TII.isInlineConstant(Imm); 4359 } 4360